syntaxmatrix 1.4.6__py3-none-any.whl → 2.5.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/__init__.py +13 -8
- syntaxmatrix/agentic/__init__.py +0 -0
- syntaxmatrix/agentic/agent_tools.py +24 -0
- syntaxmatrix/agentic/agents.py +810 -0
- syntaxmatrix/agentic/code_tools_registry.py +37 -0
- syntaxmatrix/agentic/model_templates.py +1790 -0
- syntaxmatrix/auth.py +308 -14
- syntaxmatrix/commentary.py +328 -0
- syntaxmatrix/core.py +993 -375
- syntaxmatrix/dataset_preprocessing.py +218 -0
- syntaxmatrix/db.py +92 -95
- syntaxmatrix/display.py +95 -121
- syntaxmatrix/generate_page.py +634 -0
- syntaxmatrix/gpt_models_latest.py +46 -0
- syntaxmatrix/history_store.py +26 -29
- syntaxmatrix/kernel_manager.py +96 -17
- syntaxmatrix/llm_store.py +1 -1
- syntaxmatrix/plottings.py +6 -0
- syntaxmatrix/profiles.py +64 -8
- syntaxmatrix/project_root.py +55 -43
- syntaxmatrix/routes.py +5072 -1398
- syntaxmatrix/session.py +19 -0
- syntaxmatrix/settings/logging.py +40 -0
- syntaxmatrix/settings/model_map.py +300 -33
- syntaxmatrix/settings/prompts.py +273 -62
- syntaxmatrix/settings/string_navbar.py +3 -3
- syntaxmatrix/static/docs.md +272 -0
- syntaxmatrix/static/icons/favicon.png +0 -0
- syntaxmatrix/static/icons/hero_bg.jpg +0 -0
- syntaxmatrix/templates/dashboard.html +608 -147
- syntaxmatrix/templates/docs.html +71 -0
- syntaxmatrix/templates/error.html +2 -3
- syntaxmatrix/templates/login.html +1 -0
- syntaxmatrix/templates/register.html +1 -0
- syntaxmatrix/ui_modes.py +14 -0
- syntaxmatrix/utils.py +2482 -159
- syntaxmatrix/vectorizer.py +16 -12
- {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/METADATA +20 -17
- syntaxmatrix-2.5.5.4.dist-info/RECORD +68 -0
- syntaxmatrix/model_templates.py +0 -30
- syntaxmatrix/static/icons/favicon.ico +0 -0
- syntaxmatrix-1.4.6.dist-info/RECORD +0 -54
- {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/WHEEL +0 -0
- {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-1.4.6.dist-info → syntaxmatrix-2.5.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,810 @@
|
|
|
1
|
+
# syntaxmatrix/agents.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os, re, json, textwrap, requests
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
|
|
8
|
+
from syntaxmatrix import utils
|
|
9
|
+
from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
|
|
10
|
+
from .. import profiles as _prof
|
|
11
|
+
from ..gpt_models_latest import set_args as _set_args, extract_output_text as _out
|
|
12
|
+
from google.genai import types
|
|
13
|
+
import tiktoken
|
|
14
|
+
from google.genai.errors import APIError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def token_calculator(total_input_content, llm_profile):
|
|
18
|
+
|
|
19
|
+
_client = llm_profile["client"]
|
|
20
|
+
_model = llm_profile["model"]
|
|
21
|
+
_provider = llm_profile["provider"].lower()
|
|
22
|
+
|
|
23
|
+
if _provider == "google":
|
|
24
|
+
tok = _client.models.count_tokens(
|
|
25
|
+
model=_model,
|
|
26
|
+
contents=total_input_content
|
|
27
|
+
)
|
|
28
|
+
input_prompt_tokens = tok.total_tokens
|
|
29
|
+
return input_prompt_tokens
|
|
30
|
+
|
|
31
|
+
elif _provider == "anthropic":
|
|
32
|
+
tok = _client.beta.messages.count_tokens(
|
|
33
|
+
model=_model,
|
|
34
|
+
system="calculate the total token for the given prompt",
|
|
35
|
+
messages=[{"role": "user", "content": total_input_content}]
|
|
36
|
+
)
|
|
37
|
+
input_prompt_tokens = tok.input_tokens
|
|
38
|
+
return input_prompt_tokens
|
|
39
|
+
|
|
40
|
+
else:
|
|
41
|
+
enc = tiktoken.encoding_for_model(_model)
|
|
42
|
+
input_prompt_tokens = len(enc.encode(total_input_content))
|
|
43
|
+
return input_prompt_tokens
|
|
44
|
+
|
|
45
|
+
def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1, max_tokens=4096):
|
|
46
|
+
"""
|
|
47
|
+
Returns:
|
|
48
|
+
(text, usage_dict)
|
|
49
|
+
|
|
50
|
+
usage_dict schema (best-effort, depending on provider):
|
|
51
|
+
{
|
|
52
|
+
"provider": str,
|
|
53
|
+
"model": str,
|
|
54
|
+
"input_tokens": int|None,
|
|
55
|
+
"output_tokens": int|None,
|
|
56
|
+
"total_tokens": int|None,
|
|
57
|
+
"error": str|None
|
|
58
|
+
}
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# coding_profile['client'] = _prof.get_client(coding_profile)
|
|
62
|
+
_client = coding_profile["client"]
|
|
63
|
+
_provider = coding_profile["provider"].lower()
|
|
64
|
+
_model = coding_profile["model"]
|
|
65
|
+
|
|
66
|
+
usage = {
|
|
67
|
+
"provider": _provider,
|
|
68
|
+
"model": _model,
|
|
69
|
+
"input_tokens": None,
|
|
70
|
+
"output_tokens": None,
|
|
71
|
+
"total_tokens": None,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def _clean_text(t):
|
|
75
|
+
if t is None:
|
|
76
|
+
return ""
|
|
77
|
+
if not isinstance(t, str):
|
|
78
|
+
t = str(t)
|
|
79
|
+
return t.strip()
|
|
80
|
+
|
|
81
|
+
def _get_usage_val(u, keys):
|
|
82
|
+
"""Read usage fields from dicts or objects, resiliently."""
|
|
83
|
+
if u is None:
|
|
84
|
+
return None
|
|
85
|
+
for k in keys:
|
|
86
|
+
try:
|
|
87
|
+
if isinstance(u, dict) and k in u:
|
|
88
|
+
return u[k]
|
|
89
|
+
if hasattr(u, k):
|
|
90
|
+
return getattr(u, k)
|
|
91
|
+
except Exception:
|
|
92
|
+
continue
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# Google
|
|
96
|
+
def google_generate_code():
|
|
97
|
+
nonlocal usage
|
|
98
|
+
"""
|
|
99
|
+
Generates content using the Gemini API and calculates token usage
|
|
100
|
+
including Context Overhead for consistency.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# 1. Client Initialization
|
|
105
|
+
config = types.GenerateContentConfig(
|
|
106
|
+
system_instruction=system_prompt,
|
|
107
|
+
temperature=temperature,
|
|
108
|
+
max_output_tokens=max_tokens,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# 2. API Call
|
|
112
|
+
resp = _client.models.generate_content(
|
|
113
|
+
model=_model,
|
|
114
|
+
contents=[user_prompt],
|
|
115
|
+
config=config,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 3. Token Usage Capture and Context Overhead Calculation
|
|
119
|
+
um = resp.usage_metadata
|
|
120
|
+
usage["input_tokens"] = um.prompt_token_count
|
|
121
|
+
usage["output_tokens"] = um.thoughts_token_count
|
|
122
|
+
usage["total_tokens"] = um.total_token_count
|
|
123
|
+
|
|
124
|
+
# 4. Response Extraction (same robust logic as before)
|
|
125
|
+
text = getattr(resp, "text", None)
|
|
126
|
+
if isinstance(text, str) and text.strip():
|
|
127
|
+
return text.strip()
|
|
128
|
+
|
|
129
|
+
chunks = []
|
|
130
|
+
candidates = getattr(resp, "candidates", None) or []
|
|
131
|
+
for cand in candidates:
|
|
132
|
+
content = getattr(cand, "content", None)
|
|
133
|
+
if content:
|
|
134
|
+
parts = getattr(content, "parts", None) or []
|
|
135
|
+
for part in parts:
|
|
136
|
+
t = getattr(part, "text", None)
|
|
137
|
+
if t:
|
|
138
|
+
chunks.append(str(t))
|
|
139
|
+
|
|
140
|
+
text = "\n".join(chunks).strip()
|
|
141
|
+
if text:
|
|
142
|
+
return text
|
|
143
|
+
|
|
144
|
+
# 5. Handle blocked response
|
|
145
|
+
fb = getattr(resp, "prompt_feedback", None)
|
|
146
|
+
block_reason = getattr(fb, "block_reason", None) if fb else None
|
|
147
|
+
if block_reason and block_reason != types.BlockedReason.REASON_UNSPECIFIED:
|
|
148
|
+
raise RuntimeError(f"{_model} blocked the response. Reason: {block_reason.name}")
|
|
149
|
+
raise RuntimeError(f"{_model} failed to return content due to insufficient data.")
|
|
150
|
+
|
|
151
|
+
except APIError as e:
|
|
152
|
+
error_msg = f"Gemini API Error: {e}"
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
error_msg = f"An unexpected error occurred during API call or processing: {e}"
|
|
156
|
+
|
|
157
|
+
# --- Return the error message wrapped in the required output code structure ---
|
|
158
|
+
msg = f"I smxAI have instructed {error_msg}\n"
|
|
159
|
+
return (
|
|
160
|
+
f"# {msg}\n"
|
|
161
|
+
"from syntaxmatrix.display import show\n"
|
|
162
|
+
f"show({msg!r})\n"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# OpenAI Responses API
|
|
166
|
+
def gpt_models_latest_generate_code():
|
|
167
|
+
nonlocal usage
|
|
168
|
+
|
|
169
|
+
def reasoning_and_verbosity():
|
|
170
|
+
reasoning_effort, verbosity = "medium", "medium"
|
|
171
|
+
if _model == "gpt-5-nano":
|
|
172
|
+
reasoning_effort, verbosity = "low", "low"
|
|
173
|
+
elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
|
|
174
|
+
reasoning_effort, verbosity = "medium", "medium"
|
|
175
|
+
elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
|
|
176
|
+
reasoning_effort, verbosity = "high", "high"
|
|
177
|
+
return (reasoning_effort, verbosity)
|
|
178
|
+
try:
|
|
179
|
+
args = _set_args(
|
|
180
|
+
model=_model,
|
|
181
|
+
instructions=system_prompt,
|
|
182
|
+
input=user_prompt,
|
|
183
|
+
previous_id=None,
|
|
184
|
+
store=False,
|
|
185
|
+
reasoning_effort=reasoning_and_verbosity()[0],
|
|
186
|
+
verbosity=reasoning_and_verbosity()[1],
|
|
187
|
+
)
|
|
188
|
+
resp = _client.responses.create(**args)
|
|
189
|
+
|
|
190
|
+
um = resp.usage
|
|
191
|
+
usage["input_tokens"] = um.input_tokens
|
|
192
|
+
usage["output_tokens"] = um.output_tokens
|
|
193
|
+
usage["total_tokens"] = um.total_tokens
|
|
194
|
+
|
|
195
|
+
code = _out(resp).strip()
|
|
196
|
+
if code:
|
|
197
|
+
return code
|
|
198
|
+
|
|
199
|
+
# Try to surface any block reason (safety / policy / etc.)
|
|
200
|
+
block_reason = None
|
|
201
|
+
output = resp.get("output")
|
|
202
|
+
for item in output:
|
|
203
|
+
fr = getattr(item, "finish_reason", None)
|
|
204
|
+
if fr and fr != "stop":
|
|
205
|
+
block_reason = fr
|
|
206
|
+
break
|
|
207
|
+
if block_reason:
|
|
208
|
+
raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
|
|
209
|
+
raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
|
|
210
|
+
|
|
211
|
+
except APIError as e:
|
|
212
|
+
# IMPORTANT: return VALID PYTHON so the dashboard can show the error
|
|
213
|
+
msg = f"I smxAI have instructed {e}"
|
|
214
|
+
return (
|
|
215
|
+
f"# {msg}\n"
|
|
216
|
+
"from syntaxmatrix.display import show\n"
|
|
217
|
+
f"show({msg!r})\n"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
# IMPORTANT: return VALID PYTHON so the dashboard can show the error
|
|
222
|
+
msg = f"I smxAI have instructed {e}"
|
|
223
|
+
return (
|
|
224
|
+
f"# {msg}\n"
|
|
225
|
+
"from syntaxmatrix.display import show\n"
|
|
226
|
+
f"show({msg!r})\n"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Anthropic
|
|
230
|
+
def anthropic_generate_code():
|
|
231
|
+
nonlocal usage
|
|
232
|
+
try:
|
|
233
|
+
resp = _client.messages.create(
|
|
234
|
+
model=_model,
|
|
235
|
+
max_tokens=max_tokens,
|
|
236
|
+
temperature=temperature,
|
|
237
|
+
system=system_prompt,
|
|
238
|
+
messages=[
|
|
239
|
+
{"role": "user", "content": user_prompt}
|
|
240
|
+
]
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
um = resp.usage
|
|
244
|
+
usage["input_tokens"] = um.input_tokens
|
|
245
|
+
usage["output_tokens"] = um.output_tokens
|
|
246
|
+
usage["total_tokens"] = um.input_tokens + um.output_tokens
|
|
247
|
+
|
|
248
|
+
# Extract plain text from Claude-style content blocks
|
|
249
|
+
text_blocks = []
|
|
250
|
+
content = getattr(resp, "content", None) or []
|
|
251
|
+
for block in content:
|
|
252
|
+
t = getattr(block, "text", None)
|
|
253
|
+
if not t and isinstance(block, dict):
|
|
254
|
+
t = (block.get("text") or "").strip()
|
|
255
|
+
if t:
|
|
256
|
+
text_blocks.append(str(t))
|
|
257
|
+
|
|
258
|
+
text = "\n".join(text_blocks).strip()
|
|
259
|
+
if text:
|
|
260
|
+
return text
|
|
261
|
+
|
|
262
|
+
stop_reason = getattr(resp, "stop_reason", None)
|
|
263
|
+
if stop_reason and stop_reason != "end_turn":
|
|
264
|
+
raise RuntimeError(f"{_model} stopped with reason: {stop_reason}")
|
|
265
|
+
raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
|
|
266
|
+
|
|
267
|
+
except Exception as e:
|
|
268
|
+
msg = f"I smxAI have instructed {e}\n"
|
|
269
|
+
return (
|
|
270
|
+
f"# {msg}\n"
|
|
271
|
+
"from syntaxmatrix.display import show\n"
|
|
272
|
+
f"show({msg!r})\n"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# OpenAI Chat Completions
|
|
276
|
+
def openai_sdk_generate_code():
|
|
277
|
+
nonlocal usage
|
|
278
|
+
try:
|
|
279
|
+
resp = _client.chat.completions.create(
|
|
280
|
+
model=_model,
|
|
281
|
+
messages=[
|
|
282
|
+
{"role": "system", "content": system_prompt},
|
|
283
|
+
{"role": "user", "content": user_prompt},
|
|
284
|
+
],
|
|
285
|
+
temperature=temperature,
|
|
286
|
+
max_tokens=max_tokens,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
um = resp.usage
|
|
292
|
+
usage["input_tokens"] = um.prompt_tokens
|
|
293
|
+
usage["output_tokens"] = um.completion_tokens
|
|
294
|
+
usage["total_tokens"] = um.total_tokens
|
|
295
|
+
|
|
296
|
+
text = resp.choices[0].message.content
|
|
297
|
+
if text:
|
|
298
|
+
return text
|
|
299
|
+
|
|
300
|
+
# Try to surface any block reason (safety / policy / etc.)
|
|
301
|
+
block_reason = None
|
|
302
|
+
choices = getattr(resp, "choices", None) or []
|
|
303
|
+
if choices:
|
|
304
|
+
first = choices[0]
|
|
305
|
+
fr = getattr(first, "finish_reason", None)
|
|
306
|
+
if fr and fr != "stop":
|
|
307
|
+
block_reason = fr
|
|
308
|
+
|
|
309
|
+
if block_reason:
|
|
310
|
+
raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
|
|
311
|
+
# Fallback: nothing useful came back
|
|
312
|
+
raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
|
|
313
|
+
|
|
314
|
+
except Exception as e:
|
|
315
|
+
# IMPORTANT: return VALID PYTHON so the dashboard can show the error
|
|
316
|
+
msg = f"I smxAI have instructed {e}"
|
|
317
|
+
return (
|
|
318
|
+
f"# {msg}\n"
|
|
319
|
+
"from syntaxmatrix.display import show\n"
|
|
320
|
+
f"show({msg!r})\n"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# print("TTOOKKEENN: ", token_calculator(system_prompt + user_prompt, coding_profile))
|
|
324
|
+
|
|
325
|
+
if _provider == "google":
|
|
326
|
+
code = google_generate_code()
|
|
327
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
328
|
+
code = gpt_models_latest_generate_code()
|
|
329
|
+
elif _provider == "anthropic":
|
|
330
|
+
code = anthropic_generate_code()
|
|
331
|
+
else:
|
|
332
|
+
code = openai_sdk_generate_code()
|
|
333
|
+
|
|
334
|
+
return code, usage
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def refine_question_agent(raw_question: str, dataset_context: str | None = None) -> str:
|
|
338
|
+
|
|
339
|
+
def response_agent(user_prompt, system_prompt, llm_profile, temp=0.0, max_tokens=128):
|
|
340
|
+
_profile = llm_profile
|
|
341
|
+
|
|
342
|
+
_client = _profile["client"]
|
|
343
|
+
_provider = _profile["provider"].lower()
|
|
344
|
+
_model = _profile["model"]
|
|
345
|
+
|
|
346
|
+
# Google GenAI
|
|
347
|
+
if _provider == "google":
|
|
348
|
+
resp = _client.models.generate_content(
|
|
349
|
+
model=_model,
|
|
350
|
+
contents=system_prompt + "\n\n" + user_prompt,
|
|
351
|
+
)
|
|
352
|
+
text = resp.text
|
|
353
|
+
return text.strip()
|
|
354
|
+
|
|
355
|
+
# OpenAI
|
|
356
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
357
|
+
|
|
358
|
+
def reasoning_and_verbosity():
|
|
359
|
+
reasoning_effort, verbosity = "medium", "medium"
|
|
360
|
+
if _model == "gpt-5-nano":
|
|
361
|
+
if max_tokens <= 256:
|
|
362
|
+
reasoning_effort = "minimal"
|
|
363
|
+
else: reasoning_effort = "low"
|
|
364
|
+
elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
|
|
365
|
+
verbosity = "medium"
|
|
366
|
+
elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
|
|
367
|
+
reasoning_effort = "high"
|
|
368
|
+
verbosity = "high"
|
|
369
|
+
return (reasoning_effort, verbosity)
|
|
370
|
+
|
|
371
|
+
args = _set_args(
|
|
372
|
+
model=_model,
|
|
373
|
+
instructions=system_prompt,
|
|
374
|
+
input=user_prompt,
|
|
375
|
+
previous_id=None,
|
|
376
|
+
store=False,
|
|
377
|
+
reasoning_effort=reasoning_and_verbosity()[0],
|
|
378
|
+
verbosity=reasoning_and_verbosity()[1],
|
|
379
|
+
)
|
|
380
|
+
resp = _client.responses.create(**args)
|
|
381
|
+
txt = _out(resp)
|
|
382
|
+
return txt
|
|
383
|
+
|
|
384
|
+
# Anthropic
|
|
385
|
+
elif _provider == "anthropic":
|
|
386
|
+
try:
|
|
387
|
+
resp = _client.messages.create(
|
|
388
|
+
model=_model,
|
|
389
|
+
system=system_prompt,
|
|
390
|
+
messages=[{"role": "user", "content": user_prompt}],
|
|
391
|
+
temperature=0.2,
|
|
392
|
+
max_tokens= max_tokens,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Extract plain text from Claude's content blocks
|
|
396
|
+
text = ""
|
|
397
|
+
content = getattr(resp, "content", None)
|
|
398
|
+
if content and isinstance(content, list):
|
|
399
|
+
parts = []
|
|
400
|
+
for block in content:
|
|
401
|
+
# blocks typically like {"type": "text", "text": "..."}
|
|
402
|
+
t = getattr(block, "text", None)
|
|
403
|
+
if not t and isinstance(block, dict):
|
|
404
|
+
t = block.get("text")
|
|
405
|
+
if t:
|
|
406
|
+
parts.append(t)
|
|
407
|
+
text = " ".join(parts)
|
|
408
|
+
return text
|
|
409
|
+
except Exception:
|
|
410
|
+
pass
|
|
411
|
+
|
|
412
|
+
# OpenAI SDK Compartible (Chat Completions)
|
|
413
|
+
else:
|
|
414
|
+
resp = _client.chat.completions.create(
|
|
415
|
+
model=_model,
|
|
416
|
+
messages=[
|
|
417
|
+
{"role": "system", "content": system_prompt},
|
|
418
|
+
{"role": "user", "content": user_prompt},
|
|
419
|
+
],
|
|
420
|
+
temperature=temp,
|
|
421
|
+
max_tokens=max_tokens,
|
|
422
|
+
)
|
|
423
|
+
text = resp.choices[0].message.content
|
|
424
|
+
return text
|
|
425
|
+
|
|
426
|
+
return "Configure LLM Profiles or contact your administrator."
|
|
427
|
+
|
|
428
|
+
system_prompt = (
|
|
429
|
+
"You rewrite user questions into specification Machine Learning (ML) job description. "
|
|
430
|
+
"If a dataset summary is provided, use it to respect column and help you redefine the question. "
|
|
431
|
+
"DO NOT write andy prelude or preamble"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
user_prompt = f"User question:\n{raw_question}\n\n"
|
|
435
|
+
if dataset_context:
|
|
436
|
+
user_prompt += f"Dataset summary:\n{dataset_context}\n"
|
|
437
|
+
|
|
438
|
+
_refiner_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
|
|
439
|
+
if not _refiner_profile:
|
|
440
|
+
return "ERROR"
|
|
441
|
+
|
|
442
|
+
_refiner_profile['client'] = _prof.get_client(_refiner_profile)
|
|
443
|
+
|
|
444
|
+
refined_question = response_agent(user_prompt, system_prompt, _refiner_profile, temp=0.0, max_tokens=128)
|
|
445
|
+
return refined_question
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def classify_ml_job_agent(refined_question, dataset_profile):
|
|
449
|
+
"""
|
|
450
|
+
Instructs an LLM (gemini-2.5-flash) to analyze a task description
|
|
451
|
+
and return a list of associated machine learning job/task types.
|
|
452
|
+
This version uses a highly extensive, generalized list of ML jobs
|
|
453
|
+
to ensure robustness across all domains (NLP, CV, RL, etc.).
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
task_description: The detailed description of the statistical/ML task.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
A list of strings identifying the relevant ML jobs. Returns an empty
|
|
460
|
+
list if the API call fails or the output cannot be parsed.
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
def ml_response(user_prompt, system_prompt, profile):
|
|
464
|
+
_profile = profile # _prof.get_profile["admin"]
|
|
465
|
+
|
|
466
|
+
_client = _profile["client"]
|
|
467
|
+
_provider = _profile["provider"].lower()
|
|
468
|
+
_model = _profile["model"]
|
|
469
|
+
|
|
470
|
+
prompt = user_prompt + "\n\n" + system_prompt
|
|
471
|
+
|
|
472
|
+
# Google GenAI
|
|
473
|
+
if _provider == "google":
|
|
474
|
+
from google.genai.errors import APIError
|
|
475
|
+
|
|
476
|
+
config=dict(
|
|
477
|
+
temperature=0.0,
|
|
478
|
+
response_mime_type="application/json",
|
|
479
|
+
# Enforcing a JSON array of strings structure for reliable parsing
|
|
480
|
+
response_schema={
|
|
481
|
+
"type": "array",
|
|
482
|
+
"items": {"type": "string"}
|
|
483
|
+
}
|
|
484
|
+
)
|
|
485
|
+
try:
|
|
486
|
+
response = _client.models.generate_content(
|
|
487
|
+
model=_model,
|
|
488
|
+
contents=prompt,
|
|
489
|
+
config=config,
|
|
490
|
+
)
|
|
491
|
+
json_string = response.text.strip()
|
|
492
|
+
ml_jobs = json.loads(json_string)
|
|
493
|
+
|
|
494
|
+
if not isinstance(ml_jobs, list) or not all(isinstance(job, str) for job in ml_jobs):
|
|
495
|
+
return []
|
|
496
|
+
return ml_jobs
|
|
497
|
+
|
|
498
|
+
except APIError as e:
|
|
499
|
+
return [f"An API error occurred: {e}"]
|
|
500
|
+
except json.JSONDecodeError as e:
|
|
501
|
+
if 'response' in locals():
|
|
502
|
+
return [f"Raw response text: {response.text}"]
|
|
503
|
+
except Exception as e:
|
|
504
|
+
return [f"An unexpected error occurred: {e}"]
|
|
505
|
+
|
|
506
|
+
elif _provider == "openai" and _model in GPT_MODELS_LATEST:
|
|
507
|
+
|
|
508
|
+
def reasoning_and_verbosity():
|
|
509
|
+
reasoning_effort, verbosity = "medium", "medium"
|
|
510
|
+
if _model == "gpt-5-nano":
|
|
511
|
+
reasoning_effort = "low"
|
|
512
|
+
elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
|
|
513
|
+
verbosity = "medium"
|
|
514
|
+
elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
|
|
515
|
+
reasoning_effort = "high"
|
|
516
|
+
verbosity = "high"
|
|
517
|
+
return (reasoning_effort, verbosity)
|
|
518
|
+
|
|
519
|
+
args = _set_args(
|
|
520
|
+
model=_model,
|
|
521
|
+
instructions=system_prompt,
|
|
522
|
+
input=user_prompt,
|
|
523
|
+
previous_id=None,
|
|
524
|
+
store=False,
|
|
525
|
+
reasoning_effort=reasoning_and_verbosity()[0],
|
|
526
|
+
verbosity=reasoning_and_verbosity()[1],
|
|
527
|
+
)
|
|
528
|
+
resp = _client.responses.create(**args)
|
|
529
|
+
txt = _out(resp)
|
|
530
|
+
return txt
|
|
531
|
+
|
|
532
|
+
elif _provider == "anthropic":
|
|
533
|
+
try:
|
|
534
|
+
resp = _client.messages.create(
|
|
535
|
+
model=_model,
|
|
536
|
+
system=system_prompt,
|
|
537
|
+
messages=[{"role": "user", "content": user_prompt}],
|
|
538
|
+
temperature=0.0,
|
|
539
|
+
max_tokens= 128,
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Extract plain text from Claude's content blocks
|
|
543
|
+
text = ""
|
|
544
|
+
content = getattr(resp, "content", None)
|
|
545
|
+
if content and isinstance(content, list):
|
|
546
|
+
parts = []
|
|
547
|
+
for block in content:
|
|
548
|
+
# blocks typically like {"type": "text", "text": "..."}
|
|
549
|
+
t = getattr(block, "text", None)
|
|
550
|
+
if not t and isinstance(block, dict):
|
|
551
|
+
t = block.get("text")
|
|
552
|
+
if t:
|
|
553
|
+
parts.append(t)
|
|
554
|
+
text = " ".join(parts)
|
|
555
|
+
return text
|
|
556
|
+
except Exception:
|
|
557
|
+
pass
|
|
558
|
+
|
|
559
|
+
else:
|
|
560
|
+
resp = _client.chat.completions.create(
|
|
561
|
+
model=_model,
|
|
562
|
+
messages=[
|
|
563
|
+
{"role": "system", "content": system_prompt},
|
|
564
|
+
{"role": "user", "content": user_prompt},
|
|
565
|
+
],
|
|
566
|
+
temperature=0.0,
|
|
567
|
+
max_tokens=128,
|
|
568
|
+
)
|
|
569
|
+
text = resp.choices[0].message.content
|
|
570
|
+
return text
|
|
571
|
+
|
|
572
|
+
return "Configure LLM Profiles or contact your administrator."
|
|
573
|
+
|
|
574
|
+
system_prompt = (
|
|
575
|
+
"You are a strict machine learning task classifier for an ML workbench.\n"
|
|
576
|
+
"Your job is to label the user's task desc. with all relevant tags from a fixed list.\n\n"
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
# --- 1. Define the Master List of ML Tasks (Generalized) ---
|
|
580
|
+
ml_task_list = [
|
|
581
|
+
# Supervised Learning
|
|
582
|
+
"classification", "regression", "ranking", "object_detection", "image_segmentation",
|
|
583
|
+
|
|
584
|
+
# Unsupervised Learning
|
|
585
|
+
"clustering", "dimensionality_reduction", "anomaly_detection", "association_rule_mining",
|
|
586
|
+
|
|
587
|
+
# Sequential/Time Data
|
|
588
|
+
"time_series_forecasting", "sequence_labeling", "survival_analysis",
|
|
589
|
+
|
|
590
|
+
# Specialized Domains
|
|
591
|
+
"natural_language_processing", "computer_vision", "reinforcement_learning",
|
|
592
|
+
"generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
|
|
593
|
+
|
|
594
|
+
# Foundational/Pipeline Steps
|
|
595
|
+
"feature_engineering", "statistical_inference", "data_preprocessing",
|
|
596
|
+
"model_validation", "hyperparameter_tuning"
|
|
597
|
+
]
|
|
598
|
+
|
|
599
|
+
# --- 2. Construct the Generalized Prompt for the LLM ---
|
|
600
|
+
task_description = refined_question
|
|
601
|
+
|
|
602
|
+
user_prompt = f"""
|
|
603
|
+
Analyze the following task description:
|
|
604
|
+
---
|
|
605
|
+
{task_description}
|
|
606
|
+
---
|
|
607
|
+
|
|
608
|
+
If the Dataset Profile is provided, use its info, together with the task description, to make your job types
|
|
609
|
+
Identify and select ALL job types from the provided, extensive list that are directly
|
|
610
|
+
relevant to achieving the goals outlined in the task description (either as the
|
|
611
|
+
core goal, prerequisites, or essential steps).
|
|
612
|
+
|
|
613
|
+
ML Jobs List: {', '.join(ml_task_list)}
|
|
614
|
+
|
|
615
|
+
Respond ONLY with a valid JSON array of strings containing the selected ML job names.
|
|
616
|
+
Example Response: ["natural_language_processing", "classification", "feature_engineering"]
|
|
617
|
+
"""
|
|
618
|
+
|
|
619
|
+
if dataset_profile:
|
|
620
|
+
user_prompt += f"\nDataset profile:\n{dataset_profile}\n"
|
|
621
|
+
|
|
622
|
+
llm_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
|
|
623
|
+
if not llm_profile:
|
|
624
|
+
return "ERROR"
|
|
625
|
+
|
|
626
|
+
llm_profile['client'] = _prof.get_client(llm_profile)
|
|
627
|
+
|
|
628
|
+
# Extract raw content
|
|
629
|
+
tasks = ml_response(user_prompt, system_prompt, llm_profile)
|
|
630
|
+
return tasks
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def text_formatter_agent(text):
|
|
634
|
+
"""
|
|
635
|
+
Parses an ML job description using the Gemini API with Structured JSON Output.
|
|
636
|
+
"""
|
|
637
|
+
|
|
638
|
+
def generate_formatted_report(data):
|
|
639
|
+
"""
|
|
640
|
+
Generates a formatted string of the structured data in a clean,
|
|
641
|
+
document-like format mimicking the requested list structure.
|
|
642
|
+
|
|
643
|
+
Returns:
|
|
644
|
+
str: The complete formatted report as a string.
|
|
645
|
+
"""
|
|
646
|
+
if not data:
|
|
647
|
+
return "No data to display."
|
|
648
|
+
|
|
649
|
+
output_lines = []
|
|
650
|
+
|
|
651
|
+
# --- Helper Functions ---
|
|
652
|
+
def clean_md(text):
|
|
653
|
+
"""Removes markdown bold syntax."""
|
|
654
|
+
return text.replace("**", "")
|
|
655
|
+
|
|
656
|
+
def format_smart_list_item(prefix, item_text, width=80):
|
|
657
|
+
"""
|
|
658
|
+
Content-agnostic list formatter.
|
|
659
|
+
Detects 'Header: Description' patterns and formats them inline.
|
|
660
|
+
Returns the formatted string.
|
|
661
|
+
"""
|
|
662
|
+
cleaned = clean_md(item_text)
|
|
663
|
+
|
|
664
|
+
# Check for "Header: Description" pattern
|
|
665
|
+
# We look for a colon appearing early in the string (e.g., within first 60 chars)
|
|
666
|
+
colon_match = re.match(r"^([^:]{1,60}):\s*(.*)", cleaned, re.DOTALL)
|
|
667
|
+
|
|
668
|
+
if colon_match:
|
|
669
|
+
header = colon_match.group(1).strip()
|
|
670
|
+
description = colon_match.group(2).strip()
|
|
671
|
+
|
|
672
|
+
# Format: PREFIX HEADER: Description
|
|
673
|
+
full_line = f"{prefix} {header.upper()}: {description}\n"
|
|
674
|
+
else:
|
|
675
|
+
# Format: PREFIX Content
|
|
676
|
+
full_line = f"{prefix} {cleaned}\n"
|
|
677
|
+
|
|
678
|
+
# Calculate hanging indent (aligning with the start of the text after the prefix)
|
|
679
|
+
# Length of prefix + 1 space
|
|
680
|
+
indent_width = len(prefix) + 1
|
|
681
|
+
hanging_indent = " " * indent_width
|
|
682
|
+
|
|
683
|
+
return textwrap.fill(
|
|
684
|
+
full_line,
|
|
685
|
+
width=width,
|
|
686
|
+
subsequent_indent=hanging_indent
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# --- Report Construction ---
|
|
690
|
+
|
|
691
|
+
# 1. Title
|
|
692
|
+
title = clean_md(data.get("project_title", "Project Report"))
|
|
693
|
+
output_lines.append("\n" + "=" * 80)
|
|
694
|
+
output_lines.append(f"{title.center(80)}")
|
|
695
|
+
output_lines.append("=" * 80 + "\n")
|
|
696
|
+
|
|
697
|
+
# 2. Project Goal
|
|
698
|
+
output_lines.append("PROJECT GOAL\n")
|
|
699
|
+
output_lines.append("-" * 12)
|
|
700
|
+
goal = clean_md(data.get("project_goal", ""))
|
|
701
|
+
output_lines.append(textwrap.fill(goal, width=80))
|
|
702
|
+
output_lines.append("") # Adds a blank line
|
|
703
|
+
|
|
704
|
+
# 3. Key Objectives
|
|
705
|
+
if data.get("key_objectives"):
|
|
706
|
+
output_lines.append("KEY OBJECTIVES & STRATEGIC INSIGHTS")
|
|
707
|
+
output_lines.append("-" * 35)
|
|
708
|
+
for item in data["key_objectives"]:
|
|
709
|
+
output_lines.append(format_smart_list_item("•", item))
|
|
710
|
+
output_lines.append("")
|
|
711
|
+
|
|
712
|
+
# 4. ML Tasks (Numbered List)
|
|
713
|
+
if data.get("ml_tasks"):
|
|
714
|
+
output_lines.append("ML EXECUTION TASKS")
|
|
715
|
+
output_lines.append("-" * 18)
|
|
716
|
+
for i, task in enumerate(data["ml_tasks"], 1):
|
|
717
|
+
# Using i. as prefix
|
|
718
|
+
output_lines.append(format_smart_list_item(f"{i}.", task))
|
|
719
|
+
output_lines.append("")
|
|
720
|
+
|
|
721
|
+
# 5. Deliverables
|
|
722
|
+
if data.get("expected_deliverables"):
|
|
723
|
+
output_lines.append("EXPECTED DELIVERABLES")
|
|
724
|
+
output_lines.append("-" * 21)
|
|
725
|
+
for item in data["expected_deliverables"]:
|
|
726
|
+
output_lines.append(format_smart_list_item("•", item))
|
|
727
|
+
output_lines.append("")
|
|
728
|
+
|
|
729
|
+
# Join all lines with newlines
|
|
730
|
+
return "\n".join(output_lines)
|
|
731
|
+
|
|
732
|
+
formatter_profile = _prof.get_profile("classification") or _prof.get_profile("classification")
|
|
733
|
+
_api_key = formatter_profile["api_key"]
|
|
734
|
+
_provider = formatter_profile["provider"]
|
|
735
|
+
_model = formatter_profile["model"]
|
|
736
|
+
|
|
737
|
+
# 1. Define the Schema for strict JSON enforcement
|
|
738
|
+
schema = {
|
|
739
|
+
"type": "OBJECT",
|
|
740
|
+
"properties": {
|
|
741
|
+
"project_title": {"type": "STRING"},
|
|
742
|
+
"project_goal": {"type": "STRING"},
|
|
743
|
+
"key_objectives": {
|
|
744
|
+
"type": "ARRAY",
|
|
745
|
+
"items": {"type": "STRING"}
|
|
746
|
+
},
|
|
747
|
+
"data_inputs": {
|
|
748
|
+
"type": "OBJECT",
|
|
749
|
+
"properties": {
|
|
750
|
+
"description_items": {
|
|
751
|
+
"type": "ARRAY",
|
|
752
|
+
"items": {"type": "STRING"}
|
|
753
|
+
},
|
|
754
|
+
"extracted_features": {
|
|
755
|
+
"type": "ARRAY",
|
|
756
|
+
"items": {"type": "STRING"},
|
|
757
|
+
"description": "List of specific column names or features mentioned (e.g. Age, BMI)"
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
},
|
|
761
|
+
"ml_tasks": {
|
|
762
|
+
"type": "ARRAY",
|
|
763
|
+
"items": {"type": "STRING"}
|
|
764
|
+
},
|
|
765
|
+
"expected_deliverables": {
|
|
766
|
+
"type": "ARRAY",
|
|
767
|
+
"items": {"type": "STRING"}
|
|
768
|
+
}
|
|
769
|
+
},
|
|
770
|
+
"required": ["project_title", "project_goal", "key_objectives", "data_inputs", "ml_tasks"]
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
# 2. Construct the API Request
|
|
774
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{_model}:generateContent?key={_api_key}"
|
|
775
|
+
|
|
776
|
+
headers = {"Content-Type": "application/json"}
|
|
777
|
+
|
|
778
|
+
payload = {
|
|
779
|
+
"contents": [{
|
|
780
|
+
"parts": [{
|
|
781
|
+
"text": f"Extract the structured data from the following ML Job Description:\n\n{text}"
|
|
782
|
+
}]
|
|
783
|
+
}],
|
|
784
|
+
"generationConfig": {
|
|
785
|
+
"responseMimeType": "application/json",
|
|
786
|
+
"responseSchema": schema
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
response = requests.post(url, headers=headers, json=payload)
|
|
792
|
+
response.raise_for_status()
|
|
793
|
+
|
|
794
|
+
result_json = response.json()
|
|
795
|
+
|
|
796
|
+
# 4. Extract and Parse Content
|
|
797
|
+
raw_text_response = result_json["candidates"][0]["content"]["parts"][0]["text"]
|
|
798
|
+
parsed_data = json.loads(raw_text_response)
|
|
799
|
+
|
|
800
|
+
report = generate_formatted_report(parsed_data)
|
|
801
|
+
return parsed_data
|
|
802
|
+
|
|
803
|
+
except requests.exceptions.RequestException as e:
|
|
804
|
+
if 'response' in locals() and response is not None:
|
|
805
|
+
return (f"API Request Failed: {e}\n\nResponse info: {response.text}")
|
|
806
|
+
return None
|
|
807
|
+
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
|
808
|
+
return f"Parsing Failed: {e}"
|
|
809
|
+
|
|
810
|
+
|