syntaxmatrix 2.5.6.1__py3-none-any.whl → 2.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syntaxmatrix/agentic/agents.py +80 -116
- syntaxmatrix/core.py +3 -3
- syntaxmatrix/routes.py +192 -6
- syntaxmatrix/settings/model_map.py +4 -3
- syntaxmatrix/templates/dashboard.html +206 -87
- syntaxmatrix/templates/dataset_resize.html +535 -0
- syntaxmatrix/utils.py +9 -0
- {syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/METADATA +1 -1
- {syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/RECORD +12 -11
- {syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/WHEEL +0 -0
- {syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/licenses/LICENSE.txt +0 -0
- {syntaxmatrix-2.5.6.1.dist-info → syntaxmatrix-2.5.8.dist-info}/top_level.txt +0 -0
syntaxmatrix/agentic/agents.py
CHANGED
|
@@ -42,7 +42,7 @@ def token_calculator(total_input_content, llm_profile):
|
|
|
42
42
|
input_prompt_tokens = len(enc.encode(total_input_content))
|
|
43
43
|
return input_prompt_tokens
|
|
44
44
|
|
|
45
|
-
def mlearning_agent(user_prompt, system_prompt, coding_profile
|
|
45
|
+
def mlearning_agent(user_prompt, system_prompt, coding_profile):
|
|
46
46
|
"""
|
|
47
47
|
Returns:
|
|
48
48
|
(text, usage_dict)
|
|
@@ -95,72 +95,41 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
95
95
|
# Google
|
|
96
96
|
def google_generate_code():
|
|
97
97
|
nonlocal usage
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
98
|
+
config = types.GenerateContentConfig(
|
|
99
|
+
system_instruction=system_prompt,
|
|
100
|
+
# Optional: Force the model to generate a Python code block as JSON
|
|
101
|
+
response_mime_type="application/json",
|
|
102
|
+
response_schema=types.Schema(
|
|
103
|
+
type=types.Type.OBJECT,
|
|
104
|
+
properties={
|
|
105
|
+
"code": types.Schema(type=types.Type.STRING, description="The runnable Python code."),
|
|
106
|
+
"explanation": types.Schema(type=types.Type.STRING, description="A brief explanation of the code."),
|
|
107
|
+
},
|
|
108
|
+
required=["code"]
|
|
109
|
+
),
|
|
110
|
+
)
|
|
110
111
|
|
|
111
|
-
|
|
112
|
-
|
|
112
|
+
try:
|
|
113
|
+
response = _client.models.generate_content(
|
|
113
114
|
model=_model,
|
|
114
|
-
contents=
|
|
115
|
+
contents=user_prompt,
|
|
115
116
|
config=config,
|
|
116
117
|
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
return f"An error occurred during API call: {e}"
|
|
117
120
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
# 4. Response Extraction (same robust logic as before)
|
|
125
|
-
text = getattr(resp, "text", None)
|
|
126
|
-
if isinstance(text, str) and text.strip():
|
|
127
|
-
return text.strip()
|
|
128
|
-
|
|
129
|
-
chunks = []
|
|
130
|
-
candidates = getattr(resp, "candidates", None) or []
|
|
131
|
-
for cand in candidates:
|
|
132
|
-
content = getattr(cand, "content", None)
|
|
133
|
-
if content:
|
|
134
|
-
parts = getattr(content, "parts", None) or []
|
|
135
|
-
for part in parts:
|
|
136
|
-
t = getattr(part, "text", None)
|
|
137
|
-
if t:
|
|
138
|
-
chunks.append(str(t))
|
|
139
|
-
|
|
140
|
-
text = "\n".join(chunks).strip()
|
|
141
|
-
if text:
|
|
142
|
-
return text
|
|
143
|
-
|
|
144
|
-
# 5. Handle blocked response
|
|
145
|
-
fb = getattr(resp, "prompt_feedback", None)
|
|
146
|
-
block_reason = getattr(fb, "block_reason", None) if fb else None
|
|
147
|
-
if block_reason and block_reason != types.BlockedReason.REASON_UNSPECIFIED:
|
|
148
|
-
raise RuntimeError(f"{_model} blocked the response. Reason: {block_reason.name}")
|
|
149
|
-
raise RuntimeError(f"{_model} failed to return content due to insufficient data.")
|
|
121
|
+
# 3. Token Usage Capture and Context Overhead Calculation
|
|
122
|
+
um = response.usage_metadata
|
|
123
|
+
usage["input_tokens"] = um.prompt_token_count
|
|
124
|
+
usage["output_tokens"] = um.candidates_token_count + um.thoughts_token_count
|
|
125
|
+
usage["total_tokens"] = um.total_token_count
|
|
150
126
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
127
|
+
try:
|
|
128
|
+
# The response text will be a JSON string due to the config.
|
|
129
|
+
response_json = json.loads(response.text)
|
|
130
|
+
return response_json.get("code", "Error: Code field not found in response.")
|
|
154
131
|
except Exception as e:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
# --- Return the error message wrapped in the required output code structure ---
|
|
158
|
-
msg = f"I smxAI have instructed {error_msg}\n"
|
|
159
|
-
return (
|
|
160
|
-
f"# {msg}\n"
|
|
161
|
-
"from syntaxmatrix.display import show\n"
|
|
162
|
-
f"show({msg!r})\n"
|
|
163
|
-
)
|
|
132
|
+
return f"Error parsing response as JSON: {e}\nRaw Response: {response.text}"
|
|
164
133
|
|
|
165
134
|
# OpenAI Responses API
|
|
166
135
|
def gpt_models_latest_generate_code():
|
|
@@ -170,7 +139,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
170
139
|
reasoning_effort, verbosity = "medium", "medium"
|
|
171
140
|
if _model == "gpt-5-nano":
|
|
172
141
|
reasoning_effort, verbosity = "low", "low"
|
|
173
|
-
elif _model in ["gpt-5-mini", "gpt-5-codex
|
|
142
|
+
elif _model in ["gpt-5-mini", "gpt-5-mini-codex"]:
|
|
174
143
|
reasoning_effort, verbosity = "medium", "medium"
|
|
175
144
|
elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
|
|
176
145
|
reasoning_effort, verbosity = "high", "high"
|
|
@@ -194,19 +163,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
194
163
|
|
|
195
164
|
code = _out(resp).strip()
|
|
196
165
|
if code:
|
|
197
|
-
return code
|
|
198
|
-
|
|
199
|
-
# Try to surface any block reason (safety / policy / etc.)
|
|
200
|
-
block_reason = None
|
|
201
|
-
output = resp.get("output")
|
|
202
|
-
for item in output:
|
|
203
|
-
fr = getattr(item, "finish_reason", None)
|
|
204
|
-
if fr and fr != "stop":
|
|
205
|
-
block_reason = fr
|
|
206
|
-
break
|
|
207
|
-
if block_reason:
|
|
208
|
-
raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
|
|
209
|
-
raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
|
|
166
|
+
return code
|
|
210
167
|
|
|
211
168
|
except APIError as e:
|
|
212
169
|
# IMPORTANT: return VALID PYTHON so the dashboard can show the error
|
|
@@ -225,15 +182,14 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
225
182
|
"from syntaxmatrix.display import show\n"
|
|
226
183
|
f"show({msg!r})\n"
|
|
227
184
|
)
|
|
228
|
-
|
|
185
|
+
|
|
229
186
|
# Anthropic
|
|
230
187
|
def anthropic_generate_code():
|
|
231
188
|
nonlocal usage
|
|
232
189
|
try:
|
|
233
190
|
resp = _client.messages.create(
|
|
234
191
|
model=_model,
|
|
235
|
-
|
|
236
|
-
temperature=temperature,
|
|
192
|
+
temperature=0,
|
|
237
193
|
system=system_prompt,
|
|
238
194
|
messages=[
|
|
239
195
|
{"role": "user", "content": user_prompt}
|
|
@@ -276,40 +232,43 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
276
232
|
def openai_sdk_generate_code():
|
|
277
233
|
nonlocal usage
|
|
278
234
|
try:
|
|
279
|
-
|
|
235
|
+
response = None
|
|
236
|
+
if _model == "deepseek-reasoner":
|
|
237
|
+
response = _client.chat.completions.create(
|
|
238
|
+
model=_model,
|
|
239
|
+
messages=[
|
|
240
|
+
{"role": "system", "content": system_prompt},
|
|
241
|
+
{"role": "user", "content": user_prompt},
|
|
242
|
+
],
|
|
243
|
+
extra_body={"thinking": {"type": "enabled"}},
|
|
244
|
+
temperature=0,
|
|
245
|
+
stream=False
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
response = _client.chat.completions.create(
|
|
280
249
|
model=_model,
|
|
281
250
|
messages=[
|
|
282
251
|
{"role": "system", "content": system_prompt},
|
|
283
252
|
{"role": "user", "content": user_prompt},
|
|
284
253
|
],
|
|
285
|
-
temperature=
|
|
286
|
-
|
|
254
|
+
temperature=0,
|
|
255
|
+
stream=False
|
|
287
256
|
)
|
|
257
|
+
content = response.choices[0].message.content
|
|
288
258
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
um = resp.usage
|
|
259
|
+
um = response.usage
|
|
292
260
|
usage["input_tokens"] = um.prompt_tokens
|
|
293
261
|
usage["output_tokens"] = um.completion_tokens
|
|
294
262
|
usage["total_tokens"] = um.total_tokens
|
|
295
263
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
first = choices[0]
|
|
305
|
-
fr = getattr(first, "finish_reason", None)
|
|
306
|
-
if fr and fr != "stop":
|
|
307
|
-
block_reason = fr
|
|
308
|
-
|
|
309
|
-
if block_reason:
|
|
310
|
-
raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
|
|
311
|
-
# Fallback: nothing useful came back
|
|
312
|
-
raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
|
|
264
|
+
code_match = re.search(r"```(?:python)?\n(.*?)```", content, re.DOTALL)
|
|
265
|
+
|
|
266
|
+
if code_match:
|
|
267
|
+
return code_match.group(1).strip()
|
|
268
|
+
else:
|
|
269
|
+
# If no markdown blocks are found, return the raw content
|
|
270
|
+
# (assuming the model obeyed instructions to output only code)
|
|
271
|
+
return content.strip()
|
|
313
272
|
|
|
314
273
|
except Exception as e:
|
|
315
274
|
# IMPORTANT: return VALID PYTHON so the dashboard can show the error
|
|
@@ -318,9 +277,7 @@ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1,
|
|
|
318
277
|
f"# {msg}\n"
|
|
319
278
|
"from syntaxmatrix.display import show\n"
|
|
320
279
|
f"show({msg!r})\n"
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
# print("TTOOKKEENN: ", token_calculator(system_prompt + user_prompt, coding_profile))
|
|
280
|
+
)
|
|
324
281
|
|
|
325
282
|
if _provider == "google":
|
|
326
283
|
code = google_generate_code()
|
|
@@ -427,9 +384,11 @@ def refine_question_agent(raw_question: str, dataset_context: str | None = None)
|
|
|
427
384
|
|
|
428
385
|
system_prompt = ("""
|
|
429
386
|
- You are a Machine Learning (ML) and Data Science (DS) expert.
|
|
430
|
-
-
|
|
431
|
-
-
|
|
432
|
-
-
|
|
387
|
+
- Your goal is to use the provided dataset summary to convert given question into clear ML job specifications.
|
|
388
|
+
- Use the provided dataset summary to respect columns and aid you in properly refining the user question.
|
|
389
|
+
- Include chronological outline in order to guide a code generator to avoid falling off tracks.
|
|
390
|
+
- DO NOT include any prelude or preamble. Just the refined tasks.
|
|
391
|
+
- If and only if the dataset summary columns are not relevant to your desired columns that you deduced by analysing the question, and you suspect that the wrong dataset was used in the dataset summary, stop and just say: 'incompatible'.
|
|
433
392
|
""")
|
|
434
393
|
|
|
435
394
|
user_prompt = f"User question:\n{raw_question}\n\n"
|
|
@@ -447,7 +406,7 @@ def refine_question_agent(raw_question: str, dataset_context: str | None = None)
|
|
|
447
406
|
|
|
448
407
|
|
|
449
408
|
def classify_ml_job_agent(refined_question, dataset_profile):
|
|
450
|
-
|
|
409
|
+
import ast
|
|
451
410
|
def ml_response(user_prompt, system_prompt, profile):
|
|
452
411
|
_profile = profile # _prof.get_profile["admin"]
|
|
453
412
|
|
|
@@ -562,8 +521,7 @@ def classify_ml_job_agent(refined_question, dataset_profile):
|
|
|
562
521
|
system_prompt = ("""
|
|
563
522
|
You are a strict machine learning task classifier for an ML workbench.
|
|
564
523
|
Your goal is to correctly label the user's task specifications with the most relevant tags from a fixed list.
|
|
565
|
-
You Must always have 'data_preprocessing' as the 1st tag. Then add
|
|
566
|
-
If no relevant tag, default to "data_preprocessing" and return that alone.
|
|
524
|
+
You Must always have 'data_preprocessing' as the 1st tag. Then add all other relevant tags.
|
|
567
525
|
You should return only your list of tags, no prelude or preamble.
|
|
568
526
|
""")
|
|
569
527
|
|
|
@@ -583,15 +541,14 @@ def classify_ml_job_agent(refined_question, dataset_profile):
|
|
|
583
541
|
"generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
|
|
584
542
|
|
|
585
543
|
# Foundational/Pipeline Steps
|
|
586
|
-
"feature_engineering", "statistical_inference", "
|
|
587
|
-
"model_validation", "hyperparameter_tuning"
|
|
544
|
+
"data_preprocessing", "feature_engineering", "statistical_inference", "clustering", "hyperparameter_tuning"
|
|
588
545
|
]
|
|
589
546
|
|
|
590
547
|
# --- 2. Construct the Generalized Prompt for the LLM ---
|
|
591
548
|
task_description = refined_question
|
|
592
549
|
|
|
593
550
|
user_prompt = f"""
|
|
594
|
-
Analyze the following task description:
|
|
551
|
+
Analyze and classify the following task description:
|
|
595
552
|
---
|
|
596
553
|
{task_description}
|
|
597
554
|
---
|
|
@@ -604,7 +561,7 @@ def classify_ml_job_agent(refined_question, dataset_profile):
|
|
|
604
561
|
ML Jobs List: {', '.join(ml_task_list)}
|
|
605
562
|
|
|
606
563
|
Respond ONLY with a valid JSON array of strings containing the selected ML job names.
|
|
607
|
-
Example Response: ["
|
|
564
|
+
Example Response: ["data_preprocessing", "regression", "classification", "feature_engineering"]
|
|
608
565
|
"""
|
|
609
566
|
|
|
610
567
|
if dataset_profile:
|
|
@@ -612,13 +569,20 @@ def classify_ml_job_agent(refined_question, dataset_profile):
|
|
|
612
569
|
|
|
613
570
|
llm_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
|
|
614
571
|
if not llm_profile:
|
|
615
|
-
return
|
|
572
|
+
return (
|
|
573
|
+
"<div class='smx-alert smx-alert-warn'>"
|
|
574
|
+
"No LLM profile is configured for Classification. Please, do that in the Admin panel or contact your Administrator."
|
|
575
|
+
"</div>"
|
|
576
|
+
)
|
|
577
|
+
|
|
616
578
|
|
|
617
579
|
llm_profile['client'] = _prof.get_client(llm_profile)
|
|
618
580
|
|
|
619
|
-
# Extract raw content
|
|
620
581
|
tasks = ml_response(user_prompt, system_prompt, llm_profile)
|
|
621
|
-
|
|
582
|
+
try:
|
|
583
|
+
return ast.literal_eval(tasks)
|
|
584
|
+
except Exception:
|
|
585
|
+
return tasks
|
|
622
586
|
|
|
623
587
|
|
|
624
588
|
def text_formatter_agent(text):
|
syntaxmatrix/core.py
CHANGED
|
@@ -599,7 +599,7 @@ class SyntaxMUI:
|
|
|
599
599
|
from syntaxmatrix.gpt_models_latest import extract_output_text as _out, set_args
|
|
600
600
|
|
|
601
601
|
if not self._classification_profile:
|
|
602
|
-
classification_profile = _prof.get_profile('classification') or _prof.get_profile('
|
|
602
|
+
classification_profile = _prof.get_profile('classification') or _prof.get_profile('admin')
|
|
603
603
|
if not classification_profile:
|
|
604
604
|
return {"Error": "Set a profile for Classification"}
|
|
605
605
|
self._classification_profile = classification_profile
|
|
@@ -1317,11 +1317,11 @@ class SyntaxMUI:
|
|
|
1317
1317
|
""")
|
|
1318
1318
|
|
|
1319
1319
|
if not self._coding_profile:
|
|
1320
|
-
coding_profile = _prof.get_profile("coding") or _prof.get_profile("admin")
|
|
1320
|
+
coding_profile = _prof.get_profile("coding") # or _prof.get_profile("admin")
|
|
1321
1321
|
if not coding_profile:
|
|
1322
1322
|
return (
|
|
1323
1323
|
'<div class="smx-alert smx-alert-warn">'
|
|
1324
|
-
'No LLM profile configured for <code>coding</code>
|
|
1324
|
+
'No LLM profile configured for <code>coding</code> <br>'
|
|
1325
1325
|
'Please, add the LLM profile inside the admin panel or contact your Administrator.'
|
|
1326
1326
|
'</div>'
|
|
1327
1327
|
)
|
syntaxmatrix/routes.py
CHANGED
|
@@ -65,6 +65,7 @@ _CLIENT_DIR = detect_project_root()
|
|
|
65
65
|
_stream_q = queue.Queue()
|
|
66
66
|
_stream_cancelled = {}
|
|
67
67
|
_last_result_html = {} # { session_id: html_doc }
|
|
68
|
+
_last_resized_csv = {} # { resize_id: bytes for last resized CSV per browser session }
|
|
68
69
|
|
|
69
70
|
# single, reused formatter: inline styles, padding, rounded corners, scroll
|
|
70
71
|
_FMT = _HtmlFmt(
|
|
@@ -3047,7 +3048,7 @@ def setup_routes(smx):
|
|
|
3047
3048
|
}) + "\n\n"
|
|
3048
3049
|
|
|
3049
3050
|
except GeneratorExit:
|
|
3050
|
-
|
|
3051
|
+
return "Client aborted the stream."
|
|
3051
3052
|
except Exception as e:
|
|
3052
3053
|
smx.error(f"Stream error: {e}")
|
|
3053
3054
|
yield "data: " + json.dumps({"event": "error", "error": str(e)}) + "\n\n"
|
|
@@ -5610,8 +5611,19 @@ def setup_routes(smx):
|
|
|
5610
5611
|
dataset_profile = f"modality: tabular; columns: {columns_summary}"
|
|
5611
5612
|
|
|
5612
5613
|
refined_question = refine_question_agent(askai_question, dataset_context)
|
|
5613
|
-
tags =
|
|
5614
|
-
|
|
5614
|
+
tags = []
|
|
5615
|
+
if refined_question.lower() == "incompatible" or refined_question.lower() == "mismatch":
|
|
5616
|
+
return ("""
|
|
5617
|
+
<div style="position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); text-align: center;">
|
|
5618
|
+
<h1 style="margin: 0 0 10px 0;">Oops: Context mismatch</h1>
|
|
5619
|
+
<p style="margin: 0;">Please, upload the proper dataset for solution to your query.</p>
|
|
5620
|
+
<br>
|
|
5621
|
+
<a class='button' href='/dashboard' style='text-decoration:none;'>Return</a>
|
|
5622
|
+
</div>
|
|
5623
|
+
""")
|
|
5624
|
+
else:
|
|
5625
|
+
tags = classify_ml_job_agent(refined_question, dataset_profile)
|
|
5626
|
+
|
|
5615
5627
|
ai_code = smx.ai_generate_code(refined_question, tags, df)
|
|
5616
5628
|
llm_usage = smx.get_last_llm_usage()
|
|
5617
5629
|
ai_code = auto_inject_template(ai_code, tags, df)
|
|
@@ -6513,8 +6525,8 @@ def setup_routes(smx):
|
|
|
6513
6525
|
cell["highlighted_code"] = Markup(_pygmentize(cell["code"]))
|
|
6514
6526
|
|
|
6515
6527
|
highlighted_ai_code = _pygmentize(ai_code)
|
|
6516
|
-
|
|
6517
|
-
|
|
6528
|
+
smxAI = "Orion"
|
|
6529
|
+
|
|
6518
6530
|
return render_template(
|
|
6519
6531
|
"dashboard.html",
|
|
6520
6532
|
section=section,
|
|
@@ -6525,7 +6537,8 @@ def setup_routes(smx):
|
|
|
6525
6537
|
highlighted_ai_code=highlighted_ai_code if ai_code else None,
|
|
6526
6538
|
askai_question=smx.sanitize_rough_to_markdown_task(askai_question),
|
|
6527
6539
|
refined_question=refined_question,
|
|
6528
|
-
tasks=
|
|
6540
|
+
tasks=tags,
|
|
6541
|
+
smxAI=smxAI,
|
|
6529
6542
|
data_cells=data_cells,
|
|
6530
6543
|
session_id=session_id,
|
|
6531
6544
|
llm_usage=llm_usage
|
|
@@ -6589,6 +6602,179 @@ def setup_routes(smx):
|
|
|
6589
6602
|
# go back to the dashboard; dashboard() will auto-select the next file
|
|
6590
6603
|
return redirect(url_for("dashboard"))
|
|
6591
6604
|
|
|
6605
|
+
# ── DATASET RESIZE (independent helper page) -------------------------
|
|
6606
|
+
|
|
6607
|
+
|
|
6608
|
+
@smx.app.route("/dataset/resize", methods=["GET", "POST"])
|
|
6609
|
+
def dataset_resize():
|
|
6610
|
+
"""
|
|
6611
|
+
User uploads any CSV and picks a target size (percentage of rows).
|
|
6612
|
+
We keep the last resized CSV in memory and expose a download link.
|
|
6613
|
+
"""
|
|
6614
|
+
# One id per browser session to index _last_resized_csv
|
|
6615
|
+
resize_id = session.get("dataset_resize_id")
|
|
6616
|
+
if not resize_id:
|
|
6617
|
+
resize_id = str(uuid.uuid4())
|
|
6618
|
+
session["dataset_resize_id"] = resize_id
|
|
6619
|
+
|
|
6620
|
+
resize_info = None # stats we pass down to the template
|
|
6621
|
+
|
|
6622
|
+
if request.method == "POST":
|
|
6623
|
+
file = request.files.get("dataset_file")
|
|
6624
|
+
target_pct_raw = (request.form.get("target_pct") or "").strip()
|
|
6625
|
+
strat_col = (request.form.get("strat_col") or "").strip()
|
|
6626
|
+
|
|
6627
|
+
error_msg = None
|
|
6628
|
+
df = None
|
|
6629
|
+
|
|
6630
|
+
# --- Basic validation ---
|
|
6631
|
+
if not file or file.filename == "":
|
|
6632
|
+
error_msg = "Please choose a CSV file."
|
|
6633
|
+
elif not file.filename.lower().endswith(".csv"):
|
|
6634
|
+
error_msg = "Only CSV files are supported."
|
|
6635
|
+
|
|
6636
|
+
# --- Read CSV into a DataFrame ---
|
|
6637
|
+
if not error_msg:
|
|
6638
|
+
try:
|
|
6639
|
+
df = pd.read_csv(file)
|
|
6640
|
+
except Exception as e:
|
|
6641
|
+
error_msg = f"Could not read CSV: {e}"
|
|
6642
|
+
|
|
6643
|
+
# --- Parse target percentage ---
|
|
6644
|
+
pct = None
|
|
6645
|
+
if not error_msg:
|
|
6646
|
+
try:
|
|
6647
|
+
pct = float(target_pct_raw)
|
|
6648
|
+
except Exception:
|
|
6649
|
+
error_msg = "Target size must be a number between 1 and 100."
|
|
6650
|
+
|
|
6651
|
+
if not error_msg and (pct <= 0 or pct > 100):
|
|
6652
|
+
error_msg = "Target size must be between 1 and 100."
|
|
6653
|
+
|
|
6654
|
+
if error_msg:
|
|
6655
|
+
flash(error_msg, "error")
|
|
6656
|
+
else:
|
|
6657
|
+
frac = pct / 100.0
|
|
6658
|
+
n_orig = len(df)
|
|
6659
|
+
n_target = max(1, int(round(n_orig * frac)))
|
|
6660
|
+
|
|
6661
|
+
df_resized = None
|
|
6662
|
+
used_strat = False
|
|
6663
|
+
|
|
6664
|
+
# --- Advanced: stratified sampling by a column (behind 'Show advanced options') ---
|
|
6665
|
+
if strat_col and strat_col in df.columns and n_orig > 0:
|
|
6666
|
+
used_strat = True
|
|
6667
|
+
groups = df.groupby(strat_col, sort=False)
|
|
6668
|
+
|
|
6669
|
+
# First pass: proportional allocation with rounding and minimum 1 per non-empty group
|
|
6670
|
+
allocations = {}
|
|
6671
|
+
total_alloc = 0
|
|
6672
|
+
for key, group in groups:
|
|
6673
|
+
size = len(group)
|
|
6674
|
+
if size <= 0:
|
|
6675
|
+
allocations[key] = 0
|
|
6676
|
+
continue
|
|
6677
|
+
alloc = int(round(size * frac))
|
|
6678
|
+
if alloc == 0 and size > 0:
|
|
6679
|
+
alloc = 1
|
|
6680
|
+
if alloc > size:
|
|
6681
|
+
alloc = size
|
|
6682
|
+
allocations[key] = alloc
|
|
6683
|
+
total_alloc += alloc
|
|
6684
|
+
|
|
6685
|
+
keys = list(allocations.keys())
|
|
6686
|
+
|
|
6687
|
+
# Adjust downwards if we overshot
|
|
6688
|
+
if total_alloc > n_target:
|
|
6689
|
+
idx = 0
|
|
6690
|
+
while total_alloc > n_target and any(v > 1 for v in allocations.values()):
|
|
6691
|
+
k = keys[idx % len(keys)]
|
|
6692
|
+
if allocations[k] > 1:
|
|
6693
|
+
allocations[k] -= 1
|
|
6694
|
+
total_alloc -= 1
|
|
6695
|
+
idx += 1
|
|
6696
|
+
|
|
6697
|
+
# Adjust upwards if we undershot and we still have room in groups
|
|
6698
|
+
if total_alloc < n_target and keys:
|
|
6699
|
+
idx = 0
|
|
6700
|
+
while total_alloc < n_target:
|
|
6701
|
+
k = keys[idx % len(keys)]
|
|
6702
|
+
group_size = len(groups.get_group(k))
|
|
6703
|
+
if allocations[k] < group_size:
|
|
6704
|
+
allocations[k] += 1
|
|
6705
|
+
total_alloc += 1
|
|
6706
|
+
idx += 1
|
|
6707
|
+
if idx > len(keys) * 3:
|
|
6708
|
+
break
|
|
6709
|
+
|
|
6710
|
+
sampled_parts = []
|
|
6711
|
+
for key, group in groups:
|
|
6712
|
+
n_g = allocations.get(key, 0)
|
|
6713
|
+
if n_g > 0:
|
|
6714
|
+
sampled_parts.append(group.sample(n=n_g, random_state=0))
|
|
6715
|
+
|
|
6716
|
+
if sampled_parts:
|
|
6717
|
+
df_resized = (
|
|
6718
|
+
pd.concat(sampled_parts, axis=0)
|
|
6719
|
+
.sample(frac=1.0, random_state=0)
|
|
6720
|
+
.reset_index(drop=True)
|
|
6721
|
+
)
|
|
6722
|
+
|
|
6723
|
+
# --- Default: simple random sample over all rows ---
|
|
6724
|
+
if df_resized is None:
|
|
6725
|
+
if n_target >= n_orig:
|
|
6726
|
+
df_resized = df.copy()
|
|
6727
|
+
else:
|
|
6728
|
+
df_resized = df.sample(n=n_target, random_state=0).reset_index(drop=True)
|
|
6729
|
+
if strat_col and strat_col not in df.columns:
|
|
6730
|
+
flash(
|
|
6731
|
+
f"Column '{strat_col}' not found. Used simple random sampling instead.",
|
|
6732
|
+
"warning",
|
|
6733
|
+
)
|
|
6734
|
+
|
|
6735
|
+
# --- Serialise to CSV in memory and stash in _last_resized_csv ---
|
|
6736
|
+
buf = _std_io.BytesIO()
|
|
6737
|
+
df_resized.to_csv(buf, index=False)
|
|
6738
|
+
buf.seek(0)
|
|
6739
|
+
_last_resized_csv[resize_id] = buf.getvalue()
|
|
6740
|
+
|
|
6741
|
+
resize_info = {
|
|
6742
|
+
"rows_in": n_orig,
|
|
6743
|
+
"rows_out": len(df_resized),
|
|
6744
|
+
"pct": pct,
|
|
6745
|
+
"used_strat": used_strat,
|
|
6746
|
+
"strat_col": strat_col if used_strat else "",
|
|
6747
|
+
}
|
|
6748
|
+
flash("Dataset resized successfully. Use the download link below.", "success")
|
|
6749
|
+
|
|
6750
|
+
return render_template("dataset_resize.html", resize_info=resize_info)
|
|
6751
|
+
|
|
6752
|
+
@smx.app.route("/dataset/resize/download", methods=["GET"])
|
|
6753
|
+
def download_resized_dataset():
|
|
6754
|
+
"""Download the last resized dataset for this browser session as a CSV."""
|
|
6755
|
+
resize_id = session.get("dataset_resize_id")
|
|
6756
|
+
if not resize_id:
|
|
6757
|
+
return ("No resized dataset available.", 404)
|
|
6758
|
+
|
|
6759
|
+
data = _last_resized_csv.get(resize_id)
|
|
6760
|
+
if not data:
|
|
6761
|
+
return ("No resized dataset available.", 404)
|
|
6762
|
+
|
|
6763
|
+
buf = _std_io.BytesIO(data)
|
|
6764
|
+
buf.seek(0)
|
|
6765
|
+
stamp = datetime.now().strftime("%Y%m%d-%H%M%S-%f")
|
|
6766
|
+
filename = f"resized_dataset_{stamp}.csv"
|
|
6767
|
+
|
|
6768
|
+
# Drop it from memory once downloaded
|
|
6769
|
+
_last_resized_csv.pop(resize_id, None)
|
|
6770
|
+
|
|
6771
|
+
return send_file(
|
|
6772
|
+
buf,
|
|
6773
|
+
mimetype="text/csv; charset=utf-8",
|
|
6774
|
+
as_attachment=True,
|
|
6775
|
+
download_name=filename,
|
|
6776
|
+
)
|
|
6777
|
+
|
|
6592
6778
|
|
|
6593
6779
|
def _pdf_fallback_reportlab(full_html: str):
|
|
6594
6780
|
"""ReportLab fallback: extract text + base64 <img> and lay them out."""
|
|
@@ -36,12 +36,14 @@ PROVIDERS_MODELS = {
|
|
|
36
36
|
|
|
37
37
|
],
|
|
38
38
|
#4
|
|
39
|
-
"deepseek": [
|
|
39
|
+
"deepseek": [
|
|
40
|
+
"deepseek-reasoner",
|
|
40
41
|
"deepseek-chat",
|
|
41
42
|
],
|
|
42
43
|
#5
|
|
43
44
|
"moonshot": [
|
|
44
|
-
"kimi-k2-
|
|
45
|
+
"kimi-k2-thinking",
|
|
46
|
+
"kimi-k2-instruct",
|
|
45
47
|
],
|
|
46
48
|
#6
|
|
47
49
|
"alibaba": [
|
|
@@ -57,7 +59,6 @@ PROVIDERS_MODELS = {
|
|
|
57
59
|
"claude-sonnet-4-5",
|
|
58
60
|
"claude-sonnet-4-0",
|
|
59
61
|
"claude-3-5-haiku-latest",
|
|
60
|
-
"claude-3-haiku-20240307",
|
|
61
62
|
]
|
|
62
63
|
}
|
|
63
64
|
|