syntaxmatrix 2.5.4__py3-none-any.whl → 2.5.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,853 @@
1
+ # syntaxmatrix/agents.py
2
+ from __future__ import annotations
3
+ import os, re, json, textwrap, requests
4
+ import pandas as pd
5
+
6
+ from typing import Optional, List
7
+
8
+ from syntaxmatrix import utils
9
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
10
+ from .. import profiles as _prof
11
+ from ..gpt_models_latest import set_args as _set_args, extract_output_text as _out
12
+ from google.genai import types
13
+ import tiktoken
14
+
15
+
16
+ def token_calculator(total_input_content, llm_profile):
17
+
18
+ _client = llm_profile["client"]
19
+ _model = llm_profile["model"]
20
+ _provider = llm_profile["provider"].lower()
21
+
22
+ if _provider == "google":
23
+ tok = _client.models.count_tokens(
24
+ model=_model,
25
+ contents=total_input_content
26
+ )
27
+ input_prompt_tokens = tok.total_tokens
28
+ return input_prompt_tokens
29
+
30
+ elif _provider == "anthropic":
31
+ tok = _client.beta.messages.count_tokens(
32
+ model=_model,
33
+ system="calculate the total token for the given prompt",
34
+ messages=[{"role": "user", "content": total_input_content}]
35
+ )
36
+ input_prompt_tokens = tok.input_tokens
37
+ return input_prompt_tokens
38
+
39
+ else:
40
+ enc = tiktoken.encoding_for_model(_model)
41
+ input_prompt_tokens = len(enc.encode(total_input_content))
42
+ return input_prompt_tokens
43
+
44
+
45
+ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1, max_tokens=4096):
46
+ """
47
+ Returns:
48
+ (text, usage_dict)
49
+
50
+ usage_dict schema (best-effort, depending on provider):
51
+ {
52
+ "provider": str,
53
+ "model": str,
54
+ "input_tokens": int|None,
55
+ "output_tokens": int|None,
56
+ "total_tokens": int|None,
57
+ "error": str|None
58
+ }
59
+ """
60
+
61
+ # coding_profile['client'] = _prof.get_client(coding_profile)
62
+ _client = coding_profile["client"]
63
+ _provider = coding_profile["provider"].lower()
64
+ _model = coding_profile["model"]
65
+
66
+ usage = {
67
+ "provider": _provider,
68
+ "model": _model,
69
+ "input_tokens": None,
70
+ "output_tokens": None,
71
+ "total_tokens": None,
72
+ }
73
+
74
+ def _clean_text(t):
75
+ if t is None:
76
+ return ""
77
+ if not isinstance(t, str):
78
+ t = str(t)
79
+ return t.strip()
80
+
81
+ def _get_usage_val(u, keys):
82
+ """Read usage fields from dicts or objects, resiliently."""
83
+ if u is None:
84
+ return None
85
+ for k in keys:
86
+ try:
87
+ if isinstance(u, dict) and k in u:
88
+ return u[k]
89
+ if hasattr(u, k):
90
+ return getattr(u, k)
91
+ except Exception:
92
+ continue
93
+ return None
94
+
95
+ # Google
96
+ def google_generate_code():
97
+ try:
98
+ config = types.GenerateContentConfig(
99
+ system_instruction=system_prompt,
100
+ temperature=temperature,
101
+ max_output_tokens=max_tokens,
102
+ response_mime_type="text/plain",
103
+ )
104
+
105
+ resp = _client.models.generate_content(
106
+ model=_model,
107
+ contents=user_prompt,
108
+ config=config
109
+ )
110
+ # --- capture usage for Gemini / python-genai ---
111
+ um = getattr(resp, "usage_metadata", None) or getattr(resp, "usageMetadata", None)
112
+ if um is not None:
113
+ # Try all known names, then fall back to arithmetic
114
+ prompt = _get_usage_val(um, [
115
+ "prompt_token_count",
116
+ "promptTokenCount",
117
+ "input_tokens",
118
+ ])
119
+ candidates = _get_usage_val(um, [
120
+ "candidates_token_count",
121
+ "candidatesTokenCount",
122
+ "output_tokens",
123
+ ])
124
+ total = _get_usage_val(um, [
125
+ "total_token_count",
126
+ "totalTokenCount",
127
+ "total_tokens",
128
+ ])
129
+
130
+ usage["input_tokens"] = prompt
131
+ usage["output_tokens"] = candidates
132
+ usage["total_tokens"] = total
133
+
134
+ # Fallback: if output not provided but prompt + total exist,
135
+ # derive it as total - prompt
136
+ if usage["output_tokens"] is None and prompt is not None and total is not None:
137
+ try:
138
+ usage["output_tokens"] = int(total) - int(prompt)
139
+ except Exception:
140
+ # if it somehow fails, just leave it as None
141
+ pass
142
+
143
+ # 1) Fast path: SDK convenience property
144
+ text = getattr(resp, "text", None)
145
+ if isinstance(text, str) and text.strip():
146
+ return text.strip()
147
+ chunks = []
148
+ candidates = getattr(resp, "candidates", None) or []
149
+ for cand in candidates:
150
+ content = getattr(cand, "content", None)
151
+ parts = getattr(content, "parts", None) or []
152
+ for part in parts:
153
+ t = getattr(part, "text", None)
154
+ if t:
155
+ chunks.append(str(t))
156
+ text = "\n".join(chunks).strip()
157
+ if text:
158
+ return text
159
+
160
+ # Try to surface any block reason (safety / policy / etc.)
161
+ fb = getattr(resp, "prompt_feedback", None)
162
+ block_reason = getattr(fb, "block_reason", None) if fb else None
163
+ if block_reason:
164
+ raise RuntimeError(f"{_model} to blocked the response. Reason: {block_reason}")
165
+ raise RuntimeError(f"{_model} to say nothing in this section due to insufficient data.")
166
+
167
+ except Exception as e:
168
+ msg = f"I smxAI have instructed {e}\n"
169
+ return (
170
+ f"# {msg}\n"
171
+ "from syntaxmatrix.display import show\n"
172
+ f"show({msg!r})\n"
173
+ )
174
+
175
+ # OpenAI Responses API
176
+ def gpt_models_latest_generate_code():
177
+ nonlocal usage
178
+
179
+ def reasoning_and_verbosity():
180
+ reasoning_effort, verbosity = "medium", "medium"
181
+ if _model == "gpt-5-nano":
182
+ reasoning_effort, verbosity = "low", "low"
183
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
184
+ reasoning_effort, verbosity = "medium", "medium"
185
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
186
+ reasoning_effort, verbosity = "high", "high"
187
+ return (reasoning_effort, verbosity)
188
+ try:
189
+ args = _set_args(
190
+ model=_model,
191
+ instructions=system_prompt,
192
+ input=user_prompt,
193
+ previous_id=None,
194
+ store=False,
195
+ reasoning_effort=reasoning_and_verbosity()[0],
196
+ verbosity=reasoning_and_verbosity()[1],
197
+ )
198
+ resp = _client.responses.create(**args)
199
+
200
+ # --- Capture token usage (prompt, completion, and total tokens) ---
201
+ u = getattr(resp, "usage", None)
202
+ # If usage is not None, extract tokens directly from the response
203
+ if u is not None:
204
+ usage["input_tokens"] = getattr(u, "prompt_tokens", None) or getattr(u, "promptTokenCount", None)
205
+ usage["output_tokens"] = getattr(u, "completion_tokens", None) or getattr(u, "completionTokenCount", None)
206
+ usage["total_tokens"] = getattr(u, "total_tokens", None) or getattr(u, "totalTokenCount", None)
207
+
208
+ # --- If missing input/output tokens, fallback logic ---
209
+ if usage["input_tokens"] is None or usage["output_tokens"] is None:
210
+ # Use the raw response and fallback to manually calculate tokens
211
+ prompt_text = user_prompt
212
+ output_text = _out(resp).strip()
213
+
214
+ # Calculate input tokens based on the prompt
215
+ encoding = tiktoken.get_encoding("cl100k_base") # Use GPT-5's encoding
216
+ usage["input_tokens"] = len(encoding.encode(prompt_text))
217
+
218
+ # Calculate output tokens based on the model's response
219
+ usage["output_tokens"] = len(encoding.encode(output_text))
220
+
221
+ # Total tokens is the sum of input + output
222
+ usage["total_tokens"] = usage["input_tokens"] + usage["output_tokens"]
223
+
224
+ # If tokens are still missing, log the issue
225
+ if usage["input_tokens"] is None or usage["output_tokens"] is None:
226
+ raise RuntimeError(f"Missing token data in OpenAI response for model {_model}. Tokens: {usage}")
227
+
228
+ code = _out(resp).strip()
229
+ if code:
230
+ return code
231
+
232
+ # Try to surface any block reason (safety / policy / etc.)
233
+ block_reason = None
234
+ output = getattr(resp, "output", None) or []
235
+ for item in output:
236
+ fr = getattr(item, "finish_reason", None)
237
+ if fr and fr != "stop":
238
+ block_reason = fr
239
+ break
240
+
241
+ if block_reason:
242
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
243
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
244
+
245
+ except Exception as e:
246
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
247
+ msg = f"I smxAI have instructed {e}"
248
+ return (
249
+ f"# {msg}\n"
250
+ "from syntaxmatrix.display import show\n"
251
+ f"show({msg!r})\n"
252
+ )
253
+
254
+ # Anthropic
255
+ def anthropic_generate_code():
256
+ try:
257
+ resp = _client.messages.create(
258
+ model=_model,
259
+ max_tokens=max_tokens,
260
+ temperature=temperature,
261
+ system=system_prompt,
262
+ messages=[
263
+ {"role": "user", "content": user_prompt}
264
+ ]
265
+ )
266
+ # usage in Responses API is usually a dict-like object
267
+ um = getattr(resp, "usage", None)
268
+ if um is not None:
269
+ # Try all known names, then fall back to arithmetic
270
+ prompt = _get_usage_val(um, [
271
+ "prompt_token_count",
272
+ "promptTokenCount",
273
+ "input_tokens",
274
+ ])
275
+ candidates = _get_usage_val(um, [
276
+ "candidates_token_count",
277
+ "candidatesTokenCount",
278
+ "output_tokens",
279
+ ])
280
+ total = _get_usage_val(um, [
281
+ "total_token_count",
282
+ "totalTokenCount",
283
+ "total_tokens",
284
+ ])
285
+
286
+ usage["input_tokens"] = prompt
287
+ usage["output_tokens"] = candidates
288
+ usage["total_tokens"] = total
289
+
290
+ if usage["output_tokens"] is None and prompt is not None and total is not None:
291
+ try:
292
+ usage["output_tokens"] = int(total) - int(prompt)
293
+ except Exception:
294
+ # if it somehow fails, just leave it as None
295
+ pass
296
+
297
+ # Extract plain text from Claude-style content blocks
298
+ text_blocks = []
299
+ content = getattr(resp, "content", None) or []
300
+ for block in content:
301
+ t = getattr(block, "text", None)
302
+ if not t and isinstance(block, dict):
303
+ t = (block.get("text") or "").strip()
304
+ if t:
305
+ text_blocks.append(str(t))
306
+
307
+ text = "\n".join(text_blocks).strip()
308
+ if text:
309
+ return text
310
+
311
+ stop_reason = getattr(resp, "stop_reason", None)
312
+ if stop_reason and stop_reason != "end_turn":
313
+ raise RuntimeError(f"{_model} stopped with reason: {stop_reason}")
314
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
315
+
316
+ except Exception as e:
317
+ msg = f"I smxAI have instructed {e}\n"
318
+ return (
319
+ f"# {msg}\n"
320
+ "from syntaxmatrix.display import show\n"
321
+ f"show({msg!r})\n"
322
+ )
323
+
324
+ # OpenAI Chat Completions
325
+ def openai_sdk_generate_code():
326
+ try:
327
+ resp = _client.chat.completions.create(
328
+ model=_model,
329
+ messages=[
330
+ {"role": "system", "content": system_prompt},
331
+ {"role": "user", "content": user_prompt},
332
+ ],
333
+ temperature=temperature,
334
+ max_tokens=max_tokens,
335
+ )
336
+ um = getattr(resp, "usage", None)
337
+ usage["input_tokens"] = _get_usage_val(um, ["prompt_tokens", "input_tokens"])
338
+ usage["output_tokens"] = _get_usage_val(um, ["completion_tokens", "output_tokens"])
339
+ usage["total_tokens"] = _get_usage_val(um, ["total_tokens"])
340
+
341
+ text = resp.choices[0].message.content
342
+ if text:
343
+ return text
344
+
345
+ # Try to surface any block reason (safety / policy / etc.)
346
+ block_reason = None
347
+ choices = getattr(resp, "choices", None) or []
348
+ if choices:
349
+ first = choices[0]
350
+ fr = getattr(first, "finish_reason", None)
351
+ if fr and fr != "stop":
352
+ block_reason = fr
353
+
354
+ if block_reason:
355
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
356
+ # Fallback: nothing useful came back
357
+ raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
358
+
359
+ except Exception as e:
360
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
361
+ msg = f"I smxAI have instructed {e}"
362
+ return (
363
+ f"# {msg}\n"
364
+ "from syntaxmatrix.display import show\n"
365
+ f"show({msg!r})\n"
366
+ )
367
+
368
+ if _provider == "google":
369
+ code = google_generate_code()
370
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
371
+ code = gpt_models_latest_generate_code()
372
+ elif _provider == "anthropic":
373
+ code = anthropic_generate_code()
374
+ else:
375
+ code = openai_sdk_generate_code()
376
+
377
+ return code, usage
378
+
379
+
380
+ def refine_question_agent(raw_question: str, dataset_context: str | None = None) -> str:
381
+
382
+ def response_agent(user_prompt, system_prompt, llm_profile, temp=0.0, max_tokens=128):
383
+ _profile = llm_profile
384
+
385
+ _client = _profile["client"]
386
+ _provider = _profile["provider"].lower()
387
+ _model = _profile["model"]
388
+
389
+ # Google GenAI
390
+ if _provider == "google":
391
+ resp = _client.models.generate_content(
392
+ model=_model,
393
+ contents=system_prompt + "\n\n" + user_prompt,
394
+ )
395
+ text = resp.text
396
+ return text.strip()
397
+
398
+ # OpenAI
399
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
400
+
401
+ def reasoning_and_verbosity():
402
+ reasoning_effort, verbosity = "medium", "medium"
403
+ if _model == "gpt-5-nano":
404
+ if max_tokens <= 256:
405
+ reasoning_effort = "minimal"
406
+ else: reasoning_effort = "low"
407
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
408
+ verbosity = "medium"
409
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
410
+ reasoning_effort = "high"
411
+ verbosity = "high"
412
+ return (reasoning_effort, verbosity)
413
+
414
+ args = _set_args(
415
+ model=_model,
416
+ instructions=system_prompt,
417
+ input=user_prompt,
418
+ previous_id=None,
419
+ store=False,
420
+ reasoning_effort=reasoning_and_verbosity()[0],
421
+ verbosity=reasoning_and_verbosity()[1],
422
+ )
423
+ resp = _client.responses.create(**args)
424
+ txt = _out(resp)
425
+ return txt
426
+
427
+ # Anthropic
428
+ elif _provider == "anthropic":
429
+ try:
430
+ resp = _client.messages.create(
431
+ model=_model,
432
+ system=system_prompt,
433
+ messages=[{"role": "user", "content": user_prompt}],
434
+ temperature=0.2,
435
+ max_tokens= max_tokens,
436
+ )
437
+
438
+ # Extract plain text from Claude's content blocks
439
+ text = ""
440
+ content = getattr(resp, "content", None)
441
+ if content and isinstance(content, list):
442
+ parts = []
443
+ for block in content:
444
+ # blocks typically like {"type": "text", "text": "..."}
445
+ t = getattr(block, "text", None)
446
+ if not t and isinstance(block, dict):
447
+ t = block.get("text")
448
+ if t:
449
+ parts.append(t)
450
+ text = " ".join(parts)
451
+ return text
452
+ except Exception:
453
+ pass
454
+
455
+ # OpenAI SDK Compartible (Chat Completions)
456
+ else:
457
+ resp = _client.chat.completions.create(
458
+ model=_model,
459
+ messages=[
460
+ {"role": "system", "content": system_prompt},
461
+ {"role": "user", "content": user_prompt},
462
+ ],
463
+ temperature=temp,
464
+ max_tokens=max_tokens,
465
+ )
466
+ text = resp.choices[0].message.content
467
+ return text
468
+
469
+ return "Configure LLM Profiles or contact your administrator."
470
+
471
+ system_prompt = (
472
+ "You rewrite user questions into specification Machine Learning (ML) job description. "
473
+ "If a dataset summary is provided, use it to respect column and help you redefine the question. "
474
+ "DO NOT write andy prelude or preamble"
475
+ )
476
+
477
+ user_prompt = f"User question:\n{raw_question}\n\n"
478
+ if dataset_context:
479
+ user_prompt += f"Dataset summary:\n{dataset_context}\n"
480
+
481
+ _refiner_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
482
+ if not _refiner_profile:
483
+ return "ERROR"
484
+
485
+ _refiner_profile['client'] = _prof.get_client(_refiner_profile)
486
+
487
+ refined_question = response_agent(user_prompt, system_prompt, _refiner_profile, temp=0.0, max_tokens=128)
488
+ return refined_question
489
+
490
+
491
+ def classify_ml_job_agent(refined_question, dataset_profile):
492
+ """
493
+ Instructs an LLM (gemini-2.5-flash) to analyze a task description
494
+ and return a list of associated machine learning job/task types.
495
+ This version uses a highly extensive, generalized list of ML jobs
496
+ to ensure robustness across all domains (NLP, CV, RL, etc.).
497
+
498
+ Args:
499
+ task_description: The detailed description of the statistical/ML task.
500
+
501
+ Returns:
502
+ A list of strings identifying the relevant ML jobs. Returns an empty
503
+ list if the API call fails or the output cannot be parsed.
504
+ """
505
+
506
+ def ml_response(user_prompt, system_prompt, profile):
507
+ _profile = profile # _prof.get_profile["admin"]
508
+
509
+ _client = _profile["client"]
510
+ _provider = _profile["provider"].lower()
511
+ _model = _profile["model"]
512
+
513
+ prompt = user_prompt + "\n\n" + system_prompt
514
+
515
+ # Google GenAI
516
+ if _provider == "google":
517
+ from google.genai.errors import APIError
518
+
519
+ config=dict(
520
+ temperature=0.0,
521
+ response_mime_type="application/json",
522
+ # Enforcing a JSON array of strings structure for reliable parsing
523
+ response_schema={
524
+ "type": "array",
525
+ "items": {"type": "string"}
526
+ }
527
+ )
528
+ try:
529
+ response = _client.models.generate_content(
530
+ model=_model,
531
+ contents=prompt,
532
+ config=config,
533
+ )
534
+ json_string = response.text.strip()
535
+ ml_jobs = json.loads(json_string)
536
+
537
+ if not isinstance(ml_jobs, list) or not all(isinstance(job, str) for job in ml_jobs):
538
+ return []
539
+ return ml_jobs
540
+
541
+ except APIError as e:
542
+ return [f"An API error occurred: {e}"]
543
+ except json.JSONDecodeError as e:
544
+ if 'response' in locals():
545
+ return [f"Raw response text: {response.text}"]
546
+ except Exception as e:
547
+ return [f"An unexpected error occurred: {e}"]
548
+
549
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
550
+
551
+ def reasoning_and_verbosity():
552
+ reasoning_effort, verbosity = "medium", "medium"
553
+ if _model == "gpt-5-nano":
554
+ reasoning_effort = "low"
555
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
556
+ verbosity = "medium"
557
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
558
+ reasoning_effort = "high"
559
+ verbosity = "high"
560
+ return (reasoning_effort, verbosity)
561
+
562
+ args = _set_args(
563
+ model=_model,
564
+ instructions=system_prompt,
565
+ input=user_prompt,
566
+ previous_id=None,
567
+ store=False,
568
+ reasoning_effort=reasoning_and_verbosity()[0],
569
+ verbosity=reasoning_and_verbosity()[1],
570
+ )
571
+ resp = _client.responses.create(**args)
572
+ txt = _out(resp)
573
+ return txt
574
+
575
+ elif _provider == "anthropic":
576
+ try:
577
+ resp = _client.messages.create(
578
+ model=_model,
579
+ system=system_prompt,
580
+ messages=[{"role": "user", "content": user_prompt}],
581
+ temperature=0.0,
582
+ max_tokens= 128,
583
+ )
584
+
585
+ # Extract plain text from Claude's content blocks
586
+ text = ""
587
+ content = getattr(resp, "content", None)
588
+ if content and isinstance(content, list):
589
+ parts = []
590
+ for block in content:
591
+ # blocks typically like {"type": "text", "text": "..."}
592
+ t = getattr(block, "text", None)
593
+ if not t and isinstance(block, dict):
594
+ t = block.get("text")
595
+ if t:
596
+ parts.append(t)
597
+ text = " ".join(parts)
598
+ return text
599
+ except Exception:
600
+ pass
601
+
602
+ else:
603
+ resp = _client.chat.completions.create(
604
+ model=_model,
605
+ messages=[
606
+ {"role": "system", "content": system_prompt},
607
+ {"role": "user", "content": user_prompt},
608
+ ],
609
+ temperature=0.0,
610
+ max_tokens=128,
611
+ )
612
+ text = resp.choices[0].message.content
613
+ return text
614
+
615
+ return "Configure LLM Profiles or contact your administrator."
616
+
617
+ system_prompt = (
618
+ "You are a strict machine learning task classifier for an ML workbench.\n"
619
+ "Your job is to label the user's task desc. with all relevant tags from a fixed list.\n\n"
620
+ )
621
+
622
+ # --- 1. Define the Master List of ML Tasks (Generalized) ---
623
+ ml_task_list = [
624
+ # Supervised Learning
625
+ "classification", "regression", "ranking", "object_detection", "image_segmentation",
626
+
627
+ # Unsupervised Learning
628
+ "clustering", "dimensionality_reduction", "anomaly_detection", "association_rule_mining",
629
+
630
+ # Sequential/Time Data
631
+ "time_series_forecasting", "sequence_labeling", "survival_analysis",
632
+
633
+ # Specialized Domains
634
+ "natural_language_processing", "computer_vision", "reinforcement_learning",
635
+ "generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
636
+
637
+ # Foundational/Pipeline Steps
638
+ "feature_engineering", "statistical_inference", "data_preprocessing",
639
+ "model_validation", "hyperparameter_tuning"
640
+ ]
641
+
642
+ # --- 2. Construct the Generalized Prompt for the LLM ---
643
+ task_description = refined_question
644
+
645
+ user_prompt = f"""
646
+ Analyze the following task description:
647
+ ---
648
+ {task_description}
649
+ ---
650
+
651
+ If the Dataset Profile is provided, use its info, together with the task description, to make your job types
652
+ Identify and select ALL job types from the provided, extensive list that are directly
653
+ relevant to achieving the goals outlined in the task description (either as the
654
+ core goal, prerequisites, or essential steps).
655
+
656
+ ML Jobs List: {', '.join(ml_task_list)}
657
+
658
+ Respond ONLY with a valid JSON array of strings containing the selected ML job names.
659
+ Example Response: ["natural_language_processing", "classification", "feature_engineering"]
660
+ """
661
+
662
+ if dataset_profile:
663
+ user_prompt += f"\nDataset profile:\n{dataset_profile}\n"
664
+
665
+ llm_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
666
+ if not llm_profile:
667
+ return "ERROR"
668
+
669
+ llm_profile['client'] = _prof.get_client(llm_profile)
670
+
671
+ # Extract raw content
672
+ tasks = ml_response(user_prompt, system_prompt, llm_profile)
673
+ return tasks
674
+
675
+
676
+ def text_formatter_agent(text):
677
+ """
678
+ Parses an ML job description using the Gemini API with Structured JSON Output.
679
+ """
680
+
681
+ def generate_formatted_report(data):
682
+ """
683
+ Generates a formatted string of the structured data in a clean,
684
+ document-like format mimicking the requested list structure.
685
+
686
+ Returns:
687
+ str: The complete formatted report as a string.
688
+ """
689
+ if not data:
690
+ return "No data to display."
691
+
692
+ output_lines = []
693
+
694
+ # --- Helper Functions ---
695
+ def clean_md(text):
696
+ """Removes markdown bold syntax."""
697
+ return text.replace("**", "")
698
+
699
+ def format_smart_list_item(prefix, item_text, width=80):
700
+ """
701
+ Content-agnostic list formatter.
702
+ Detects 'Header: Description' patterns and formats them inline.
703
+ Returns the formatted string.
704
+ """
705
+ cleaned = clean_md(item_text)
706
+
707
+ # Check for "Header: Description" pattern
708
+ # We look for a colon appearing early in the string (e.g., within first 60 chars)
709
+ colon_match = re.match(r"^([^:]{1,60}):\s*(.*)", cleaned, re.DOTALL)
710
+
711
+ if colon_match:
712
+ header = colon_match.group(1).strip()
713
+ description = colon_match.group(2).strip()
714
+
715
+ # Format: PREFIX HEADER: Description
716
+ full_line = f"{prefix} {header.upper()}: {description}\n"
717
+ else:
718
+ # Format: PREFIX Content
719
+ full_line = f"{prefix} {cleaned}\n"
720
+
721
+ # Calculate hanging indent (aligning with the start of the text after the prefix)
722
+ # Length of prefix + 1 space
723
+ indent_width = len(prefix) + 1
724
+ hanging_indent = " " * indent_width
725
+
726
+ return textwrap.fill(
727
+ full_line,
728
+ width=width,
729
+ subsequent_indent=hanging_indent
730
+ )
731
+
732
+ # --- Report Construction ---
733
+
734
+ # 1. Title
735
+ title = clean_md(data.get("project_title", "Project Report"))
736
+ output_lines.append("\n" + "=" * 80)
737
+ output_lines.append(f"{title.center(80)}")
738
+ output_lines.append("=" * 80 + "\n")
739
+
740
+ # 2. Project Goal
741
+ output_lines.append("PROJECT GOAL\n")
742
+ output_lines.append("-" * 12)
743
+ goal = clean_md(data.get("project_goal", ""))
744
+ output_lines.append(textwrap.fill(goal, width=80))
745
+ output_lines.append("") # Adds a blank line
746
+
747
+ # 3. Key Objectives
748
+ if data.get("key_objectives"):
749
+ output_lines.append("KEY OBJECTIVES & STRATEGIC INSIGHTS")
750
+ output_lines.append("-" * 35)
751
+ for item in data["key_objectives"]:
752
+ output_lines.append(format_smart_list_item("•", item))
753
+ output_lines.append("")
754
+
755
+ # 4. ML Tasks (Numbered List)
756
+ if data.get("ml_tasks"):
757
+ output_lines.append("ML EXECUTION TASKS")
758
+ output_lines.append("-" * 18)
759
+ for i, task in enumerate(data["ml_tasks"], 1):
760
+ # Using i. as prefix
761
+ output_lines.append(format_smart_list_item(f"{i}.", task))
762
+ output_lines.append("")
763
+
764
+ # 5. Deliverables
765
+ if data.get("expected_deliverables"):
766
+ output_lines.append("EXPECTED DELIVERABLES")
767
+ output_lines.append("-" * 21)
768
+ for item in data["expected_deliverables"]:
769
+ output_lines.append(format_smart_list_item("•", item))
770
+ output_lines.append("")
771
+
772
+ # Join all lines with newlines
773
+ return "\n".join(output_lines)
774
+
775
+ formatter_profile = _prof.get_profile("classification") or _prof.get_profile("classification")
776
+ _api_key = formatter_profile["api_key"]
777
+ _provider = formatter_profile["provider"]
778
+ _model = formatter_profile["model"]
779
+
780
+ # 1. Define the Schema for strict JSON enforcement
781
+ schema = {
782
+ "type": "OBJECT",
783
+ "properties": {
784
+ "project_title": {"type": "STRING"},
785
+ "project_goal": {"type": "STRING"},
786
+ "key_objectives": {
787
+ "type": "ARRAY",
788
+ "items": {"type": "STRING"}
789
+ },
790
+ "data_inputs": {
791
+ "type": "OBJECT",
792
+ "properties": {
793
+ "description_items": {
794
+ "type": "ARRAY",
795
+ "items": {"type": "STRING"}
796
+ },
797
+ "extracted_features": {
798
+ "type": "ARRAY",
799
+ "items": {"type": "STRING"},
800
+ "description": "List of specific column names or features mentioned (e.g. Age, BMI)"
801
+ }
802
+ }
803
+ },
804
+ "ml_tasks": {
805
+ "type": "ARRAY",
806
+ "items": {"type": "STRING"}
807
+ },
808
+ "expected_deliverables": {
809
+ "type": "ARRAY",
810
+ "items": {"type": "STRING"}
811
+ }
812
+ },
813
+ "required": ["project_title", "project_goal", "key_objectives", "data_inputs", "ml_tasks"]
814
+ }
815
+
816
+ # 2. Construct the API Request
817
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{_model}:generateContent?key={_api_key}"
818
+
819
+ headers = {"Content-Type": "application/json"}
820
+
821
+ payload = {
822
+ "contents": [{
823
+ "parts": [{
824
+ "text": f"Extract the structured data from the following ML Job Description:\n\n{text}"
825
+ }]
826
+ }],
827
+ "generationConfig": {
828
+ "responseMimeType": "application/json",
829
+ "responseSchema": schema
830
+ }
831
+ }
832
+
833
+ try:
834
+ response = requests.post(url, headers=headers, json=payload)
835
+ response.raise_for_status()
836
+
837
+ result_json = response.json()
838
+
839
+ # 4. Extract and Parse Content
840
+ raw_text_response = result_json["candidates"][0]["content"]["parts"][0]["text"]
841
+ parsed_data = json.loads(raw_text_response)
842
+
843
+ report = generate_formatted_report(parsed_data)
844
+ return parsed_data
845
+
846
+ except requests.exceptions.RequestException as e:
847
+ if 'response' in locals() and response is not None:
848
+ return (f"API Request Failed: {e}\n\nResponse info: {response.text}")
849
+ return None
850
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
851
+ return f"Parsing Failed: {e}"
852
+
853
+