syntaxmatrix 2.5.5__py3-none-any.whl → 2.5.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,819 @@
1
+ # syntaxmatrix/agents.py
2
+ from __future__ import annotations
3
+ import os, re, json, textwrap, requests
4
+ import pandas as pd
5
+
6
+ from typing import Optional, List
7
+
8
+ from syntaxmatrix import utils
9
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
10
+ from .. import profiles as _prof
11
+ from ..gpt_models_latest import set_args as _set_args, extract_output_text as _out
12
+ from google.genai import types
13
+ import tiktoken
14
+ from google.genai.errors import APIError
15
+
16
+
17
+ def token_calculator(total_input_content, llm_profile):
18
+
19
+ _client = llm_profile["client"]
20
+ _model = llm_profile["model"]
21
+ _provider = llm_profile["provider"].lower()
22
+
23
+ if _provider == "google":
24
+ tok = _client.models.count_tokens(
25
+ model=_model,
26
+ contents=total_input_content
27
+ )
28
+ input_prompt_tokens = tok.total_tokens
29
+ return input_prompt_tokens
30
+
31
+ elif _provider == "anthropic":
32
+ tok = _client.beta.messages.count_tokens(
33
+ model=_model,
34
+ system="calculate the total token for the given prompt",
35
+ messages=[{"role": "user", "content": total_input_content}]
36
+ )
37
+ input_prompt_tokens = tok.input_tokens
38
+ return input_prompt_tokens
39
+
40
+ else:
41
+ enc = tiktoken.encoding_for_model(_model)
42
+ input_prompt_tokens = len(enc.encode(total_input_content))
43
+ return input_prompt_tokens
44
+
45
+ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1, max_tokens=4096):
46
+ """
47
+ Returns:
48
+ (text, usage_dict)
49
+
50
+ usage_dict schema (best-effort, depending on provider):
51
+ {
52
+ "provider": str,
53
+ "model": str,
54
+ "input_tokens": int|None,
55
+ "output_tokens": int|None,
56
+ "total_tokens": int|None,
57
+ "error": str|None
58
+ }
59
+ """
60
+
61
+ # coding_profile['client'] = _prof.get_client(coding_profile)
62
+ _client = coding_profile["client"]
63
+ _provider = coding_profile["provider"].lower()
64
+ _model = coding_profile["model"]
65
+
66
+ usage = {
67
+ "provider": _provider,
68
+ "model": _model,
69
+ "input_tokens": None,
70
+ "output_tokens": None,
71
+ "total_tokens": None,
72
+ }
73
+
74
+ def _clean_text(t):
75
+ if t is None:
76
+ return ""
77
+ if not isinstance(t, str):
78
+ t = str(t)
79
+ return t.strip()
80
+
81
+ def _get_usage_val(u, keys):
82
+ """Read usage fields from dicts or objects, resiliently."""
83
+ if u is None:
84
+ return None
85
+ for k in keys:
86
+ try:
87
+ if isinstance(u, dict) and k in u:
88
+ return u[k]
89
+ if hasattr(u, k):
90
+ return getattr(u, k)
91
+ except Exception:
92
+ continue
93
+ return None
94
+
95
+ # Google
96
+ def google_generate_code():
97
+ nonlocal usage
98
+ """
99
+ Generates content using the Gemini API and calculates token usage
100
+ including Context Overhead for consistency.
101
+ """
102
+
103
+ try:
104
+ # 1. Client Initialization
105
+ config = types.GenerateContentConfig(
106
+ system_instruction=system_prompt,
107
+ temperature=temperature,
108
+ max_output_tokens=max_tokens,
109
+ )
110
+
111
+ # 2. API Call
112
+ resp = _client.models.generate_content(
113
+ model=_model,
114
+ contents=[user_prompt],
115
+ config=config,
116
+ )
117
+
118
+ print("\n888888888888888888888888888888\n")
119
+ print("RESPONSE:\n", resp)
120
+ print("\n888888888888888888888888888888\n")
121
+
122
+ print("\n888888888888888888888888888888\n")
123
+ print("USAGEMETADATA:\n", resp.usage)
124
+ print("\n888888888888888888888888888888\n")
125
+
126
+ # 3. Token Usage Capture and Context Overhead Calculation
127
+ um = getattr(resp, "usage_metadata", None)
128
+ if um is not None:
129
+ usage["input_tokens"] = _get_usage_val(um, ["prompt_token_count", "promptTokenCount", "input_tokens"])
130
+ usage["output_tokens"] = _get_usage_val(um, ["candidates_token_count", "candidatesTokenCount", "output_tokens"])
131
+ usage["total_tokens"] = _get_usage_val(um, ["total_token_count", "totalTokenCount", "total_tokens"])
132
+
133
+ # 4. Response Extraction (same robust logic as before)
134
+ text = getattr(resp, "text", None)
135
+ if isinstance(text, str) and text.strip():
136
+ return text.strip()
137
+
138
+ chunks = []
139
+ candidates = getattr(resp, "candidates", None) or []
140
+ for cand in candidates:
141
+ content = getattr(cand, "content", None)
142
+ if content:
143
+ parts = getattr(content, "parts", None) or []
144
+ for part in parts:
145
+ t = getattr(part, "text", None)
146
+ if t:
147
+ chunks.append(str(t))
148
+
149
+ text = "\n".join(chunks).strip()
150
+ if text:
151
+ return text
152
+
153
+ # 5. Handle blocked response
154
+ fb = getattr(resp, "prompt_feedback", None)
155
+ block_reason = getattr(fb, "block_reason", None) if fb else None
156
+ if block_reason and block_reason != types.BlockedReason.REASON_UNSPECIFIED:
157
+ raise RuntimeError(f"{_model} blocked the response. Reason: {block_reason.name}")
158
+ raise RuntimeError(f"{_model} failed to return content due to insufficient data.")
159
+
160
+ except APIError as e:
161
+ error_msg = f"Gemini API Error: {e}"
162
+
163
+ except Exception as e:
164
+ error_msg = f"An unexpected error occurred during API call or processing: {e}"
165
+
166
+ # --- Return the error message wrapped in the required output code structure ---
167
+ msg = f"I smxAI have instructed {error_msg}\n"
168
+ return (
169
+ f"# {msg}\n"
170
+ "from syntaxmatrix.display import show\n"
171
+ f"show({msg!r})\n"
172
+ )
173
+
174
+ # OpenAI Responses API
175
+ def gpt_models_latest_generate_code():
176
+ nonlocal usage
177
+
178
+ def reasoning_and_verbosity():
179
+ reasoning_effort, verbosity = "medium", "medium"
180
+ if _model == "gpt-5-nano":
181
+ reasoning_effort, verbosity = "low", "low"
182
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
183
+ reasoning_effort, verbosity = "medium", "medium"
184
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
185
+ reasoning_effort, verbosity = "high", "high"
186
+ return (reasoning_effort, verbosity)
187
+ try:
188
+ args = _set_args(
189
+ model=_model,
190
+ instructions=system_prompt,
191
+ input=user_prompt,
192
+ previous_id=None,
193
+ store=False,
194
+ reasoning_effort=reasoning_and_verbosity()[0],
195
+ verbosity=reasoning_and_verbosity()[1],
196
+ )
197
+ resp = _client.responses.create(**args)
198
+
199
+ um = resp.usage
200
+ usage["input_tokens"] = um.input_tokens
201
+ usage["output_tokens"] = um.output_tokens
202
+ usage["total_tokens"] = um.total_tokens
203
+
204
+ code = _out(resp).strip()
205
+ if code:
206
+ return code
207
+
208
+ # Try to surface any block reason (safety / policy / etc.)
209
+ block_reason = None
210
+ output = resp.get("output")
211
+ for item in output:
212
+ fr = getattr(item, "finish_reason", None)
213
+ if fr and fr != "stop":
214
+ block_reason = fr
215
+ break
216
+ if block_reason:
217
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
218
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
219
+
220
+ except APIError as e:
221
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
222
+ msg = f"I smxAI have instructed {e}"
223
+ return (
224
+ f"# {msg}\n"
225
+ "from syntaxmatrix.display import show\n"
226
+ f"show({msg!r})\n"
227
+ )
228
+
229
+ except Exception as e:
230
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
231
+ msg = f"I smxAI have instructed {e}"
232
+ return (
233
+ f"# {msg}\n"
234
+ "from syntaxmatrix.display import show\n"
235
+ f"show({msg!r})\n"
236
+ )
237
+
238
+ # Anthropic
239
+ def anthropic_generate_code():
240
+ nonlocal usage
241
+ try:
242
+ resp = _client.messages.create(
243
+ model=_model,
244
+ max_tokens=max_tokens,
245
+ temperature=temperature,
246
+ system=system_prompt,
247
+ messages=[
248
+ {"role": "user", "content": user_prompt}
249
+ ]
250
+ )
251
+
252
+ um = resp.usage
253
+ usage["input_tokens"] = um.input_tokens
254
+ usage["output_tokens"] = um.output_tokens
255
+ usage["total_tokens"] = um.input_tokens + um.output_tokens
256
+
257
+ # Extract plain text from Claude-style content blocks
258
+ text_blocks = []
259
+ content = getattr(resp, "content", None) or []
260
+ for block in content:
261
+ t = getattr(block, "text", None)
262
+ if not t and isinstance(block, dict):
263
+ t = (block.get("text") or "").strip()
264
+ if t:
265
+ text_blocks.append(str(t))
266
+
267
+ text = "\n".join(text_blocks).strip()
268
+ if text:
269
+ return text
270
+
271
+ stop_reason = getattr(resp, "stop_reason", None)
272
+ if stop_reason and stop_reason != "end_turn":
273
+ raise RuntimeError(f"{_model} stopped with reason: {stop_reason}")
274
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
275
+
276
+ except Exception as e:
277
+ msg = f"I smxAI have instructed {e}\n"
278
+ return (
279
+ f"# {msg}\n"
280
+ "from syntaxmatrix.display import show\n"
281
+ f"show({msg!r})\n"
282
+ )
283
+
284
+ # OpenAI Chat Completions
285
+ def openai_sdk_generate_code():
286
+ nonlocal usage
287
+ try:
288
+ resp = _client.chat.completions.create(
289
+ model=_model,
290
+ messages=[
291
+ {"role": "system", "content": system_prompt},
292
+ {"role": "user", "content": user_prompt},
293
+ ],
294
+ temperature=temperature,
295
+ max_tokens=max_tokens,
296
+ )
297
+
298
+
299
+
300
+ um = resp.usage
301
+ usage["input_tokens"] = um.prompt_tokens
302
+ usage["output_tokens"] = um.completion_tokens
303
+ usage["total_tokens"] = um.total_tokens
304
+
305
+ text = resp.choices[0].message.content
306
+ if text:
307
+ return text
308
+
309
+ # Try to surface any block reason (safety / policy / etc.)
310
+ block_reason = None
311
+ choices = getattr(resp, "choices", None) or []
312
+ if choices:
313
+ first = choices[0]
314
+ fr = getattr(first, "finish_reason", None)
315
+ if fr and fr != "stop":
316
+ block_reason = fr
317
+
318
+ if block_reason:
319
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
320
+ # Fallback: nothing useful came back
321
+ raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
322
+
323
+ except Exception as e:
324
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
325
+ msg = f"I smxAI have instructed {e}"
326
+ return (
327
+ f"# {msg}\n"
328
+ "from syntaxmatrix.display import show\n"
329
+ f"show({msg!r})\n"
330
+ )
331
+
332
+ # print("TTOOKKEENN: ", token_calculator(system_prompt + user_prompt, coding_profile))
333
+
334
+ if _provider == "google":
335
+ code = google_generate_code()
336
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
337
+ code = gpt_models_latest_generate_code()
338
+ elif _provider == "anthropic":
339
+ code = anthropic_generate_code()
340
+ else:
341
+ code = openai_sdk_generate_code()
342
+
343
+ return code, usage
344
+
345
+
346
+ def refine_question_agent(raw_question: str, dataset_context: str | None = None) -> str:
347
+
348
+ def response_agent(user_prompt, system_prompt, llm_profile, temp=0.0, max_tokens=128):
349
+ _profile = llm_profile
350
+
351
+ _client = _profile["client"]
352
+ _provider = _profile["provider"].lower()
353
+ _model = _profile["model"]
354
+
355
+ # Google GenAI
356
+ if _provider == "google":
357
+ resp = _client.models.generate_content(
358
+ model=_model,
359
+ contents=system_prompt + "\n\n" + user_prompt,
360
+ )
361
+ text = resp.text
362
+ return text.strip()
363
+
364
+ # OpenAI
365
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
366
+
367
+ def reasoning_and_verbosity():
368
+ reasoning_effort, verbosity = "medium", "medium"
369
+ if _model == "gpt-5-nano":
370
+ if max_tokens <= 256:
371
+ reasoning_effort = "minimal"
372
+ else: reasoning_effort = "low"
373
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
374
+ verbosity = "medium"
375
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
376
+ reasoning_effort = "high"
377
+ verbosity = "high"
378
+ return (reasoning_effort, verbosity)
379
+
380
+ args = _set_args(
381
+ model=_model,
382
+ instructions=system_prompt,
383
+ input=user_prompt,
384
+ previous_id=None,
385
+ store=False,
386
+ reasoning_effort=reasoning_and_verbosity()[0],
387
+ verbosity=reasoning_and_verbosity()[1],
388
+ )
389
+ resp = _client.responses.create(**args)
390
+ txt = _out(resp)
391
+ return txt
392
+
393
+ # Anthropic
394
+ elif _provider == "anthropic":
395
+ try:
396
+ resp = _client.messages.create(
397
+ model=_model,
398
+ system=system_prompt,
399
+ messages=[{"role": "user", "content": user_prompt}],
400
+ temperature=0.2,
401
+ max_tokens= max_tokens,
402
+ )
403
+
404
+ # Extract plain text from Claude's content blocks
405
+ text = ""
406
+ content = getattr(resp, "content", None)
407
+ if content and isinstance(content, list):
408
+ parts = []
409
+ for block in content:
410
+ # blocks typically like {"type": "text", "text": "..."}
411
+ t = getattr(block, "text", None)
412
+ if not t and isinstance(block, dict):
413
+ t = block.get("text")
414
+ if t:
415
+ parts.append(t)
416
+ text = " ".join(parts)
417
+ return text
418
+ except Exception:
419
+ pass
420
+
421
+ # OpenAI SDK Compartible (Chat Completions)
422
+ else:
423
+ resp = _client.chat.completions.create(
424
+ model=_model,
425
+ messages=[
426
+ {"role": "system", "content": system_prompt},
427
+ {"role": "user", "content": user_prompt},
428
+ ],
429
+ temperature=temp,
430
+ max_tokens=max_tokens,
431
+ )
432
+ text = resp.choices[0].message.content
433
+ return text
434
+
435
+ return "Configure LLM Profiles or contact your administrator."
436
+
437
+ system_prompt = (
438
+ "You rewrite user questions into specification Machine Learning (ML) job description. "
439
+ "If a dataset summary is provided, use it to respect column and help you redefine the question. "
440
+ "DO NOT write andy prelude or preamble"
441
+ )
442
+
443
+ user_prompt = f"User question:\n{raw_question}\n\n"
444
+ if dataset_context:
445
+ user_prompt += f"Dataset summary:\n{dataset_context}\n"
446
+
447
+ _refiner_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
448
+ if not _refiner_profile:
449
+ return "ERROR"
450
+
451
+ _refiner_profile['client'] = _prof.get_client(_refiner_profile)
452
+
453
+ refined_question = response_agent(user_prompt, system_prompt, _refiner_profile, temp=0.0, max_tokens=128)
454
+ return refined_question
455
+
456
+
457
+ def classify_ml_job_agent(refined_question, dataset_profile):
458
+ """
459
+ Instructs an LLM (gemini-2.5-flash) to analyze a task description
460
+ and return a list of associated machine learning job/task types.
461
+ This version uses a highly extensive, generalized list of ML jobs
462
+ to ensure robustness across all domains (NLP, CV, RL, etc.).
463
+
464
+ Args:
465
+ task_description: The detailed description of the statistical/ML task.
466
+
467
+ Returns:
468
+ A list of strings identifying the relevant ML jobs. Returns an empty
469
+ list if the API call fails or the output cannot be parsed.
470
+ """
471
+
472
+ def ml_response(user_prompt, system_prompt, profile):
473
+ _profile = profile # _prof.get_profile["admin"]
474
+
475
+ _client = _profile["client"]
476
+ _provider = _profile["provider"].lower()
477
+ _model = _profile["model"]
478
+
479
+ prompt = user_prompt + "\n\n" + system_prompt
480
+
481
+ # Google GenAI
482
+ if _provider == "google":
483
+ from google.genai.errors import APIError
484
+
485
+ config=dict(
486
+ temperature=0.0,
487
+ response_mime_type="application/json",
488
+ # Enforcing a JSON array of strings structure for reliable parsing
489
+ response_schema={
490
+ "type": "array",
491
+ "items": {"type": "string"}
492
+ }
493
+ )
494
+ try:
495
+ response = _client.models.generate_content(
496
+ model=_model,
497
+ contents=prompt,
498
+ config=config,
499
+ )
500
+ json_string = response.text.strip()
501
+ ml_jobs = json.loads(json_string)
502
+
503
+ if not isinstance(ml_jobs, list) or not all(isinstance(job, str) for job in ml_jobs):
504
+ return []
505
+ return ml_jobs
506
+
507
+ except APIError as e:
508
+ return [f"An API error occurred: {e}"]
509
+ except json.JSONDecodeError as e:
510
+ if 'response' in locals():
511
+ return [f"Raw response text: {response.text}"]
512
+ except Exception as e:
513
+ return [f"An unexpected error occurred: {e}"]
514
+
515
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
516
+
517
+ def reasoning_and_verbosity():
518
+ reasoning_effort, verbosity = "medium", "medium"
519
+ if _model == "gpt-5-nano":
520
+ reasoning_effort = "low"
521
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
522
+ verbosity = "medium"
523
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
524
+ reasoning_effort = "high"
525
+ verbosity = "high"
526
+ return (reasoning_effort, verbosity)
527
+
528
+ args = _set_args(
529
+ model=_model,
530
+ instructions=system_prompt,
531
+ input=user_prompt,
532
+ previous_id=None,
533
+ store=False,
534
+ reasoning_effort=reasoning_and_verbosity()[0],
535
+ verbosity=reasoning_and_verbosity()[1],
536
+ )
537
+ resp = _client.responses.create(**args)
538
+ txt = _out(resp)
539
+ return txt
540
+
541
+ elif _provider == "anthropic":
542
+ try:
543
+ resp = _client.messages.create(
544
+ model=_model,
545
+ system=system_prompt,
546
+ messages=[{"role": "user", "content": user_prompt}],
547
+ temperature=0.0,
548
+ max_tokens= 128,
549
+ )
550
+
551
+ # Extract plain text from Claude's content blocks
552
+ text = ""
553
+ content = getattr(resp, "content", None)
554
+ if content and isinstance(content, list):
555
+ parts = []
556
+ for block in content:
557
+ # blocks typically like {"type": "text", "text": "..."}
558
+ t = getattr(block, "text", None)
559
+ if not t and isinstance(block, dict):
560
+ t = block.get("text")
561
+ if t:
562
+ parts.append(t)
563
+ text = " ".join(parts)
564
+ return text
565
+ except Exception:
566
+ pass
567
+
568
+ else:
569
+ resp = _client.chat.completions.create(
570
+ model=_model,
571
+ messages=[
572
+ {"role": "system", "content": system_prompt},
573
+ {"role": "user", "content": user_prompt},
574
+ ],
575
+ temperature=0.0,
576
+ max_tokens=128,
577
+ )
578
+ text = resp.choices[0].message.content
579
+ return text
580
+
581
+ return "Configure LLM Profiles or contact your administrator."
582
+
583
+ system_prompt = (
584
+ "You are a strict machine learning task classifier for an ML workbench.\n"
585
+ "Your job is to label the user's task desc. with all relevant tags from a fixed list.\n\n"
586
+ )
587
+
588
+ # --- 1. Define the Master List of ML Tasks (Generalized) ---
589
+ ml_task_list = [
590
+ # Supervised Learning
591
+ "classification", "regression", "ranking", "object_detection", "image_segmentation",
592
+
593
+ # Unsupervised Learning
594
+ "clustering", "dimensionality_reduction", "anomaly_detection", "association_rule_mining",
595
+
596
+ # Sequential/Time Data
597
+ "time_series_forecasting", "sequence_labeling", "survival_analysis",
598
+
599
+ # Specialized Domains
600
+ "natural_language_processing", "computer_vision", "reinforcement_learning",
601
+ "generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
602
+
603
+ # Foundational/Pipeline Steps
604
+ "feature_engineering", "statistical_inference", "data_preprocessing",
605
+ "model_validation", "hyperparameter_tuning"
606
+ ]
607
+
608
+ # --- 2. Construct the Generalized Prompt for the LLM ---
609
+ task_description = refined_question
610
+
611
+ user_prompt = f"""
612
+ Analyze the following task description:
613
+ ---
614
+ {task_description}
615
+ ---
616
+
617
+ If the Dataset Profile is provided, use its info, together with the task description, to make your job types
618
+ Identify and select ALL job types from the provided, extensive list that are directly
619
+ relevant to achieving the goals outlined in the task description (either as the
620
+ core goal, prerequisites, or essential steps).
621
+
622
+ ML Jobs List: {', '.join(ml_task_list)}
623
+
624
+ Respond ONLY with a valid JSON array of strings containing the selected ML job names.
625
+ Example Response: ["natural_language_processing", "classification", "feature_engineering"]
626
+ """
627
+
628
+ if dataset_profile:
629
+ user_prompt += f"\nDataset profile:\n{dataset_profile}\n"
630
+
631
+ llm_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
632
+ if not llm_profile:
633
+ return "ERROR"
634
+
635
+ llm_profile['client'] = _prof.get_client(llm_profile)
636
+
637
+ # Extract raw content
638
+ tasks = ml_response(user_prompt, system_prompt, llm_profile)
639
+ return tasks
640
+
641
+
642
+ def text_formatter_agent(text):
643
+ """
644
+ Parses an ML job description using the Gemini API with Structured JSON Output.
645
+ """
646
+
647
+ def generate_formatted_report(data):
648
+ """
649
+ Generates a formatted string of the structured data in a clean,
650
+ document-like format mimicking the requested list structure.
651
+
652
+ Returns:
653
+ str: The complete formatted report as a string.
654
+ """
655
+ if not data:
656
+ return "No data to display."
657
+
658
+ output_lines = []
659
+
660
+ # --- Helper Functions ---
661
+ def clean_md(text):
662
+ """Removes markdown bold syntax."""
663
+ return text.replace("**", "")
664
+
665
+ def format_smart_list_item(prefix, item_text, width=80):
666
+ """
667
+ Content-agnostic list formatter.
668
+ Detects 'Header: Description' patterns and formats them inline.
669
+ Returns the formatted string.
670
+ """
671
+ cleaned = clean_md(item_text)
672
+
673
+ # Check for "Header: Description" pattern
674
+ # We look for a colon appearing early in the string (e.g., within first 60 chars)
675
+ colon_match = re.match(r"^([^:]{1,60}):\s*(.*)", cleaned, re.DOTALL)
676
+
677
+ if colon_match:
678
+ header = colon_match.group(1).strip()
679
+ description = colon_match.group(2).strip()
680
+
681
+ # Format: PREFIX HEADER: Description
682
+ full_line = f"{prefix} {header.upper()}: {description}\n"
683
+ else:
684
+ # Format: PREFIX Content
685
+ full_line = f"{prefix} {cleaned}\n"
686
+
687
+ # Calculate hanging indent (aligning with the start of the text after the prefix)
688
+ # Length of prefix + 1 space
689
+ indent_width = len(prefix) + 1
690
+ hanging_indent = " " * indent_width
691
+
692
+ return textwrap.fill(
693
+ full_line,
694
+ width=width,
695
+ subsequent_indent=hanging_indent
696
+ )
697
+
698
+ # --- Report Construction ---
699
+
700
+ # 1. Title
701
+ title = clean_md(data.get("project_title", "Project Report"))
702
+ output_lines.append("\n" + "=" * 80)
703
+ output_lines.append(f"{title.center(80)}")
704
+ output_lines.append("=" * 80 + "\n")
705
+
706
+ # 2. Project Goal
707
+ output_lines.append("PROJECT GOAL\n")
708
+ output_lines.append("-" * 12)
709
+ goal = clean_md(data.get("project_goal", ""))
710
+ output_lines.append(textwrap.fill(goal, width=80))
711
+ output_lines.append("") # Adds a blank line
712
+
713
+ # 3. Key Objectives
714
+ if data.get("key_objectives"):
715
+ output_lines.append("KEY OBJECTIVES & STRATEGIC INSIGHTS")
716
+ output_lines.append("-" * 35)
717
+ for item in data["key_objectives"]:
718
+ output_lines.append(format_smart_list_item("•", item))
719
+ output_lines.append("")
720
+
721
+ # 4. ML Tasks (Numbered List)
722
+ if data.get("ml_tasks"):
723
+ output_lines.append("ML EXECUTION TASKS")
724
+ output_lines.append("-" * 18)
725
+ for i, task in enumerate(data["ml_tasks"], 1):
726
+ # Using i. as prefix
727
+ output_lines.append(format_smart_list_item(f"{i}.", task))
728
+ output_lines.append("")
729
+
730
+ # 5. Deliverables
731
+ if data.get("expected_deliverables"):
732
+ output_lines.append("EXPECTED DELIVERABLES")
733
+ output_lines.append("-" * 21)
734
+ for item in data["expected_deliverables"]:
735
+ output_lines.append(format_smart_list_item("•", item))
736
+ output_lines.append("")
737
+
738
+ # Join all lines with newlines
739
+ return "\n".join(output_lines)
740
+
741
+ formatter_profile = _prof.get_profile("classification") or _prof.get_profile("classification")
742
+ _api_key = formatter_profile["api_key"]
743
+ _provider = formatter_profile["provider"]
744
+ _model = formatter_profile["model"]
745
+
746
+ # 1. Define the Schema for strict JSON enforcement
747
+ schema = {
748
+ "type": "OBJECT",
749
+ "properties": {
750
+ "project_title": {"type": "STRING"},
751
+ "project_goal": {"type": "STRING"},
752
+ "key_objectives": {
753
+ "type": "ARRAY",
754
+ "items": {"type": "STRING"}
755
+ },
756
+ "data_inputs": {
757
+ "type": "OBJECT",
758
+ "properties": {
759
+ "description_items": {
760
+ "type": "ARRAY",
761
+ "items": {"type": "STRING"}
762
+ },
763
+ "extracted_features": {
764
+ "type": "ARRAY",
765
+ "items": {"type": "STRING"},
766
+ "description": "List of specific column names or features mentioned (e.g. Age, BMI)"
767
+ }
768
+ }
769
+ },
770
+ "ml_tasks": {
771
+ "type": "ARRAY",
772
+ "items": {"type": "STRING"}
773
+ },
774
+ "expected_deliverables": {
775
+ "type": "ARRAY",
776
+ "items": {"type": "STRING"}
777
+ }
778
+ },
779
+ "required": ["project_title", "project_goal", "key_objectives", "data_inputs", "ml_tasks"]
780
+ }
781
+
782
+ # 2. Construct the API Request
783
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{_model}:generateContent?key={_api_key}"
784
+
785
+ headers = {"Content-Type": "application/json"}
786
+
787
+ payload = {
788
+ "contents": [{
789
+ "parts": [{
790
+ "text": f"Extract the structured data from the following ML Job Description:\n\n{text}"
791
+ }]
792
+ }],
793
+ "generationConfig": {
794
+ "responseMimeType": "application/json",
795
+ "responseSchema": schema
796
+ }
797
+ }
798
+
799
+ try:
800
+ response = requests.post(url, headers=headers, json=payload)
801
+ response.raise_for_status()
802
+
803
+ result_json = response.json()
804
+
805
+ # 4. Extract and Parse Content
806
+ raw_text_response = result_json["candidates"][0]["content"]["parts"][0]["text"]
807
+ parsed_data = json.loads(raw_text_response)
808
+
809
+ report = generate_formatted_report(parsed_data)
810
+ return parsed_data
811
+
812
+ except requests.exceptions.RequestException as e:
813
+ if 'response' in locals() and response is not None:
814
+ return (f"API Request Failed: {e}\n\nResponse info: {response.text}")
815
+ return None
816
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
817
+ return f"Parsing Failed: {e}"
818
+
819
+