syntaxmatrix 2.3.5__py3-none-any.whl → 2.5.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,810 @@
1
+ # syntaxmatrix/agents.py
2
+ from __future__ import annotations
3
+ import os, re, json, textwrap, requests
4
+ import pandas as pd
5
+
6
+ from typing import Optional, List
7
+
8
+ from syntaxmatrix import utils
9
+ from syntaxmatrix.settings.model_map import GPT_MODELS_LATEST
10
+ from .. import profiles as _prof
11
+ from ..gpt_models_latest import set_args as _set_args, extract_output_text as _out
12
+ from google.genai import types
13
+ import tiktoken
14
+ from google.genai.errors import APIError
15
+
16
+
17
+ def token_calculator(total_input_content, llm_profile):
18
+
19
+ _client = llm_profile["client"]
20
+ _model = llm_profile["model"]
21
+ _provider = llm_profile["provider"].lower()
22
+
23
+ if _provider == "google":
24
+ tok = _client.models.count_tokens(
25
+ model=_model,
26
+ contents=total_input_content
27
+ )
28
+ input_prompt_tokens = tok.total_tokens
29
+ return input_prompt_tokens
30
+
31
+ elif _provider == "anthropic":
32
+ tok = _client.beta.messages.count_tokens(
33
+ model=_model,
34
+ system="calculate the total token for the given prompt",
35
+ messages=[{"role": "user", "content": total_input_content}]
36
+ )
37
+ input_prompt_tokens = tok.input_tokens
38
+ return input_prompt_tokens
39
+
40
+ else:
41
+ enc = tiktoken.encoding_for_model(_model)
42
+ input_prompt_tokens = len(enc.encode(total_input_content))
43
+ return input_prompt_tokens
44
+
45
+ def mlearning_agent(user_prompt, system_prompt, coding_profile, temperature=0.1, max_tokens=4096):
46
+ """
47
+ Returns:
48
+ (text, usage_dict)
49
+
50
+ usage_dict schema (best-effort, depending on provider):
51
+ {
52
+ "provider": str,
53
+ "model": str,
54
+ "input_tokens": int|None,
55
+ "output_tokens": int|None,
56
+ "total_tokens": int|None,
57
+ "error": str|None
58
+ }
59
+ """
60
+
61
+ # coding_profile['client'] = _prof.get_client(coding_profile)
62
+ _client = coding_profile["client"]
63
+ _provider = coding_profile["provider"].lower()
64
+ _model = coding_profile["model"]
65
+
66
+ usage = {
67
+ "provider": _provider,
68
+ "model": _model,
69
+ "input_tokens": None,
70
+ "output_tokens": None,
71
+ "total_tokens": None,
72
+ }
73
+
74
+ def _clean_text(t):
75
+ if t is None:
76
+ return ""
77
+ if not isinstance(t, str):
78
+ t = str(t)
79
+ return t.strip()
80
+
81
+ def _get_usage_val(u, keys):
82
+ """Read usage fields from dicts or objects, resiliently."""
83
+ if u is None:
84
+ return None
85
+ for k in keys:
86
+ try:
87
+ if isinstance(u, dict) and k in u:
88
+ return u[k]
89
+ if hasattr(u, k):
90
+ return getattr(u, k)
91
+ except Exception:
92
+ continue
93
+ return None
94
+
95
+ # Google
96
+ def google_generate_code():
97
+ nonlocal usage
98
+ """
99
+ Generates content using the Gemini API and calculates token usage
100
+ including Context Overhead for consistency.
101
+ """
102
+
103
+ try:
104
+ # 1. Client Initialization
105
+ config = types.GenerateContentConfig(
106
+ system_instruction=system_prompt,
107
+ temperature=temperature,
108
+ max_output_tokens=max_tokens,
109
+ )
110
+
111
+ # 2. API Call
112
+ resp = _client.models.generate_content(
113
+ model=_model,
114
+ contents=[user_prompt],
115
+ config=config,
116
+ )
117
+
118
+ # 3. Token Usage Capture and Context Overhead Calculation
119
+ um = resp.usage_metadata
120
+ usage["input_tokens"] = um.prompt_token_count
121
+ usage["output_tokens"] = um.thoughts_token_count
122
+ usage["total_tokens"] = um.total_token_count
123
+
124
+ # 4. Response Extraction (same robust logic as before)
125
+ text = getattr(resp, "text", None)
126
+ if isinstance(text, str) and text.strip():
127
+ return text.strip()
128
+
129
+ chunks = []
130
+ candidates = getattr(resp, "candidates", None) or []
131
+ for cand in candidates:
132
+ content = getattr(cand, "content", None)
133
+ if content:
134
+ parts = getattr(content, "parts", None) or []
135
+ for part in parts:
136
+ t = getattr(part, "text", None)
137
+ if t:
138
+ chunks.append(str(t))
139
+
140
+ text = "\n".join(chunks).strip()
141
+ if text:
142
+ return text
143
+
144
+ # 5. Handle blocked response
145
+ fb = getattr(resp, "prompt_feedback", None)
146
+ block_reason = getattr(fb, "block_reason", None) if fb else None
147
+ if block_reason and block_reason != types.BlockedReason.REASON_UNSPECIFIED:
148
+ raise RuntimeError(f"{_model} blocked the response. Reason: {block_reason.name}")
149
+ raise RuntimeError(f"{_model} failed to return content due to insufficient data.")
150
+
151
+ except APIError as e:
152
+ error_msg = f"Gemini API Error: {e}"
153
+
154
+ except Exception as e:
155
+ error_msg = f"An unexpected error occurred during API call or processing: {e}"
156
+
157
+ # --- Return the error message wrapped in the required output code structure ---
158
+ msg = f"I smxAI have instructed {error_msg}\n"
159
+ return (
160
+ f"# {msg}\n"
161
+ "from syntaxmatrix.display import show\n"
162
+ f"show({msg!r})\n"
163
+ )
164
+
165
+ # OpenAI Responses API
166
+ def gpt_models_latest_generate_code():
167
+ nonlocal usage
168
+
169
+ def reasoning_and_verbosity():
170
+ reasoning_effort, verbosity = "medium", "medium"
171
+ if _model == "gpt-5-nano":
172
+ reasoning_effort, verbosity = "low", "low"
173
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
174
+ reasoning_effort, verbosity = "medium", "medium"
175
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
176
+ reasoning_effort, verbosity = "high", "high"
177
+ return (reasoning_effort, verbosity)
178
+ try:
179
+ args = _set_args(
180
+ model=_model,
181
+ instructions=system_prompt,
182
+ input=user_prompt,
183
+ previous_id=None,
184
+ store=False,
185
+ reasoning_effort=reasoning_and_verbosity()[0],
186
+ verbosity=reasoning_and_verbosity()[1],
187
+ )
188
+ resp = _client.responses.create(**args)
189
+
190
+ um = resp.usage
191
+ usage["input_tokens"] = um.input_tokens
192
+ usage["output_tokens"] = um.output_tokens
193
+ usage["total_tokens"] = um.total_tokens
194
+
195
+ code = _out(resp).strip()
196
+ if code:
197
+ return code
198
+
199
+ # Try to surface any block reason (safety / policy / etc.)
200
+ block_reason = None
201
+ output = resp.get("output")
202
+ for item in output:
203
+ fr = getattr(item, "finish_reason", None)
204
+ if fr and fr != "stop":
205
+ block_reason = fr
206
+ break
207
+ if block_reason:
208
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
209
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
210
+
211
+ except APIError as e:
212
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
213
+ msg = f"I smxAI have instructed {e}"
214
+ return (
215
+ f"# {msg}\n"
216
+ "from syntaxmatrix.display import show\n"
217
+ f"show({msg!r})\n"
218
+ )
219
+
220
+ except Exception as e:
221
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
222
+ msg = f"I smxAI have instructed {e}"
223
+ return (
224
+ f"# {msg}\n"
225
+ "from syntaxmatrix.display import show\n"
226
+ f"show({msg!r})\n"
227
+ )
228
+
229
+ # Anthropic
230
+ def anthropic_generate_code():
231
+ nonlocal usage
232
+ try:
233
+ resp = _client.messages.create(
234
+ model=_model,
235
+ max_tokens=max_tokens,
236
+ temperature=temperature,
237
+ system=system_prompt,
238
+ messages=[
239
+ {"role": "user", "content": user_prompt}
240
+ ]
241
+ )
242
+
243
+ um = resp.usage
244
+ usage["input_tokens"] = um.input_tokens
245
+ usage["output_tokens"] = um.output_tokens
246
+ usage["total_tokens"] = um.input_tokens + um.output_tokens
247
+
248
+ # Extract plain text from Claude-style content blocks
249
+ text_blocks = []
250
+ content = getattr(resp, "content", None) or []
251
+ for block in content:
252
+ t = getattr(block, "text", None)
253
+ if not t and isinstance(block, dict):
254
+ t = (block.get("text") or "").strip()
255
+ if t:
256
+ text_blocks.append(str(t))
257
+
258
+ text = "\n".join(text_blocks).strip()
259
+ if text:
260
+ return text
261
+
262
+ stop_reason = getattr(resp, "stop_reason", None)
263
+ if stop_reason and stop_reason != "end_turn":
264
+ raise RuntimeError(f"{_model} stopped with reason: {stop_reason}")
265
+ raise RuntimeError(f"{_model} returned an empty response in this section due to insufficient data.")
266
+
267
+ except Exception as e:
268
+ msg = f"I smxAI have instructed {e}\n"
269
+ return (
270
+ f"# {msg}\n"
271
+ "from syntaxmatrix.display import show\n"
272
+ f"show({msg!r})\n"
273
+ )
274
+
275
+ # OpenAI Chat Completions
276
+ def openai_sdk_generate_code():
277
+ nonlocal usage
278
+ try:
279
+ resp = _client.chat.completions.create(
280
+ model=_model,
281
+ messages=[
282
+ {"role": "system", "content": system_prompt},
283
+ {"role": "user", "content": user_prompt},
284
+ ],
285
+ temperature=temperature,
286
+ max_tokens=max_tokens,
287
+ )
288
+
289
+
290
+
291
+ um = resp.usage
292
+ usage["input_tokens"] = um.prompt_tokens
293
+ usage["output_tokens"] = um.completion_tokens
294
+ usage["total_tokens"] = um.total_tokens
295
+
296
+ text = resp.choices[0].message.content
297
+ if text:
298
+ return text
299
+
300
+ # Try to surface any block reason (safety / policy / etc.)
301
+ block_reason = None
302
+ choices = getattr(resp, "choices", None) or []
303
+ if choices:
304
+ first = choices[0]
305
+ fr = getattr(first, "finish_reason", None)
306
+ if fr and fr != "stop":
307
+ block_reason = fr
308
+
309
+ if block_reason:
310
+ raise RuntimeError(f"{_model} stopped with reason: {block_reason}")
311
+ # Fallback: nothing useful came back
312
+ raise RuntimeError(f"{_model} returned nothing in this section due to insufficient data.")
313
+
314
+ except Exception as e:
315
+ # IMPORTANT: return VALID PYTHON so the dashboard can show the error
316
+ msg = f"I smxAI have instructed {e}"
317
+ return (
318
+ f"# {msg}\n"
319
+ "from syntaxmatrix.display import show\n"
320
+ f"show({msg!r})\n"
321
+ )
322
+
323
+ # print("TTOOKKEENN: ", token_calculator(system_prompt + user_prompt, coding_profile))
324
+
325
+ if _provider == "google":
326
+ code = google_generate_code()
327
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
328
+ code = gpt_models_latest_generate_code()
329
+ elif _provider == "anthropic":
330
+ code = anthropic_generate_code()
331
+ else:
332
+ code = openai_sdk_generate_code()
333
+
334
+ return code, usage
335
+
336
+
337
+ def refine_question_agent(raw_question: str, dataset_context: str | None = None) -> str:
338
+
339
+ def response_agent(user_prompt, system_prompt, llm_profile, temp=0.0, max_tokens=128):
340
+ _profile = llm_profile
341
+
342
+ _client = _profile["client"]
343
+ _provider = _profile["provider"].lower()
344
+ _model = _profile["model"]
345
+
346
+ # Google GenAI
347
+ if _provider == "google":
348
+ resp = _client.models.generate_content(
349
+ model=_model,
350
+ contents=system_prompt + "\n\n" + user_prompt,
351
+ )
352
+ text = resp.text
353
+ return text.strip()
354
+
355
+ # OpenAI
356
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
357
+
358
+ def reasoning_and_verbosity():
359
+ reasoning_effort, verbosity = "medium", "medium"
360
+ if _model == "gpt-5-nano":
361
+ if max_tokens <= 256:
362
+ reasoning_effort = "minimal"
363
+ else: reasoning_effort = "low"
364
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
365
+ verbosity = "medium"
366
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
367
+ reasoning_effort = "high"
368
+ verbosity = "high"
369
+ return (reasoning_effort, verbosity)
370
+
371
+ args = _set_args(
372
+ model=_model,
373
+ instructions=system_prompt,
374
+ input=user_prompt,
375
+ previous_id=None,
376
+ store=False,
377
+ reasoning_effort=reasoning_and_verbosity()[0],
378
+ verbosity=reasoning_and_verbosity()[1],
379
+ )
380
+ resp = _client.responses.create(**args)
381
+ txt = _out(resp)
382
+ return txt
383
+
384
+ # Anthropic
385
+ elif _provider == "anthropic":
386
+ try:
387
+ resp = _client.messages.create(
388
+ model=_model,
389
+ system=system_prompt,
390
+ messages=[{"role": "user", "content": user_prompt}],
391
+ temperature=0.2,
392
+ max_tokens= max_tokens,
393
+ )
394
+
395
+ # Extract plain text from Claude's content blocks
396
+ text = ""
397
+ content = getattr(resp, "content", None)
398
+ if content and isinstance(content, list):
399
+ parts = []
400
+ for block in content:
401
+ # blocks typically like {"type": "text", "text": "..."}
402
+ t = getattr(block, "text", None)
403
+ if not t and isinstance(block, dict):
404
+ t = block.get("text")
405
+ if t:
406
+ parts.append(t)
407
+ text = " ".join(parts)
408
+ return text
409
+ except Exception:
410
+ pass
411
+
412
+ # OpenAI SDK Compartible (Chat Completions)
413
+ else:
414
+ resp = _client.chat.completions.create(
415
+ model=_model,
416
+ messages=[
417
+ {"role": "system", "content": system_prompt},
418
+ {"role": "user", "content": user_prompt},
419
+ ],
420
+ temperature=temp,
421
+ max_tokens=max_tokens,
422
+ )
423
+ text = resp.choices[0].message.content
424
+ return text
425
+
426
+ return "Configure LLM Profiles or contact your administrator."
427
+
428
+ system_prompt = (
429
+ "You rewrite user questions into specification Machine Learning (ML) job description. "
430
+ "If a dataset summary is provided, use it to respect column and help you redefine the question. "
431
+ "DO NOT write andy prelude or preamble"
432
+ )
433
+
434
+ user_prompt = f"User question:\n{raw_question}\n\n"
435
+ if dataset_context:
436
+ user_prompt += f"Dataset summary:\n{dataset_context}\n"
437
+
438
+ _refiner_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
439
+ if not _refiner_profile:
440
+ return "ERROR"
441
+
442
+ _refiner_profile['client'] = _prof.get_client(_refiner_profile)
443
+
444
+ refined_question = response_agent(user_prompt, system_prompt, _refiner_profile, temp=0.0, max_tokens=128)
445
+ return refined_question
446
+
447
+
448
+ def classify_ml_job_agent(refined_question, dataset_profile):
449
+ """
450
+ Instructs an LLM (gemini-2.5-flash) to analyze a task description
451
+ and return a list of associated machine learning job/task types.
452
+ This version uses a highly extensive, generalized list of ML jobs
453
+ to ensure robustness across all domains (NLP, CV, RL, etc.).
454
+
455
+ Args:
456
+ task_description: The detailed description of the statistical/ML task.
457
+
458
+ Returns:
459
+ A list of strings identifying the relevant ML jobs. Returns an empty
460
+ list if the API call fails or the output cannot be parsed.
461
+ """
462
+
463
+ def ml_response(user_prompt, system_prompt, profile):
464
+ _profile = profile # _prof.get_profile["admin"]
465
+
466
+ _client = _profile["client"]
467
+ _provider = _profile["provider"].lower()
468
+ _model = _profile["model"]
469
+
470
+ prompt = user_prompt + "\n\n" + system_prompt
471
+
472
+ # Google GenAI
473
+ if _provider == "google":
474
+ from google.genai.errors import APIError
475
+
476
+ config=dict(
477
+ temperature=0.0,
478
+ response_mime_type="application/json",
479
+ # Enforcing a JSON array of strings structure for reliable parsing
480
+ response_schema={
481
+ "type": "array",
482
+ "items": {"type": "string"}
483
+ }
484
+ )
485
+ try:
486
+ response = _client.models.generate_content(
487
+ model=_model,
488
+ contents=prompt,
489
+ config=config,
490
+ )
491
+ json_string = response.text.strip()
492
+ ml_jobs = json.loads(json_string)
493
+
494
+ if not isinstance(ml_jobs, list) or not all(isinstance(job, str) for job in ml_jobs):
495
+ return []
496
+ return ml_jobs
497
+
498
+ except APIError as e:
499
+ return [f"An API error occurred: {e}"]
500
+ except json.JSONDecodeError as e:
501
+ if 'response' in locals():
502
+ return [f"Raw response text: {response.text}"]
503
+ except Exception as e:
504
+ return [f"An unexpected error occurred: {e}"]
505
+
506
+ elif _provider == "openai" and _model in GPT_MODELS_LATEST:
507
+
508
+ def reasoning_and_verbosity():
509
+ reasoning_effort, verbosity = "medium", "medium"
510
+ if _model == "gpt-5-nano":
511
+ reasoning_effort = "low"
512
+ elif _model in ["gpt-5-mini", "gpt-5-codex-mini"]:
513
+ verbosity = "medium"
514
+ elif _model in ["gpt-5", "gpt-5-codex", "gpt-5-pro"]:
515
+ reasoning_effort = "high"
516
+ verbosity = "high"
517
+ return (reasoning_effort, verbosity)
518
+
519
+ args = _set_args(
520
+ model=_model,
521
+ instructions=system_prompt,
522
+ input=user_prompt,
523
+ previous_id=None,
524
+ store=False,
525
+ reasoning_effort=reasoning_and_verbosity()[0],
526
+ verbosity=reasoning_and_verbosity()[1],
527
+ )
528
+ resp = _client.responses.create(**args)
529
+ txt = _out(resp)
530
+ return txt
531
+
532
+ elif _provider == "anthropic":
533
+ try:
534
+ resp = _client.messages.create(
535
+ model=_model,
536
+ system=system_prompt,
537
+ messages=[{"role": "user", "content": user_prompt}],
538
+ temperature=0.0,
539
+ max_tokens= 128,
540
+ )
541
+
542
+ # Extract plain text from Claude's content blocks
543
+ text = ""
544
+ content = getattr(resp, "content", None)
545
+ if content and isinstance(content, list):
546
+ parts = []
547
+ for block in content:
548
+ # blocks typically like {"type": "text", "text": "..."}
549
+ t = getattr(block, "text", None)
550
+ if not t and isinstance(block, dict):
551
+ t = block.get("text")
552
+ if t:
553
+ parts.append(t)
554
+ text = " ".join(parts)
555
+ return text
556
+ except Exception:
557
+ pass
558
+
559
+ else:
560
+ resp = _client.chat.completions.create(
561
+ model=_model,
562
+ messages=[
563
+ {"role": "system", "content": system_prompt},
564
+ {"role": "user", "content": user_prompt},
565
+ ],
566
+ temperature=0.0,
567
+ max_tokens=128,
568
+ )
569
+ text = resp.choices[0].message.content
570
+ return text
571
+
572
+ return "Configure LLM Profiles or contact your administrator."
573
+
574
+ system_prompt = (
575
+ "You are a strict machine learning task classifier for an ML workbench.\n"
576
+ "Your job is to label the user's task desc. with all relevant tags from a fixed list.\n\n"
577
+ )
578
+
579
+ # --- 1. Define the Master List of ML Tasks (Generalized) ---
580
+ ml_task_list = [
581
+ # Supervised Learning
582
+ "classification", "regression", "ranking", "object_detection", "image_segmentation",
583
+
584
+ # Unsupervised Learning
585
+ "clustering", "dimensionality_reduction", "anomaly_detection", "association_rule_mining",
586
+
587
+ # Sequential/Time Data
588
+ "time_series_forecasting", "sequence_labeling", "survival_analysis",
589
+
590
+ # Specialized Domains
591
+ "natural_language_processing", "computer_vision", "reinforcement_learning",
592
+ "generative_modeling", "causal_inference", "risk_modeling", "graph_analysis",
593
+
594
+ # Foundational/Pipeline Steps
595
+ "feature_engineering", "statistical_inference", "data_preprocessing",
596
+ "model_validation", "hyperparameter_tuning"
597
+ ]
598
+
599
+ # --- 2. Construct the Generalized Prompt for the LLM ---
600
+ task_description = refined_question
601
+
602
+ user_prompt = f"""
603
+ Analyze the following task description:
604
+ ---
605
+ {task_description}
606
+ ---
607
+
608
+ If the Dataset Profile is provided, use its info, together with the task description, to make your job types
609
+ Identify and select ALL job types from the provided, extensive list that are directly
610
+ relevant to achieving the goals outlined in the task description (either as the
611
+ core goal, prerequisites, or essential steps).
612
+
613
+ ML Jobs List: {', '.join(ml_task_list)}
614
+
615
+ Respond ONLY with a valid JSON array of strings containing the selected ML job names.
616
+ Example Response: ["natural_language_processing", "classification", "feature_engineering"]
617
+ """
618
+
619
+ if dataset_profile:
620
+ user_prompt += f"\nDataset profile:\n{dataset_profile}\n"
621
+
622
+ llm_profile = _prof.get_profile("classification") or _prof.get_profile("admin")
623
+ if not llm_profile:
624
+ return "ERROR"
625
+
626
+ llm_profile['client'] = _prof.get_client(llm_profile)
627
+
628
+ # Extract raw content
629
+ tasks = ml_response(user_prompt, system_prompt, llm_profile)
630
+ return tasks
631
+
632
+
633
+ def text_formatter_agent(text):
634
+ """
635
+ Parses an ML job description using the Gemini API with Structured JSON Output.
636
+ """
637
+
638
+ def generate_formatted_report(data):
639
+ """
640
+ Generates a formatted string of the structured data in a clean,
641
+ document-like format mimicking the requested list structure.
642
+
643
+ Returns:
644
+ str: The complete formatted report as a string.
645
+ """
646
+ if not data:
647
+ return "No data to display."
648
+
649
+ output_lines = []
650
+
651
+ # --- Helper Functions ---
652
+ def clean_md(text):
653
+ """Removes markdown bold syntax."""
654
+ return text.replace("**", "")
655
+
656
+ def format_smart_list_item(prefix, item_text, width=80):
657
+ """
658
+ Content-agnostic list formatter.
659
+ Detects 'Header: Description' patterns and formats them inline.
660
+ Returns the formatted string.
661
+ """
662
+ cleaned = clean_md(item_text)
663
+
664
+ # Check for "Header: Description" pattern
665
+ # We look for a colon appearing early in the string (e.g., within first 60 chars)
666
+ colon_match = re.match(r"^([^:]{1,60}):\s*(.*)", cleaned, re.DOTALL)
667
+
668
+ if colon_match:
669
+ header = colon_match.group(1).strip()
670
+ description = colon_match.group(2).strip()
671
+
672
+ # Format: PREFIX HEADER: Description
673
+ full_line = f"{prefix} {header.upper()}: {description}\n"
674
+ else:
675
+ # Format: PREFIX Content
676
+ full_line = f"{prefix} {cleaned}\n"
677
+
678
+ # Calculate hanging indent (aligning with the start of the text after the prefix)
679
+ # Length of prefix + 1 space
680
+ indent_width = len(prefix) + 1
681
+ hanging_indent = " " * indent_width
682
+
683
+ return textwrap.fill(
684
+ full_line,
685
+ width=width,
686
+ subsequent_indent=hanging_indent
687
+ )
688
+
689
+ # --- Report Construction ---
690
+
691
+ # 1. Title
692
+ title = clean_md(data.get("project_title", "Project Report"))
693
+ output_lines.append("\n" + "=" * 80)
694
+ output_lines.append(f"{title.center(80)}")
695
+ output_lines.append("=" * 80 + "\n")
696
+
697
+ # 2. Project Goal
698
+ output_lines.append("PROJECT GOAL\n")
699
+ output_lines.append("-" * 12)
700
+ goal = clean_md(data.get("project_goal", ""))
701
+ output_lines.append(textwrap.fill(goal, width=80))
702
+ output_lines.append("") # Adds a blank line
703
+
704
+ # 3. Key Objectives
705
+ if data.get("key_objectives"):
706
+ output_lines.append("KEY OBJECTIVES & STRATEGIC INSIGHTS")
707
+ output_lines.append("-" * 35)
708
+ for item in data["key_objectives"]:
709
+ output_lines.append(format_smart_list_item("•", item))
710
+ output_lines.append("")
711
+
712
+ # 4. ML Tasks (Numbered List)
713
+ if data.get("ml_tasks"):
714
+ output_lines.append("ML EXECUTION TASKS")
715
+ output_lines.append("-" * 18)
716
+ for i, task in enumerate(data["ml_tasks"], 1):
717
+ # Using i. as prefix
718
+ output_lines.append(format_smart_list_item(f"{i}.", task))
719
+ output_lines.append("")
720
+
721
+ # 5. Deliverables
722
+ if data.get("expected_deliverables"):
723
+ output_lines.append("EXPECTED DELIVERABLES")
724
+ output_lines.append("-" * 21)
725
+ for item in data["expected_deliverables"]:
726
+ output_lines.append(format_smart_list_item("•", item))
727
+ output_lines.append("")
728
+
729
+ # Join all lines with newlines
730
+ return "\n".join(output_lines)
731
+
732
+ formatter_profile = _prof.get_profile("classification") or _prof.get_profile("classification")
733
+ _api_key = formatter_profile["api_key"]
734
+ _provider = formatter_profile["provider"]
735
+ _model = formatter_profile["model"]
736
+
737
+ # 1. Define the Schema for strict JSON enforcement
738
+ schema = {
739
+ "type": "OBJECT",
740
+ "properties": {
741
+ "project_title": {"type": "STRING"},
742
+ "project_goal": {"type": "STRING"},
743
+ "key_objectives": {
744
+ "type": "ARRAY",
745
+ "items": {"type": "STRING"}
746
+ },
747
+ "data_inputs": {
748
+ "type": "OBJECT",
749
+ "properties": {
750
+ "description_items": {
751
+ "type": "ARRAY",
752
+ "items": {"type": "STRING"}
753
+ },
754
+ "extracted_features": {
755
+ "type": "ARRAY",
756
+ "items": {"type": "STRING"},
757
+ "description": "List of specific column names or features mentioned (e.g. Age, BMI)"
758
+ }
759
+ }
760
+ },
761
+ "ml_tasks": {
762
+ "type": "ARRAY",
763
+ "items": {"type": "STRING"}
764
+ },
765
+ "expected_deliverables": {
766
+ "type": "ARRAY",
767
+ "items": {"type": "STRING"}
768
+ }
769
+ },
770
+ "required": ["project_title", "project_goal", "key_objectives", "data_inputs", "ml_tasks"]
771
+ }
772
+
773
+ # 2. Construct the API Request
774
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{_model}:generateContent?key={_api_key}"
775
+
776
+ headers = {"Content-Type": "application/json"}
777
+
778
+ payload = {
779
+ "contents": [{
780
+ "parts": [{
781
+ "text": f"Extract the structured data from the following ML Job Description:\n\n{text}"
782
+ }]
783
+ }],
784
+ "generationConfig": {
785
+ "responseMimeType": "application/json",
786
+ "responseSchema": schema
787
+ }
788
+ }
789
+
790
+ try:
791
+ response = requests.post(url, headers=headers, json=payload)
792
+ response.raise_for_status()
793
+
794
+ result_json = response.json()
795
+
796
+ # 4. Extract and Parse Content
797
+ raw_text_response = result_json["candidates"][0]["content"]["parts"][0]["text"]
798
+ parsed_data = json.loads(raw_text_response)
799
+
800
+ report = generate_formatted_report(parsed_data)
801
+ return parsed_data
802
+
803
+ except requests.exceptions.RequestException as e:
804
+ if 'response' in locals() and response is not None:
805
+ return (f"API Request Failed: {e}\n\nResponse info: {response.text}")
806
+ return None
807
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
808
+ return f"Parsing Failed: {e}"
809
+
810
+