crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.dockerignore +12 -0
  2. package/.gitattributes +2 -0
  3. package/CHANGELOG.md +86 -0
  4. package/Dockerfile +21 -0
  5. package/LICENSE +21 -0
  6. package/README.md +139 -0
  7. package/USAGE.md +106 -0
  8. package/agents/__init__.py +0 -0
  9. package/agents/cleaner.py +38 -0
  10. package/agents/insights.py +44 -0
  11. package/agents/relation.py +36 -0
  12. package/agents/visualizer.py +41 -0
  13. package/assets/badge_crewai.svg +4 -0
  14. package/assets/badge_matplotlib.svg +4 -0
  15. package/assets/badge_ollama.svg +4 -0
  16. package/assets/badge_pandas.svg +4 -0
  17. package/assets/badge_seaborn.svg +4 -0
  18. package/assets/branding_image.png +0 -0
  19. package/assets/complete_workflow.svg +216 -0
  20. package/assets/favicon.png +0 -0
  21. package/assets/logo.png +0 -0
  22. package/assets/stars.svg +12 -0
  23. package/bin/crewlyze.js +79 -0
  24. package/config/README.md +129 -0
  25. package/config/__init__.py +1 -0
  26. package/config/context.py +16 -0
  27. package/config/llm_config.py +300 -0
  28. package/config/metrics_tracker.py +70 -0
  29. package/crew.py +870 -0
  30. package/crewlyze-3.1.0.tgz +0 -0
  31. package/fix_syntax.py +54 -0
  32. package/main.py +1279 -0
  33. package/package.json +22 -0
  34. package/pyproject.toml +32 -0
  35. package/requirements.txt +33 -0
  36. package/tools/__init__.py +0 -0
  37. package/tools/dataset_tools.py +803 -0
  38. package/ui/__init__.py +3 -0
  39. package/ui/copilot.py +200 -0
  40. package/ui/export.py +800 -0
  41. package/update_appjs.py +54 -0
  42. package/update_llm.py +21 -0
  43. package/update_main.py +20 -0
  44. package/web/app.js +3142 -0
  45. package/web/index.html +1105 -0
  46. package/web/style.css +2561 -0
  47. package/workflows/__init__.py +0 -0
  48. package/workflows/pipeline.py +254 -0
@@ -0,0 +1,803 @@
1
+ # Crewlyze
2
+ # Copyright (c) 2025 Sowmiyan S
3
+ # Licensed under the MIT License
4
+
5
+ """
6
+ Dataset tools for CrewAI agents.
7
+
8
+ Security note
9
+ -------------
10
+ All LLM-generated code is executed in an isolated child process (subprocess),
11
+ never via exec() in the parent process. This eliminates RCE risk: the child
12
+ has no access to the parent's globals, secrets, or in-memory state.
13
+
14
+ Performance note
15
+ ----------------
16
+ build_dataset_profile() is a pure Python function (not a CrewAI tool) that
17
+ pre-computes a compact dataset summary and returns it as a string. Injecting
18
+ this string into task descriptions eliminates 6-8 LLM round-trips per run that
19
+ agents would otherwise spend calling read_dataset_head / get_dataset_info /
20
+ get_correlation_matrix before they can act.
21
+
22
+ generate_plotly_charts() parses the relation-agent output and produces
23
+ interactive Plotly figures directly in Python — no LLM, no subprocess, no PNG
24
+ file I/O. This replaces static matplotlib PNGs with zoomable, hoverable charts.
25
+ """
26
+
27
+ import os
28
+ import re
29
+ import sys
30
+ import textwrap
31
+ import tempfile
32
+ import subprocess
33
+
34
+ import pandas as pd
35
+ from typing import Optional
36
+ from crewai.tools import tool
37
+
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Robust CSV Reader Helper
42
+ # ---------------------------------------------------------------------------
43
+
44
+ def read_csv_robust(file_path: str, **kwargs) -> pd.DataFrame:
45
+ """Read a CSV file robustly, handling encoding and tokenization (bad lines) errors.
46
+
47
+ If standard parsing fails, it falls back to skipping bad lines and prints
48
+ a warning to stdout so it appears in the user-facing logs.
49
+ """
50
+ encodings = ['utf-8', 'latin1', 'utf-8-sig', 'cp1252']
51
+
52
+ # Try normal reading first with different encodings
53
+ for encoding in encodings:
54
+ try:
55
+ return pd.read_csv(file_path, encoding=encoding, **kwargs)
56
+ except Exception as e:
57
+ if isinstance(e, FileNotFoundError):
58
+ raise e
59
+ continue
60
+
61
+ # If standard reading fails, try skipping bad lines
62
+ print(f"[Warning] Encountered formatting issues reading {file_path}. Attempting to parse by skipping malformed lines...", file=sys.stdout)
63
+ sys.stdout.flush()
64
+ for encoding in encodings:
65
+ try:
66
+ return pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip', **kwargs)
67
+ except Exception:
68
+ continue
69
+
70
+ # If all fails, run one final time to let the error bubble up
71
+ return pd.read_csv(file_path, **kwargs)
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # Internal helpers
76
+ # ---------------------------------------------------------------------------
77
+
78
+ def _strip_markdown_fences(code: str) -> str:
79
+ """Remove leading/trailing markdown code fences from LLM output."""
80
+ # Find any python code blocks: ```python ... ```
81
+ match = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
82
+ if match:
83
+ code = match.group(1)
84
+ else:
85
+ code = code.strip()
86
+ code = re.sub(r"^```(?:python)?\s*\n?", "", code)
87
+ code = re.sub(r"\n?```\s*$", "", code)
88
+
89
+ import textwrap
90
+ return textwrap.dedent(code).strip()
91
+
92
+
93
+ def _df_to_markdown(df: "pd.DataFrame", index: bool = True) -> str:
94
+ """Convert a pandas DataFrame to a compact GitHub-style markdown table.
95
+
96
+ Pure Python fallback — avoids the optional ``tabulate`` dependency.
97
+ Numeric cells are right-aligned; all others are left-aligned.
98
+
99
+ Args:
100
+ df : DataFrame to render.
101
+ index : If True, include the DataFrame index as the first column.
102
+
103
+ Returns:
104
+ A multi-line markdown string.
105
+ """
106
+ import pandas as pd
107
+
108
+ if df is None or df.empty:
109
+ return "*(empty)*"
110
+
111
+ df_display = df.reset_index() if index else df.copy()
112
+
113
+ headers = [str(h) for h in list(df_display.columns)]
114
+ rows = []
115
+ for row in df_display.values.tolist():
116
+ formatted_row = []
117
+ for cell in row:
118
+ if cell is None or pd.isna(cell):
119
+ formatted_row.append("")
120
+ else:
121
+ formatted_row.append(str(cell))
122
+ rows.append(formatted_row)
123
+
124
+ # Column widths — at least as wide as the header
125
+ col_widths = [len(h) for h in headers]
126
+ for row in rows:
127
+ for i, cell in enumerate(row):
128
+ col_widths[i] = max(col_widths[i], len(cell))
129
+
130
+ def _row_str(cells: list, widths: list) -> str:
131
+ parts = []
132
+ for cell, w in zip(cells, widths):
133
+ parts.append(cell.ljust(w))
134
+ return "| " + " | ".join(parts) + " |"
135
+
136
+ sep_parts = ["-" * w for w in col_widths]
137
+ separator = "|" + "|".join(f"-{s}-" for s in sep_parts) + "|"
138
+
139
+ lines = [_row_str(headers, col_widths), separator]
140
+ for row in rows:
141
+ lines.append(_row_str(row, col_widths))
142
+
143
+ return "\n".join(lines)
144
+
145
+
146
+ def _heal_script_code(script: str, error_output: str) -> str:
147
+ """Use the configured LLM to self-heal/correct a failing python script."""
148
+ try:
149
+ from crewai import LLM
150
+ from config.llm_config import get_llm_params
151
+
152
+ # Get LLM instance
153
+ llm = LLM(**get_llm_params())
154
+
155
+ prompt = f"""You are a senior python debugger.
156
+ The following python script failed during execution with an error/traceback.
157
+
158
+ --- FAIL SCRIPT ---
159
+ {script}
160
+
161
+ --- ERROR OUTPUT ---
162
+ {error_output}
163
+
164
+ Task:
165
+ Analyze the error carefully. Identify why it failed (e.g., missing imports, undefined variables, incorrect pandas api calls, type mismatches).
166
+ Rewrite the script to fix this error, ensuring you preserve the original logic and functional goal.
167
+ Do NOT omit the imports, variables, or functions that are set up.
168
+ For visualization scripts, make sure 'save_chart(filename)' is called correctly.
169
+ For data cleaning scripts, make sure 'df.to_csv(FILE_PATH, index=False)' is kept at the end.
170
+
171
+ Format:
172
+ Return ONLY the corrected, ready-to-run python script inside a markdown python block:
173
+ ```python
174
+ # corrected code here
175
+ ```
176
+ Do not include any other explanations, intros, or markdown outside the code block.
177
+ """
178
+ # Call LLM
179
+ response = llm.call([{"role": "user", "content": prompt}])
180
+ corrected_code = _strip_markdown_fences(response)
181
+ return corrected_code
182
+ except Exception as e:
183
+ # If healing fails, return original script
184
+ print(f"Self-healing error: {e}")
185
+ return script
186
+
187
+
188
+ def _run_in_subprocess(script: str, timeout: int = 120, is_healed_attempt: bool = False) -> tuple[bool, str]:
189
+ """
190
+ Write *script* to a temp file and execute it in an isolated subprocess.
191
+ Includes auto-dependency healing to download missing packages if needed.
192
+ """
193
+ with tempfile.NamedTemporaryFile(
194
+ mode="w", suffix=".py", delete=False, encoding="utf-8"
195
+ ) as tmp:
196
+ tmp.write(script)
197
+ tmp_path = tmp.name
198
+
199
+ try:
200
+ proc = subprocess.run(
201
+ [sys.executable, tmp_path],
202
+ capture_output=True,
203
+ text=True,
204
+ timeout=timeout,
205
+ )
206
+ output = (proc.stdout + proc.stderr).strip()
207
+ success = proc.returncode == 0
208
+
209
+ # If execution failed and we haven't already tried to self-heal:
210
+ if not success and not is_healed_attempt:
211
+ # 1. Package dependency healing (ModuleNotFoundError)
212
+ match_module = re.search(r"ModuleNotFoundError:\s*No module named\s*['\"]([^'\"]+)['\"]", output)
213
+ if match_module:
214
+ module_name = match_module.group(1)
215
+ print(f"\n[Auto-Healing System] Missing module '{module_name}'. Attempting pip install...\n")
216
+ sys.stdout.flush()
217
+ try:
218
+ subprocess.run([sys.executable, "-m", "pip", "install", module_name], capture_output=True)
219
+ # Retry execution after package install
220
+ success_pkg, output_pkg = _run_in_subprocess(script, timeout=timeout, is_healed_attempt=True)
221
+ if success_pkg:
222
+ print(f"[Auto-Healing System] Installed '{module_name}' and executed successfully!")
223
+ sys.stdout.flush()
224
+ return True, f"[Auto-Healing system installed missing package '{module_name}']\n\nOutput:\n{output_pkg}"
225
+ except Exception as pkg_err:
226
+ print(f"[Auto-Healing System] Failed to install package: {pkg_err}")
227
+ sys.stdout.flush()
228
+
229
+ # 2. Logic/Code healing (LLM repair)
230
+ print(f"\n[Auto-Healing System] Executing python script failed with error. Attempting self-healing...\nError details:\n{output}\n")
231
+ sys.stdout.flush()
232
+ healed_script = _heal_script_code(script, output)
233
+ if healed_script and healed_script != script:
234
+ success_h, output_h = _run_in_subprocess(healed_script, timeout=timeout, is_healed_attempt=True)
235
+ if success_h:
236
+ print("[Auto-Healing System] Code repaired and executed successfully!")
237
+ sys.stdout.flush()
238
+ return True, f"[Auto-Healing system resolved a code error!]\nOriginal Error:\n{output}\n\nSuccessful Execution Output:\n{output_h}"
239
+ else:
240
+ print("[Auto-Healing System] Attempted repair but healed script still failed.")
241
+ sys.stdout.flush()
242
+
243
+ return success, output or "(no output)"
244
+ except subprocess.TimeoutExpired:
245
+ return False, f"Execution timed out after {timeout}s."
246
+ except Exception as e:
247
+ return False, f"Failed to launch subprocess: {e}"
248
+ finally:
249
+ try:
250
+ os.unlink(tmp_path)
251
+ except OSError:
252
+ pass
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # Pure Python helpers — NOT CrewAI tools, called directly from run_crew()
257
+ # ---------------------------------------------------------------------------
258
+
259
+ def build_dataset_profile(csv_path: str, max_rows: int = 5000) -> str:
260
+ """Build a compact, token-efficient dataset profile string.
261
+
262
+ Injecting this into task descriptions eliminates the need for agents to
263
+ call read_dataset_head / get_dataset_info / get_correlation_matrix —
264
+ saving 6-8 LLM round-trips per pipeline run.
265
+
266
+ Args:
267
+ csv_path : Path to the CSV file.
268
+ max_rows : Row cap for profiling large files (default 5000).
269
+
270
+ Returns:
271
+ A markdown-formatted string safe for embedding in task descriptions.
272
+ """
273
+ try:
274
+ df = read_csv_robust(csv_path, nrows=max_rows)
275
+ except Exception as exc:
276
+ return f"[Profile unavailable: {exc}]"
277
+
278
+ lines: list[str] = []
279
+ sampled = len(df)
280
+
281
+ # Shape
282
+ note = " (sample — file is larger)" if sampled == max_rows else ""
283
+ lines.append(
284
+ f"**Dataset shape**: {sampled} rows × {len(df.columns)} columns{note}"
285
+ )
286
+ lines.append("")
287
+
288
+ # Column summary
289
+ lines.append("**Columns** (name | dtype | missing% | stats/top values):")
290
+ for col in df.columns:
291
+ dtype = df[col].dtype
292
+ miss_pct = round(df[col].isnull().sum() / max(len(df), 1) * 100, 1)
293
+ if pd.api.types.is_numeric_dtype(dtype):
294
+ desc = (
295
+ f"min={df[col].min():.4g}, "
296
+ f"mean={df[col].mean():.4g}, "
297
+ f"max={df[col].max():.4g}"
298
+ )
299
+ else:
300
+ tops = df[col].dropna().value_counts().head(3).index.tolist()
301
+ desc = ", ".join(str(v) for v in tops) or "—"
302
+ lines.append(f" - {col}: {dtype} | missing={miss_pct}% | {desc}")
303
+ lines.append("")
304
+
305
+ # Top correlations (numeric only)
306
+ numeric_df = df.select_dtypes(include=["number"])
307
+ if len(numeric_df.columns) >= 2:
308
+ try:
309
+ corr = numeric_df.corr().unstack().reset_index()
310
+ corr.columns = ["A", "B", "r"]
311
+ corr = corr[corr["A"] < corr["B"]].copy()
312
+ corr["abs_r"] = corr["r"].abs()
313
+ top5 = corr.nlargest(5, "abs_r")[["A", "B", "r"]]
314
+ lines.append("**Top correlations**:")
315
+ for _, row in top5.iterrows():
316
+ lines.append(f" - {row['A']} ↔ {row['B']}: r={row['r']:.3f}")
317
+ lines.append("")
318
+ except Exception:
319
+ pass # silently skip if corr fails (e.g. all NaN numeric cols)
320
+
321
+ # Sample rows
322
+ lines.append("**Sample rows (first 5)**:")
323
+ lines.append(_df_to_markdown(df.head(5), index=False))
324
+
325
+ return "\n".join(lines)
326
+
327
+
328
+ def generate_plotly_charts(csv_path: str, relations_text: str, max_rows: int = 5000) -> list:
329
+ """Parse agent relation output and generate interactive Plotly figures.
330
+
331
+ Replaces static matplotlib PNGs with zoomable, hoverable charts rendered
332
+ natively by st.plotly_chart(). No LLM calls, no subprocess, no file I/O.
333
+
334
+ Args:
335
+ csv_path : Path to the cleaned CSV file.
336
+ relations_text : Raw text output from the relation agent.
337
+ max_rows : Row cap for chart data (default 5000 for rendering speed).
338
+
339
+ Returns:
340
+ List of dicts: [{"title": str, "fig": plotly.graph_objs.Figure}, ...]
341
+ Returns an empty list if plotly is unavailable or no valid relations found.
342
+ """
343
+ try:
344
+ import plotly.express as px
345
+ except ImportError:
346
+ return []
347
+
348
+ try:
349
+ df = read_csv_robust(csv_path, nrows=max_rows)
350
+ except Exception:
351
+ return []
352
+
353
+ # Dark-theme layout matching the app's "Obsidian & Electric Violet" aesthetic
354
+ _dark = dict(
355
+ paper_bgcolor="rgba(9,9,11,0.0)",
356
+ plot_bgcolor="rgba(15,23,42,0.4)",
357
+ font_color="#e2e8f0",
358
+ title_font_color="#a78bfa",
359
+ title_font_size=15,
360
+ xaxis=dict(
361
+ gridcolor="rgba(255,255,255,0.07)",
362
+ zerolinecolor="rgba(255,255,255,0.1)",
363
+ title_font_color="#94a3b8",
364
+ ),
365
+ yaxis=dict(
366
+ gridcolor="rgba(255,255,255,0.07)",
367
+ zerolinecolor="rgba(255,255,255,0.1)",
368
+ title_font_color="#94a3b8",
369
+ ),
370
+ margin=dict(l=50, r=20, t=55, b=50),
371
+ hoverlabel=dict(bgcolor="rgba(15,23,42,0.95)", font_color="#e2e8f0"),
372
+ )
373
+ _colors = ["#a78bfa", "#6366f1", "#22d3ee", "#e879f9", "#34d399"]
374
+
375
+ figures = []
376
+
377
+ for line in relations_text.split("\n"):
378
+ line = line.strip()
379
+ if not (line and "|" in line and "X:" in line):
380
+ continue
381
+ try:
382
+ parts = [p.strip() for p in line.lstrip("- ").split("|")]
383
+ x_col = parts[0].split(":", 1)[1].strip()
384
+ y_col = parts[1].split(":", 1)[1].strip()
385
+ ptype = parts[2].split(":", 1)[1].strip().lower() if len(parts) > 2 else "scatter"
386
+ except (IndexError, ValueError):
387
+ continue
388
+
389
+ if x_col not in df.columns or y_col not in df.columns:
390
+ continue
391
+
392
+ # Guard: skip trivial same-column pairs
393
+ if x_col == y_col:
394
+ continue
395
+
396
+ title = f"{x_col} vs {y_col}"
397
+ color = _colors[len(figures) % len(_colors)]
398
+
399
+ try:
400
+ sample = df[[x_col, y_col]].dropna().head(2000)
401
+
402
+ if sample.empty:
403
+ continue
404
+
405
+ if "scatter" in ptype or "plot" in ptype:
406
+ fig = px.scatter(
407
+ sample, x=x_col, y=y_col, title=title,
408
+ color_discrete_sequence=[color], opacity=0.75,
409
+ )
410
+ fig.update_traces(marker=dict(size=6, line=dict(width=0.5, color="rgba(255,255,255,0.3)")))
411
+ elif "bar" in ptype:
412
+ if pd.api.types.is_numeric_dtype(df[x_col]):
413
+ # numeric x → bin it, then aggregate
414
+ agg = sample.groupby(x_col)[y_col].mean().reset_index()
415
+ else:
416
+ agg = sample.groupby(x_col)[y_col].mean().reset_index()
417
+ fig = px.bar(
418
+ agg.head(25), x=x_col, y=y_col, title=title,
419
+ color_discrete_sequence=[color],
420
+ )
421
+ elif "line" in ptype:
422
+ fig = px.line(
423
+ sample.sort_values(x_col), x=x_col, y=y_col, title=title,
424
+ color_discrete_sequence=[color],
425
+ )
426
+ elif "box" in ptype:
427
+ fig = px.box(
428
+ sample, x=x_col if not pd.api.types.is_numeric_dtype(df[x_col]) else None,
429
+ y=y_col, title=title,
430
+ color_discrete_sequence=[color],
431
+ )
432
+ elif "hist" in ptype:
433
+ fig = px.histogram(
434
+ sample, x=x_col, nbins=30,
435
+ title=f"Distribution of {x_col}",
436
+ color_discrete_sequence=[color],
437
+ )
438
+ else:
439
+ # Default: scatter
440
+ fig = px.scatter(
441
+ sample, x=x_col, y=y_col, title=title,
442
+ color_discrete_sequence=[color], opacity=0.75,
443
+ )
444
+
445
+ fig.update_layout(**_dark)
446
+ figures.append({"title": title, "fig": fig, "x": x_col, "y": y_col, "type": ptype})
447
+
448
+ except Exception as _chart_err: # log but continue
449
+ print(f"[Plotly] Skipping {title!r}: {_chart_err}")
450
+ continue
451
+
452
+ # Fallback: if no relation lines were parseable, auto-generate charts
453
+ # from the first few numeric column pairs in the dataset.
454
+ if not figures:
455
+ numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
456
+ cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
457
+
458
+ pair_count = 0
459
+ for i, col in enumerate(numeric_cols[:3]):
460
+ color = _colors[i % len(_colors)]
461
+ # Histogram for each numeric col
462
+ try:
463
+ fig = px.histogram(df[[col]].dropna().head(3000), x=col,
464
+ nbins=30, title=f"Distribution of {col}",
465
+ color_discrete_sequence=[color])
466
+ fig.update_layout(**_dark)
467
+ figures.append({"title": f"Distribution of {col}", "fig": fig, "x": col, "y": col, "type": "histogram"})
468
+ pair_count += 1
469
+ except Exception:
470
+ continue
471
+
472
+ # Scatter for first 2 numeric pairs
473
+ for i in range(min(len(numeric_cols) - 1, 2)):
474
+ xc, yc = numeric_cols[i], numeric_cols[i + 1]
475
+ color = _colors[(pair_count + i) % len(_colors)]
476
+ try:
477
+ sample = df[[xc, yc]].dropna().head(2000)
478
+ fig = px.scatter(sample, x=xc, y=yc, title=f"{xc} vs {yc}",
479
+ color_discrete_sequence=[color], opacity=0.75)
480
+ fig.update_layout(**_dark)
481
+ figures.append({"title": f"{xc} vs {yc}", "fig": fig, "x": xc, "y": yc, "type": "scatter"})
482
+ except Exception:
483
+ continue
484
+
485
+ # Bar for first categorical × numeric
486
+ if cat_cols and numeric_cols:
487
+ cc, nc = cat_cols[0], numeric_cols[0]
488
+ try:
489
+ agg = df[[cc, nc]].dropna().groupby(cc)[nc].mean().reset_index()
490
+ fig = px.bar(agg.head(20), x=cc, y=nc, title=f"{nc} by {cc}",
491
+ color_discrete_sequence=[_colors[0]])
492
+ fig.update_layout(**_dark)
493
+ figures.append({"title": f"{nc} by {cc}", "fig": fig, "x": cc, "y": nc, "type": "bar"})
494
+ except Exception:
495
+ pass
496
+
497
+ return figures
498
+
499
+
500
+ # ---------------------------------------------------------------------------
501
+ # CrewAI tools — used by agents at runtime as fallback / code generation aid
502
+ # ---------------------------------------------------------------------------
503
+
504
+ class DatasetTools:
505
+
506
+ @tool("Read Dataset Head")
507
+ def read_dataset_head(file_path: Optional[str] = None) -> str:
508
+ """Reads the first 10 rows of the dataset to understand its structure.
509
+ Uses nrows=10 so the entire file is never loaded into memory.
510
+ If file_path is not specified or is invalid, the active session's CSV will be used.
511
+ """
512
+ try:
513
+ from config.context import current_session_csv
514
+ fp = file_path
515
+ if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
516
+ fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
517
+ df = read_csv_robust(fp, nrows=10)
518
+ return _df_to_markdown(df, index=False)
519
+ except Exception as e:
520
+ return f"Error reading file: {e}"
521
+
522
+ @tool("Get Dataset Info")
523
+ def get_dataset_info(file_path: Optional[str] = None) -> str:
524
+ """Returns basic information about the dataset: shape, columns, data types,
525
+ and missing-value counts.
526
+ If file_path is not specified or is invalid, the active session's CSV will be used.
527
+ """
528
+ try:
529
+ from config.context import current_session_csv
530
+ fp = file_path
531
+ if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
532
+ fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
533
+ df = read_csv_robust(fp)
534
+ lines = [f"Shape: {df.shape}", "\nColumns and Types:"]
535
+ for col, dtype in df.dtypes.items():
536
+ missing = df[col].isnull().sum()
537
+ lines.append(f" - {col}: {dtype} (Missing: {missing})")
538
+ return "\n".join(lines)
539
+ except Exception as e:
540
+ return f"Error analyzing file: {e}"
541
+
542
+ @tool("Get Correlation Matrix")
543
+ def get_correlation_matrix(file_path: Optional[str] = None) -> str:
544
+ """Returns the top-20 strongest column-pair correlations (by absolute value).
545
+ If file_path is not specified or is invalid, the active session's CSV will be used.
546
+ """
547
+ try:
548
+ from config.context import current_session_csv
549
+ fp = file_path
550
+ if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
551
+ fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
552
+ df = read_csv_robust(fp)
553
+ numeric_df = df.select_dtypes(include=["number"])
554
+ if numeric_df.empty:
555
+ return "No numeric columns found."
556
+
557
+ corr = numeric_df.corr()
558
+ unstacked = (
559
+ corr.unstack()
560
+ .reset_index()
561
+ .rename(columns={"level_0": "Col_A", "level_1": "Col_B", 0: "Correlation"})
562
+ )
563
+ unstacked = unstacked[unstacked["Col_A"] < unstacked["Col_B"]]
564
+ unstacked["AbsCorr"] = unstacked["Correlation"].abs()
565
+ top = (
566
+ unstacked.sort_values("AbsCorr", ascending=False)
567
+ .head(20)
568
+ .drop(columns=["AbsCorr"])
569
+ .reset_index(drop=True)
570
+ )
571
+ return _df_to_markdown(top, index=False)
572
+ except Exception as e:
573
+ return f"Error calculating correlation: {e}"
574
+
575
+ @tool("Clean Dataset with Python Code")
576
+ def clean_dataset_with_python(file_path: Optional[str] = None, python_code: Optional[str] = None) -> str:
577
+ """Cleans the dataset by executing *python_code* in an isolated subprocess.
578
+ The file_path parameter is optional and defaults to the active session's dataset CSV.
579
+
580
+ Your code must:
581
+ 1. Read the CSV: df = pd.read_csv(FILE_PATH) # FILE_PATH is pre-set
582
+ 2. Perform cleaning on df
583
+ 3. Save: df.to_csv(FILE_PATH, index=False)
584
+
585
+ Do NOT include markdown code fences. Do NOT use any other file paths.
586
+ """
587
+ # Swap if python_code is not specified but file_path contains code
588
+ if not python_code:
589
+ if file_path and ("import " in file_path or "df[" in file_path or "\n" in file_path):
590
+ python_code = file_path
591
+ file_path = None
592
+ else:
593
+ return "Error: python_code is required."
594
+
595
+ from config.context import current_session_csv
596
+ fp = file_path
597
+ if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
598
+ fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
599
+
600
+ clean_code = _strip_markdown_fences(python_code)
601
+
602
+ script = textwrap.dedent(f"""\
603
+ import os
604
+ import pandas as pd
605
+
606
+ FILE_PATH = {repr(str(fp))}
607
+ df = pd.read_csv(FILE_PATH)
608
+
609
+ # Safeguard: redirect all read_csv calls to FILE_PATH
610
+ _orig_read_csv = pd.read_csv
611
+ def custom_read_csv(*args, **kwargs):
612
+ return _orig_read_csv(FILE_PATH)
613
+ pd.read_csv = custom_read_csv
614
+ """) + "\n" + clean_code + "\n" + textwrap.dedent(f"""\
615
+ df.to_csv(FILE_PATH, index=False)
616
+ print("Dataset cleaned and saved successfully.")
617
+ """)
618
+
619
+ success, output = _run_in_subprocess(script)
620
+ if success:
621
+ return f"Dataset cleaned successfully.\n{output}"
622
+ return f"Error executing cleaning code:\n{output}"
623
+
624
+ @tool("Execute Visualization Code")
625
+ def execute_visualization_code(python_code: Optional[str] = None, **kwargs) -> str:
626
+ """Executes Python plotting code to generate and save PNG visual charts.
627
+
628
+ The code runs in a pre-configured Python environment where:
629
+ - 'df' is a pre-loaded pandas DataFrame containing the cleaned dataset.
630
+ - 'OUTPUT_DIR' is a pre-defined string representing the output folder path.
631
+ - 'save_chart(filename)' is a helper function to save the current plot into OUTPUT_DIR.
632
+ - Libraries 'pandas', 'matplotlib.pyplot as plt', and 'seaborn as sns' are already imported.
633
+
634
+ Example usage:
635
+ plt.figure(figsize=(10, 6))
636
+ sns.scatterplot(data=df, x='column_x', y='column_y')
637
+ plt.title('Relationship Title')
638
+ save_chart('chart_name.png')
639
+ plt.close()
640
+ """
641
+ if not python_code:
642
+ for k, v in kwargs.items():
643
+ if v and isinstance(v, str) and ("plt." in v or "sns." in v or "import " in v or "\n" in v):
644
+ python_code = v
645
+ break
646
+ if not python_code:
647
+ return "Error: python_code is required."
648
+
649
+ clean_code = _strip_markdown_fences(python_code)
650
+ from config.context import current_session_csv, current_session_output_dir
651
+ csv_path = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
652
+ output_dir = current_session_output_dir.get() or os.getenv("CURRENT_SESSION_OUTPUT_DIR", "")
653
+
654
+ # Fallbacks if env vars are missing
655
+ if not csv_path:
656
+ csv_path = "data/sessions/default/cleaned.csv"
657
+ if not output_dir:
658
+ output_dir = "outputs/default"
659
+
660
+ script = textwrap.dedent(f"""\
661
+ import os
662
+ import pandas as pd
663
+ import matplotlib
664
+ matplotlib.use('Agg')
665
+ import matplotlib.pyplot as plt
666
+ import seaborn as sns
667
+ import textwrap
668
+
669
+ CSV_PATH = {repr(csv_path)}
670
+ OUTPUT_DIR = {repr(output_dir)}
671
+
672
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
673
+ df = pd.read_csv(CSV_PATH)
674
+
675
+ # Safeguard: redirect all read_csv calls to the cleaned CSV path
676
+ _orig_read_csv = pd.read_csv
677
+ def custom_read_csv(*args, **kwargs):
678
+ return _orig_read_csv(CSV_PATH)
679
+ pd.read_csv = custom_read_csv
680
+
681
+ def save_chart(filename):
682
+ if not filename.endswith('.png'):
683
+ filename += '.png'
684
+ path = os.path.join(OUTPUT_DIR, filename)
685
+ plt.savefig(path, bbox_inches='tight', dpi=180)
686
+ print(f"Saved chart: {{filename}}")
687
+ """) + "\n" + clean_code
688
+
689
+ success, output = _run_in_subprocess(script)
690
+ if success:
691
+ return f"Visualization executed successfully. Output:\n{output}"
692
+ return f"Error executing visualization code:\n{output}"
693
+
694
+
695
+ def auto_coerce_types(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
696
+ """
697
+ Analyze columns in the DataFrame, detect type mismatches/conflicts,
698
+ and convert them to their appropriate types (e.g., object to numeric or datetime).
699
+ Returns (converted_df, list_of_actions).
700
+ """
701
+ actions = []
702
+ df = df.copy()
703
+
704
+ for col in df.columns:
705
+ # Skip empty columns
706
+ if df[col].isnull().all():
707
+ continue
708
+
709
+ dtype = df[col].dtype
710
+
711
+ # We only need to coerce object/string columns
712
+ if dtype == 'object':
713
+ sample_non_null = df[col].dropna().head(200).astype(str)
714
+ if sample_non_null.empty:
715
+ continue
716
+
717
+ # 1. Heuristic for Date: check if strings contain date patterns
718
+ date_like_count = 0
719
+ for val in sample_non_null:
720
+ val_clean = val.strip()
721
+ # Skip 4-digit years (e.g. 1990) to prevent converting them to date objects
722
+ if val_clean.isdigit() and len(val_clean) == 4:
723
+ continue
724
+ # matches YYYY-MM-DD, DD/MM/YYYY, or wordy dates like "01 Jul 2026"
725
+ if re.match(r'^\d{4}[-/]\d{1,2}[-/]\d{1,2}', val_clean) or \
726
+ re.match(r'^\d{1,2}[-/]\d{1,2}[-/]\d{4}', val_clean) or \
727
+ re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', val_clean, re.IGNORECASE):
728
+ date_like_count += 1
729
+
730
+ if date_like_count > len(sample_non_null) * 0.5:
731
+ try:
732
+ try:
733
+ converted = pd.to_datetime(df[col], errors='coerce', format='mixed')
734
+ except (ValueError, TypeError):
735
+ converted = pd.to_datetime(df[col], errors='coerce')
736
+ # If conversion didn't result in all NaTs
737
+ if not converted.isnull().all() and converted.notnull().sum() > len(df[col].dropna()) * 0.7:
738
+ df[col] = converted
739
+ actions.append(f"Converted column '{col}' to Datetime (detected date-like patterns)")
740
+ continue
741
+ except Exception:
742
+ pass
743
+
744
+ # 2. Heuristic for Numeric: check if it could be a numeric column stored as string
745
+ # (e.g., currency "$1,000", percentage "95%", or numbers with commas/spaces/missing placeholders)
746
+ numeric_like_count = 0
747
+ for val in sample_non_null:
748
+ val_clean = val.strip().lower()
749
+ if not val_clean or val_clean in {'nan', 'null', 'n/a', 'na', '?', 'none', '-', '.', 'missing', 'empty'}:
750
+ numeric_like_count += 1
751
+ continue
752
+ # Clean currency symbols, percentage signs, commas, and whitespace
753
+ cleaned_val = re.sub(r'[\$,%\s]', '', val_clean).replace(',', '')
754
+ if re.match(r'^-?\d+(?:\.\d+)?$', cleaned_val):
755
+ numeric_like_count += 1
756
+
757
+ if numeric_like_count > len(sample_non_null) * 0.8:
758
+ try:
759
+ # Clean currency symbols, commas, percent signs, and spaces
760
+ # Replace common string placeholders with empty strings so pd.to_numeric turns them to NaN
761
+ cleaned_col = df[col].astype(str).str.strip()
762
+ # Strip any surrounding quotes first
763
+ cleaned_col = cleaned_col.str.replace(r'^["\']|["\']$', '', regex=True)
764
+ # Replace placeholders (exact whole-string match only, case-insensitive)
765
+ for ph in ['nan', 'null', 'n/a', 'na', '?', 'none', '-', 'missing', 'empty']:
766
+ cleaned_col = cleaned_col.str.replace(re.compile(rf'^\s*{re.escape(ph)}\s*$', re.IGNORECASE), '', regex=True)
767
+ cleaned_col = cleaned_col.str.replace(r'[\$,%\s]', '', regex=True).str.replace(',', '', regex=False)
768
+ # Convert to numeric
769
+ converted = pd.to_numeric(cleaned_col, errors='coerce')
770
+ if not converted.isnull().all():
771
+ non_null_converted = converted.dropna()
772
+ if not non_null_converted.empty and (non_null_converted % 1 == 0).all():
773
+ if converted.isnull().any():
774
+ df[col] = converted.astype('Int64')
775
+ actions.append(f"Converted column '{col}' to Nullable Integer (cleaned currency/delimiters/nulls)")
776
+ else:
777
+ df[col] = converted.astype(int)
778
+ actions.append(f"Converted column '{col}' to Integer (cleaned currency/delimiters/nulls)")
779
+ else:
780
+ df[col] = converted
781
+ actions.append(f"Converted column '{col}' to Float (cleaned currency/delimiters/nulls)")
782
+ continue
783
+ except Exception:
784
+ pass
785
+
786
+ # 3. Heuristic for Boolean: check for binary values (Yes/No, True/False, Y/N, 1/0)
787
+ unique_vals = set(sample_non_null.str.lower().str.strip())
788
+ if unique_vals.issubset({'yes', 'no', 'y', 'n', 'true', 'false', 't', 'f', '1', '0'}):
789
+ # Ensure we have both binary parts represented, not just a single constant value column
790
+ if len(unique_vals) >= 2:
791
+ try:
792
+ bool_map = {
793
+ 'yes': True, 'no': False, 'y': True, 'n': False,
794
+ 'true': True, 'false': False, 't': True, 'f': False,
795
+ '1': True, '0': False
796
+ }
797
+ df[col] = df[col].astype(str).str.lower().str.strip().map(bool_map)
798
+ actions.append(f"Converted column '{col}' to Boolean (detected binary labels)")
799
+ continue
800
+ except Exception:
801
+ pass
802
+
803
+ return df, actions