crewlyze 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.dockerignore +12 -0
- package/.gitattributes +2 -0
- package/CHANGELOG.md +86 -0
- package/Dockerfile +21 -0
- package/LICENSE +21 -0
- package/README.md +139 -0
- package/USAGE.md +106 -0
- package/agents/__init__.py +0 -0
- package/agents/cleaner.py +38 -0
- package/agents/insights.py +44 -0
- package/agents/relation.py +36 -0
- package/agents/visualizer.py +41 -0
- package/assets/badge_crewai.svg +4 -0
- package/assets/badge_matplotlib.svg +4 -0
- package/assets/badge_ollama.svg +4 -0
- package/assets/badge_pandas.svg +4 -0
- package/assets/badge_seaborn.svg +4 -0
- package/assets/branding_image.png +0 -0
- package/assets/complete_workflow.svg +216 -0
- package/assets/favicon.png +0 -0
- package/assets/logo.png +0 -0
- package/assets/stars.svg +12 -0
- package/bin/crewlyze.js +79 -0
- package/config/README.md +129 -0
- package/config/__init__.py +1 -0
- package/config/context.py +16 -0
- package/config/llm_config.py +300 -0
- package/config/metrics_tracker.py +70 -0
- package/crew.py +870 -0
- package/crewlyze-3.1.0.tgz +0 -0
- package/fix_syntax.py +54 -0
- package/main.py +1279 -0
- package/package.json +22 -0
- package/pyproject.toml +32 -0
- package/requirements.txt +33 -0
- package/tools/__init__.py +0 -0
- package/tools/dataset_tools.py +803 -0
- package/ui/__init__.py +3 -0
- package/ui/copilot.py +200 -0
- package/ui/export.py +800 -0
- package/update_appjs.py +54 -0
- package/update_llm.py +21 -0
- package/update_main.py +20 -0
- package/web/app.js +3142 -0
- package/web/index.html +1105 -0
- package/web/style.css +2561 -0
- package/workflows/__init__.py +0 -0
- package/workflows/pipeline.py +254 -0
|
@@ -0,0 +1,803 @@
|
|
|
1
|
+
# Crewlyze
|
|
2
|
+
# Copyright (c) 2025 Sowmiyan S
|
|
3
|
+
# Licensed under the MIT License
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Dataset tools for CrewAI agents.
|
|
7
|
+
|
|
8
|
+
Security note
|
|
9
|
+
-------------
|
|
10
|
+
All LLM-generated code is executed in an isolated child process (subprocess),
|
|
11
|
+
never via exec() in the parent process. This eliminates RCE risk: the child
|
|
12
|
+
has no access to the parent's globals, secrets, or in-memory state.
|
|
13
|
+
|
|
14
|
+
Performance note
|
|
15
|
+
----------------
|
|
16
|
+
build_dataset_profile() is a pure Python function (not a CrewAI tool) that
|
|
17
|
+
pre-computes a compact dataset summary and returns it as a string. Injecting
|
|
18
|
+
this string into task descriptions eliminates 6-8 LLM round-trips per run that
|
|
19
|
+
agents would otherwise spend calling read_dataset_head / get_dataset_info /
|
|
20
|
+
get_correlation_matrix before they can act.
|
|
21
|
+
|
|
22
|
+
generate_plotly_charts() parses the relation-agent output and produces
|
|
23
|
+
interactive Plotly figures directly in Python — no LLM, no subprocess, no PNG
|
|
24
|
+
file I/O. This replaces static matplotlib PNGs with zoomable, hoverable charts.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import sys
|
|
30
|
+
import textwrap
|
|
31
|
+
import tempfile
|
|
32
|
+
import subprocess
|
|
33
|
+
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from typing import Optional
|
|
36
|
+
from crewai.tools import tool
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Robust CSV Reader Helper
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def read_csv_robust(file_path: str, **kwargs) -> pd.DataFrame:
|
|
45
|
+
"""Read a CSV file robustly, handling encoding and tokenization (bad lines) errors.
|
|
46
|
+
|
|
47
|
+
If standard parsing fails, it falls back to skipping bad lines and prints
|
|
48
|
+
a warning to stdout so it appears in the user-facing logs.
|
|
49
|
+
"""
|
|
50
|
+
encodings = ['utf-8', 'latin1', 'utf-8-sig', 'cp1252']
|
|
51
|
+
|
|
52
|
+
# Try normal reading first with different encodings
|
|
53
|
+
for encoding in encodings:
|
|
54
|
+
try:
|
|
55
|
+
return pd.read_csv(file_path, encoding=encoding, **kwargs)
|
|
56
|
+
except Exception as e:
|
|
57
|
+
if isinstance(e, FileNotFoundError):
|
|
58
|
+
raise e
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
# If standard reading fails, try skipping bad lines
|
|
62
|
+
print(f"[Warning] Encountered formatting issues reading {file_path}. Attempting to parse by skipping malformed lines...", file=sys.stdout)
|
|
63
|
+
sys.stdout.flush()
|
|
64
|
+
for encoding in encodings:
|
|
65
|
+
try:
|
|
66
|
+
return pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip', **kwargs)
|
|
67
|
+
except Exception:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
# If all fails, run one final time to let the error bubble up
|
|
71
|
+
return pd.read_csv(file_path, **kwargs)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# Internal helpers
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
def _strip_markdown_fences(code: str) -> str:
|
|
79
|
+
"""Remove leading/trailing markdown code fences from LLM output."""
|
|
80
|
+
# Find any python code blocks: ```python ... ```
|
|
81
|
+
match = re.search(r"```(?:python)?\s*(.*?)\s*```", code, re.DOTALL | re.IGNORECASE)
|
|
82
|
+
if match:
|
|
83
|
+
code = match.group(1)
|
|
84
|
+
else:
|
|
85
|
+
code = code.strip()
|
|
86
|
+
code = re.sub(r"^```(?:python)?\s*\n?", "", code)
|
|
87
|
+
code = re.sub(r"\n?```\s*$", "", code)
|
|
88
|
+
|
|
89
|
+
import textwrap
|
|
90
|
+
return textwrap.dedent(code).strip()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _df_to_markdown(df: "pd.DataFrame", index: bool = True) -> str:
|
|
94
|
+
"""Convert a pandas DataFrame to a compact GitHub-style markdown table.
|
|
95
|
+
|
|
96
|
+
Pure Python fallback — avoids the optional ``tabulate`` dependency.
|
|
97
|
+
Numeric cells are right-aligned; all others are left-aligned.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
df : DataFrame to render.
|
|
101
|
+
index : If True, include the DataFrame index as the first column.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
A multi-line markdown string.
|
|
105
|
+
"""
|
|
106
|
+
import pandas as pd
|
|
107
|
+
|
|
108
|
+
if df is None or df.empty:
|
|
109
|
+
return "*(empty)*"
|
|
110
|
+
|
|
111
|
+
df_display = df.reset_index() if index else df.copy()
|
|
112
|
+
|
|
113
|
+
headers = [str(h) for h in list(df_display.columns)]
|
|
114
|
+
rows = []
|
|
115
|
+
for row in df_display.values.tolist():
|
|
116
|
+
formatted_row = []
|
|
117
|
+
for cell in row:
|
|
118
|
+
if cell is None or pd.isna(cell):
|
|
119
|
+
formatted_row.append("")
|
|
120
|
+
else:
|
|
121
|
+
formatted_row.append(str(cell))
|
|
122
|
+
rows.append(formatted_row)
|
|
123
|
+
|
|
124
|
+
# Column widths — at least as wide as the header
|
|
125
|
+
col_widths = [len(h) for h in headers]
|
|
126
|
+
for row in rows:
|
|
127
|
+
for i, cell in enumerate(row):
|
|
128
|
+
col_widths[i] = max(col_widths[i], len(cell))
|
|
129
|
+
|
|
130
|
+
def _row_str(cells: list, widths: list) -> str:
|
|
131
|
+
parts = []
|
|
132
|
+
for cell, w in zip(cells, widths):
|
|
133
|
+
parts.append(cell.ljust(w))
|
|
134
|
+
return "| " + " | ".join(parts) + " |"
|
|
135
|
+
|
|
136
|
+
sep_parts = ["-" * w for w in col_widths]
|
|
137
|
+
separator = "|" + "|".join(f"-{s}-" for s in sep_parts) + "|"
|
|
138
|
+
|
|
139
|
+
lines = [_row_str(headers, col_widths), separator]
|
|
140
|
+
for row in rows:
|
|
141
|
+
lines.append(_row_str(row, col_widths))
|
|
142
|
+
|
|
143
|
+
return "\n".join(lines)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _heal_script_code(script: str, error_output: str) -> str:
|
|
147
|
+
"""Use the configured LLM to self-heal/correct a failing python script."""
|
|
148
|
+
try:
|
|
149
|
+
from crewai import LLM
|
|
150
|
+
from config.llm_config import get_llm_params
|
|
151
|
+
|
|
152
|
+
# Get LLM instance
|
|
153
|
+
llm = LLM(**get_llm_params())
|
|
154
|
+
|
|
155
|
+
prompt = f"""You are a senior python debugger.
|
|
156
|
+
The following python script failed during execution with an error/traceback.
|
|
157
|
+
|
|
158
|
+
--- FAIL SCRIPT ---
|
|
159
|
+
{script}
|
|
160
|
+
|
|
161
|
+
--- ERROR OUTPUT ---
|
|
162
|
+
{error_output}
|
|
163
|
+
|
|
164
|
+
Task:
|
|
165
|
+
Analyze the error carefully. Identify why it failed (e.g., missing imports, undefined variables, incorrect pandas api calls, type mismatches).
|
|
166
|
+
Rewrite the script to fix this error, ensuring you preserve the original logic and functional goal.
|
|
167
|
+
Do NOT omit the imports, variables, or functions that are set up.
|
|
168
|
+
For visualization scripts, make sure 'save_chart(filename)' is called correctly.
|
|
169
|
+
For data cleaning scripts, make sure 'df.to_csv(FILE_PATH, index=False)' is kept at the end.
|
|
170
|
+
|
|
171
|
+
Format:
|
|
172
|
+
Return ONLY the corrected, ready-to-run python script inside a markdown python block:
|
|
173
|
+
```python
|
|
174
|
+
# corrected code here
|
|
175
|
+
```
|
|
176
|
+
Do not include any other explanations, intros, or markdown outside the code block.
|
|
177
|
+
"""
|
|
178
|
+
# Call LLM
|
|
179
|
+
response = llm.call([{"role": "user", "content": prompt}])
|
|
180
|
+
corrected_code = _strip_markdown_fences(response)
|
|
181
|
+
return corrected_code
|
|
182
|
+
except Exception as e:
|
|
183
|
+
# If healing fails, return original script
|
|
184
|
+
print(f"Self-healing error: {e}")
|
|
185
|
+
return script
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _run_in_subprocess(script: str, timeout: int = 120, is_healed_attempt: bool = False) -> tuple[bool, str]:
|
|
189
|
+
"""
|
|
190
|
+
Write *script* to a temp file and execute it in an isolated subprocess.
|
|
191
|
+
Includes auto-dependency healing to download missing packages if needed.
|
|
192
|
+
"""
|
|
193
|
+
with tempfile.NamedTemporaryFile(
|
|
194
|
+
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
|
195
|
+
) as tmp:
|
|
196
|
+
tmp.write(script)
|
|
197
|
+
tmp_path = tmp.name
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
proc = subprocess.run(
|
|
201
|
+
[sys.executable, tmp_path],
|
|
202
|
+
capture_output=True,
|
|
203
|
+
text=True,
|
|
204
|
+
timeout=timeout,
|
|
205
|
+
)
|
|
206
|
+
output = (proc.stdout + proc.stderr).strip()
|
|
207
|
+
success = proc.returncode == 0
|
|
208
|
+
|
|
209
|
+
# If execution failed and we haven't already tried to self-heal:
|
|
210
|
+
if not success and not is_healed_attempt:
|
|
211
|
+
# 1. Package dependency healing (ModuleNotFoundError)
|
|
212
|
+
match_module = re.search(r"ModuleNotFoundError:\s*No module named\s*['\"]([^'\"]+)['\"]", output)
|
|
213
|
+
if match_module:
|
|
214
|
+
module_name = match_module.group(1)
|
|
215
|
+
print(f"\n[Auto-Healing System] Missing module '{module_name}'. Attempting pip install...\n")
|
|
216
|
+
sys.stdout.flush()
|
|
217
|
+
try:
|
|
218
|
+
subprocess.run([sys.executable, "-m", "pip", "install", module_name], capture_output=True)
|
|
219
|
+
# Retry execution after package install
|
|
220
|
+
success_pkg, output_pkg = _run_in_subprocess(script, timeout=timeout, is_healed_attempt=True)
|
|
221
|
+
if success_pkg:
|
|
222
|
+
print(f"[Auto-Healing System] Installed '{module_name}' and executed successfully!")
|
|
223
|
+
sys.stdout.flush()
|
|
224
|
+
return True, f"[Auto-Healing system installed missing package '{module_name}']\n\nOutput:\n{output_pkg}"
|
|
225
|
+
except Exception as pkg_err:
|
|
226
|
+
print(f"[Auto-Healing System] Failed to install package: {pkg_err}")
|
|
227
|
+
sys.stdout.flush()
|
|
228
|
+
|
|
229
|
+
# 2. Logic/Code healing (LLM repair)
|
|
230
|
+
print(f"\n[Auto-Healing System] Executing python script failed with error. Attempting self-healing...\nError details:\n{output}\n")
|
|
231
|
+
sys.stdout.flush()
|
|
232
|
+
healed_script = _heal_script_code(script, output)
|
|
233
|
+
if healed_script and healed_script != script:
|
|
234
|
+
success_h, output_h = _run_in_subprocess(healed_script, timeout=timeout, is_healed_attempt=True)
|
|
235
|
+
if success_h:
|
|
236
|
+
print("[Auto-Healing System] Code repaired and executed successfully!")
|
|
237
|
+
sys.stdout.flush()
|
|
238
|
+
return True, f"[Auto-Healing system resolved a code error!]\nOriginal Error:\n{output}\n\nSuccessful Execution Output:\n{output_h}"
|
|
239
|
+
else:
|
|
240
|
+
print("[Auto-Healing System] Attempted repair but healed script still failed.")
|
|
241
|
+
sys.stdout.flush()
|
|
242
|
+
|
|
243
|
+
return success, output or "(no output)"
|
|
244
|
+
except subprocess.TimeoutExpired:
|
|
245
|
+
return False, f"Execution timed out after {timeout}s."
|
|
246
|
+
except Exception as e:
|
|
247
|
+
return False, f"Failed to launch subprocess: {e}"
|
|
248
|
+
finally:
|
|
249
|
+
try:
|
|
250
|
+
os.unlink(tmp_path)
|
|
251
|
+
except OSError:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
# ---------------------------------------------------------------------------
|
|
256
|
+
# Pure Python helpers — NOT CrewAI tools, called directly from run_crew()
|
|
257
|
+
# ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
def build_dataset_profile(csv_path: str, max_rows: int = 5000) -> str:
|
|
260
|
+
"""Build a compact, token-efficient dataset profile string.
|
|
261
|
+
|
|
262
|
+
Injecting this into task descriptions eliminates the need for agents to
|
|
263
|
+
call read_dataset_head / get_dataset_info / get_correlation_matrix —
|
|
264
|
+
saving 6-8 LLM round-trips per pipeline run.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
csv_path : Path to the CSV file.
|
|
268
|
+
max_rows : Row cap for profiling large files (default 5000).
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A markdown-formatted string safe for embedding in task descriptions.
|
|
272
|
+
"""
|
|
273
|
+
try:
|
|
274
|
+
df = read_csv_robust(csv_path, nrows=max_rows)
|
|
275
|
+
except Exception as exc:
|
|
276
|
+
return f"[Profile unavailable: {exc}]"
|
|
277
|
+
|
|
278
|
+
lines: list[str] = []
|
|
279
|
+
sampled = len(df)
|
|
280
|
+
|
|
281
|
+
# Shape
|
|
282
|
+
note = " (sample — file is larger)" if sampled == max_rows else ""
|
|
283
|
+
lines.append(
|
|
284
|
+
f"**Dataset shape**: {sampled} rows × {len(df.columns)} columns{note}"
|
|
285
|
+
)
|
|
286
|
+
lines.append("")
|
|
287
|
+
|
|
288
|
+
# Column summary
|
|
289
|
+
lines.append("**Columns** (name | dtype | missing% | stats/top values):")
|
|
290
|
+
for col in df.columns:
|
|
291
|
+
dtype = df[col].dtype
|
|
292
|
+
miss_pct = round(df[col].isnull().sum() / max(len(df), 1) * 100, 1)
|
|
293
|
+
if pd.api.types.is_numeric_dtype(dtype):
|
|
294
|
+
desc = (
|
|
295
|
+
f"min={df[col].min():.4g}, "
|
|
296
|
+
f"mean={df[col].mean():.4g}, "
|
|
297
|
+
f"max={df[col].max():.4g}"
|
|
298
|
+
)
|
|
299
|
+
else:
|
|
300
|
+
tops = df[col].dropna().value_counts().head(3).index.tolist()
|
|
301
|
+
desc = ", ".join(str(v) for v in tops) or "—"
|
|
302
|
+
lines.append(f" - {col}: {dtype} | missing={miss_pct}% | {desc}")
|
|
303
|
+
lines.append("")
|
|
304
|
+
|
|
305
|
+
# Top correlations (numeric only)
|
|
306
|
+
numeric_df = df.select_dtypes(include=["number"])
|
|
307
|
+
if len(numeric_df.columns) >= 2:
|
|
308
|
+
try:
|
|
309
|
+
corr = numeric_df.corr().unstack().reset_index()
|
|
310
|
+
corr.columns = ["A", "B", "r"]
|
|
311
|
+
corr = corr[corr["A"] < corr["B"]].copy()
|
|
312
|
+
corr["abs_r"] = corr["r"].abs()
|
|
313
|
+
top5 = corr.nlargest(5, "abs_r")[["A", "B", "r"]]
|
|
314
|
+
lines.append("**Top correlations**:")
|
|
315
|
+
for _, row in top5.iterrows():
|
|
316
|
+
lines.append(f" - {row['A']} ↔ {row['B']}: r={row['r']:.3f}")
|
|
317
|
+
lines.append("")
|
|
318
|
+
except Exception:
|
|
319
|
+
pass # silently skip if corr fails (e.g. all NaN numeric cols)
|
|
320
|
+
|
|
321
|
+
# Sample rows
|
|
322
|
+
lines.append("**Sample rows (first 5)**:")
|
|
323
|
+
lines.append(_df_to_markdown(df.head(5), index=False))
|
|
324
|
+
|
|
325
|
+
return "\n".join(lines)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def generate_plotly_charts(csv_path: str, relations_text: str, max_rows: int = 5000) -> list:
|
|
329
|
+
"""Parse agent relation output and generate interactive Plotly figures.
|
|
330
|
+
|
|
331
|
+
Replaces static matplotlib PNGs with zoomable, hoverable charts rendered
|
|
332
|
+
natively by st.plotly_chart(). No LLM calls, no subprocess, no file I/O.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
csv_path : Path to the cleaned CSV file.
|
|
336
|
+
relations_text : Raw text output from the relation agent.
|
|
337
|
+
max_rows : Row cap for chart data (default 5000 for rendering speed).
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
List of dicts: [{"title": str, "fig": plotly.graph_objs.Figure}, ...]
|
|
341
|
+
Returns an empty list if plotly is unavailable or no valid relations found.
|
|
342
|
+
"""
|
|
343
|
+
try:
|
|
344
|
+
import plotly.express as px
|
|
345
|
+
except ImportError:
|
|
346
|
+
return []
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
df = read_csv_robust(csv_path, nrows=max_rows)
|
|
350
|
+
except Exception:
|
|
351
|
+
return []
|
|
352
|
+
|
|
353
|
+
# Dark-theme layout matching the app's "Obsidian & Electric Violet" aesthetic
|
|
354
|
+
_dark = dict(
|
|
355
|
+
paper_bgcolor="rgba(9,9,11,0.0)",
|
|
356
|
+
plot_bgcolor="rgba(15,23,42,0.4)",
|
|
357
|
+
font_color="#e2e8f0",
|
|
358
|
+
title_font_color="#a78bfa",
|
|
359
|
+
title_font_size=15,
|
|
360
|
+
xaxis=dict(
|
|
361
|
+
gridcolor="rgba(255,255,255,0.07)",
|
|
362
|
+
zerolinecolor="rgba(255,255,255,0.1)",
|
|
363
|
+
title_font_color="#94a3b8",
|
|
364
|
+
),
|
|
365
|
+
yaxis=dict(
|
|
366
|
+
gridcolor="rgba(255,255,255,0.07)",
|
|
367
|
+
zerolinecolor="rgba(255,255,255,0.1)",
|
|
368
|
+
title_font_color="#94a3b8",
|
|
369
|
+
),
|
|
370
|
+
margin=dict(l=50, r=20, t=55, b=50),
|
|
371
|
+
hoverlabel=dict(bgcolor="rgba(15,23,42,0.95)", font_color="#e2e8f0"),
|
|
372
|
+
)
|
|
373
|
+
_colors = ["#a78bfa", "#6366f1", "#22d3ee", "#e879f9", "#34d399"]
|
|
374
|
+
|
|
375
|
+
figures = []
|
|
376
|
+
|
|
377
|
+
for line in relations_text.split("\n"):
|
|
378
|
+
line = line.strip()
|
|
379
|
+
if not (line and "|" in line and "X:" in line):
|
|
380
|
+
continue
|
|
381
|
+
try:
|
|
382
|
+
parts = [p.strip() for p in line.lstrip("- ").split("|")]
|
|
383
|
+
x_col = parts[0].split(":", 1)[1].strip()
|
|
384
|
+
y_col = parts[1].split(":", 1)[1].strip()
|
|
385
|
+
ptype = parts[2].split(":", 1)[1].strip().lower() if len(parts) > 2 else "scatter"
|
|
386
|
+
except (IndexError, ValueError):
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
if x_col not in df.columns or y_col not in df.columns:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
# Guard: skip trivial same-column pairs
|
|
393
|
+
if x_col == y_col:
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
title = f"{x_col} vs {y_col}"
|
|
397
|
+
color = _colors[len(figures) % len(_colors)]
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
sample = df[[x_col, y_col]].dropna().head(2000)
|
|
401
|
+
|
|
402
|
+
if sample.empty:
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
if "scatter" in ptype or "plot" in ptype:
|
|
406
|
+
fig = px.scatter(
|
|
407
|
+
sample, x=x_col, y=y_col, title=title,
|
|
408
|
+
color_discrete_sequence=[color], opacity=0.75,
|
|
409
|
+
)
|
|
410
|
+
fig.update_traces(marker=dict(size=6, line=dict(width=0.5, color="rgba(255,255,255,0.3)")))
|
|
411
|
+
elif "bar" in ptype:
|
|
412
|
+
if pd.api.types.is_numeric_dtype(df[x_col]):
|
|
413
|
+
# numeric x → bin it, then aggregate
|
|
414
|
+
agg = sample.groupby(x_col)[y_col].mean().reset_index()
|
|
415
|
+
else:
|
|
416
|
+
agg = sample.groupby(x_col)[y_col].mean().reset_index()
|
|
417
|
+
fig = px.bar(
|
|
418
|
+
agg.head(25), x=x_col, y=y_col, title=title,
|
|
419
|
+
color_discrete_sequence=[color],
|
|
420
|
+
)
|
|
421
|
+
elif "line" in ptype:
|
|
422
|
+
fig = px.line(
|
|
423
|
+
sample.sort_values(x_col), x=x_col, y=y_col, title=title,
|
|
424
|
+
color_discrete_sequence=[color],
|
|
425
|
+
)
|
|
426
|
+
elif "box" in ptype:
|
|
427
|
+
fig = px.box(
|
|
428
|
+
sample, x=x_col if not pd.api.types.is_numeric_dtype(df[x_col]) else None,
|
|
429
|
+
y=y_col, title=title,
|
|
430
|
+
color_discrete_sequence=[color],
|
|
431
|
+
)
|
|
432
|
+
elif "hist" in ptype:
|
|
433
|
+
fig = px.histogram(
|
|
434
|
+
sample, x=x_col, nbins=30,
|
|
435
|
+
title=f"Distribution of {x_col}",
|
|
436
|
+
color_discrete_sequence=[color],
|
|
437
|
+
)
|
|
438
|
+
else:
|
|
439
|
+
# Default: scatter
|
|
440
|
+
fig = px.scatter(
|
|
441
|
+
sample, x=x_col, y=y_col, title=title,
|
|
442
|
+
color_discrete_sequence=[color], opacity=0.75,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
fig.update_layout(**_dark)
|
|
446
|
+
figures.append({"title": title, "fig": fig, "x": x_col, "y": y_col, "type": ptype})
|
|
447
|
+
|
|
448
|
+
except Exception as _chart_err: # log but continue
|
|
449
|
+
print(f"[Plotly] Skipping {title!r}: {_chart_err}")
|
|
450
|
+
continue
|
|
451
|
+
|
|
452
|
+
# Fallback: if no relation lines were parseable, auto-generate charts
|
|
453
|
+
# from the first few numeric column pairs in the dataset.
|
|
454
|
+
if not figures:
|
|
455
|
+
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
|
|
456
|
+
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
457
|
+
|
|
458
|
+
pair_count = 0
|
|
459
|
+
for i, col in enumerate(numeric_cols[:3]):
|
|
460
|
+
color = _colors[i % len(_colors)]
|
|
461
|
+
# Histogram for each numeric col
|
|
462
|
+
try:
|
|
463
|
+
fig = px.histogram(df[[col]].dropna().head(3000), x=col,
|
|
464
|
+
nbins=30, title=f"Distribution of {col}",
|
|
465
|
+
color_discrete_sequence=[color])
|
|
466
|
+
fig.update_layout(**_dark)
|
|
467
|
+
figures.append({"title": f"Distribution of {col}", "fig": fig, "x": col, "y": col, "type": "histogram"})
|
|
468
|
+
pair_count += 1
|
|
469
|
+
except Exception:
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# Scatter for first 2 numeric pairs
|
|
473
|
+
for i in range(min(len(numeric_cols) - 1, 2)):
|
|
474
|
+
xc, yc = numeric_cols[i], numeric_cols[i + 1]
|
|
475
|
+
color = _colors[(pair_count + i) % len(_colors)]
|
|
476
|
+
try:
|
|
477
|
+
sample = df[[xc, yc]].dropna().head(2000)
|
|
478
|
+
fig = px.scatter(sample, x=xc, y=yc, title=f"{xc} vs {yc}",
|
|
479
|
+
color_discrete_sequence=[color], opacity=0.75)
|
|
480
|
+
fig.update_layout(**_dark)
|
|
481
|
+
figures.append({"title": f"{xc} vs {yc}", "fig": fig, "x": xc, "y": yc, "type": "scatter"})
|
|
482
|
+
except Exception:
|
|
483
|
+
continue
|
|
484
|
+
|
|
485
|
+
# Bar for first categorical × numeric
|
|
486
|
+
if cat_cols and numeric_cols:
|
|
487
|
+
cc, nc = cat_cols[0], numeric_cols[0]
|
|
488
|
+
try:
|
|
489
|
+
agg = df[[cc, nc]].dropna().groupby(cc)[nc].mean().reset_index()
|
|
490
|
+
fig = px.bar(agg.head(20), x=cc, y=nc, title=f"{nc} by {cc}",
|
|
491
|
+
color_discrete_sequence=[_colors[0]])
|
|
492
|
+
fig.update_layout(**_dark)
|
|
493
|
+
figures.append({"title": f"{nc} by {cc}", "fig": fig, "x": cc, "y": nc, "type": "bar"})
|
|
494
|
+
except Exception:
|
|
495
|
+
pass
|
|
496
|
+
|
|
497
|
+
return figures
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# ---------------------------------------------------------------------------
|
|
501
|
+
# CrewAI tools — used by agents at runtime as fallback / code generation aid
|
|
502
|
+
# ---------------------------------------------------------------------------
|
|
503
|
+
|
|
504
|
+
class DatasetTools:
|
|
505
|
+
|
|
506
|
+
@tool("Read Dataset Head")
|
|
507
|
+
def read_dataset_head(file_path: Optional[str] = None) -> str:
|
|
508
|
+
"""Reads the first 10 rows of the dataset to understand its structure.
|
|
509
|
+
Uses nrows=10 so the entire file is never loaded into memory.
|
|
510
|
+
If file_path is not specified or is invalid, the active session's CSV will be used.
|
|
511
|
+
"""
|
|
512
|
+
try:
|
|
513
|
+
from config.context import current_session_csv
|
|
514
|
+
fp = file_path
|
|
515
|
+
if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
|
|
516
|
+
fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
|
|
517
|
+
df = read_csv_robust(fp, nrows=10)
|
|
518
|
+
return _df_to_markdown(df, index=False)
|
|
519
|
+
except Exception as e:
|
|
520
|
+
return f"Error reading file: {e}"
|
|
521
|
+
|
|
522
|
+
@tool("Get Dataset Info")
|
|
523
|
+
def get_dataset_info(file_path: Optional[str] = None) -> str:
|
|
524
|
+
"""Returns basic information about the dataset: shape, columns, data types,
|
|
525
|
+
and missing-value counts.
|
|
526
|
+
If file_path is not specified or is invalid, the active session's CSV will be used.
|
|
527
|
+
"""
|
|
528
|
+
try:
|
|
529
|
+
from config.context import current_session_csv
|
|
530
|
+
fp = file_path
|
|
531
|
+
if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
|
|
532
|
+
fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
|
|
533
|
+
df = read_csv_robust(fp)
|
|
534
|
+
lines = [f"Shape: {df.shape}", "\nColumns and Types:"]
|
|
535
|
+
for col, dtype in df.dtypes.items():
|
|
536
|
+
missing = df[col].isnull().sum()
|
|
537
|
+
lines.append(f" - {col}: {dtype} (Missing: {missing})")
|
|
538
|
+
return "\n".join(lines)
|
|
539
|
+
except Exception as e:
|
|
540
|
+
return f"Error analyzing file: {e}"
|
|
541
|
+
|
|
542
|
+
@tool("Get Correlation Matrix")
|
|
543
|
+
def get_correlation_matrix(file_path: Optional[str] = None) -> str:
|
|
544
|
+
"""Returns the top-20 strongest column-pair correlations (by absolute value).
|
|
545
|
+
If file_path is not specified or is invalid, the active session's CSV will be used.
|
|
546
|
+
"""
|
|
547
|
+
try:
|
|
548
|
+
from config.context import current_session_csv
|
|
549
|
+
fp = file_path
|
|
550
|
+
if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
|
|
551
|
+
fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
|
|
552
|
+
df = read_csv_robust(fp)
|
|
553
|
+
numeric_df = df.select_dtypes(include=["number"])
|
|
554
|
+
if numeric_df.empty:
|
|
555
|
+
return "No numeric columns found."
|
|
556
|
+
|
|
557
|
+
corr = numeric_df.corr()
|
|
558
|
+
unstacked = (
|
|
559
|
+
corr.unstack()
|
|
560
|
+
.reset_index()
|
|
561
|
+
.rename(columns={"level_0": "Col_A", "level_1": "Col_B", 0: "Correlation"})
|
|
562
|
+
)
|
|
563
|
+
unstacked = unstacked[unstacked["Col_A"] < unstacked["Col_B"]]
|
|
564
|
+
unstacked["AbsCorr"] = unstacked["Correlation"].abs()
|
|
565
|
+
top = (
|
|
566
|
+
unstacked.sort_values("AbsCorr", ascending=False)
|
|
567
|
+
.head(20)
|
|
568
|
+
.drop(columns=["AbsCorr"])
|
|
569
|
+
.reset_index(drop=True)
|
|
570
|
+
)
|
|
571
|
+
return _df_to_markdown(top, index=False)
|
|
572
|
+
except Exception as e:
|
|
573
|
+
return f"Error calculating correlation: {e}"
|
|
574
|
+
|
|
575
|
+
@tool("Clean Dataset with Python Code")
|
|
576
|
+
def clean_dataset_with_python(file_path: Optional[str] = None, python_code: Optional[str] = None) -> str:
|
|
577
|
+
"""Cleans the dataset by executing *python_code* in an isolated subprocess.
|
|
578
|
+
The file_path parameter is optional and defaults to the active session's dataset CSV.
|
|
579
|
+
|
|
580
|
+
Your code must:
|
|
581
|
+
1. Read the CSV: df = pd.read_csv(FILE_PATH) # FILE_PATH is pre-set
|
|
582
|
+
2. Perform cleaning on df
|
|
583
|
+
3. Save: df.to_csv(FILE_PATH, index=False)
|
|
584
|
+
|
|
585
|
+
Do NOT include markdown code fences. Do NOT use any other file paths.
|
|
586
|
+
"""
|
|
587
|
+
# Swap if python_code is not specified but file_path contains code
|
|
588
|
+
if not python_code:
|
|
589
|
+
if file_path and ("import " in file_path or "df[" in file_path or "\n" in file_path):
|
|
590
|
+
python_code = file_path
|
|
591
|
+
file_path = None
|
|
592
|
+
else:
|
|
593
|
+
return "Error: python_code is required."
|
|
594
|
+
|
|
595
|
+
from config.context import current_session_csv
|
|
596
|
+
fp = file_path
|
|
597
|
+
if not fp or not isinstance(fp, str) or fp.lower() == "none" or "properties" in str(fp):
|
|
598
|
+
fp = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
|
|
599
|
+
|
|
600
|
+
clean_code = _strip_markdown_fences(python_code)
|
|
601
|
+
|
|
602
|
+
script = textwrap.dedent(f"""\
|
|
603
|
+
import os
|
|
604
|
+
import pandas as pd
|
|
605
|
+
|
|
606
|
+
FILE_PATH = {repr(str(fp))}
|
|
607
|
+
df = pd.read_csv(FILE_PATH)
|
|
608
|
+
|
|
609
|
+
# Safeguard: redirect all read_csv calls to FILE_PATH
|
|
610
|
+
_orig_read_csv = pd.read_csv
|
|
611
|
+
def custom_read_csv(*args, **kwargs):
|
|
612
|
+
return _orig_read_csv(FILE_PATH)
|
|
613
|
+
pd.read_csv = custom_read_csv
|
|
614
|
+
""") + "\n" + clean_code + "\n" + textwrap.dedent(f"""\
|
|
615
|
+
df.to_csv(FILE_PATH, index=False)
|
|
616
|
+
print("Dataset cleaned and saved successfully.")
|
|
617
|
+
""")
|
|
618
|
+
|
|
619
|
+
success, output = _run_in_subprocess(script)
|
|
620
|
+
if success:
|
|
621
|
+
return f"Dataset cleaned successfully.\n{output}"
|
|
622
|
+
return f"Error executing cleaning code:\n{output}"
|
|
623
|
+
|
|
624
|
+
@tool("Execute Visualization Code")
|
|
625
|
+
def execute_visualization_code(python_code: Optional[str] = None, **kwargs) -> str:
|
|
626
|
+
"""Executes Python plotting code to generate and save PNG visual charts.
|
|
627
|
+
|
|
628
|
+
The code runs in a pre-configured Python environment where:
|
|
629
|
+
- 'df' is a pre-loaded pandas DataFrame containing the cleaned dataset.
|
|
630
|
+
- 'OUTPUT_DIR' is a pre-defined string representing the output folder path.
|
|
631
|
+
- 'save_chart(filename)' is a helper function to save the current plot into OUTPUT_DIR.
|
|
632
|
+
- Libraries 'pandas', 'matplotlib.pyplot as plt', and 'seaborn as sns' are already imported.
|
|
633
|
+
|
|
634
|
+
Example usage:
|
|
635
|
+
plt.figure(figsize=(10, 6))
|
|
636
|
+
sns.scatterplot(data=df, x='column_x', y='column_y')
|
|
637
|
+
plt.title('Relationship Title')
|
|
638
|
+
save_chart('chart_name.png')
|
|
639
|
+
plt.close()
|
|
640
|
+
"""
|
|
641
|
+
if not python_code:
|
|
642
|
+
for k, v in kwargs.items():
|
|
643
|
+
if v and isinstance(v, str) and ("plt." in v or "sns." in v or "import " in v or "\n" in v):
|
|
644
|
+
python_code = v
|
|
645
|
+
break
|
|
646
|
+
if not python_code:
|
|
647
|
+
return "Error: python_code is required."
|
|
648
|
+
|
|
649
|
+
clean_code = _strip_markdown_fences(python_code)
|
|
650
|
+
from config.context import current_session_csv, current_session_output_dir
|
|
651
|
+
csv_path = current_session_csv.get() or os.getenv("CURRENT_SESSION_CSV", "")
|
|
652
|
+
output_dir = current_session_output_dir.get() or os.getenv("CURRENT_SESSION_OUTPUT_DIR", "")
|
|
653
|
+
|
|
654
|
+
# Fallbacks if env vars are missing
|
|
655
|
+
if not csv_path:
|
|
656
|
+
csv_path = "data/sessions/default/cleaned.csv"
|
|
657
|
+
if not output_dir:
|
|
658
|
+
output_dir = "outputs/default"
|
|
659
|
+
|
|
660
|
+
script = textwrap.dedent(f"""\
|
|
661
|
+
import os
|
|
662
|
+
import pandas as pd
|
|
663
|
+
import matplotlib
|
|
664
|
+
matplotlib.use('Agg')
|
|
665
|
+
import matplotlib.pyplot as plt
|
|
666
|
+
import seaborn as sns
|
|
667
|
+
import textwrap
|
|
668
|
+
|
|
669
|
+
CSV_PATH = {repr(csv_path)}
|
|
670
|
+
OUTPUT_DIR = {repr(output_dir)}
|
|
671
|
+
|
|
672
|
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
673
|
+
df = pd.read_csv(CSV_PATH)
|
|
674
|
+
|
|
675
|
+
# Safeguard: redirect all read_csv calls to the cleaned CSV path
|
|
676
|
+
_orig_read_csv = pd.read_csv
|
|
677
|
+
def custom_read_csv(*args, **kwargs):
|
|
678
|
+
return _orig_read_csv(CSV_PATH)
|
|
679
|
+
pd.read_csv = custom_read_csv
|
|
680
|
+
|
|
681
|
+
def save_chart(filename):
|
|
682
|
+
if not filename.endswith('.png'):
|
|
683
|
+
filename += '.png'
|
|
684
|
+
path = os.path.join(OUTPUT_DIR, filename)
|
|
685
|
+
plt.savefig(path, bbox_inches='tight', dpi=180)
|
|
686
|
+
print(f"Saved chart: {{filename}}")
|
|
687
|
+
""") + "\n" + clean_code
|
|
688
|
+
|
|
689
|
+
success, output = _run_in_subprocess(script)
|
|
690
|
+
if success:
|
|
691
|
+
return f"Visualization executed successfully. Output:\n{output}"
|
|
692
|
+
return f"Error executing visualization code:\n{output}"
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def auto_coerce_types(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
|
|
696
|
+
"""
|
|
697
|
+
Analyze columns in the DataFrame, detect type mismatches/conflicts,
|
|
698
|
+
and convert them to their appropriate types (e.g., object to numeric or datetime).
|
|
699
|
+
Returns (converted_df, list_of_actions).
|
|
700
|
+
"""
|
|
701
|
+
actions = []
|
|
702
|
+
df = df.copy()
|
|
703
|
+
|
|
704
|
+
for col in df.columns:
|
|
705
|
+
# Skip empty columns
|
|
706
|
+
if df[col].isnull().all():
|
|
707
|
+
continue
|
|
708
|
+
|
|
709
|
+
dtype = df[col].dtype
|
|
710
|
+
|
|
711
|
+
# We only need to coerce object/string columns
|
|
712
|
+
if dtype == 'object':
|
|
713
|
+
sample_non_null = df[col].dropna().head(200).astype(str)
|
|
714
|
+
if sample_non_null.empty:
|
|
715
|
+
continue
|
|
716
|
+
|
|
717
|
+
# 1. Heuristic for Date: check if strings contain date patterns
|
|
718
|
+
date_like_count = 0
|
|
719
|
+
for val in sample_non_null:
|
|
720
|
+
val_clean = val.strip()
|
|
721
|
+
# Skip 4-digit years (e.g. 1990) to prevent converting them to date objects
|
|
722
|
+
if val_clean.isdigit() and len(val_clean) == 4:
|
|
723
|
+
continue
|
|
724
|
+
# matches YYYY-MM-DD, DD/MM/YYYY, or wordy dates like "01 Jul 2026"
|
|
725
|
+
if re.match(r'^\d{4}[-/]\d{1,2}[-/]\d{1,2}', val_clean) or \
|
|
726
|
+
re.match(r'^\d{1,2}[-/]\d{1,2}[-/]\d{4}', val_clean) or \
|
|
727
|
+
re.search(r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', val_clean, re.IGNORECASE):
|
|
728
|
+
date_like_count += 1
|
|
729
|
+
|
|
730
|
+
if date_like_count > len(sample_non_null) * 0.5:
|
|
731
|
+
try:
|
|
732
|
+
try:
|
|
733
|
+
converted = pd.to_datetime(df[col], errors='coerce', format='mixed')
|
|
734
|
+
except (ValueError, TypeError):
|
|
735
|
+
converted = pd.to_datetime(df[col], errors='coerce')
|
|
736
|
+
# If conversion didn't result in all NaTs
|
|
737
|
+
if not converted.isnull().all() and converted.notnull().sum() > len(df[col].dropna()) * 0.7:
|
|
738
|
+
df[col] = converted
|
|
739
|
+
actions.append(f"Converted column '{col}' to Datetime (detected date-like patterns)")
|
|
740
|
+
continue
|
|
741
|
+
except Exception:
|
|
742
|
+
pass
|
|
743
|
+
|
|
744
|
+
# 2. Heuristic for Numeric: check if it could be a numeric column stored as string
|
|
745
|
+
# (e.g., currency "$1,000", percentage "95%", or numbers with commas/spaces/missing placeholders)
|
|
746
|
+
numeric_like_count = 0
|
|
747
|
+
for val in sample_non_null:
|
|
748
|
+
val_clean = val.strip().lower()
|
|
749
|
+
if not val_clean or val_clean in {'nan', 'null', 'n/a', 'na', '?', 'none', '-', '.', 'missing', 'empty'}:
|
|
750
|
+
numeric_like_count += 1
|
|
751
|
+
continue
|
|
752
|
+
# Clean currency symbols, percentage signs, commas, and whitespace
|
|
753
|
+
cleaned_val = re.sub(r'[\$,%\s]', '', val_clean).replace(',', '')
|
|
754
|
+
if re.match(r'^-?\d+(?:\.\d+)?$', cleaned_val):
|
|
755
|
+
numeric_like_count += 1
|
|
756
|
+
|
|
757
|
+
if numeric_like_count > len(sample_non_null) * 0.8:
|
|
758
|
+
try:
|
|
759
|
+
# Clean currency symbols, commas, percent signs, and spaces
|
|
760
|
+
# Replace common string placeholders with empty strings so pd.to_numeric turns them to NaN
|
|
761
|
+
cleaned_col = df[col].astype(str).str.strip()
|
|
762
|
+
# Strip any surrounding quotes first
|
|
763
|
+
cleaned_col = cleaned_col.str.replace(r'^["\']|["\']$', '', regex=True)
|
|
764
|
+
# Replace placeholders (exact whole-string match only, case-insensitive)
|
|
765
|
+
for ph in ['nan', 'null', 'n/a', 'na', '?', 'none', '-', 'missing', 'empty']:
|
|
766
|
+
cleaned_col = cleaned_col.str.replace(re.compile(rf'^\s*{re.escape(ph)}\s*$', re.IGNORECASE), '', regex=True)
|
|
767
|
+
cleaned_col = cleaned_col.str.replace(r'[\$,%\s]', '', regex=True).str.replace(',', '', regex=False)
|
|
768
|
+
# Convert to numeric
|
|
769
|
+
converted = pd.to_numeric(cleaned_col, errors='coerce')
|
|
770
|
+
if not converted.isnull().all():
|
|
771
|
+
non_null_converted = converted.dropna()
|
|
772
|
+
if not non_null_converted.empty and (non_null_converted % 1 == 0).all():
|
|
773
|
+
if converted.isnull().any():
|
|
774
|
+
df[col] = converted.astype('Int64')
|
|
775
|
+
actions.append(f"Converted column '{col}' to Nullable Integer (cleaned currency/delimiters/nulls)")
|
|
776
|
+
else:
|
|
777
|
+
df[col] = converted.astype(int)
|
|
778
|
+
actions.append(f"Converted column '{col}' to Integer (cleaned currency/delimiters/nulls)")
|
|
779
|
+
else:
|
|
780
|
+
df[col] = converted
|
|
781
|
+
actions.append(f"Converted column '{col}' to Float (cleaned currency/delimiters/nulls)")
|
|
782
|
+
continue
|
|
783
|
+
except Exception:
|
|
784
|
+
pass
|
|
785
|
+
|
|
786
|
+
# 3. Heuristic for Boolean: check for binary values (Yes/No, True/False, Y/N, 1/0)
|
|
787
|
+
unique_vals = set(sample_non_null.str.lower().str.strip())
|
|
788
|
+
if unique_vals.issubset({'yes', 'no', 'y', 'n', 'true', 'false', 't', 'f', '1', '0'}):
|
|
789
|
+
# Ensure we have both binary parts represented, not just a single constant value column
|
|
790
|
+
if len(unique_vals) >= 2:
|
|
791
|
+
try:
|
|
792
|
+
bool_map = {
|
|
793
|
+
'yes': True, 'no': False, 'y': True, 'n': False,
|
|
794
|
+
'true': True, 'false': False, 't': True, 'f': False,
|
|
795
|
+
'1': True, '0': False
|
|
796
|
+
}
|
|
797
|
+
df[col] = df[col].astype(str).str.lower().str.strip().map(bool_map)
|
|
798
|
+
actions.append(f"Converted column '{col}' to Boolean (detected binary labels)")
|
|
799
|
+
continue
|
|
800
|
+
except Exception:
|
|
801
|
+
pass
|
|
802
|
+
|
|
803
|
+
return df, actions
|