crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.dockerignore +12 -0
  2. package/.gitattributes +2 -0
  3. package/CHANGELOG.md +86 -0
  4. package/Dockerfile +21 -0
  5. package/LICENSE +21 -0
  6. package/README.md +139 -0
  7. package/USAGE.md +106 -0
  8. package/agents/__init__.py +0 -0
  9. package/agents/cleaner.py +38 -0
  10. package/agents/insights.py +44 -0
  11. package/agents/relation.py +36 -0
  12. package/agents/visualizer.py +41 -0
  13. package/assets/badge_crewai.svg +4 -0
  14. package/assets/badge_matplotlib.svg +4 -0
  15. package/assets/badge_ollama.svg +4 -0
  16. package/assets/badge_pandas.svg +4 -0
  17. package/assets/badge_seaborn.svg +4 -0
  18. package/assets/branding_image.png +0 -0
  19. package/assets/complete_workflow.svg +216 -0
  20. package/assets/favicon.png +0 -0
  21. package/assets/logo.png +0 -0
  22. package/assets/stars.svg +12 -0
  23. package/bin/crewlyze.js +79 -0
  24. package/config/README.md +129 -0
  25. package/config/__init__.py +1 -0
  26. package/config/context.py +16 -0
  27. package/config/llm_config.py +300 -0
  28. package/config/metrics_tracker.py +70 -0
  29. package/crew.py +870 -0
  30. package/crewlyze-3.1.0.tgz +0 -0
  31. package/fix_syntax.py +54 -0
  32. package/main.py +1279 -0
  33. package/package.json +22 -0
  34. package/pyproject.toml +32 -0
  35. package/requirements.txt +33 -0
  36. package/tools/__init__.py +0 -0
  37. package/tools/dataset_tools.py +803 -0
  38. package/ui/__init__.py +3 -0
  39. package/ui/copilot.py +200 -0
  40. package/ui/export.py +800 -0
  41. package/update_appjs.py +54 -0
  42. package/update_llm.py +21 -0
  43. package/update_main.py +20 -0
  44. package/web/app.js +3142 -0
  45. package/web/index.html +1105 -0
  46. package/web/style.css +2561 -0
  47. package/workflows/__init__.py +0 -0
  48. package/workflows/pipeline.py +254 -0
package/crew.py ADDED
@@ -0,0 +1,870 @@
1
+ # Crewlyze
2
+ # Copyright (c) 2025 Sowmiyan S
3
+ # Licensed under the MIT License
4
+
5
+ """
6
+ Main crew orchestration module.
7
+
8
+ Performance improvements in this version:
9
+ - build_dataset_profile() computes a rich data summary before agents start,
10
+ eliminating 6-8 LLM tool-call round-trips across the pipeline.
11
+ - Large files (> 10 000 rows) are sampled to 5 000 rows for profiling;
12
+ the cleaner still operates on the full dataset.
13
+ - relation_task and insight_task run in PARALLEL via ThreadPoolExecutor,
14
+ saving the time of one full sequential task slot.
15
+ - visualize_task receives the actual relation + insight outputs injected
16
+ into its description (rather than relying on CrewAI's context= mechanism
17
+ which requires all tasks to live in the same Crew instance).
18
+ - on_progress callback allows the caller (app.py) to surface intermediate
19
+ results in the UI as each stage completes.
20
+ """
21
+
22
+ import logging
23
+ import os
24
+ import shutil
25
+ import sys
26
+ import time
27
+ import traceback
28
+ from concurrent.futures import ThreadPoolExecutor, as_completed
29
+ from pathlib import Path
30
+ from typing import Callable, Optional
31
+
32
+ import pandas as pd
33
+ from dotenv import load_dotenv
34
+
35
+ load_dotenv()
36
+
37
+ # Suppress noisy loggers
38
+ logging.getLogger("urllib3").setLevel(logging.ERROR)
39
+ logging.getLogger("opentelemetry").setLevel(logging.ERROR)
40
+
41
+ # Disable CrewAI telemetry
42
+ os.environ["CREWAI_TELEMETRY_OPT_OUT"] = "true"
43
+ os.environ["OTEL_SDK_DISABLED"] = "true"
44
+
45
+ # Monkey patch crewai caching to avoid Nvidia NIM / LiteLLM validation errors
46
+ try:
47
+ import crewai.llms.cache as _crewai_cache
48
+ _crewai_cache.mark_cache_breakpoint = lambda msg: msg
49
+ except Exception:
50
+ pass
51
+
52
+ try:
53
+ from crewai import Crew
54
+ except ImportError as exc:
55
+ print(f"ERROR: {exc}\nRun: pip install crewai")
56
+ sys.exit(1)
57
+
58
+ from tools.dataset_tools import build_dataset_profile, generate_plotly_charts, read_csv_robust
59
+ from workflows.pipeline import make_pipeline
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Visualizer Fallback Generator (Pure Python, no LLM)
64
+ # ---------------------------------------------------------------------------
65
+
66
+ def _run_auto_visualizer_fallback(csv_path: Path, output_dir: Path, relations_text: str = "") -> str:
67
+ """
68
+ Pure Python statistical visualizer fallback that runs when the agent fails to save PNGs.
69
+ Uses discovered relation pairs first (relation-aware), then falls back to generic charts.
70
+ Creates structured, premium plots with consistent layout styles.
71
+ """
72
+ import re
73
+ import pandas as pd
74
+ import matplotlib
75
+ matplotlib.use('Agg')
76
+ import matplotlib.pyplot as plt
77
+ import seaborn as sns
78
+
79
+ try:
80
+ df = read_csv_robust(csv_path)
81
+ output_dir.mkdir(parents=True, exist_ok=True)
82
+
83
+ numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
84
+ categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
85
+
86
+ generated = []
87
+ # Dark-themed premium style
88
+ sns.set_theme(style="darkgrid", palette="deep")
89
+ BG_DARK = "#0f172a"
90
+ BG_CARD = "#1e293b"
91
+ TEXT_COLOR = "#e2e8f0"
92
+ GRID_COLOR = "#334155"
93
+ colors = ["#818cf8", "#22d3ee", "#f472b6", "#34d399", "#fb923c"]
94
+
95
+ def _apply_dark_style(fig, ax_list):
96
+ fig.patch.set_facecolor(BG_DARK)
97
+ for ax in (ax_list if isinstance(ax_list, list) else [ax_list]):
98
+ ax.set_facecolor(BG_CARD)
99
+ ax.tick_params(colors=TEXT_COLOR)
100
+ ax.xaxis.label.set_color(TEXT_COLOR)
101
+ ax.yaxis.label.set_color(TEXT_COLOR)
102
+ ax.title.set_color(TEXT_COLOR)
103
+ for spine in ax.spines.values():
104
+ spine.set_edgecolor(GRID_COLOR)
105
+ ax.grid(color=GRID_COLOR, linewidth=0.5)
106
+
107
+ # ── PHASE 1: Parse relation pairs from agent output ────────────────────
108
+ relation_pairs = []
109
+ if relations_text:
110
+ for line in relations_text.split("\n"):
111
+ line = line.strip()
112
+ if not (line and "|" in line and "X:" in line):
113
+ continue
114
+ try:
115
+ parts = [p.strip() for p in line.lstrip("- ").split("|")]
116
+ x_col = parts[0].split(":", 1)[1].strip()
117
+ y_col = parts[1].split(":", 1)[1].strip()
118
+ ptype = parts[2].split(":", 1)[1].strip().lower() if len(parts) > 2 else "scatter"
119
+ if x_col in df.columns and y_col in df.columns and x_col != y_col:
120
+ relation_pairs.append((x_col, y_col, ptype))
121
+ except (IndexError, ValueError):
122
+ continue
123
+
124
+ # ── PHASE 2: Generate relation-based charts ────────────────────────────
125
+ for i, (x_col, y_col, ptype) in enumerate(relation_pairs[:5]):
126
+ color = colors[i % len(colors)]
127
+ try:
128
+ sample = df[[x_col, y_col]].dropna().head(2000)
129
+ if sample.empty:
130
+ continue
131
+
132
+ fig, ax = plt.subplots(figsize=(10, 6))
133
+
134
+ x_is_num = pd.api.types.is_numeric_dtype(df[x_col])
135
+ y_is_num = pd.api.types.is_numeric_dtype(df[y_col])
136
+
137
+ if "bar" in ptype:
138
+ agg = sample.groupby(x_col)[y_col].mean().reset_index().head(20)
139
+ sns.barplot(data=agg, x=x_col, y=y_col, color=color, ax=ax)
140
+ plt.xticks(rotation=40, ha="right", color=TEXT_COLOR)
141
+ title = f"{y_col} by {x_col}"
142
+ elif "line" in ptype:
143
+ sns.lineplot(data=sample.sort_values(x_col), x=x_col, y=y_col, color=color, ax=ax)
144
+ title = f"{y_col} over {x_col}"
145
+ elif "box" in ptype:
146
+ if not x_is_num:
147
+ top_cats = df[x_col].value_counts().head(8).index
148
+ sample = sample[sample[x_col].isin(top_cats)]
149
+ sns.boxplot(data=sample, x=x_col if not x_is_num else None,
150
+ y=y_col, color=color, ax=ax)
151
+ title = f"Distribution of {y_col}"
152
+ elif "hist" in ptype:
153
+ sns.histplot(sample[x_col].dropna(), kde=True, color=color, ax=ax)
154
+ title = f"Distribution of {x_col}"
155
+ else:
156
+ if x_is_num and y_is_num:
157
+ sns.scatterplot(data=sample, x=x_col, y=y_col,
158
+ color=color, alpha=0.7, ax=ax)
159
+ else:
160
+ top_cats = df[x_col].value_counts().head(15).index
161
+ sub = sample[sample[x_col].isin(top_cats)]
162
+ sns.boxplot(data=sub, x=x_col, y=y_col, color=color, ax=ax)
163
+ plt.xticks(rotation=40, ha="right", color=TEXT_COLOR)
164
+ title = f"{x_col} vs {y_col} Relationship"
165
+
166
+ ax.set_title(title, fontsize=13, fontweight="bold", pad=14)
167
+ _apply_dark_style(fig, ax)
168
+ plt.tight_layout()
169
+ safe_name = re.sub(r"[^\w]+", "_", f"relation_{x_col}_vs_{y_col}").lower()
170
+ dest = output_dir / f"{safe_name}.png"
171
+ plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
172
+ plt.close()
173
+ generated.append(dest.name)
174
+ print(f"Relation chart saved: {dest.name}")
175
+
176
+ except Exception as chart_err:
177
+ print(f"Relation chart error ({x_col} vs {y_col}): {chart_err}")
178
+ plt.close()
179
+ continue
180
+
181
+ # ── PHASE 3: Generic fallback charts if no relation charts were made ───
182
+ if not generated:
183
+ # Correlation heatmap
184
+ if len(numeric_cols) >= 2:
185
+ try:
186
+ fig, ax = plt.subplots(figsize=(10, 8))
187
+ corr = df[numeric_cols].corr()
188
+ sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f",
189
+ square=True, cbar_kws={"shrink": .8}, ax=ax,
190
+ annot_kws={"color": TEXT_COLOR})
191
+ ax.set_title("Correlation Matrix", fontsize=14, fontweight="bold", pad=14)
192
+ _apply_dark_style(fig, ax)
193
+ plt.tight_layout()
194
+ dest = output_dir / "correlation_matrix.png"
195
+ plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
196
+ plt.close()
197
+ generated.append(dest.name)
198
+ except Exception:
199
+ plt.close()
200
+
201
+ # Distribution of first numeric col
202
+ if numeric_cols:
203
+ try:
204
+ col = numeric_cols[0]
205
+ fig, ax = plt.subplots(figsize=(10, 6))
206
+ sns.histplot(df[col].dropna(), kde=True, color=colors[0], ax=ax)
207
+ ax.set_title(f"Distribution of {col}", fontsize=13, fontweight="bold", pad=14)
208
+ _apply_dark_style(fig, ax)
209
+ plt.tight_layout()
210
+ dest = output_dir / f"distribution_{col}.png"
211
+ plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
212
+ plt.close()
213
+ generated.append(dest.name)
214
+ except Exception:
215
+ plt.close()
216
+
217
+ # First scatter pair
218
+ if len(numeric_cols) >= 2:
219
+ try:
220
+ x, y = numeric_cols[0], numeric_cols[1]
221
+ fig, ax = plt.subplots(figsize=(10, 6))
222
+ sns.scatterplot(data=df.head(2000), x=x, y=y, color=colors[1], alpha=0.7, ax=ax)
223
+ ax.set_title(f"{x} vs {y} Relationship", fontsize=13, fontweight="bold", pad=14)
224
+ _apply_dark_style(fig, ax)
225
+ plt.tight_layout()
226
+ dest = output_dir / f"scatter_{x}_vs_{y}.png"
227
+ plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
228
+ plt.close()
229
+ generated.append(dest.name)
230
+ except Exception:
231
+ plt.close()
232
+
233
+ # Categorical bar
234
+ if categorical_cols and numeric_cols:
235
+ try:
236
+ cat, num = categorical_cols[0], numeric_cols[0]
237
+ top_cats = df[cat].value_counts().head(10).index
238
+ sub_df = df[df[cat].isin(top_cats)]
239
+ fig, ax = plt.subplots(figsize=(10, 6))
240
+ sns.barplot(data=sub_df, x=cat, y=num, errorbar=None, color=colors[2], ax=ax)
241
+ ax.set_title(f"Average {num} by {cat} (Top 10)", fontsize=13, fontweight="bold", pad=14)
242
+ plt.xticks(rotation=45, ha="right", color=TEXT_COLOR)
243
+ _apply_dark_style(fig, ax)
244
+ plt.tight_layout()
245
+ dest = output_dir / f"bar_{cat}_vs_{num}.png"
246
+ plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
247
+ plt.close()
248
+ generated.append(dest.name)
249
+ except Exception:
250
+ plt.close()
251
+
252
+ return f"Generated {len(generated)} chart(s) ({len(relation_pairs)} from relations, rest generic)."
253
+ except Exception as e:
254
+ return f"Fallback visualization failed: {e}"
255
+
256
+
257
+ # ---------------------------------------------------------------------------
258
+ # Session cleanup helper
259
+ # ---------------------------------------------------------------------------
260
+
261
+ def _cleanup_old_sessions(max_age_hours: int = 24) -> None:
262
+ """Remove session directories older than *max_age_hours*.
263
+ Also enforces a strict disk quota limit: if the total combined size of sessions and
264
+ outputs exceeds 1.0 GB, it prunes the oldest folders until the size is under 400 MB.
265
+ """
266
+ user_home = Path.home() / ".crewlyze"
267
+ data_dir = Path(os.getenv("CREWLYZE_DATA_DIR", str(user_home / "data")))
268
+ sessions_root = data_dir / "sessions"
269
+ outputs_root = Path(os.getenv("CREWLYZE_OUTPUTS_DIR", str(user_home / "outputs")))
270
+
271
+ # 1. Clean based on age
272
+ for root in (sessions_root, outputs_root):
273
+ if not root.exists():
274
+ continue
275
+ cutoff = time.time() - max_age_hours * 3600
276
+ for session_dir in root.iterdir():
277
+ if session_dir.is_dir():
278
+ try:
279
+ if session_dir.stat().st_mtime < cutoff:
280
+ shutil.rmtree(session_dir, ignore_errors=True)
281
+ except OSError:
282
+ pass
283
+
284
+ # 2. Clean based on disk quota (max 1.0 GB combined)
285
+ def get_dir_size(path: Path) -> int:
286
+ if not path.exists():
287
+ return 0
288
+ return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file())
289
+
290
+ total_size = get_dir_size(sessions_root) + get_dir_size(outputs_root)
291
+ max_quota_bytes = 1000 * 1024 * 1024 # 1.0 GB
292
+ target_quota_bytes = 400 * 1024 * 1024 # 400 MB
293
+
294
+ if total_size > max_quota_bytes:
295
+ print(f"Disk quota exceeded: {total_size / (1024*1024):.1f}MB. Pruning oldest sessions...")
296
+ # Collect all session subfolders and outputs with their mtimes
297
+ subfolders = []
298
+ for root in (sessions_root, outputs_root):
299
+ if root.exists():
300
+ for folder in root.iterdir():
301
+ if folder.is_dir():
302
+ subfolders.append((folder, folder.stat().st_mtime))
303
+
304
+ # Sort oldest first
305
+ subfolders.sort(key=lambda x: x[1])
306
+
307
+ for folder, _ in subfolders:
308
+ try:
309
+ shutil.rmtree(folder, ignore_errors=True)
310
+ # Recalculate
311
+ total_size = get_dir_size(sessions_root) + get_dir_size(outputs_root)
312
+ if total_size <= target_quota_bytes:
313
+ print(f"Disk footprint successfully reduced to {total_size / (1024*1024):.1f}MB.")
314
+ break
315
+ except Exception as e:
316
+ print(f"Error pruning session folder {folder}: {e}")
317
+
318
+
319
+ # ---------------------------------------------------------------------------
320
+ # Parallel task execution helper
321
+ # ---------------------------------------------------------------------------
322
+
323
+ def _run_single_task(agent, task, max_rpm: int = 8) -> object:
324
+ """Run a single CrewAI task in its own isolated mini-Crew.
325
+
326
+ Used to execute relation_task and insight_task concurrently.
327
+ Each call creates a separate Crew instance — no shared state.
328
+
329
+ Returns the task object (with .output populated by kickoff).
330
+ """
331
+ mini = Crew(
332
+ agents=[agent],
333
+ tasks=[task],
334
+ max_rpm=max_rpm,
335
+ cache=False,
336
+ verbose=True,
337
+ )
338
+ mini.kickoff()
339
+ return task
340
+
341
+
342
+ # ---------------------------------------------------------------------------
343
+ # Output extractor
344
+ # ---------------------------------------------------------------------------
345
+
346
+ def _safe_output(task) -> str:
347
+ """Safely extract raw string output and error diagnostics from a completed CrewAI task."""
348
+ if task is None:
349
+ return ""
350
+
351
+ output_parts = []
352
+ if hasattr(task, "output") and task.output is not None:
353
+ output_parts.append(str(task.output.raw if hasattr(task.output, "raw") else task.output))
354
+
355
+ for attr_name in ("error", "exception", "traceback", "trace"): # best-effort diagnostics
356
+ if hasattr(task, attr_name):
357
+ attr_value = getattr(task, attr_name)
358
+ if attr_value:
359
+ output_parts.append(f"[{attr_name}] {attr_value}")
360
+
361
+ if not output_parts and hasattr(task, "__dict__"):
362
+ # Fallback: include any candidate diagnostic attributes from the task object
363
+ for key in ("status", "state", "result", "message"):
364
+ if hasattr(task, key):
365
+ value = getattr(task, key)
366
+ if value:
367
+ output_parts.append(f"[{key}] {value}")
368
+
369
+ return "\n\n".join(output_parts).strip()
370
+
371
+
372
+ def _run_auto_relation_fallback(df: pd.DataFrame) -> str:
373
+ """
374
+ Generate a fallback relationships text using purely statistical correlations.
375
+ """
376
+ try:
377
+ # Get numeric cols
378
+ num_cols = df.select_dtypes(include=["number"]).columns.tolist()
379
+ cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
380
+
381
+ relations = []
382
+
383
+ # 1. Numeric correlation pairs
384
+ if len(num_cols) >= 2:
385
+ corr = df[num_cols].corr().abs()
386
+ unstacked = corr.unstack().sort_values(ascending=False)
387
+ unstacked = unstacked[unstacked.index.get_level_values(0) != unstacked.index.get_level_values(1)]
388
+ added = set()
389
+ for (c1, c2), val in unstacked.items():
390
+ pair = tuple(sorted([c1, c2]))
391
+ if pair not in added:
392
+ added.add(pair)
393
+ relations.append(
394
+ f"- X: {c1} | Y: {c2} | Type: Scatter Plot | Details: High correlation coefficient of {val:.2f} identified between numeric variables."
395
+ )
396
+ if len(relations) >= 3:
397
+ break
398
+
399
+ # 2. Numeric vs categorical pairs
400
+ for cat in cat_cols[:2]:
401
+ for num in num_cols[:2]:
402
+ if len(relations) >= 5:
403
+ break
404
+ relations.append(
405
+ f"- X: {cat} | Y: {num} | Type: Bar Chart | Details: Comparison of average {num} across different values of the categorical column {cat}."
406
+ )
407
+
408
+ if not relations:
409
+ cols = df.columns.tolist()
410
+ for i in range(min(5, len(cols) - 1)):
411
+ relations.append(
412
+ f"- X: {cols[i]} | Y: {cols[i+1]} | Type: Bar Chart | Details: Distribution pattern comparison."
413
+ )
414
+
415
+ return "\n".join(relations)
416
+ except Exception as e:
417
+ return f"- X: {df.columns[0]} | Y: {df.columns[0]} | Type: Bar Chart | Details: Fallback relation due to error: {e}"
418
+
419
+
420
+ def _run_auto_insights_fallback(df: pd.DataFrame, project_goal: str = "") -> str:
421
+ """
422
+ Generate standard fallback consulting report with 5 insights based on dataframe profile.
423
+ """
424
+ n_rows, n_cols = df.shape
425
+ num_cols = df.select_dtypes(include=["number"]).columns.tolist()
426
+ cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
427
+
428
+ goal_sentence = f"Addressing the primary objective: '{project_goal}'" if project_goal else "Standard dataset optimization"
429
+
430
+ report = []
431
+ report.append("### Objectives & Goals")
432
+ report.append(f"Execute comprehensive automated analysis. {goal_sentence}.\n")
433
+
434
+ report.append("### Dataset Statistics")
435
+ report.append(f"- Total rows: {n_rows}")
436
+ report.append(f"- Total columns: {n_cols}")
437
+ report.append(f"- Numeric columns: {', '.join(num_cols) if num_cols else 'None'}")
438
+ report.append(f"- Categorical columns: {', '.join(cat_cols) if cat_cols else 'None'}\n")
439
+
440
+ report.append("### Strategic Insights")
441
+
442
+ for i in range(1, 6):
443
+ obs = f"Analyzed distribution and patterns across dataset attributes (index {i})."
444
+ impl = "Variations in these variables indicate potential performance clusters and operational segments."
445
+ strat = "Establish tracking dashboards to monitor column distributions and segment actions accordingly."
446
+ if i == 1 and num_cols:
447
+ obs = f"Descriptive statistical summary of key driver '{num_cols[0]}' shows standard distribution."
448
+ impl = f"Operational variance in '{num_cols[0]}' direct impacts overall workflow efficiency and revenue metrics."
449
+ strat = f"Implement optimization safeguards on '{num_cols[0]}' to minimize operational deviation."
450
+ elif i == 2 and len(num_cols) >= 2:
451
+ obs = f"Correlation analysis shows distinct dependency between '{num_cols[0]}' and '{num_cols[1]}'."
452
+ impl = f"Resource allocation in '{num_cols[0]}' exhibits a lockstep relationship with '{num_cols[1]}' performance."
453
+ strat = f"Balance budget allocations dynamically between '{num_cols[0]}' and '{num_cols[1]}' to maximize ROI."
454
+ elif i == 3 and cat_cols:
455
+ obs = f"Categorical breakdown shows high frequency concentration in column '{cat_cols[0]}'."
456
+ impl = f"Customer or operational focus is heavily centered on '{cat_cols[0]}' dominant values, leaving other areas under-served."
457
+ strat = f"Launch targeted campaigns or resource plans to diversify segments beyond '{cat_cols[0]}' top attributes."
458
+
459
+ report.append(f"{i}. **Observation**: {obs}")
460
+ report.append(f" **Business Implication**: {impl}")
461
+ report.append(f" **Actionable Strategy**: {strat}\n")
462
+
463
+ report.append("### Warnings & Alerts")
464
+ report.append("- [Auto-Healing Fallback Alert]: Active insights agent failed. Showing baseline statistical intelligence insights.")
465
+
466
+ return "\n".join(report)
467
+
468
+
469
+ # ---------------------------------------------------------------------------
470
+ # Main entry point
471
+ # ---------------------------------------------------------------------------
472
+
473
+ def run_crew(
474
+ csv_path: str,
475
+ session_id: str = "default",
476
+ on_progress: Optional[Callable[[str, object], None]] = None,
477
+ selected_tasks: Optional[list[str]] = None,
478
+ deep_analysis: bool = False,
479
+ ) -> dict:
480
+ """
481
+ Run the full multi-agent analysis pipeline on *csv_path*.
482
+
483
+ Pipeline stages
484
+ ---------------
485
+ 1. Clean (sequential) — Data Cleaner agent
486
+ 2. Relations (parallel with Insights) — Relationship Analyst agent
487
+ 2. Insights (parallel with Relations) — BI Analyst agent
488
+ 3. Visualize (sequential, after 1 + 2) — Data Visualizer agent
489
+ 4. Plotly (pure Python, no LLM) — generate_plotly_charts()
490
+
491
+ Parameters
492
+ ----------
493
+ csv_path : Path to the uploaded CSV file.
494
+ session_id : Unique identifier for this session (isolates files/outputs).
495
+ on_progress : Optional callback(stage: str, data: object) called after
496
+ each stage completes. Stages: "profiling", "cleaning",
497
+ "relations", "insights", "visualization", "plotly".
498
+
499
+ Returns
500
+ -------
501
+ dict with keys:
502
+ dataframe, cleaning_steps, relations, insights, code,
503
+ output_dir, plotly_charts
504
+ """
505
+ _cleanup_old_sessions()
506
+
507
+ import time
508
+ from config.metrics_tracker import log_metric
509
+
510
+ start_run = time.time()
511
+ stage_times = {}
512
+ total_tokens = 0
513
+
514
+ def _progress(stage: str, data: object = None) -> None:
515
+ if on_progress:
516
+ on_progress(stage, data)
517
+
518
+ # ── Per-session directories ───────────────────────────────────────────────
519
+ user_home = Path.home() / ".crewlyze"
520
+ data_dir = Path(os.getenv("CREWLYZE_DATA_DIR", str(user_home / "data")))
521
+ outputs_dir_base = Path(os.getenv("CREWLYZE_OUTPUTS_DIR", str(user_home / "outputs")))
522
+
523
+ session_data_dir = data_dir / "sessions" / session_id
524
+ session_output_dir = outputs_dir_base / session_id
525
+ session_data_dir.mkdir(parents=True, exist_ok=True)
526
+ session_output_dir.mkdir(parents=True, exist_ok=True)
527
+
528
+ # Clean up previous visualizations for this session only
529
+ for existing_png in session_output_dir.glob("*.png"):
530
+ existing_png.unlink(missing_ok=True)
531
+
532
+ print("=" * 50)
533
+ print("Crewlyze")
534
+ print("=" * 50)
535
+
536
+ # ── Load original dataset ─────────────────────────────────────────────────
537
+ try:
538
+ df = read_csv_robust(csv_path)
539
+ except FileNotFoundError:
540
+ raise FileNotFoundError(f"Upload not found at: {csv_path}")
541
+
542
+ n_rows, n_cols = df.shape
543
+ print(f"Loaded {n_rows:,} rows, {n_cols} columns")
544
+ cols_preview = ", ".join(df.columns[:10])
545
+ if n_cols > 10:
546
+ cols_preview += "..."
547
+ print(f"Columns: {cols_preview}")
548
+
549
+ # ── Backup original before agents touch it ────────────────────────────────
550
+ original_backup = session_data_dir / "original.csv"
551
+ cleaned_path = session_data_dir / "cleaned.csv"
552
+
553
+ df.to_csv(original_backup, index=False)
554
+ df.to_csv(cleaned_path, index=False)
555
+ print(f"Original backed up → {original_backup}")
556
+ print(f"Working copy created → {cleaned_path}\n")
557
+
558
+ os.environ["CURRENT_SESSION_CSV"] = str(cleaned_path)
559
+ os.environ["CURRENT_SESSION_OUTPUT_DIR"] = str(session_output_dir)
560
+
561
+ # Determine requested task stages and deep analysis mode
562
+ if selected_tasks is None:
563
+ selected_tasks = []
564
+
565
+ env_tasks = selected_tasks or []
566
+ if not env_tasks:
567
+ env_tasks = ["cleaning", "relations", "insights", "visualization"]
568
+ do_cleaning = "cleaning" in env_tasks
569
+ do_relations = "relations" in env_tasks
570
+ do_insights = "insights" in env_tasks
571
+ do_visualization = "visualization" in env_tasks
572
+
573
+ # ── Automatic Data Type Inference and Coercion ────────────────────────────
574
+ coercion_summary = ""
575
+ if do_cleaning:
576
+ print("Running automatic data type coercion ...")
577
+ from tools.dataset_tools import auto_coerce_types
578
+ df_coerced, coercion_actions = auto_coerce_types(df)
579
+ if coercion_actions:
580
+ print("Data type coercion completed:")
581
+ coercion_lines = []
582
+ for action in coercion_actions:
583
+ print(f" - {action}")
584
+ coercion_lines.append(f"- {action}")
585
+ coercion_summary = "\n".join(coercion_lines)
586
+ # Save the coerced dataframe to cleaned_path
587
+ df_coerced.to_csv(cleaned_path, index=False)
588
+ # Update our in-memory df and shapes
589
+ df = df_coerced
590
+ n_rows, n_cols = df.shape
591
+ else:
592
+ print("No type conflicts detected.")
593
+
594
+ # ── Pre-compute dataset profile (eliminates 6-8 agent tool-call round-trips)
595
+ # Large files are sampled; the cleaner still operates on the full dataset.
596
+ profile_max_rows = 5000 if n_rows > 10_000 else n_rows
597
+ if n_rows > 10_000:
598
+ print(f"Large file detected ({n_rows:,} rows). "
599
+ f"Profiling on {profile_max_rows:,}-row sample ...")
600
+ print("Building dataset profile ...")
601
+ start_prof = time.time()
602
+ profile = build_dataset_profile(str(cleaned_path), max_rows=profile_max_rows)
603
+ stage_times["profiling"] = time.time() - start_prof
604
+ _progress("profiling", profile)
605
+ print("Profile ready.\n")
606
+
607
+ if not deep_analysis:
608
+ from config.context import current_deep_analysis
609
+ deep_analysis = current_deep_analysis.get()
610
+
611
+ # Load goal, title, and existing tweaked relations if available
612
+ project_goal = ""
613
+ report_title = ""
614
+ existing_relations = ""
615
+ try:
616
+ import json
617
+ meta_path = session_data_dir / "metadata.json"
618
+ if meta_path.exists():
619
+ with open(meta_path, "r", encoding="utf-8") as f:
620
+ meta = json.load(f)
621
+ project_goal = meta.get("optimized_goal") or meta.get("goal") or ""
622
+ report_title = meta.get("report_title") or ""
623
+
624
+ # Load tweaked relations from results.json
625
+ results_path = session_data_dir / "results.json"
626
+ if results_path.exists():
627
+ with open(results_path, "r", encoding="utf-8") as f:
628
+ res_data = json.load(f)
629
+ existing_relations = res_data.get("relations") or ""
630
+ except Exception as e:
631
+ print(f"Warning: Could not read metadata or results cache: {e}")
632
+
633
+ # ── Build fresh agents + tasks ────────────────────────────────────────────
634
+ agents, tasks = make_pipeline(
635
+ session_id,
636
+ profile=profile,
637
+ selected_tasks=env_tasks,
638
+ deep_analysis=deep_analysis,
639
+ project_goal=project_goal,
640
+ report_title=report_title,
641
+ existing_relations=existing_relations,
642
+ coercion_summary=coercion_summary,
643
+ )
644
+ # tasks = [clean_task, relation_task, insight_task, visualize_task]
645
+
646
+ # ════════════════════════════════════════════════════════════════════════
647
+ # STAGE 1 — Clean (sequential, must run before anything else)
648
+ # ════════════════════════════════════════════════════════════════════════
649
+ clean_output = "Data cleaning was skipped by user selection."
650
+
651
+ if do_cleaning:
652
+ print("\n[Stage 1/4] Running Data Cleaner ...")
653
+ start_clean_stage = time.time()
654
+ clean_crew = Crew(
655
+ agents=[agents[0]],
656
+ tasks=[tasks[0]],
657
+ max_rpm=15,
658
+ cache=True,
659
+ verbose=True,
660
+ )
661
+ try:
662
+ clean_crew.kickoff()
663
+ clean_output = _safe_output(tasks[0])
664
+ try:
665
+ if hasattr(clean_crew, "usage_metrics") and clean_crew.usage_metrics:
666
+ total_tokens += clean_crew.usage_metrics.get("total_tokens", 0)
667
+ except Exception:
668
+ pass
669
+ except Exception as exc:
670
+ print(f"Cleaning error: {exc}. Activating auto-healing fallback...")
671
+ traceback.print_exc()
672
+ clean_output = (
673
+ f"Data Cleaner encountered an error: {exc}.\n"
674
+ "- Auto-healing fallback: Skipped active code execution and used raw data copy to prevent pipeline failure."
675
+ )
676
+
677
+ stage_times["cleaning"] = time.time() - start_clean_stage
678
+ _progress("cleaning", clean_output)
679
+ print("[Stage 1/4] Cleaning complete.\n")
680
+ else:
681
+ print("\n[Stage 1/4] Skipping Data Cleaner (user selection).\n")
682
+ _progress("cleaning", clean_output)
683
+
684
+ # ════════════════════════════════════════════════════════════════════════
685
+ # STAGE 2 — Relations + Insights (PARALLEL)
686
+ # ════════════════════════════════════════════════════════════════════════
687
+ relation_output = "Relationship mapping was skipped by user selection."
688
+ insights_output = "Business insights generation was skipped by user selection."
689
+
690
+ if do_relations or do_insights:
691
+ print("[Stage 2/4] Running Relation Analyst + BI Analyst ...")
692
+ start_rel_ins_stage = time.time()
693
+
694
+ if do_relations and do_insights:
695
+ import contextvars
696
+ ctx1 = contextvars.copy_context()
697
+ ctx2 = contextvars.copy_context()
698
+
699
+ def run_rel_safe():
700
+ try:
701
+ res_task = _run_single_task(agents[1], tasks[1], 8)
702
+ return _safe_output(res_task)
703
+ except Exception as e:
704
+ print(f"Relations Agent error: {e}. Activating auto-healing fallback...")
705
+ traceback.print_exc()
706
+ return _run_auto_relation_fallback(df)
707
+
708
+ def run_ins_safe():
709
+ try:
710
+ res_task = _run_single_task(agents[2], tasks[2], 8)
711
+ return _safe_output(res_task)
712
+ except Exception as e:
713
+ print(f"Insights Agent error: {e}. Activating auto-healing fallback...")
714
+ traceback.print_exc()
715
+ return _run_auto_insights_fallback(df, project_goal)
716
+
717
+ try:
718
+ with ThreadPoolExecutor(max_workers=2, thread_name_prefix="crew") as executor:
719
+ rel_future = executor.submit(ctx1.run, run_rel_safe)
720
+ ins_future = executor.submit(ctx2.run, run_ins_safe)
721
+ relation_output = rel_future.result()
722
+ insights_output = ins_future.result()
723
+ except Exception as exc:
724
+ print(f"Parallel execution collapsed: {exc}. Running fallbacks...")
725
+ traceback.print_exc()
726
+ if do_relations:
727
+ relation_output = _run_auto_relation_fallback(df)
728
+ if do_insights:
729
+ insights_output = _run_auto_insights_fallback(df, project_goal)
730
+ else:
731
+ if do_relations:
732
+ try:
733
+ rel_crew = Crew(agents=[agents[1]], tasks=[tasks[1]], max_rpm=15, cache=True, verbose=True)
734
+ rel_crew.kickoff()
735
+ relation_output = _safe_output(tasks[1])
736
+ except Exception as e:
737
+ print(f"Relations Agent error: {e}. Activating auto-healing fallback...")
738
+ traceback.print_exc()
739
+ relation_output = _run_auto_relation_fallback(df)
740
+ if do_insights:
741
+ try:
742
+ ins_crew = Crew(agents=[agents[2]], tasks=[tasks[2]], max_rpm=15, cache=True, verbose=True)
743
+ ins_crew.kickoff()
744
+ insights_output = _safe_output(tasks[2])
745
+ except Exception as e:
746
+ print(f"Insights Agent error: {e}. Activating auto-healing fallback...")
747
+ traceback.print_exc()
748
+ insights_output = _run_auto_insights_fallback(df, project_goal)
749
+
750
+ _progress("relations", relation_output)
751
+ _progress("insights", insights_output)
752
+ stage_times["relations_insights"] = time.time() - start_rel_ins_stage
753
+ print("[Stage 2/4] Relations + Insights complete.\n")
754
+
755
+ # ════════════════════════════════════════════════════════════════════════
756
+ # STAGE 3 — Visualize (sequential, receives actual outputs as context)
757
+ # ════════════════════════════════════════════════════════════════════════
758
+ visualize_output = "Visualization was skipped by user selection."
759
+
760
+ if do_visualization:
761
+ print("[Stage 3/4] Running Data Visualizer ...")
762
+ start_viz_stage = time.time()
763
+
764
+ # Inject relation + insight outputs directly into the task description
765
+ # so the visualizer has full context without relying on CrewAI's
766
+ # cross-crew context= mechanism.
767
+ viz_task = tasks[3]
768
+ viz_task.description += (
769
+ f"\n\nRELATIONSHIPS TO VISUALIZE:\n{relation_output}"
770
+ f"\n\nKEY INSIGHTS FOR CONTEXT:\n{insights_output}"
771
+ )
772
+
773
+ viz_crew = Crew(
774
+ agents=[agents[3]],
775
+ tasks=[viz_task],
776
+ max_rpm=15,
777
+ cache=True,
778
+ verbose=True,
779
+ )
780
+
781
+ try:
782
+ viz_crew.kickoff()
783
+ visualize_output = _safe_output(viz_task)
784
+ try:
785
+ if hasattr(viz_crew, "usage_metrics") and viz_crew.usage_metrics:
786
+ total_tokens += viz_crew.usage_metrics.get("total_tokens", 0)
787
+ except Exception:
788
+ pass
789
+ except Exception as exc:
790
+ print(f"Visualization Agent error: {exc}. Activating auto-healing visualizer fallback...")
791
+ traceback.print_exc()
792
+ visualize_output = f"Visualization Agent encountered error: {exc}"
793
+
794
+ # Auto-healing fallback check: if no PNG charts were successfully saved
795
+ png_files = list(session_output_dir.glob("*.png"))
796
+ if not png_files:
797
+ print("No PNG charts generated by agent. Running relation-aware visualizer fallback...")
798
+ fallback_msg = _run_auto_visualizer_fallback(
799
+ cleaned_path, session_output_dir, relations_text=relation_output
800
+ )
801
+ visualize_output = f"{visualize_output}\n\n[Auto-Healing Fallback Status]: {fallback_msg}"
802
+ print(fallback_msg)
803
+ else:
804
+ print("[Stage 3/4] Skipping Data Visualizer (user selection).\n")
805
+
806
+ _progress("visualization", visualize_output)
807
+ stage_times["visualization"] = time.time() - start_viz_stage
808
+ print("[Stage 3/4] Visualization complete.\n")
809
+
810
+ # ── Generate interactive Plotly charts (pure Python, no LLM) ─────────────
811
+ print("[Stage 4/4] Building interactive Plotly charts ...")
812
+ start_plotly_stage = time.time()
813
+ plotly_charts = generate_plotly_charts(
814
+ csv_path=str(cleaned_path),
815
+ relations_text=relation_output,
816
+ )
817
+ _progress("plotly", plotly_charts)
818
+ stage_times["plotly"] = time.time() - start_plotly_stage
819
+ print(f"Generated {len(plotly_charts)} interactive chart(s).\n")
820
+
821
+ # ── Reload cleaned dataframe ──────────────────────────────────────────────
822
+ try:
823
+ cleaned_df = read_csv_robust(cleaned_path)
824
+ except Exception:
825
+ print("WARNING: Could not load cleaned CSV. Falling back to original data.")
826
+ cleaned_df = df
827
+
828
+ total_time = time.time() - start_run
829
+ try:
830
+ dataset_name = Path(csv_path).name
831
+ est_cost = (total_tokens / 1_000_000) * 0.15 if total_tokens else 0.0
832
+ log_metric(
833
+ session_id=session_id,
834
+ dataset_name=dataset_name,
835
+ rows=n_rows,
836
+ cols=n_cols,
837
+ stages=stage_times,
838
+ total_time=total_time,
839
+ success=True,
840
+ token_usage=total_tokens,
841
+ estimated_cost=est_cost
842
+ )
843
+ except Exception as e:
844
+ print(f"Error logging metric: {e}")
845
+
846
+ return {
847
+ "dataframe": cleaned_df,
848
+ "cleaning_steps": clean_output,
849
+ "relations": relation_output,
850
+ "insights": insights_output,
851
+ "code": visualize_output,
852
+ "output_dir": str(session_output_dir),
853
+ "plotly_charts": plotly_charts,
854
+ }
855
+
856
+
857
+ # ---------------------------------------------------------------------------
858
+ # CLI entry point
859
+ # ---------------------------------------------------------------------------
860
+
861
+ if __name__ == "__main__":
862
+ default_path = (Path.cwd() / "data" / "TB_Burden_Country.csv").resolve()
863
+ path = input(
864
+ f"Enter the path to your CSV file (default: {default_path.name}): "
865
+ ) or str(default_path)
866
+ report = run_crew(path, session_id="cli")
867
+ if report:
868
+ print("\nAnalysis Complete.")
869
+ print("Crewlyze")
870
+ print("Prithiv.A.K Sebin.S Sowmiyan.S")