npm - crewlyze - Versions diffs - 3.1.0 - Mend

crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/.dockerignore +12 -0
package/.gitattributes +2 -0
package/CHANGELOG.md +86 -0
package/Dockerfile +21 -0
package/LICENSE +21 -0
package/README.md +139 -0
package/USAGE.md +106 -0
package/agents/__init__.py +0 -0
package/agents/cleaner.py +38 -0
package/agents/insights.py +44 -0
package/agents/relation.py +36 -0
package/agents/visualizer.py +41 -0
package/assets/badge_crewai.svg +4 -0
package/assets/badge_matplotlib.svg +4 -0
package/assets/badge_ollama.svg +4 -0
package/assets/badge_pandas.svg +4 -0
package/assets/badge_seaborn.svg +4 -0
package/assets/branding_image.png +0 -0
package/assets/complete_workflow.svg +216 -0
package/assets/favicon.png +0 -0
package/assets/logo.png +0 -0
package/assets/stars.svg +12 -0
package/bin/crewlyze.js +79 -0
package/config/README.md +129 -0
package/config/__init__.py +1 -0
package/config/context.py +16 -0
package/config/llm_config.py +300 -0
package/config/metrics_tracker.py +70 -0
package/crew.py +870 -0
package/crewlyze-3.1.0.tgz +0 -0
package/fix_syntax.py +54 -0
package/main.py +1279 -0
package/package.json +22 -0
package/pyproject.toml +32 -0
package/requirements.txt +33 -0
package/tools/__init__.py +0 -0
package/tools/dataset_tools.py +803 -0
package/ui/__init__.py +3 -0
package/ui/copilot.py +200 -0
package/ui/export.py +800 -0
package/update_appjs.py +54 -0
package/update_llm.py +21 -0
package/update_main.py +20 -0
package/web/app.js +3142 -0
package/web/index.html +1105 -0
package/web/style.css +2561 -0
package/workflows/__init__.py +0 -0
package/workflows/pipeline.py +254 -0

package/crew.py ADDED Viewed

@@ -0,0 +1,870 @@
+# Crewlyze
+# Copyright (c) 2025 Sowmiyan S
+# Licensed under the MIT License
+"""
+Main crew orchestration module.
+Performance improvements in this version:
+- build_dataset_profile() computes a rich data summary before agents start,
+  eliminating 6-8 LLM tool-call round-trips across the pipeline.
+- Large files (> 10 000 rows) are sampled to 5 000 rows for profiling;
+  the cleaner still operates on the full dataset.
+- relation_task and insight_task run in PARALLEL via ThreadPoolExecutor,
+  saving the time of one full sequential task slot.
+- visualize_task receives the actual relation + insight outputs injected
+  into its description (rather than relying on CrewAI's context= mechanism
+  which requires all tasks to live in the same Crew instance).
+- on_progress callback allows the caller (app.py) to surface intermediate
+  results in the UI as each stage completes.
+"""
+import logging
+import os
+import shutil
+import sys
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Callable, Optional
+import pandas as pd
+from dotenv import load_dotenv
+load_dotenv()
+# Suppress noisy loggers
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+logging.getLogger("opentelemetry").setLevel(logging.ERROR)
+# Disable CrewAI telemetry
+os.environ["CREWAI_TELEMETRY_OPT_OUT"] = "true"
+os.environ["OTEL_SDK_DISABLED"]        = "true"
+# Monkey patch crewai caching to avoid Nvidia NIM / LiteLLM validation errors
+try:
+    import crewai.llms.cache as _crewai_cache
+    _crewai_cache.mark_cache_breakpoint = lambda msg: msg
+except Exception:
+    pass
+try:
+    from crewai import Crew
+except ImportError as exc:
+    print(f"ERROR: {exc}\nRun: pip install crewai")
+    sys.exit(1)
+from tools.dataset_tools import build_dataset_profile, generate_plotly_charts, read_csv_robust
+from workflows.pipeline import make_pipeline
+# ---------------------------------------------------------------------------
+# Visualizer Fallback Generator (Pure Python, no LLM)
+# ---------------------------------------------------------------------------
+def _run_auto_visualizer_fallback(csv_path: Path, output_dir: Path, relations_text: str = "") -> str:
+    """
+    Pure Python statistical visualizer fallback that runs when the agent fails to save PNGs.
+    Uses discovered relation pairs first (relation-aware), then falls back to generic charts.
+    Creates structured, premium plots with consistent layout styles.
+    """
+    import re
+    import pandas as pd
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    try:
+        df = read_csv_robust(csv_path)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+        categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
+        generated = []
+        # Dark-themed premium style
+        sns.set_theme(style="darkgrid", palette="deep")
+        BG_DARK = "#0f172a"
+        BG_CARD = "#1e293b"
+        TEXT_COLOR = "#e2e8f0"
+        GRID_COLOR = "#334155"
+        colors = ["#818cf8", "#22d3ee", "#f472b6", "#34d399", "#fb923c"]
+        def _apply_dark_style(fig, ax_list):
+            fig.patch.set_facecolor(BG_DARK)
+            for ax in (ax_list if isinstance(ax_list, list) else [ax_list]):
+                ax.set_facecolor(BG_CARD)
+                ax.tick_params(colors=TEXT_COLOR)
+                ax.xaxis.label.set_color(TEXT_COLOR)
+                ax.yaxis.label.set_color(TEXT_COLOR)
+                ax.title.set_color(TEXT_COLOR)
+                for spine in ax.spines.values():
+                    spine.set_edgecolor(GRID_COLOR)
+                ax.grid(color=GRID_COLOR, linewidth=0.5)
+        # ── PHASE 1: Parse relation pairs from agent output ────────────────────
+        relation_pairs = []
+        if relations_text:
+            for line in relations_text.split("\n"):
+                line = line.strip()
+                if not (line and "|" in line and "X:" in line):
+                    continue
+                try:
+                    parts = [p.strip() for p in line.lstrip("- ").split("|")]
+                    x_col = parts[0].split(":", 1)[1].strip()
+                    y_col = parts[1].split(":", 1)[1].strip()
+                    ptype = parts[2].split(":", 1)[1].strip().lower() if len(parts) > 2 else "scatter"
+                    if x_col in df.columns and y_col in df.columns and x_col != y_col:
+                        relation_pairs.append((x_col, y_col, ptype))
+                except (IndexError, ValueError):
+                    continue
+        # ── PHASE 2: Generate relation-based charts ────────────────────────────
+        for i, (x_col, y_col, ptype) in enumerate(relation_pairs[:5]):
+            color = colors[i % len(colors)]
+            try:
+                sample = df[[x_col, y_col]].dropna().head(2000)
+                if sample.empty:
+                    continue
+                fig, ax = plt.subplots(figsize=(10, 6))
+                x_is_num = pd.api.types.is_numeric_dtype(df[x_col])
+                y_is_num = pd.api.types.is_numeric_dtype(df[y_col])
+                if "bar" in ptype:
+                    agg = sample.groupby(x_col)[y_col].mean().reset_index().head(20)
+                    sns.barplot(data=agg, x=x_col, y=y_col, color=color, ax=ax)
+                    plt.xticks(rotation=40, ha="right", color=TEXT_COLOR)
+                    title = f"{y_col} by {x_col}"
+                elif "line" in ptype:
+                    sns.lineplot(data=sample.sort_values(x_col), x=x_col, y=y_col, color=color, ax=ax)
+                    title = f"{y_col} over {x_col}"
+                elif "box" in ptype:
+                    if not x_is_num:
+                        top_cats = df[x_col].value_counts().head(8).index
+                        sample = sample[sample[x_col].isin(top_cats)]
+                    sns.boxplot(data=sample, x=x_col if not x_is_num else None,
+                                y=y_col, color=color, ax=ax)
+                    title = f"Distribution of {y_col}"
+                elif "hist" in ptype:
+                    sns.histplot(sample[x_col].dropna(), kde=True, color=color, ax=ax)
+                    title = f"Distribution of {x_col}"
+                else:
+                    if x_is_num and y_is_num:
+                        sns.scatterplot(data=sample, x=x_col, y=y_col,
+                                        color=color, alpha=0.7, ax=ax)
+                    else:
+                        top_cats = df[x_col].value_counts().head(15).index
+                        sub = sample[sample[x_col].isin(top_cats)]
+                        sns.boxplot(data=sub, x=x_col, y=y_col, color=color, ax=ax)
+                        plt.xticks(rotation=40, ha="right", color=TEXT_COLOR)
+                    title = f"{x_col} vs {y_col} Relationship"
+                ax.set_title(title, fontsize=13, fontweight="bold", pad=14)
+                _apply_dark_style(fig, ax)
+                plt.tight_layout()
+                safe_name = re.sub(r"[^\w]+", "_", f"relation_{x_col}_vs_{y_col}").lower()
+                dest = output_dir / f"{safe_name}.png"
+                plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
+                plt.close()
+                generated.append(dest.name)
+                print(f"Relation chart saved: {dest.name}")
+            except Exception as chart_err:
+                print(f"Relation chart error ({x_col} vs {y_col}): {chart_err}")
+                plt.close()
+                continue
+        # ── PHASE 3: Generic fallback charts if no relation charts were made ───
+        if not generated:
+            # Correlation heatmap
+            if len(numeric_cols) >= 2:
+                try:
+                    fig, ax = plt.subplots(figsize=(10, 8))
+                    corr = df[numeric_cols].corr()
+                    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f",
+                                square=True, cbar_kws={"shrink": .8}, ax=ax,
+                                annot_kws={"color": TEXT_COLOR})
+                    ax.set_title("Correlation Matrix", fontsize=14, fontweight="bold", pad=14)
+                    _apply_dark_style(fig, ax)
+                    plt.tight_layout()
+                    dest = output_dir / "correlation_matrix.png"
+                    plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
+                    plt.close()
+                    generated.append(dest.name)
+                except Exception:
+                    plt.close()
+            # Distribution of first numeric col
+            if numeric_cols:
+                try:
+                    col = numeric_cols[0]
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.histplot(df[col].dropna(), kde=True, color=colors[0], ax=ax)
+                    ax.set_title(f"Distribution of {col}", fontsize=13, fontweight="bold", pad=14)
+                    _apply_dark_style(fig, ax)
+                    plt.tight_layout()
+                    dest = output_dir / f"distribution_{col}.png"
+                    plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
+                    plt.close()
+                    generated.append(dest.name)
+                except Exception:
+                    plt.close()
+            # First scatter pair
+            if len(numeric_cols) >= 2:
+                try:
+                    x, y = numeric_cols[0], numeric_cols[1]
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.scatterplot(data=df.head(2000), x=x, y=y, color=colors[1], alpha=0.7, ax=ax)
+                    ax.set_title(f"{x} vs {y} Relationship", fontsize=13, fontweight="bold", pad=14)
+                    _apply_dark_style(fig, ax)
+                    plt.tight_layout()
+                    dest = output_dir / f"scatter_{x}_vs_{y}.png"
+                    plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
+                    plt.close()
+                    generated.append(dest.name)
+                except Exception:
+                    plt.close()
+            # Categorical bar
+            if categorical_cols and numeric_cols:
+                try:
+                    cat, num = categorical_cols[0], numeric_cols[0]
+                    top_cats = df[cat].value_counts().head(10).index
+                    sub_df = df[df[cat].isin(top_cats)]
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.barplot(data=sub_df, x=cat, y=num, errorbar=None, color=colors[2], ax=ax)
+                    ax.set_title(f"Average {num} by {cat} (Top 10)", fontsize=13, fontweight="bold", pad=14)
+                    plt.xticks(rotation=45, ha="right", color=TEXT_COLOR)
+                    _apply_dark_style(fig, ax)
+                    plt.tight_layout()
+                    dest = output_dir / f"bar_{cat}_vs_{num}.png"
+                    plt.savefig(dest, dpi=150, bbox_inches="tight", facecolor=BG_DARK)
+                    plt.close()
+                    generated.append(dest.name)
+                except Exception:
+                    plt.close()
+        return f"Generated {len(generated)} chart(s) ({len(relation_pairs)} from relations, rest generic)."
+    except Exception as e:
+        return f"Fallback visualization failed: {e}"
+# ---------------------------------------------------------------------------
+# Session cleanup helper
+# ---------------------------------------------------------------------------
+def _cleanup_old_sessions(max_age_hours: int = 24) -> None:
+    """Remove session directories older than *max_age_hours*.
+    Also enforces a strict disk quota limit: if the total combined size of sessions and
+    outputs exceeds 1.0 GB, it prunes the oldest folders until the size is under 400 MB.
+    """
+    user_home = Path.home() / ".crewlyze"
+    data_dir = Path(os.getenv("CREWLYZE_DATA_DIR", str(user_home / "data")))
+    sessions_root = data_dir / "sessions"
+    outputs_root  = Path(os.getenv("CREWLYZE_OUTPUTS_DIR", str(user_home / "outputs")))
+    # 1. Clean based on age
+    for root in (sessions_root, outputs_root):
+        if not root.exists():
+            continue
+        cutoff = time.time() - max_age_hours * 3600
+        for session_dir in root.iterdir():
+            if session_dir.is_dir():
+                try:
+                    if session_dir.stat().st_mtime < cutoff:
+                        shutil.rmtree(session_dir, ignore_errors=True)
+                except OSError:
+                    pass
+    # 2. Clean based on disk quota (max 1.0 GB combined)
+    def get_dir_size(path: Path) -> int:
+        if not path.exists():
+            return 0
+        return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file())
+    total_size = get_dir_size(sessions_root) + get_dir_size(outputs_root)
+    max_quota_bytes = 1000 * 1024 * 1024  # 1.0 GB
+    target_quota_bytes = 400 * 1024 * 1024 # 400 MB
+    if total_size > max_quota_bytes:
+        print(f"Disk quota exceeded: {total_size / (1024*1024):.1f}MB. Pruning oldest sessions...")
+        # Collect all session subfolders and outputs with their mtimes
+        subfolders = []
+        for root in (sessions_root, outputs_root):
+            if root.exists():
+                for folder in root.iterdir():
+                    if folder.is_dir():
+                        subfolders.append((folder, folder.stat().st_mtime))
+        # Sort oldest first
+        subfolders.sort(key=lambda x: x[1])
+        for folder, _ in subfolders:
+            try:
+                shutil.rmtree(folder, ignore_errors=True)
+                # Recalculate
+                total_size = get_dir_size(sessions_root) + get_dir_size(outputs_root)
+                if total_size <= target_quota_bytes:
+                    print(f"Disk footprint successfully reduced to {total_size / (1024*1024):.1f}MB.")
+                    break
+            except Exception as e:
+                print(f"Error pruning session folder {folder}: {e}")
+# ---------------------------------------------------------------------------
+# Parallel task execution helper
+# ---------------------------------------------------------------------------
+def _run_single_task(agent, task, max_rpm: int = 8) -> object:
+    """Run a single CrewAI task in its own isolated mini-Crew.
+    Used to execute relation_task and insight_task concurrently.
+    Each call creates a separate Crew instance — no shared state.
+    Returns the task object (with .output populated by kickoff).
+    """
+    mini = Crew(
+        agents=[agent],
+        tasks=[task],
+        max_rpm=max_rpm,
+        cache=False,
+        verbose=True,
+    )
+    mini.kickoff()
+    return task
+# ---------------------------------------------------------------------------
+# Output extractor
+# ---------------------------------------------------------------------------
+def _safe_output(task) -> str:
+    """Safely extract raw string output and error diagnostics from a completed CrewAI task."""
+    if task is None:
+        return ""
+    output_parts = []
+    if hasattr(task, "output") and task.output is not None:
+        output_parts.append(str(task.output.raw if hasattr(task.output, "raw") else task.output))
+    for attr_name in ("error", "exception", "traceback", "trace"):  # best-effort diagnostics
+        if hasattr(task, attr_name):
+            attr_value = getattr(task, attr_name)
+            if attr_value:
+                output_parts.append(f"[{attr_name}] {attr_value}")
+    if not output_parts and hasattr(task, "__dict__"):
+        # Fallback: include any candidate diagnostic attributes from the task object
+        for key in ("status", "state", "result", "message"):
+            if hasattr(task, key):
+                value = getattr(task, key)
+                if value:
+                    output_parts.append(f"[{key}] {value}")
+    return "\n\n".join(output_parts).strip()
+def _run_auto_relation_fallback(df: pd.DataFrame) -> str:
+    """
+    Generate a fallback relationships text using purely statistical correlations.
+    """
+    try:
+        # Get numeric cols
+        num_cols = df.select_dtypes(include=["number"]).columns.tolist()
+        cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
+        relations = []
+        # 1. Numeric correlation pairs
+        if len(num_cols) >= 2:
+            corr = df[num_cols].corr().abs()
+            unstacked = corr.unstack().sort_values(ascending=False)
+            unstacked = unstacked[unstacked.index.get_level_values(0) != unstacked.index.get_level_values(1)]
+            added = set()
+            for (c1, c2), val in unstacked.items():
+                pair = tuple(sorted([c1, c2]))
+                if pair not in added:
+                    added.add(pair)
+                    relations.append(
+                        f"- X: {c1} | Y: {c2} | Type: Scatter Plot | Details: High correlation coefficient of {val:.2f} identified between numeric variables."
+                    )
+                    if len(relations) >= 3:
+                        break
+        # 2. Numeric vs categorical pairs
+        for cat in cat_cols[:2]:
+            for num in num_cols[:2]:
+                if len(relations) >= 5:
+                    break
+                relations.append(
+                    f"- X: {cat} | Y: {num} | Type: Bar Chart | Details: Comparison of average {num} across different values of the categorical column {cat}."
+                )
+        if not relations:
+            cols = df.columns.tolist()
+            for i in range(min(5, len(cols) - 1)):
+                relations.append(
+                    f"- X: {cols[i]} | Y: {cols[i+1]} | Type: Bar Chart | Details: Distribution pattern comparison."
+                )
+        return "\n".join(relations)
+    except Exception as e:
+        return f"- X: {df.columns[0]} | Y: {df.columns[0]} | Type: Bar Chart | Details: Fallback relation due to error: {e}"
+def _run_auto_insights_fallback(df: pd.DataFrame, project_goal: str = "") -> str:
+    """
+    Generate standard fallback consulting report with 5 insights based on dataframe profile.
+    """
+    n_rows, n_cols = df.shape
+    num_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
+    goal_sentence = f"Addressing the primary objective: '{project_goal}'" if project_goal else "Standard dataset optimization"
+    report = []
+    report.append("### Objectives & Goals")
+    report.append(f"Execute comprehensive automated analysis. {goal_sentence}.\n")
+    report.append("### Dataset Statistics")
+    report.append(f"- Total rows: {n_rows}")
+    report.append(f"- Total columns: {n_cols}")
+    report.append(f"- Numeric columns: {', '.join(num_cols) if num_cols else 'None'}")
+    report.append(f"- Categorical columns: {', '.join(cat_cols) if cat_cols else 'None'}\n")
+    report.append("### Strategic Insights")
+    for i in range(1, 6):
+        obs = f"Analyzed distribution and patterns across dataset attributes (index {i})."
+        impl = "Variations in these variables indicate potential performance clusters and operational segments."
+        strat = "Establish tracking dashboards to monitor column distributions and segment actions accordingly."
+        if i == 1 and num_cols:
+            obs = f"Descriptive statistical summary of key driver '{num_cols[0]}' shows standard distribution."
+            impl = f"Operational variance in '{num_cols[0]}' direct impacts overall workflow efficiency and revenue metrics."
+            strat = f"Implement optimization safeguards on '{num_cols[0]}' to minimize operational deviation."
+        elif i == 2 and len(num_cols) >= 2:
+            obs = f"Correlation analysis shows distinct dependency between '{num_cols[0]}' and '{num_cols[1]}'."
+            impl = f"Resource allocation in '{num_cols[0]}' exhibits a lockstep relationship with '{num_cols[1]}' performance."
+            strat = f"Balance budget allocations dynamically between '{num_cols[0]}' and '{num_cols[1]}' to maximize ROI."
+        elif i == 3 and cat_cols:
+            obs = f"Categorical breakdown shows high frequency concentration in column '{cat_cols[0]}'."
+            impl = f"Customer or operational focus is heavily centered on '{cat_cols[0]}' dominant values, leaving other areas under-served."
+            strat = f"Launch targeted campaigns or resource plans to diversify segments beyond '{cat_cols[0]}' top attributes."
+        report.append(f"{i}. **Observation**: {obs}")
+        report.append(f"   **Business Implication**: {impl}")
+        report.append(f"   **Actionable Strategy**: {strat}\n")
+    report.append("### Warnings & Alerts")
+    report.append("- [Auto-Healing Fallback Alert]: Active insights agent failed. Showing baseline statistical intelligence insights.")
+    return "\n".join(report)
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+def run_crew(
+    csv_path:    str,
+    session_id:  str = "default",
+    on_progress: Optional[Callable[[str, object], None]] = None,
+    selected_tasks: Optional[list[str]] = None,
+    deep_analysis: bool = False,
+) -> dict:
+    """
+    Run the full multi-agent analysis pipeline on *csv_path*.
+    Pipeline stages
+    ---------------
+    1. Clean      (sequential)                     — Data Cleaner agent
+    2. Relations  (parallel with Insights)         — Relationship Analyst agent
+    2. Insights   (parallel with Relations)        — BI Analyst agent
+    3. Visualize  (sequential, after 1 + 2)        — Data Visualizer agent
+    4. Plotly     (pure Python, no LLM)            — generate_plotly_charts()
+    Parameters
+    ----------
+    csv_path    : Path to the uploaded CSV file.
+    session_id  : Unique identifier for this session (isolates files/outputs).
+    on_progress : Optional callback(stage: str, data: object) called after
+                  each stage completes. Stages: "profiling", "cleaning",
+                  "relations", "insights", "visualization", "plotly".
+    Returns
+    -------
+    dict with keys:
+        dataframe, cleaning_steps, relations, insights, code,
+        output_dir, plotly_charts
+    """
+    _cleanup_old_sessions()
+    import time
+    from config.metrics_tracker import log_metric
+    start_run = time.time()
+    stage_times = {}
+    total_tokens = 0
+    def _progress(stage: str, data: object = None) -> None:
+        if on_progress:
+            on_progress(stage, data)
+    # ── Per-session directories ───────────────────────────────────────────────
+    user_home = Path.home() / ".crewlyze"
+    data_dir = Path(os.getenv("CREWLYZE_DATA_DIR", str(user_home / "data")))
+    outputs_dir_base = Path(os.getenv("CREWLYZE_OUTPUTS_DIR", str(user_home / "outputs")))
+    session_data_dir   = data_dir / "sessions" / session_id
+    session_output_dir = outputs_dir_base / session_id
+    session_data_dir.mkdir(parents=True, exist_ok=True)
+    session_output_dir.mkdir(parents=True, exist_ok=True)
+    # Clean up previous visualizations for this session only
+    for existing_png in session_output_dir.glob("*.png"):
+        existing_png.unlink(missing_ok=True)
+    print("=" * 50)
+    print("Crewlyze")
+    print("=" * 50)
+    # ── Load original dataset ─────────────────────────────────────────────────
+    try:
+        df = read_csv_robust(csv_path)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Upload not found at: {csv_path}")
+    n_rows, n_cols = df.shape
+    print(f"Loaded {n_rows:,} rows, {n_cols} columns")
+    cols_preview = ", ".join(df.columns[:10])
+    if n_cols > 10:
+        cols_preview += "..."
+    print(f"Columns: {cols_preview}")
+    # ── Backup original before agents touch it ────────────────────────────────
+    original_backup = session_data_dir / "original.csv"
+    cleaned_path    = session_data_dir / "cleaned.csv"
+    df.to_csv(original_backup, index=False)
+    df.to_csv(cleaned_path, index=False)
+    print(f"Original backed up → {original_backup}")
+    print(f"Working copy created → {cleaned_path}\n")
+    os.environ["CURRENT_SESSION_CSV"] = str(cleaned_path)
+    os.environ["CURRENT_SESSION_OUTPUT_DIR"] = str(session_output_dir)
+    # Determine requested task stages and deep analysis mode
+    if selected_tasks is None:
+        selected_tasks = []
+    env_tasks = selected_tasks or []
+    if not env_tasks:
+        env_tasks = ["cleaning", "relations", "insights", "visualization"]
+    do_cleaning = "cleaning" in env_tasks
+    do_relations = "relations" in env_tasks
+    do_insights = "insights" in env_tasks
+    do_visualization = "visualization" in env_tasks
+    # ── Automatic Data Type Inference and Coercion ────────────────────────────
+    coercion_summary = ""
+    if do_cleaning:
+        print("Running automatic data type coercion ...")
+        from tools.dataset_tools import auto_coerce_types
+        df_coerced, coercion_actions = auto_coerce_types(df)
+        if coercion_actions:
+            print("Data type coercion completed:")
+            coercion_lines = []
+            for action in coercion_actions:
+                print(f"  - {action}")
+                coercion_lines.append(f"- {action}")
+            coercion_summary = "\n".join(coercion_lines)
+            # Save the coerced dataframe to cleaned_path
+            df_coerced.to_csv(cleaned_path, index=False)
+            # Update our in-memory df and shapes
+            df = df_coerced
+            n_rows, n_cols = df.shape
+        else:
+            print("No type conflicts detected.")
+    # ── Pre-compute dataset profile (eliminates 6-8 agent tool-call round-trips)
+    # Large files are sampled; the cleaner still operates on the full dataset.
+    profile_max_rows = 5000 if n_rows > 10_000 else n_rows
+    if n_rows > 10_000:
+        print(f"Large file detected ({n_rows:,} rows). "
+              f"Profiling on {profile_max_rows:,}-row sample ...")
+    print("Building dataset profile ...")
+    start_prof = time.time()
+    profile = build_dataset_profile(str(cleaned_path), max_rows=profile_max_rows)
+    stage_times["profiling"] = time.time() - start_prof
+    _progress("profiling", profile)
+    print("Profile ready.\n")
+    if not deep_analysis:
+        from config.context import current_deep_analysis
+        deep_analysis = current_deep_analysis.get()
+    # Load goal, title, and existing tweaked relations if available
+    project_goal = ""
+    report_title = ""
+    existing_relations = ""
+    try:
+        import json
+        meta_path = session_data_dir / "metadata.json"
+        if meta_path.exists():
+            with open(meta_path, "r", encoding="utf-8") as f:
+                meta = json.load(f)
+                project_goal = meta.get("optimized_goal") or meta.get("goal") or ""
+                report_title = meta.get("report_title") or ""
+        # Load tweaked relations from results.json
+        results_path = session_data_dir / "results.json"
+        if results_path.exists():
+            with open(results_path, "r", encoding="utf-8") as f:
+                res_data = json.load(f)
+                existing_relations = res_data.get("relations") or ""
+    except Exception as e:
+        print(f"Warning: Could not read metadata or results cache: {e}")
+    # ── Build fresh agents + tasks ────────────────────────────────────────────
+    agents, tasks = make_pipeline(
+        session_id,
+        profile=profile,
+        selected_tasks=env_tasks,
+        deep_analysis=deep_analysis,
+        project_goal=project_goal,
+        report_title=report_title,
+        existing_relations=existing_relations,
+        coercion_summary=coercion_summary,
+    )
+    # tasks = [clean_task, relation_task, insight_task, visualize_task]
+    # ════════════════════════════════════════════════════════════════════════
+    # STAGE 1 — Clean (sequential, must run before anything else)
+    # ════════════════════════════════════════════════════════════════════════
+    clean_output = "Data cleaning was skipped by user selection."
+    if do_cleaning:
+        print("\n[Stage 1/4] Running Data Cleaner ...")
+        start_clean_stage = time.time()
+        clean_crew = Crew(
+            agents=[agents[0]],
+            tasks=[tasks[0]],
+            max_rpm=15,
+            cache=True,
+            verbose=True,
+        )
+        try:
+            clean_crew.kickoff()
+            clean_output = _safe_output(tasks[0])
+            try:
+                if hasattr(clean_crew, "usage_metrics") and clean_crew.usage_metrics:
+                    total_tokens += clean_crew.usage_metrics.get("total_tokens", 0)
+            except Exception:
+                pass
+        except Exception as exc:
+            print(f"Cleaning error: {exc}. Activating auto-healing fallback...")
+            traceback.print_exc()
+            clean_output = (
+                f"Data Cleaner encountered an error: {exc}.\n"
+                "- Auto-healing fallback: Skipped active code execution and used raw data copy to prevent pipeline failure."
+            )
+        stage_times["cleaning"] = time.time() - start_clean_stage
+        _progress("cleaning", clean_output)
+        print("[Stage 1/4] Cleaning complete.\n")
+    else:
+        print("\n[Stage 1/4] Skipping Data Cleaner (user selection).\n")
+        _progress("cleaning", clean_output)
+    # ════════════════════════════════════════════════════════════════════════
+    # STAGE 2 — Relations + Insights (PARALLEL)
+    # ════════════════════════════════════════════════════════════════════════
+    relation_output = "Relationship mapping was skipped by user selection."
+    insights_output = "Business insights generation was skipped by user selection."
+    if do_relations or do_insights:
+        print("[Stage 2/4] Running Relation Analyst + BI Analyst ...")
+        start_rel_ins_stage = time.time()
+        if do_relations and do_insights:
+            import contextvars
+            ctx1 = contextvars.copy_context()
+            ctx2 = contextvars.copy_context()
+            def run_rel_safe():
+                try:
+                    res_task = _run_single_task(agents[1], tasks[1], 8)
+                    return _safe_output(res_task)
+                except Exception as e:
+                    print(f"Relations Agent error: {e}. Activating auto-healing fallback...")
+                    traceback.print_exc()
+                    return _run_auto_relation_fallback(df)
+            def run_ins_safe():
+                try:
+                    res_task = _run_single_task(agents[2], tasks[2], 8)
+                    return _safe_output(res_task)
+                except Exception as e:
+                    print(f"Insights Agent error: {e}. Activating auto-healing fallback...")
+                    traceback.print_exc()
+                    return _run_auto_insights_fallback(df, project_goal)
+            try:
+                with ThreadPoolExecutor(max_workers=2, thread_name_prefix="crew") as executor:
+                    rel_future = executor.submit(ctx1.run, run_rel_safe)
+                    ins_future = executor.submit(ctx2.run, run_ins_safe)
+                    relation_output = rel_future.result()
+                    insights_output = ins_future.result()
+            except Exception as exc:
+                print(f"Parallel execution collapsed: {exc}. Running fallbacks...")
+                traceback.print_exc()
+                if do_relations:
+                    relation_output = _run_auto_relation_fallback(df)
+                if do_insights:
+                    insights_output = _run_auto_insights_fallback(df, project_goal)
+        else:
+            if do_relations:
+                try:
+                    rel_crew = Crew(agents=[agents[1]], tasks=[tasks[1]], max_rpm=15, cache=True, verbose=True)
+                    rel_crew.kickoff()
+                    relation_output = _safe_output(tasks[1])
+                except Exception as e:
+                    print(f"Relations Agent error: {e}. Activating auto-healing fallback...")
+                    traceback.print_exc()
+                    relation_output = _run_auto_relation_fallback(df)
+            if do_insights:
+                try:
+                    ins_crew = Crew(agents=[agents[2]], tasks=[tasks[2]], max_rpm=15, cache=True, verbose=True)
+                    ins_crew.kickoff()
+                    insights_output = _safe_output(tasks[2])
+                except Exception as e:
+                    print(f"Insights Agent error: {e}. Activating auto-healing fallback...")
+                    traceback.print_exc()
+                    insights_output = _run_auto_insights_fallback(df, project_goal)
+    _progress("relations", relation_output)
+    _progress("insights", insights_output)
+    stage_times["relations_insights"] = time.time() - start_rel_ins_stage
+    print("[Stage 2/4] Relations + Insights complete.\n")
+    # ════════════════════════════════════════════════════════════════════════
+    # STAGE 3 — Visualize (sequential, receives actual outputs as context)
+    # ════════════════════════════════════════════════════════════════════════
+    visualize_output = "Visualization was skipped by user selection."
+    if do_visualization:
+        print("[Stage 3/4] Running Data Visualizer ...")
+        start_viz_stage = time.time()
+        # Inject relation + insight outputs directly into the task description
+        # so the visualizer has full context without relying on CrewAI's
+        # cross-crew context= mechanism.
+        viz_task = tasks[3]
+        viz_task.description += (
+            f"\n\nRELATIONSHIPS TO VISUALIZE:\n{relation_output}"
+            f"\n\nKEY INSIGHTS FOR CONTEXT:\n{insights_output}"
+        )
+        viz_crew = Crew(
+            agents=[agents[3]],
+            tasks=[viz_task],
+            max_rpm=15,
+            cache=True,
+            verbose=True,
+        )
+        try:
+            viz_crew.kickoff()
+            visualize_output = _safe_output(viz_task)
+            try:
+                if hasattr(viz_crew, "usage_metrics") and viz_crew.usage_metrics:
+                    total_tokens += viz_crew.usage_metrics.get("total_tokens", 0)
+            except Exception:
+                pass
+        except Exception as exc:
+            print(f"Visualization Agent error: {exc}. Activating auto-healing visualizer fallback...")
+            traceback.print_exc()
+            visualize_output = f"Visualization Agent encountered error: {exc}"
+        # Auto-healing fallback check: if no PNG charts were successfully saved
+        png_files = list(session_output_dir.glob("*.png"))
+        if not png_files:
+            print("No PNG charts generated by agent. Running relation-aware visualizer fallback...")
+            fallback_msg = _run_auto_visualizer_fallback(
+                cleaned_path, session_output_dir, relations_text=relation_output
+            )
+            visualize_output = f"{visualize_output}\n\n[Auto-Healing Fallback Status]: {fallback_msg}"
+            print(fallback_msg)
+    else:
+        print("[Stage 3/4] Skipping Data Visualizer (user selection).\n")
+    _progress("visualization", visualize_output)
+    stage_times["visualization"] = time.time() - start_viz_stage
+    print("[Stage 3/4] Visualization complete.\n")
+    # ── Generate interactive Plotly charts (pure Python, no LLM) ─────────────
+    print("[Stage 4/4] Building interactive Plotly charts ...")
+    start_plotly_stage = time.time()
+    plotly_charts = generate_plotly_charts(
+        csv_path=str(cleaned_path),
+        relations_text=relation_output,
+    )
+    _progress("plotly", plotly_charts)
+    stage_times["plotly"] = time.time() - start_plotly_stage
+    print(f"Generated {len(plotly_charts)} interactive chart(s).\n")
+    # ── Reload cleaned dataframe ──────────────────────────────────────────────
+    try:
+        cleaned_df = read_csv_robust(cleaned_path)
+    except Exception:
+        print("WARNING: Could not load cleaned CSV. Falling back to original data.")
+        cleaned_df = df
+    total_time = time.time() - start_run
+    try:
+        dataset_name = Path(csv_path).name
+        est_cost = (total_tokens / 1_000_000) * 0.15 if total_tokens else 0.0
+        log_metric(
+            session_id=session_id,
+            dataset_name=dataset_name,
+            rows=n_rows,
+            cols=n_cols,
+            stages=stage_times,
+            total_time=total_time,
+            success=True,
+            token_usage=total_tokens,
+            estimated_cost=est_cost
+        )
+    except Exception as e:
+        print(f"Error logging metric: {e}")
+    return {
+        "dataframe":      cleaned_df,
+        "cleaning_steps": clean_output,
+        "relations":      relation_output,
+        "insights":       insights_output,
+        "code":           visualize_output,
+        "output_dir":     str(session_output_dir),
+        "plotly_charts":  plotly_charts,
+    }
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    default_path = (Path.cwd() / "data" / "TB_Burden_Country.csv").resolve()
+    path = input(
+        f"Enter the path to your CSV file (default: {default_path.name}): "
+    ) or str(default_path)
+    report = run_crew(path, session_id="cli")
+    if report:
+        print("\nAnalysis Complete.")
+        print("Crewlyze")
+        print("Prithiv.A.K  Sebin.S  Sowmiyan.S")