npm - crewlyze - Versions diffs - 3.1.0 - Mend

crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/.dockerignore +12 -0
package/.gitattributes +2 -0
package/CHANGELOG.md +86 -0
package/Dockerfile +21 -0
package/LICENSE +21 -0
package/README.md +139 -0
package/USAGE.md +106 -0
package/agents/__init__.py +0 -0
package/agents/cleaner.py +38 -0
package/agents/insights.py +44 -0
package/agents/relation.py +36 -0
package/agents/visualizer.py +41 -0
package/assets/badge_crewai.svg +4 -0
package/assets/badge_matplotlib.svg +4 -0
package/assets/badge_ollama.svg +4 -0
package/assets/badge_pandas.svg +4 -0
package/assets/badge_seaborn.svg +4 -0
package/assets/branding_image.png +0 -0
package/assets/complete_workflow.svg +216 -0
package/assets/favicon.png +0 -0
package/assets/logo.png +0 -0
package/assets/stars.svg +12 -0
package/bin/crewlyze.js +79 -0
package/config/README.md +129 -0
package/config/__init__.py +1 -0
package/config/context.py +16 -0
package/config/llm_config.py +300 -0
package/config/metrics_tracker.py +70 -0
package/crew.py +870 -0
package/crewlyze-3.1.0.tgz +0 -0
package/fix_syntax.py +54 -0
package/main.py +1279 -0
package/package.json +22 -0
package/pyproject.toml +32 -0
package/requirements.txt +33 -0
package/tools/__init__.py +0 -0
package/tools/dataset_tools.py +803 -0
package/ui/__init__.py +3 -0
package/ui/copilot.py +200 -0
package/ui/export.py +800 -0
package/update_appjs.py +54 -0
package/update_llm.py +21 -0
package/update_main.py +20 -0
package/web/app.js +3142 -0
package/web/index.html +1105 -0
package/web/style.css +2561 -0
package/workflows/__init__.py +0 -0
package/workflows/pipeline.py +254 -0

package/ui/export.py ADDED Viewed

@@ -0,0 +1,800 @@
+# Crewlyze
+# Copyright (c) 2025 Sowmiyan S
+# Licensed under the MIT License
+"""
+PDF export — professional executive report.
+Improvements:
+- Cover page: project title (filename) + timestamp.
+- Data Insights section: per-column min/max/mean/median/std for numerics,
+  top categories for categoricals — placed after the dataset summary.
+- No empty spacers for sections with no content.
+- KeepTogether for insight cards to prevent orphaned headers.
+- export_pdf_cached() wrapper used by app.py.
+"""
+import io
+import re
+from datetime import datetime
+from io import BytesIO
+from pathlib import Path
+import pandas as pd
+from PIL import Image as PILImage
+from reportlab.lib import colors
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
+from reportlab.pdfgen import canvas
+from reportlab.platypus import (
+    Image,
+    KeepTogether,
+    PageBreak,
+    Paragraph,
+    SimpleDocTemplate,
+    Spacer,
+    Table,
+    TableStyle,
+)
+# ---------------------------------------------------------------------------
+# Two-pass Canvas — Page X of Y + Corporate Rules
+# ---------------------------------------------------------------------------
+class NumberedCanvas(canvas.Canvas):
+    """Two-pass canvas for dynamic page count and corporate header/footer rules."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._saved_page_states = []
+    def showPage(self):
+        self._saved_page_states.append(dict(self.__dict__))
+        self._startPage()
+    def save(self):
+        num_pages = len(self._saved_page_states)
+        for state in self._saved_page_states:
+            self.__dict__.update(state)
+            self.draw_page_decorations(num_pages)
+            super().showPage()
+        super().save()
+    def draw_page_decorations(self, page_count):
+        self.saveState()
+        # Watermark "crewlyze" logo at top left corner of all pages in lowercase, bold, red
+        self.setFillColor(colors.HexColor("#ff252a"))
+        self.setFont("Helvetica-Bold", 10)
+        self.drawString(54, 752, "crewlyze")
+        if self._pageNumber == 1:
+            self.restoreState()
+            return  # Suppress remaining headers/footers on the title page
+        self.setFont("Helvetica-Bold", 8)
+        self.setFillColor(colors.HexColor("#475569"))
+        self.setStrokeColor(colors.HexColor("#cbd5e1"))
+        self.setLineWidth(0.5)
+        # Header - safe distance from body top (topMargin = 80)
+        self.line(54, 745, 558, 745)
+        self.drawRightString(558, 752, "EXECUTIVE ANALYSIS REPORT | CONFIDENTIAL")
+        # Footer - safe distance from body bottom (bottomMargin = 80)
+        self.line(54, 65, 558, 65)
+        self.setFont("Helvetica", 8)
+        self.drawString(54, 50, "Generated by Crewlyze System")
+        self.drawRightString(558, 50, f"Page {self._pageNumber} of {page_count}")
+        self.restoreState()
+# ---------------------------------------------------------------------------
+# Markdown → ReportLab HTML
+# ---------------------------------------------------------------------------
+def _md_to_html(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r"\*\*(.*?)\*\*", r"<b>\1</b>", text)
+    text = re.sub(r"\*(.*?)\*",     r"<i>\1</i>", text)
+    text = (
+        text.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+    )
+    text = (
+        text.replace("&lt;b&gt;",  "<b>").replace("&lt;/b&gt;", "</b>")
+            .replace("&lt;i&gt;",  "<i>").replace("&lt;/i&gt;", "</i>")
+    )
+    return text.strip()
+def _clean_ai_artifacts(text: str) -> str:
+    """Remove AI reasoning artifacts like Thought, Action, Route, Response logs from raw text."""
+    if not text:
+        return ""
+    lines = text.split("\n")
+    cleaned_lines = []
+    for line in lines:
+        l_strip = line.strip()
+        # skip lines that start with thought, action, observation, response, etc.
+        if re.match(r'^(thought|action|observation|route|call|api_key|response):\s*', l_strip, re.IGNORECASE):
+            continue
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines).strip()
+def _parse_insight_fields(text: str) -> dict:
+    """Extract Observation, Business Implication, and Actionable Strategy from insight text."""
+    obs = ""
+    imp = ""
+    strat = ""
+    # Clean text first
+    text_clean = _clean_ai_artifacts(text)
+    # Try parsing using regex
+    obs_m = re.search(r"\*\*Observation\*\*:\s*(.*?)(?=\*\*Business Implication\*\*|\*\*Actionable Strategy\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
+    imp_m = re.search(r"\*\*Business Implication\*\*:\s*(.*?)(?=\*\*Observation\*\*|\*\*Actionable Strategy\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
+    strat_m = re.search(r"\*\*Actionable Strategy\*\*:\s*(.*?)(?=\*\*Observation\*\*|\*\*Business Implication\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
+    if obs_m:
+        obs = obs_m.group(1).strip()
+    if imp_m:
+        imp = imp_m.group(1).strip()
+    if strat_m:
+        strat = strat_m.group(1).strip()
+    # Loose match fallbacks
+    if not obs:
+        obs_m = re.search(r"Observation:\s*(.*?)(?=Business Implication|Actionable Strategy|$)", text_clean, re.DOTALL | re.IGNORECASE)
+        if obs_m: obs = obs_m.group(1).strip()
+    if not imp:
+        imp_m = re.search(r"Business Implication:\s*(.*?)(?=Observation|Actionable Strategy|$)", text_clean, re.DOTALL | re.IGNORECASE)
+        if imp_m: imp = imp_m.group(1).strip()
+    if not strat:
+        strat_m = re.search(r"Actionable Strategy:\s*(.*?)(?=Observation|Business Implication|$)", text_clean, re.DOTALL | re.IGNORECASE)
+        if strat_m: strat = strat_m.group(1).strip()
+    return {
+        "observation": obs or text_clean,
+        "implication": imp,
+        "strategy": strat
+    }
+def _find_matching_chart(insight_text: str, png_files: list, placed_set: set):
+    """Find a chart from png_files that is relevant to the column names mentioned in insight_text."""
+    text_lower = insight_text.lower()
+    for png in png_files:
+        if png in placed_set:
+            continue
+        # Get words from filename (excluding extension)
+        stem_clean = png.stem.lower().replace("_", " ")
+        # Check if any significant words from the filename are in the insight text
+        words = [w for w in stem_clean.split() if w not in ("vs", "plot", "chart", "scatter", "bar", "box", "line", "distribution", "correlation")]
+        if words and all(w in text_lower for w in words):
+            return png
+    return None
+def _parse_relation_line(line: str) -> dict:
+    """Parse key relations string components into a clean dictionary."""
+    parts = [p.strip() for p in line.split("|")]
+    res = {"x": "N/A", "y": "N/A", "type": "N/A", "details": "N/A"}
+    for part in parts:
+        part_clean = part.strip()
+        if ":" in part_clean:
+            key, val = part_clean.split(":", 1)
+            key = key.strip().lower()
+            val = val.strip()
+            if key == "x":
+                res["x"] = val
+            elif key == "y":
+                res["y"] = val
+            elif key == "type":
+                res["type"] = val
+            elif key == "details":
+                res["details"] = val
+        else:
+            if part_clean.startswith("- "):
+                part_clean = part_clean[2:]
+            if part_clean:
+                res["details"] = part_clean
+    return res
+# ---------------------------------------------------------------------------
+# Data Insights table builder
+# ---------------------------------------------------------------------------
+def _format_number(val) -> str:
+    """Formats numeric values into a clean, human-readable layout without scientific notation."""
+    try:
+        if pd.isna(val):
+            return "—"
+        val_float = float(val)
+        # If it's a float that is actually an integer
+        if val_float.is_integer():
+            return f"{int(val_float):,}"
+        # For very small numbers, use scientific representation safely
+        if abs(val_float) < 0.0001 and val_float != 0:
+            return f"{val_float:.4e}"
+        # For standard floats, format with commas and strip trailing zeros
+        formatted = f"{val_float:,.2f}"
+        if "." in formatted:
+            formatted = formatted.rstrip('0').rstrip('.')
+        return formatted
+    except Exception:
+        return str(val)
+def _build_insights_table(df: pd.DataFrame, body_style, header_style, primary_color, secondary_color) -> list:
+    """
+    Build a Data Insights table showing per-column statistics.
+    Returns a list of flowables (may be empty if no numeric cols).
+    """
+    flowables = []
+    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
+    cat_cols     = df.select_dtypes(include=["object", "category"]).columns.tolist()
+    table_header_style = ParagraphStyle("TblHdrWht", parent=body_style,
+        fontName="Helvetica-Bold", fontSize=9, leading=12,
+        textColor=colors.white)
+    # ── Numeric stats table ──────────────────────────────────────────────────
+    if numeric_cols:
+        header_row = [
+            Paragraph("<b>Column</b>", table_header_style),
+            Paragraph("<b>Min</b>",    table_header_style),
+            Paragraph("<b>Max</b>",    table_header_style),
+            Paragraph("<b>Mean</b>",   table_header_style),
+            Paragraph("<b>Median</b>", table_header_style),
+            Paragraph("<b>Std Dev</b>",table_header_style),
+            Paragraph("<b>Missing%</b>", table_header_style),
+        ]
+        rows = [header_row]
+        for col in numeric_cols[:20]:   # cap at 20 columns
+            s      = df[col]
+            miss   = round(s.isnull().sum() / max(len(df), 1) * 100, 1)
+            s_num  = s.dropna()
+            row = [
+                Paragraph(str(col), body_style),
+                Paragraph(_format_number(s_num.min()) if not s_num.empty else "—", body_style),
+                Paragraph(_format_number(s_num.max()) if not s_num.empty else "—", body_style),
+                Paragraph(_format_number(s_num.mean()) if not s_num.empty else "—", body_style),
+                Paragraph(_format_number(s_num.median()) if not s_num.empty else "—", body_style),
+                Paragraph(_format_number(s_num.std()) if len(s_num) > 1 else "—", body_style),
+                Paragraph(f"{miss}%", body_style),
+            ]
+            rows.append(row)
+        tbl = Table(rows, colWidths=[120, 55, 55, 55, 55, 55, 55])
+        tbl.setStyle(TableStyle([
+            ("BACKGROUND",    (0, 0), (-1, 0),  primary_color),
+            ("LINEBELOW",     (0, 0), (-1, 0),  1.2, secondary_color),
+            ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+            ("INNERGRID",     (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
+            ("TOPPADDING",    (0, 0), (-1, -1), 5),
+            ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
+            ("LEFTPADDING",   (0, 0), (-1, -1), 6),
+            ("RIGHTPADDING",  (0, 0), (-1, -1), 6),
+            ("VALIGN",        (0, 0), (-1, -1), "MIDDLE"),
+            ("ROWBACKGROUNDS",(0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
+        ]))
+        flowables.append(tbl)
+        flowables.append(Spacer(1, 8))
+    # ── Categorical top-values table ─────────────────────────────────────────
+    if cat_cols:
+        cat_header = [
+            Paragraph("<b>Column</b>",     table_header_style),
+            Paragraph("<b>Top Values (count)</b>", table_header_style),
+            Paragraph("<b>Unique</b>",     table_header_style),
+            Paragraph("<b>Missing%</b>",   table_header_style),
+        ]
+        cat_rows = [cat_header]
+        for col in cat_cols[:10]:
+            s     = df[col]
+            miss  = round(s.isnull().sum() / max(len(df), 1) * 100, 1)
+            unique = s.nunique()
+            top3  = s.value_counts().head(3)
+            top_str = ", ".join(f"{v}({c})" for v, c in top3.items()) if not top3.empty else "—"
+            cat_rows.append([
+                Paragraph(str(col), body_style),
+                Paragraph(top_str[:80], body_style),
+                Paragraph(str(unique), body_style),
+                Paragraph(f"{miss}%", body_style),
+            ])
+        cat_tbl = Table(cat_rows, colWidths=[100, 280, 55, 65])
+        cat_tbl.setStyle(TableStyle([
+            ("BACKGROUND",    (0, 0), (-1, 0),  primary_color),
+            ("LINEBELOW",     (0, 0), (-1, 0),  1.2, secondary_color),
+            ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+            ("INNERGRID",     (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
+            ("TOPPADDING",    (0, 0), (-1, -1), 5),
+            ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
+            ("LEFTPADDING",   (0, 0), (-1, -1), 6),
+            ("RIGHTPADDING",  (0, 0), (-1, -1), 6),
+            ("VALIGN",        (0, 0), (-1, -1), "MIDDLE"),
+            ("ROWBACKGROUNDS",(0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
+        ]))
+        flowables.append(cat_tbl)
+    return flowables
+# ---------------------------------------------------------------------------
+# PDF Generation
+# ---------------------------------------------------------------------------
+def export_pdf(result: dict, filename: str = "") -> bytes:
+    """Build and return a professional executive PDF report."""
+    buffer = BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=letter,
+        rightMargin=54,
+        leftMargin=54,
+        topMargin=80,      # Increased top margin for header safety
+        bottomMargin=80,   # Increased bottom margin for footer safety
+    )
+    story = []
+    styles = getSampleStyleSheet()
+    primary_color   = colors.HexColor("#4a0404")
+    secondary_color = colors.HexColor("#ff252a")
+    text_color      = colors.HexColor("#0f172a")
+    muted_color     = colors.HexColor("#475569")
+    title_style = ParagraphStyle("DocTitle", parent=styles["Normal"],
+        fontName="Helvetica-Bold", fontSize=24, leading=30,
+        textColor=primary_color, spaceAfter=4)
+    subtitle_style = ParagraphStyle("DocSubTitle", parent=styles["Normal"],
+        fontName="Helvetica", fontSize=11, leading=15,
+        textColor=secondary_color, spaceAfter=6)
+    meta_style = ParagraphStyle("Meta", parent=styles["Normal"],
+        fontName="Helvetica", fontSize=9, leading=13,
+        textColor=muted_color, spaceAfter=12)
+    h1_style = ParagraphStyle("H1", parent=styles["Normal"],
+        fontName="Helvetica-Bold", fontSize=15, leading=20,
+        textColor=primary_color, spaceBefore=16, spaceAfter=8, keepWithNext=True)
+    h2_style = ParagraphStyle("H2", parent=styles["Normal"],
+        fontName="Helvetica-Bold", fontSize=11, leading=15,
+        textColor=secondary_color, spaceBefore=10, spaceAfter=4, keepWithNext=True)
+    body_style = ParagraphStyle("Body", parent=styles["Normal"],
+        fontName="Helvetica", fontSize=10, leading=15,
+        textColor=text_color, spaceAfter=5)
+    header_style = ParagraphStyle("TblHdr", parent=styles["Normal"],
+        fontName="Helvetica-Bold", fontSize=9, leading=12,
+        textColor=primary_color)
+    bullet_style = ParagraphStyle("Bullet", parent=styles["Normal"],
+        fontName="Helvetica", fontSize=9.5, leading=14,
+        textColor=text_color, leftIndent=14, firstLineIndent=-10, spaceAfter=7)
+    # ── Clean all inputs from AI reasoning artifacts ──────────────────────────
+    raw_insights = _clean_ai_artifacts(result.get("insights", "")).strip()
+    cleaning_text = _clean_ai_artifacts(result.get("cleaning_steps", "")).strip()
+    relations_text = _clean_ai_artifacts(result.get("relations", "")).strip()
+    report_title = _clean_ai_artifacts(result.get("report_title") or filename or "Executive Analysis Report").strip()
+    report_goal  = _clean_ai_artifacts(result.get("goal") or "").strip()
+    timestamp    = datetime.now().strftime("%B %d, %Y at %I:%M %p")
+    # ── 1. TITLE PAGE (Page 1) ────────────────────────────────────────────────
+    title_center_style = ParagraphStyle("TitleCenter", parent=title_style, alignment=1, fontSize=26, leading=32)
+    subtitle_center_style = ParagraphStyle("SubCenter", parent=subtitle_style, alignment=1, fontSize=12, leading=16)
+    meta_center_style = ParagraphStyle("MetaCenter", parent=meta_style, alignment=1, fontSize=10, leading=14)
+    story.append(Spacer(1, 140))
+    story.append(Paragraph(f"<b>{report_title.upper()}</b>", title_center_style))
+    story.append(Spacer(1, 10))
+    story.append(Paragraph("Autonomous Business Intelligence &amp; Executive Analysis Suite", subtitle_center_style))
+    story.append(Spacer(1, 40))
+    story.append(Paragraph(f"Dataset Analyzed: <b>{filename or 'dataset.csv'}</b>", meta_center_style))
+    story.append(Paragraph(f"Generated On: {timestamp}", meta_center_style))
+    if report_goal:
+        story.append(Spacer(1, 30))
+        story.append(Paragraph(f"<b>Core Objective:</b> {report_goal}", ParagraphStyle("GoalStyleCenter", parent=body_style, fontName="Helvetica-Oblique", fontSize=9.5, textColor=muted_color, alignment=1)))
+    story.append(PageBreak())
+    # Parse sections from structured insights
+    objectives_text = ""
+    stats_text = ""
+    strategic_text = ""
+    warnings_text = ""
+    if raw_insights:
+        sections = re.split(r"###\s+", raw_insights)
+        for sec in sections:
+            lines = sec.split("\n")
+            if not lines or not lines[0].strip():
+                continue
+            header = lines[0].strip().lower()
+            content = "\n".join(lines[1:]).strip()
+            if "objective" in header or "goal" in header:
+                objectives_text = content
+            elif "stat" in header:
+                stats_text = content
+            elif "insight" in header:
+                strategic_text = content
+            elif "warning" in header or "alert" in header:
+                warnings_text = content
+        if not strategic_text and not objectives_text:
+            strategic_text = raw_insights
+    # ── 2. EXECUTIVE SUMMARY & DATASET OVERVIEW (Page 2) ──────────────────────
+    story.append(Paragraph("Executive Summary &amp; Dataset Overview", h1_style))
+    story.append(Paragraph(
+        "This autonomous executive analysis report presents high-value strategic recommendations, "
+        "data quality cleaning trails, and key visualizations derived from the uploaded dataset.",
+        body_style,
+    ))
+    story.append(Spacer(1, 8))
+    df = result.get("dataframe")
+    if df is not None and isinstance(df, pd.DataFrame):
+        cols_preview = f"{', '.join(str(c) for c in df.columns[:6])}{'...' if len(df.columns) > 6 else ''}"
+        numeric_count = len(df.select_dtypes(include=["number"]).columns)
+        cat_count     = len(df.select_dtypes(include=["object", "category"]).columns)
+        box_data = [
+            [Paragraph("<b>Dataset Summary Metrics</b>", h2_style), Paragraph("", body_style)],
+            [Paragraph("Total Records Analyzed",   body_style), Paragraph(f"<b>{df.shape[0]:,}</b>", body_style)],
+            [Paragraph("Total Columns",            body_style), Paragraph(f"<b>{df.shape[1]}</b>", body_style)],
+            [Paragraph("Numeric Columns",          body_style), Paragraph(f"<b>{numeric_count}</b>", body_style)],
+            [Paragraph("Categorical Columns",      body_style), Paragraph(f"<b>{cat_count}</b>", body_style)],
+            [Paragraph("Columns Sampled",          body_style), Paragraph(cols_preview, body_style)],
+        ]
+        summary_box = Table(box_data, colWidths=[160, 344])
+        summary_box.setStyle(TableStyle([
+            ("BACKGROUND",    (0, 0), (-1, -1), colors.HexColor("#f8fafc")),
+            ("BOX",           (0, 0), (-1, -1), 1, colors.HexColor("#cbd5e1")),
+            ("LINEBELOW",     (0, 0), (-1, 0),  1.5, secondary_color),
+            ("TOPPADDING",    (0, 0), (-1, -1), 6),
+            ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+            ("LEFTPADDING",   (0, 0), (-1, -1), 12),
+            ("RIGHTPADDING",  (0, 0), (-1, -1), 12),
+            ("VALIGN",        (0, 0), (-1, -1), "MIDDLE"),
+        ]))
+        story.append(summary_box)
+        story.append(Spacer(1, 15))
+    story.append(Spacer(1, 15))
+    # ── 3. STRATEGIC BUSINESS INSIGHTS & PAIRED CHARTS (Page 3+) ──────────────
+    # Locate all saved visualization charts
+    output_dir = result.get("output_dir", Path("outputs"))
+    png_files  = list(Path(output_dir).glob("*.png"))
+    placed_charts = set()
+    if strategic_text:
+        story.append(Paragraph("Strategic Business Insights", h1_style))
+        story.append(Paragraph(
+            "Below are the critical business insights identified from the dataset, paired directly with "
+            "relevant charts indicating data correlations.",
+            body_style,
+        ))
+        story.append(Spacer(1, 8))
+        insight_items = re.split(r"\d+\.\s+", strategic_text)
+        insight_count = 0
+        # Styles for table headers in the 3-column layout
+        obs_hdr_style = ParagraphStyle("ObsHdr", parent=header_style, textColor=colors.HexColor("#0c4a6e"))
+        imp_hdr_style = ParagraphStyle("ImpHdr", parent=header_style, textColor=colors.HexColor("#581c87"))
+        strat_hdr_style = ParagraphStyle("StratHdr", parent=header_style, textColor=colors.HexColor("#14532d"))
+        for item in insight_items:
+            item = item.strip()
+            if not item:
+                continue
+            insight_count += 1
+            fields = _parse_insight_fields(item)
+            # Try to extract insight title if it starts with bold text
+            title = ""
+            first_line = item.split("\n")[0].strip()
+            if first_line and not any(k in first_line for k in ("Observation", "Implication", "Strategy")):
+                title = first_line.replace("**", "").replace("<b>", "").replace("</b>", "").strip()
+            lbl_text = f"<b>Insight {insight_count}: {title}</b>" if title else f"<b>Insight {insight_count}</b>"
+            lbl_para = Paragraph(lbl_text, ParagraphStyle("InsLbl", parent=body_style, fontName="Helvetica-Bold", fontSize=11, textColor=primary_color, spaceBefore=10, spaceAfter=4, keepWithNext=True))
+            # Build 3-column table
+            cell_obs = Paragraph(_md_to_html(fields["observation"]), body_style)
+            cell_imp = Paragraph(_md_to_html(fields["implication"] or "N/A"), body_style)
+            cell_strat = Paragraph(_md_to_html(fields["strategy"] or "N/A"), body_style)
+            tbl_data = [
+                [Paragraph("<b>Observation</b>", obs_hdr_style), Paragraph("<b>Business Implication</b>", imp_hdr_style), Paragraph("<b>Actionable Strategy</b>", strat_hdr_style)],
+                [cell_obs, cell_imp, cell_strat]
+            ]
+            card_table = Table(tbl_data, colWidths=[160, 172, 172])
+            card_table.setStyle(TableStyle([
+                ("BACKGROUND",    (0, 0), (0, 0),  colors.HexColor("#f0f9ff")), # sky blue
+                ("BACKGROUND",    (1, 0), (1, 0),  colors.HexColor("#faf5ff")), # purple
+                ("BACKGROUND",    (2, 0), (2, 0),  colors.HexColor("#f0fdf4")), # green
+                ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+                ("INNERGRID",     (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
+                ("VALIGN",        (0, 0), (-1, -1), "TOP"),
+                ("TOPPADDING",    (0, 0), (-1, -1), 6),
+                ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+                ("LEFTPADDING",   (0, 0), (-1, -1), 8),
+                ("RIGHTPADDING",  (0, 0), (-1, -1), 8),
+            ]))
+            # Look for a matched chart
+            matched_png = _find_matching_chart(item, png_files, placed_charts)
+            fig_title = None
+            img_table = None
+            if matched_png:
+                try:
+                    with PILImage.open(matched_png) as img:
+                        orig_w, orig_h = img.size
+                    max_w, max_h = 440, 240
+                    aspect = orig_h / orig_w
+                    if aspect > (max_h / max_w):
+                        new_h = max_h
+                        new_w = new_h / aspect
+                    else:
+                        new_w = max_w
+                        new_h = new_w * aspect
+                    img_flow  = Image(str(matched_png), width=new_w, height=new_h)
+                    img_table = Table([[img_flow]], colWidths=[504])
+                    img_table.setStyle(TableStyle([
+                        ("ALIGN",         (0, 0), (-1, -1), "CENTER"),
+                        ("VALIGN",        (0, 0), (-1, -1), "MIDDLE"),
+                        ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+                        ("BACKGROUND",    (0, 0), (-1, -1), colors.white),
+                        ("TOPPADDING",    (0, 0), (-1, -1), 6),
+                        ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+                    ]))
+                    fig_title = Paragraph(
+                        f"<i>Supporting Figure: {matched_png.stem.replace('_', ' ').title()}</i>",
+                        ParagraphStyle("FigTitle", parent=body_style, fontName="Helvetica-Oblique", fontSize=8.5, textColor=muted_color, spaceBefore=4, spaceAfter=2)
+                    )
+                    placed_charts.add(matched_png)
+                except Exception as exc:
+                    img_table = Paragraph(f"Could not load matching image {matched_png.name}: {exc}", body_style)
+            story.append(KeepTogether([lbl_para, card_table]))
+            if matched_png:
+                story.append(Spacer(1, 4))
+                if fig_title and img_table:
+                    story.append(KeepTogether([fig_title, img_table]))
+                elif img_table:
+                    story.append(KeepTogether([img_table]))
+            story.append(Spacer(1, 14))
+        story.append(Spacer(1, 10))
+    # ── Warnings & Alerts (if present) ────────────────────────────────────────
+    if warnings_text and not "no warnings" in warnings_text.lower() and not "none" in warnings_text.lower():
+        story.append(Paragraph("Business Risks &amp; Critical Alerts", h1_style))
+        warning_content = [
+            Paragraph(warnings_text.replace("\n", "<br/>"), ParagraphStyle("WarnStyle", parent=body_style, fontSize=9.5, textColor=colors.HexColor("#991b1b")))
+        ]
+        warning_table = Table([[warning_content]], colWidths=[504])
+        warning_table.setStyle(TableStyle([
+            ("BACKGROUND",    (0, 0), (-1, -1), colors.HexColor("#fef2f2")),
+            ("LINELEFT",      (0, 0), (0, -1),  4, colors.HexColor("#f43f5e")),
+            ("LEFTPADDING",   (0, 0), (-1, -1), 12),
+            ("RIGHTPADDING",  (0, 0), (-1, -1), 12),
+            ("TOPPADDING",    (0, 0), (-1, -1), 8),
+            ("BOTTOMPADDING", (0, 0), (-1, -1), 8),
+            ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#fee2e2")),
+        ]))
+        story.append(KeepTogether([warning_table, Spacer(1, 12)]))
+    story.append(Spacer(1, 15))
+    # ── 4. DATA SUMMARY & METHODOLOGY (Page 4+) ───────────────────────────────
+    story.append(Paragraph("Data Summary &amp; Methodology", h1_style))
+    # Project Objectives
+    if objectives_text:
+        story.append(Paragraph("Project Objectives &amp; Scope", h2_style))
+        story.append(Paragraph(objectives_text.replace("\n", "<br/>"), body_style))
+        story.append(Spacer(1, 8))
+    # Cleaning steps
+    if cleaning_text:
+        story.append(Paragraph("Data Cleaning Audit Trail", h2_style))
+        story.append(Paragraph("Automated type inference, value imputation, and formatting constraints applied to input records:", body_style))
+        for raw in cleaning_text.split("\n"):
+            line = raw.strip().lstrip("-*• ").strip()
+            if line:
+                story.append(Paragraph(f"• &nbsp; {_md_to_html(line)}", bullet_style))
+        story.append(Spacer(1, 8))
+    # Relations
+    if relations_text:
+        story.append(Paragraph("Key Correlation &amp; Relationship Map", h2_style))
+        story.append(Paragraph("Direct associations identified across columns for target visualization selection:", body_style))
+        story.append(Spacer(1, 4))
+        tbl_data = [
+            [
+                Paragraph("<b>Variable X</b>", header_style),
+                Paragraph("<b>Variable Y</b>", header_style),
+                Paragraph("<b>Chart Type</b>", header_style),
+                Paragraph("<b>Business Context &amp; Relevance</b>", header_style)
+            ]
+        ]
+        for raw in relations_text.split("\n"):
+            line = raw.strip().lstrip("-*• ").strip()
+            if not line:
+                continue
+            parsed = _parse_relation_line(line)
+            tbl_data.append([
+                Paragraph(parsed["x"], body_style),
+                Paragraph(parsed["y"], body_style),
+                Paragraph(parsed["type"], body_style),
+                Paragraph(parsed["details"], body_style)
+            ])
+        if len(tbl_data) > 1:
+            rel_table = Table(tbl_data, colWidths=[95, 95, 95, 219])
+            rel_table.setStyle(TableStyle([
+                ("BACKGROUND",    (0, 0), (-1, 0),  colors.HexColor("#f1f5f9")),
+                ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+                ("INNERGRID",     (0, 0), (-1, -1), 0.3, colors.HexColor("#cbd5e1")),
+                ("VALIGN",        (0, 0), (-1, -1), "TOP"),
+                ("TOPPADDING",    (0, 0), (-1, -1), 5),
+                ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
+                ("LEFTPADDING",   (0, 0), (-1, -1), 6),
+                ("RIGHTPADDING",  (0, 0), (-1, -1), 6),
+            ]))
+            story.append(rel_table)
+        story.append(Spacer(1, 10))
+    story.append(Spacer(1, 15))
+    # ── 5. APPENDIX (Page 5+) ─────────────────────────────────────────────────
+    story.append(Paragraph("Appendix", h1_style))
+    # Per-column statistical summary
+    if df is not None and isinstance(df, pd.DataFrame):
+        story.append(Paragraph("Per-Column Statistical Summary", h2_style))
+        story.append(Paragraph(
+            "Detailed metric breakdowns for numeric distributions and categorical frequencies:",
+            body_style,
+        ))
+        story.append(Spacer(1, 4))
+        insight_flowables = _build_insights_table(df, body_style, header_style, primary_color, secondary_color)
+        if insight_flowables:
+            story.extend(insight_flowables)
+        story.append(Spacer(1, 10))
+    # Remaining/Unmatched Visualizations
+    unplaced_charts = [png for png in png_files if png not in placed_charts]
+    if unplaced_charts:
+        story.append(Paragraph("Additional Analytical Visualizations", h2_style))
+        story.append(Paragraph(
+            "Supplementary visual mappings of auxiliary data relationships:",
+            body_style,
+        ))
+        story.append(Spacer(1, 6))
+        for png_file in unplaced_charts:
+            try:
+                with PILImage.open(png_file) as img:
+                    orig_w, orig_h = img.size
+                max_w, max_h = 440, 240
+                aspect = orig_h / orig_w
+                if aspect > (max_h / max_w):
+                    new_h = max_h
+                    new_w = new_h / aspect
+                else:
+                    new_w = max_w
+                    new_h = new_w * aspect
+                fig_title = Paragraph(
+                    f"<b>Figure: {png_file.stem.replace('_', ' ').title()}</b>",
+                    ParagraphStyle("FigTitle", parent=body_style, fontName="Helvetica-Bold",
+                                   textColor=primary_color, spaceBefore=8, spaceAfter=4, keepWithNext=True),
+                )
+                img_flow  = Image(str(png_file), width=new_w, height=new_h)
+                img_table = Table([[img_flow]], colWidths=[504])
+                img_table.setStyle(TableStyle([
+                    ("ALIGN",         (0, 0), (-1, -1), "CENTER"),
+                    ("VALIGN",        (0, 0), (-1, -1), "MIDDLE"),
+                    ("BOX",           (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
+                    ("BACKGROUND",    (0, 0), (-1, -1), colors.white),
+                    ("TOPPADDING",    (0, 0), (-1, -1), 6),
+                    ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
+                ]))
+                story.append(KeepTogether([fig_title, img_table, Spacer(1, 10)]))
+            except Exception as exc:
+                story.append(Paragraph(f"Could not load image {png_file.name}: {exc}", body_style))
+    # Conclusion & Next steps styled box
+    conclusion_style = ParagraphStyle("ConclStyle", parent=body_style, fontSize=9.5, textColor=colors.HexColor("#1e293b"))
+    conclusion_content = [
+        Paragraph("<b>Executive Conclusion &amp; Next Steps</b>", ParagraphStyle("ConclHdr", parent=h2_style, fontSize=11, textColor=primary_color, spaceAfter=6)),
+        Paragraph(
+            "The automated data pipeline has successfully validated, cleaned, and evaluated the dataset "
+            "under the specified project guidelines. To maximize return on these insights, management is advised "
+            "to prioritize the Actionable Strategy recommendations outlined in the insights section, address the warnings "
+            "disclosed, and leverage the visual intelligence charts for stakeholder presentations.",
+            conclusion_style
+        )
+    ]
+    conclusion_table = Table([[conclusion_content]], colWidths=[504])
+    conclusion_table.setStyle(TableStyle([
+        ("BACKGROUND",    (0, 0), (-1, -1), colors.HexColor("#f8fafc")), # light slate gray background
+        ("BOX",           (0, 0), (-1, -1), 1, colors.HexColor("#cbd5e1")),
+        ("LINELEFT",      (0, 0), (0, -1),  4, primary_color), # accent left border
+        ("LEFTPADDING",   (0, 0), (-1, -1), 14),
+        ("RIGHTPADDING",  (0, 0), (-1, -1), 14),
+        ("TOPPADDING",    (0, 0), (-1, -1), 10),
+        ("BOTTOMPADDING", (0, 0), (-1, -1), 10),
+    ]))
+    story.append(Spacer(1, 15))
+    story.append(KeepTogether([conclusion_table]))
+    doc.build(story, canvasmaker=NumberedCanvas)
+    pdf_bytes = buffer.getvalue()
+    buffer.close()
+    return pdf_bytes
+# ---------------------------------------------------------------------------
+# Cached wrapper (used by app.py)
+# ---------------------------------------------------------------------------
+_pdf_cache: dict = {}
+def export_pdf_cached(
+    cache_key: str,
+    filename: str = "",
+    result_cleaning:  str = "",
+    result_relations: str = "",
+    result_insights:  str = "",
+    result_code:      str = "",
+    output_dir_str:   str = "outputs",
+    df_csv:           str = "",
+) -> bytes:
+    """
+    Build (or return cached) PDF bytes from serialized result components.
+    Uses an in-process dict cache keyed by content hash to avoid rebuilding
+    identical PDFs on every Streamlit rerun.
+    """
+    if cache_key in _pdf_cache:
+        return _pdf_cache[cache_key]
+    # Reconstruct DataFrame from CSV string
+    df = None
+    if df_csv:
+        try:
+            df = pd.read_csv(io.StringIO(df_csv))
+        except Exception:
+            df = None
+    result = {
+        "dataframe":      df,
+        "cleaning_steps": result_cleaning,
+        "relations":      result_relations,
+        "insights":       result_insights,
+        "code":           result_code,
+        "output_dir":     output_dir_str,
+    }
+    pdf_bytes = export_pdf(result, filename=filename)
+    _pdf_cache[cache_key] = pdf_bytes
+    return pdf_bytes