crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.dockerignore +12 -0
  2. package/.gitattributes +2 -0
  3. package/CHANGELOG.md +86 -0
  4. package/Dockerfile +21 -0
  5. package/LICENSE +21 -0
  6. package/README.md +139 -0
  7. package/USAGE.md +106 -0
  8. package/agents/__init__.py +0 -0
  9. package/agents/cleaner.py +38 -0
  10. package/agents/insights.py +44 -0
  11. package/agents/relation.py +36 -0
  12. package/agents/visualizer.py +41 -0
  13. package/assets/badge_crewai.svg +4 -0
  14. package/assets/badge_matplotlib.svg +4 -0
  15. package/assets/badge_ollama.svg +4 -0
  16. package/assets/badge_pandas.svg +4 -0
  17. package/assets/badge_seaborn.svg +4 -0
  18. package/assets/branding_image.png +0 -0
  19. package/assets/complete_workflow.svg +216 -0
  20. package/assets/favicon.png +0 -0
  21. package/assets/logo.png +0 -0
  22. package/assets/stars.svg +12 -0
  23. package/bin/crewlyze.js +79 -0
  24. package/config/README.md +129 -0
  25. package/config/__init__.py +1 -0
  26. package/config/context.py +16 -0
  27. package/config/llm_config.py +300 -0
  28. package/config/metrics_tracker.py +70 -0
  29. package/crew.py +870 -0
  30. package/crewlyze-3.1.0.tgz +0 -0
  31. package/fix_syntax.py +54 -0
  32. package/main.py +1279 -0
  33. package/package.json +22 -0
  34. package/pyproject.toml +32 -0
  35. package/requirements.txt +33 -0
  36. package/tools/__init__.py +0 -0
  37. package/tools/dataset_tools.py +803 -0
  38. package/ui/__init__.py +3 -0
  39. package/ui/copilot.py +200 -0
  40. package/ui/export.py +800 -0
  41. package/update_appjs.py +54 -0
  42. package/update_llm.py +21 -0
  43. package/update_main.py +20 -0
  44. package/web/app.js +3142 -0
  45. package/web/index.html +1105 -0
  46. package/web/style.css +2561 -0
  47. package/workflows/__init__.py +0 -0
  48. package/workflows/pipeline.py +254 -0
package/ui/export.py ADDED
@@ -0,0 +1,800 @@
1
+ # Crewlyze
2
+ # Copyright (c) 2025 Sowmiyan S
3
+ # Licensed under the MIT License
4
+
5
+ """
6
+ PDF export — professional executive report.
7
+
8
+ Improvements:
9
+ - Cover page: project title (filename) + timestamp.
10
+ - Data Insights section: per-column min/max/mean/median/std for numerics,
11
+ top categories for categoricals — placed after the dataset summary.
12
+ - No empty spacers for sections with no content.
13
+ - KeepTogether for insight cards to prevent orphaned headers.
14
+ - export_pdf_cached() wrapper used by app.py.
15
+ """
16
+
17
+ import io
18
+ import re
19
+ from datetime import datetime
20
+ from io import BytesIO
21
+ from pathlib import Path
22
+
23
+ import pandas as pd
24
+ from PIL import Image as PILImage
25
+ from reportlab.lib import colors
26
+ from reportlab.lib.pagesizes import letter
27
+ from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
28
+ from reportlab.pdfgen import canvas
29
+ from reportlab.platypus import (
30
+ Image,
31
+ KeepTogether,
32
+ PageBreak,
33
+ Paragraph,
34
+ SimpleDocTemplate,
35
+ Spacer,
36
+ Table,
37
+ TableStyle,
38
+ )
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Two-pass Canvas — Page X of Y + Corporate Rules
43
+ # ---------------------------------------------------------------------------
44
+
45
+ class NumberedCanvas(canvas.Canvas):
46
+ """Two-pass canvas for dynamic page count and corporate header/footer rules."""
47
+
48
+ def __init__(self, *args, **kwargs):
49
+ super().__init__(*args, **kwargs)
50
+ self._saved_page_states = []
51
+
52
+ def showPage(self):
53
+ self._saved_page_states.append(dict(self.__dict__))
54
+ self._startPage()
55
+
56
+ def save(self):
57
+ num_pages = len(self._saved_page_states)
58
+ for state in self._saved_page_states:
59
+ self.__dict__.update(state)
60
+ self.draw_page_decorations(num_pages)
61
+ super().showPage()
62
+ super().save()
63
+
64
+ def draw_page_decorations(self, page_count):
65
+ self.saveState()
66
+ # Watermark "crewlyze" logo at top left corner of all pages in lowercase, bold, red
67
+ self.setFillColor(colors.HexColor("#ff252a"))
68
+ self.setFont("Helvetica-Bold", 10)
69
+ self.drawString(54, 752, "crewlyze")
70
+
71
+ if self._pageNumber == 1:
72
+ self.restoreState()
73
+ return # Suppress remaining headers/footers on the title page
74
+
75
+ self.setFont("Helvetica-Bold", 8)
76
+ self.setFillColor(colors.HexColor("#475569"))
77
+ self.setStrokeColor(colors.HexColor("#cbd5e1"))
78
+ self.setLineWidth(0.5)
79
+
80
+ # Header - safe distance from body top (topMargin = 80)
81
+ self.line(54, 745, 558, 745)
82
+ self.drawRightString(558, 752, "EXECUTIVE ANALYSIS REPORT | CONFIDENTIAL")
83
+
84
+ # Footer - safe distance from body bottom (bottomMargin = 80)
85
+ self.line(54, 65, 558, 65)
86
+ self.setFont("Helvetica", 8)
87
+ self.drawString(54, 50, "Generated by Crewlyze System")
88
+ self.drawRightString(558, 50, f"Page {self._pageNumber} of {page_count}")
89
+ self.restoreState()
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Markdown → ReportLab HTML
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def _md_to_html(text: str) -> str:
97
+ if not text:
98
+ return ""
99
+ text = re.sub(r"\*\*(.*?)\*\*", r"<b>\1</b>", text)
100
+ text = re.sub(r"\*(.*?)\*", r"<i>\1</i>", text)
101
+ text = (
102
+ text.replace("&", "&amp;")
103
+ .replace("<", "&lt;")
104
+ .replace(">", "&gt;")
105
+ )
106
+ text = (
107
+ text.replace("&lt;b&gt;", "<b>").replace("&lt;/b&gt;", "</b>")
108
+ .replace("&lt;i&gt;", "<i>").replace("&lt;/i&gt;", "</i>")
109
+ )
110
+ return text.strip()
111
+
112
+
113
+ def _clean_ai_artifacts(text: str) -> str:
114
+ """Remove AI reasoning artifacts like Thought, Action, Route, Response logs from raw text."""
115
+ if not text:
116
+ return ""
117
+ lines = text.split("\n")
118
+ cleaned_lines = []
119
+ for line in lines:
120
+ l_strip = line.strip()
121
+ # skip lines that start with thought, action, observation, response, etc.
122
+ if re.match(r'^(thought|action|observation|route|call|api_key|response):\s*', l_strip, re.IGNORECASE):
123
+ continue
124
+ cleaned_lines.append(line)
125
+ return "\n".join(cleaned_lines).strip()
126
+
127
+
128
+ def _parse_insight_fields(text: str) -> dict:
129
+ """Extract Observation, Business Implication, and Actionable Strategy from insight text."""
130
+ obs = ""
131
+ imp = ""
132
+ strat = ""
133
+
134
+ # Clean text first
135
+ text_clean = _clean_ai_artifacts(text)
136
+
137
+ # Try parsing using regex
138
+ obs_m = re.search(r"\*\*Observation\*\*:\s*(.*?)(?=\*\*Business Implication\*\*|\*\*Actionable Strategy\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
139
+ imp_m = re.search(r"\*\*Business Implication\*\*:\s*(.*?)(?=\*\*Observation\*\*|\*\*Actionable Strategy\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
140
+ strat_m = re.search(r"\*\*Actionable Strategy\*\*:\s*(.*?)(?=\*\*Observation\*\*|\*\*Business Implication\*\*|$)", text_clean, re.DOTALL | re.IGNORECASE)
141
+
142
+ if obs_m:
143
+ obs = obs_m.group(1).strip()
144
+ if imp_m:
145
+ imp = imp_m.group(1).strip()
146
+ if strat_m:
147
+ strat = strat_m.group(1).strip()
148
+
149
+ # Loose match fallbacks
150
+ if not obs:
151
+ obs_m = re.search(r"Observation:\s*(.*?)(?=Business Implication|Actionable Strategy|$)", text_clean, re.DOTALL | re.IGNORECASE)
152
+ if obs_m: obs = obs_m.group(1).strip()
153
+ if not imp:
154
+ imp_m = re.search(r"Business Implication:\s*(.*?)(?=Observation|Actionable Strategy|$)", text_clean, re.DOTALL | re.IGNORECASE)
155
+ if imp_m: imp = imp_m.group(1).strip()
156
+ if not strat:
157
+ strat_m = re.search(r"Actionable Strategy:\s*(.*?)(?=Observation|Business Implication|$)", text_clean, re.DOTALL | re.IGNORECASE)
158
+ if strat_m: strat = strat_m.group(1).strip()
159
+
160
+ return {
161
+ "observation": obs or text_clean,
162
+ "implication": imp,
163
+ "strategy": strat
164
+ }
165
+
166
+
167
+ def _find_matching_chart(insight_text: str, png_files: list, placed_set: set):
168
+ """Find a chart from png_files that is relevant to the column names mentioned in insight_text."""
169
+ text_lower = insight_text.lower()
170
+ for png in png_files:
171
+ if png in placed_set:
172
+ continue
173
+ # Get words from filename (excluding extension)
174
+ stem_clean = png.stem.lower().replace("_", " ")
175
+ # Check if any significant words from the filename are in the insight text
176
+ words = [w for w in stem_clean.split() if w not in ("vs", "plot", "chart", "scatter", "bar", "box", "line", "distribution", "correlation")]
177
+ if words and all(w in text_lower for w in words):
178
+ return png
179
+ return None
180
+
181
+
182
+ def _parse_relation_line(line: str) -> dict:
183
+ """Parse key relations string components into a clean dictionary."""
184
+ parts = [p.strip() for p in line.split("|")]
185
+ res = {"x": "N/A", "y": "N/A", "type": "N/A", "details": "N/A"}
186
+ for part in parts:
187
+ part_clean = part.strip()
188
+ if ":" in part_clean:
189
+ key, val = part_clean.split(":", 1)
190
+ key = key.strip().lower()
191
+ val = val.strip()
192
+ if key == "x":
193
+ res["x"] = val
194
+ elif key == "y":
195
+ res["y"] = val
196
+ elif key == "type":
197
+ res["type"] = val
198
+ elif key == "details":
199
+ res["details"] = val
200
+ else:
201
+ if part_clean.startswith("- "):
202
+ part_clean = part_clean[2:]
203
+ if part_clean:
204
+ res["details"] = part_clean
205
+ return res
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Data Insights table builder
210
+ # ---------------------------------------------------------------------------
211
+
212
+ def _format_number(val) -> str:
213
+ """Formats numeric values into a clean, human-readable layout without scientific notation."""
214
+ try:
215
+ if pd.isna(val):
216
+ return "—"
217
+ val_float = float(val)
218
+ # If it's a float that is actually an integer
219
+ if val_float.is_integer():
220
+ return f"{int(val_float):,}"
221
+ # For very small numbers, use scientific representation safely
222
+ if abs(val_float) < 0.0001 and val_float != 0:
223
+ return f"{val_float:.4e}"
224
+ # For standard floats, format with commas and strip trailing zeros
225
+ formatted = f"{val_float:,.2f}"
226
+ if "." in formatted:
227
+ formatted = formatted.rstrip('0').rstrip('.')
228
+ return formatted
229
+ except Exception:
230
+ return str(val)
231
+
232
+
233
+ def _build_insights_table(df: pd.DataFrame, body_style, header_style, primary_color, secondary_color) -> list:
234
+ """
235
+ Build a Data Insights table showing per-column statistics.
236
+ Returns a list of flowables (may be empty if no numeric cols).
237
+ """
238
+ flowables = []
239
+
240
+ numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
241
+ cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
242
+
243
+ table_header_style = ParagraphStyle("TblHdrWht", parent=body_style,
244
+ fontName="Helvetica-Bold", fontSize=9, leading=12,
245
+ textColor=colors.white)
246
+
247
+ # ── Numeric stats table ──────────────────────────────────────────────────
248
+ if numeric_cols:
249
+ header_row = [
250
+ Paragraph("<b>Column</b>", table_header_style),
251
+ Paragraph("<b>Min</b>", table_header_style),
252
+ Paragraph("<b>Max</b>", table_header_style),
253
+ Paragraph("<b>Mean</b>", table_header_style),
254
+ Paragraph("<b>Median</b>", table_header_style),
255
+ Paragraph("<b>Std Dev</b>",table_header_style),
256
+ Paragraph("<b>Missing%</b>", table_header_style),
257
+ ]
258
+ rows = [header_row]
259
+
260
+ for col in numeric_cols[:20]: # cap at 20 columns
261
+ s = df[col]
262
+ miss = round(s.isnull().sum() / max(len(df), 1) * 100, 1)
263
+ s_num = s.dropna()
264
+ row = [
265
+ Paragraph(str(col), body_style),
266
+ Paragraph(_format_number(s_num.min()) if not s_num.empty else "—", body_style),
267
+ Paragraph(_format_number(s_num.max()) if not s_num.empty else "—", body_style),
268
+ Paragraph(_format_number(s_num.mean()) if not s_num.empty else "—", body_style),
269
+ Paragraph(_format_number(s_num.median()) if not s_num.empty else "—", body_style),
270
+ Paragraph(_format_number(s_num.std()) if len(s_num) > 1 else "—", body_style),
271
+ Paragraph(f"{miss}%", body_style),
272
+ ]
273
+ rows.append(row)
274
+
275
+ tbl = Table(rows, colWidths=[120, 55, 55, 55, 55, 55, 55])
276
+ tbl.setStyle(TableStyle([
277
+ ("BACKGROUND", (0, 0), (-1, 0), primary_color),
278
+ ("LINEBELOW", (0, 0), (-1, 0), 1.2, secondary_color),
279
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
280
+ ("INNERGRID", (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
281
+ ("TOPPADDING", (0, 0), (-1, -1), 5),
282
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
283
+ ("LEFTPADDING", (0, 0), (-1, -1), 6),
284
+ ("RIGHTPADDING", (0, 0), (-1, -1), 6),
285
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
286
+ ("ROWBACKGROUNDS",(0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
287
+ ]))
288
+ flowables.append(tbl)
289
+ flowables.append(Spacer(1, 8))
290
+
291
+ # ── Categorical top-values table ─────────────────────────────────────────
292
+ if cat_cols:
293
+ cat_header = [
294
+ Paragraph("<b>Column</b>", table_header_style),
295
+ Paragraph("<b>Top Values (count)</b>", table_header_style),
296
+ Paragraph("<b>Unique</b>", table_header_style),
297
+ Paragraph("<b>Missing%</b>", table_header_style),
298
+ ]
299
+ cat_rows = [cat_header]
300
+
301
+ for col in cat_cols[:10]:
302
+ s = df[col]
303
+ miss = round(s.isnull().sum() / max(len(df), 1) * 100, 1)
304
+ unique = s.nunique()
305
+ top3 = s.value_counts().head(3)
306
+ top_str = ", ".join(f"{v}({c})" for v, c in top3.items()) if not top3.empty else "—"
307
+ cat_rows.append([
308
+ Paragraph(str(col), body_style),
309
+ Paragraph(top_str[:80], body_style),
310
+ Paragraph(str(unique), body_style),
311
+ Paragraph(f"{miss}%", body_style),
312
+ ])
313
+
314
+ cat_tbl = Table(cat_rows, colWidths=[100, 280, 55, 65])
315
+ cat_tbl.setStyle(TableStyle([
316
+ ("BACKGROUND", (0, 0), (-1, 0), primary_color),
317
+ ("LINEBELOW", (0, 0), (-1, 0), 1.2, secondary_color),
318
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
319
+ ("INNERGRID", (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
320
+ ("TOPPADDING", (0, 0), (-1, -1), 5),
321
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
322
+ ("LEFTPADDING", (0, 0), (-1, -1), 6),
323
+ ("RIGHTPADDING", (0, 0), (-1, -1), 6),
324
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
325
+ ("ROWBACKGROUNDS",(0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]),
326
+ ]))
327
+ flowables.append(cat_tbl)
328
+
329
+ return flowables
330
+
331
+
332
+ # ---------------------------------------------------------------------------
333
+ # PDF Generation
334
+ # ---------------------------------------------------------------------------
335
+
336
+ def export_pdf(result: dict, filename: str = "") -> bytes:
337
+ """Build and return a professional executive PDF report."""
338
+ buffer = BytesIO()
339
+ doc = SimpleDocTemplate(
340
+ buffer,
341
+ pagesize=letter,
342
+ rightMargin=54,
343
+ leftMargin=54,
344
+ topMargin=80, # Increased top margin for header safety
345
+ bottomMargin=80, # Increased bottom margin for footer safety
346
+ )
347
+ story = []
348
+ styles = getSampleStyleSheet()
349
+
350
+ primary_color = colors.HexColor("#4a0404")
351
+ secondary_color = colors.HexColor("#ff252a")
352
+ text_color = colors.HexColor("#0f172a")
353
+ muted_color = colors.HexColor("#475569")
354
+
355
+ title_style = ParagraphStyle("DocTitle", parent=styles["Normal"],
356
+ fontName="Helvetica-Bold", fontSize=24, leading=30,
357
+ textColor=primary_color, spaceAfter=4)
358
+ subtitle_style = ParagraphStyle("DocSubTitle", parent=styles["Normal"],
359
+ fontName="Helvetica", fontSize=11, leading=15,
360
+ textColor=secondary_color, spaceAfter=6)
361
+ meta_style = ParagraphStyle("Meta", parent=styles["Normal"],
362
+ fontName="Helvetica", fontSize=9, leading=13,
363
+ textColor=muted_color, spaceAfter=12)
364
+ h1_style = ParagraphStyle("H1", parent=styles["Normal"],
365
+ fontName="Helvetica-Bold", fontSize=15, leading=20,
366
+ textColor=primary_color, spaceBefore=16, spaceAfter=8, keepWithNext=True)
367
+ h2_style = ParagraphStyle("H2", parent=styles["Normal"],
368
+ fontName="Helvetica-Bold", fontSize=11, leading=15,
369
+ textColor=secondary_color, spaceBefore=10, spaceAfter=4, keepWithNext=True)
370
+ body_style = ParagraphStyle("Body", parent=styles["Normal"],
371
+ fontName="Helvetica", fontSize=10, leading=15,
372
+ textColor=text_color, spaceAfter=5)
373
+ header_style = ParagraphStyle("TblHdr", parent=styles["Normal"],
374
+ fontName="Helvetica-Bold", fontSize=9, leading=12,
375
+ textColor=primary_color)
376
+ bullet_style = ParagraphStyle("Bullet", parent=styles["Normal"],
377
+ fontName="Helvetica", fontSize=9.5, leading=14,
378
+ textColor=text_color, leftIndent=14, firstLineIndent=-10, spaceAfter=7)
379
+
380
+ # ── Clean all inputs from AI reasoning artifacts ──────────────────────────
381
+ raw_insights = _clean_ai_artifacts(result.get("insights", "")).strip()
382
+ cleaning_text = _clean_ai_artifacts(result.get("cleaning_steps", "")).strip()
383
+ relations_text = _clean_ai_artifacts(result.get("relations", "")).strip()
384
+ report_title = _clean_ai_artifacts(result.get("report_title") or filename or "Executive Analysis Report").strip()
385
+ report_goal = _clean_ai_artifacts(result.get("goal") or "").strip()
386
+ timestamp = datetime.now().strftime("%B %d, %Y at %I:%M %p")
387
+
388
+ # ── 1. TITLE PAGE (Page 1) ────────────────────────────────────────────────
389
+ title_center_style = ParagraphStyle("TitleCenter", parent=title_style, alignment=1, fontSize=26, leading=32)
390
+ subtitle_center_style = ParagraphStyle("SubCenter", parent=subtitle_style, alignment=1, fontSize=12, leading=16)
391
+ meta_center_style = ParagraphStyle("MetaCenter", parent=meta_style, alignment=1, fontSize=10, leading=14)
392
+
393
+ story.append(Spacer(1, 140))
394
+ story.append(Paragraph(f"<b>{report_title.upper()}</b>", title_center_style))
395
+ story.append(Spacer(1, 10))
396
+ story.append(Paragraph("Autonomous Business Intelligence &amp; Executive Analysis Suite", subtitle_center_style))
397
+ story.append(Spacer(1, 40))
398
+
399
+ story.append(Paragraph(f"Dataset Analyzed: <b>{filename or 'dataset.csv'}</b>", meta_center_style))
400
+ story.append(Paragraph(f"Generated On: {timestamp}", meta_center_style))
401
+
402
+ if report_goal:
403
+ story.append(Spacer(1, 30))
404
+ story.append(Paragraph(f"<b>Core Objective:</b> {report_goal}", ParagraphStyle("GoalStyleCenter", parent=body_style, fontName="Helvetica-Oblique", fontSize=9.5, textColor=muted_color, alignment=1)))
405
+
406
+ story.append(PageBreak())
407
+
408
+ # Parse sections from structured insights
409
+ objectives_text = ""
410
+ stats_text = ""
411
+ strategic_text = ""
412
+ warnings_text = ""
413
+
414
+ if raw_insights:
415
+ sections = re.split(r"###\s+", raw_insights)
416
+ for sec in sections:
417
+ lines = sec.split("\n")
418
+ if not lines or not lines[0].strip():
419
+ continue
420
+ header = lines[0].strip().lower()
421
+ content = "\n".join(lines[1:]).strip()
422
+
423
+ if "objective" in header or "goal" in header:
424
+ objectives_text = content
425
+ elif "stat" in header:
426
+ stats_text = content
427
+ elif "insight" in header:
428
+ strategic_text = content
429
+ elif "warning" in header or "alert" in header:
430
+ warnings_text = content
431
+
432
+ if not strategic_text and not objectives_text:
433
+ strategic_text = raw_insights
434
+
435
+ # ── 2. EXECUTIVE SUMMARY & DATASET OVERVIEW (Page 2) ──────────────────────
436
+ story.append(Paragraph("Executive Summary &amp; Dataset Overview", h1_style))
437
+ story.append(Paragraph(
438
+ "This autonomous executive analysis report presents high-value strategic recommendations, "
439
+ "data quality cleaning trails, and key visualizations derived from the uploaded dataset.",
440
+ body_style,
441
+ ))
442
+ story.append(Spacer(1, 8))
443
+
444
+ df = result.get("dataframe")
445
+ if df is not None and isinstance(df, pd.DataFrame):
446
+ cols_preview = f"{', '.join(str(c) for c in df.columns[:6])}{'...' if len(df.columns) > 6 else ''}"
447
+ numeric_count = len(df.select_dtypes(include=["number"]).columns)
448
+ cat_count = len(df.select_dtypes(include=["object", "category"]).columns)
449
+
450
+ box_data = [
451
+ [Paragraph("<b>Dataset Summary Metrics</b>", h2_style), Paragraph("", body_style)],
452
+ [Paragraph("Total Records Analyzed", body_style), Paragraph(f"<b>{df.shape[0]:,}</b>", body_style)],
453
+ [Paragraph("Total Columns", body_style), Paragraph(f"<b>{df.shape[1]}</b>", body_style)],
454
+ [Paragraph("Numeric Columns", body_style), Paragraph(f"<b>{numeric_count}</b>", body_style)],
455
+ [Paragraph("Categorical Columns", body_style), Paragraph(f"<b>{cat_count}</b>", body_style)],
456
+ [Paragraph("Columns Sampled", body_style), Paragraph(cols_preview, body_style)],
457
+ ]
458
+ summary_box = Table(box_data, colWidths=[160, 344])
459
+ summary_box.setStyle(TableStyle([
460
+ ("BACKGROUND", (0, 0), (-1, -1), colors.HexColor("#f8fafc")),
461
+ ("BOX", (0, 0), (-1, -1), 1, colors.HexColor("#cbd5e1")),
462
+ ("LINEBELOW", (0, 0), (-1, 0), 1.5, secondary_color),
463
+ ("TOPPADDING", (0, 0), (-1, -1), 6),
464
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
465
+ ("LEFTPADDING", (0, 0), (-1, -1), 12),
466
+ ("RIGHTPADDING", (0, 0), (-1, -1), 12),
467
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
468
+ ]))
469
+ story.append(summary_box)
470
+ story.append(Spacer(1, 15))
471
+
472
+ story.append(Spacer(1, 15))
473
+
474
+ # ── 3. STRATEGIC BUSINESS INSIGHTS & PAIRED CHARTS (Page 3+) ──────────────
475
+ # Locate all saved visualization charts
476
+ output_dir = result.get("output_dir", Path("outputs"))
477
+ png_files = list(Path(output_dir).glob("*.png"))
478
+ placed_charts = set()
479
+
480
+ if strategic_text:
481
+ story.append(Paragraph("Strategic Business Insights", h1_style))
482
+ story.append(Paragraph(
483
+ "Below are the critical business insights identified from the dataset, paired directly with "
484
+ "relevant charts indicating data correlations.",
485
+ body_style,
486
+ ))
487
+ story.append(Spacer(1, 8))
488
+
489
+ insight_items = re.split(r"\d+\.\s+", strategic_text)
490
+ insight_count = 0
491
+
492
+ # Styles for table headers in the 3-column layout
493
+ obs_hdr_style = ParagraphStyle("ObsHdr", parent=header_style, textColor=colors.HexColor("#0c4a6e"))
494
+ imp_hdr_style = ParagraphStyle("ImpHdr", parent=header_style, textColor=colors.HexColor("#581c87"))
495
+ strat_hdr_style = ParagraphStyle("StratHdr", parent=header_style, textColor=colors.HexColor("#14532d"))
496
+
497
+ for item in insight_items:
498
+ item = item.strip()
499
+ if not item:
500
+ continue
501
+ insight_count += 1
502
+ fields = _parse_insight_fields(item)
503
+
504
+ # Try to extract insight title if it starts with bold text
505
+ title = ""
506
+ first_line = item.split("\n")[0].strip()
507
+ if first_line and not any(k in first_line for k in ("Observation", "Implication", "Strategy")):
508
+ title = first_line.replace("**", "").replace("<b>", "").replace("</b>", "").strip()
509
+
510
+ lbl_text = f"<b>Insight {insight_count}: {title}</b>" if title else f"<b>Insight {insight_count}</b>"
511
+ lbl_para = Paragraph(lbl_text, ParagraphStyle("InsLbl", parent=body_style, fontName="Helvetica-Bold", fontSize=11, textColor=primary_color, spaceBefore=10, spaceAfter=4, keepWithNext=True))
512
+
513
+ # Build 3-column table
514
+ cell_obs = Paragraph(_md_to_html(fields["observation"]), body_style)
515
+ cell_imp = Paragraph(_md_to_html(fields["implication"] or "N/A"), body_style)
516
+ cell_strat = Paragraph(_md_to_html(fields["strategy"] or "N/A"), body_style)
517
+
518
+ tbl_data = [
519
+ [Paragraph("<b>Observation</b>", obs_hdr_style), Paragraph("<b>Business Implication</b>", imp_hdr_style), Paragraph("<b>Actionable Strategy</b>", strat_hdr_style)],
520
+ [cell_obs, cell_imp, cell_strat]
521
+ ]
522
+
523
+ card_table = Table(tbl_data, colWidths=[160, 172, 172])
524
+ card_table.setStyle(TableStyle([
525
+ ("BACKGROUND", (0, 0), (0, 0), colors.HexColor("#f0f9ff")), # sky blue
526
+ ("BACKGROUND", (1, 0), (1, 0), colors.HexColor("#faf5ff")), # purple
527
+ ("BACKGROUND", (2, 0), (2, 0), colors.HexColor("#f0fdf4")), # green
528
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
529
+ ("INNERGRID", (0, 0), (-1, -1), 0.3, colors.HexColor("#e2e8f0")),
530
+ ("VALIGN", (0, 0), (-1, -1), "TOP"),
531
+ ("TOPPADDING", (0, 0), (-1, -1), 6),
532
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
533
+ ("LEFTPADDING", (0, 0), (-1, -1), 8),
534
+ ("RIGHTPADDING", (0, 0), (-1, -1), 8),
535
+ ]))
536
+
537
+ # Look for a matched chart
538
+ matched_png = _find_matching_chart(item, png_files, placed_charts)
539
+ fig_title = None
540
+ img_table = None
541
+
542
+ if matched_png:
543
+ try:
544
+ with PILImage.open(matched_png) as img:
545
+ orig_w, orig_h = img.size
546
+ max_w, max_h = 440, 240
547
+ aspect = orig_h / orig_w
548
+ if aspect > (max_h / max_w):
549
+ new_h = max_h
550
+ new_w = new_h / aspect
551
+ else:
552
+ new_w = max_w
553
+ new_h = new_w * aspect
554
+
555
+ img_flow = Image(str(matched_png), width=new_w, height=new_h)
556
+ img_table = Table([[img_flow]], colWidths=[504])
557
+ img_table.setStyle(TableStyle([
558
+ ("ALIGN", (0, 0), (-1, -1), "CENTER"),
559
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
560
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
561
+ ("BACKGROUND", (0, 0), (-1, -1), colors.white),
562
+ ("TOPPADDING", (0, 0), (-1, -1), 6),
563
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
564
+ ]))
565
+ fig_title = Paragraph(
566
+ f"<i>Supporting Figure: {matched_png.stem.replace('_', ' ').title()}</i>",
567
+ ParagraphStyle("FigTitle", parent=body_style, fontName="Helvetica-Oblique", fontSize=8.5, textColor=muted_color, spaceBefore=4, spaceAfter=2)
568
+ )
569
+ placed_charts.add(matched_png)
570
+ except Exception as exc:
571
+ img_table = Paragraph(f"Could not load matching image {matched_png.name}: {exc}", body_style)
572
+
573
+ story.append(KeepTogether([lbl_para, card_table]))
574
+ if matched_png:
575
+ story.append(Spacer(1, 4))
576
+ if fig_title and img_table:
577
+ story.append(KeepTogether([fig_title, img_table]))
578
+ elif img_table:
579
+ story.append(KeepTogether([img_table]))
580
+ story.append(Spacer(1, 14))
581
+
582
+ story.append(Spacer(1, 10))
583
+
584
+ # ── Warnings & Alerts (if present) ────────────────────────────────────────
585
+ if warnings_text and not "no warnings" in warnings_text.lower() and not "none" in warnings_text.lower():
586
+ story.append(Paragraph("Business Risks &amp; Critical Alerts", h1_style))
587
+ warning_content = [
588
+ Paragraph(warnings_text.replace("\n", "<br/>"), ParagraphStyle("WarnStyle", parent=body_style, fontSize=9.5, textColor=colors.HexColor("#991b1b")))
589
+ ]
590
+ warning_table = Table([[warning_content]], colWidths=[504])
591
+ warning_table.setStyle(TableStyle([
592
+ ("BACKGROUND", (0, 0), (-1, -1), colors.HexColor("#fef2f2")),
593
+ ("LINELEFT", (0, 0), (0, -1), 4, colors.HexColor("#f43f5e")),
594
+ ("LEFTPADDING", (0, 0), (-1, -1), 12),
595
+ ("RIGHTPADDING", (0, 0), (-1, -1), 12),
596
+ ("TOPPADDING", (0, 0), (-1, -1), 8),
597
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 8),
598
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#fee2e2")),
599
+ ]))
600
+ story.append(KeepTogether([warning_table, Spacer(1, 12)]))
601
+
602
+ story.append(Spacer(1, 15))
603
+
604
+ # ── 4. DATA SUMMARY & METHODOLOGY (Page 4+) ───────────────────────────────
605
+ story.append(Paragraph("Data Summary &amp; Methodology", h1_style))
606
+
607
+ # Project Objectives
608
+ if objectives_text:
609
+ story.append(Paragraph("Project Objectives &amp; Scope", h2_style))
610
+ story.append(Paragraph(objectives_text.replace("\n", "<br/>"), body_style))
611
+ story.append(Spacer(1, 8))
612
+
613
+ # Cleaning steps
614
+ if cleaning_text:
615
+ story.append(Paragraph("Data Cleaning Audit Trail", h2_style))
616
+ story.append(Paragraph("Automated type inference, value imputation, and formatting constraints applied to input records:", body_style))
617
+ for raw in cleaning_text.split("\n"):
618
+ line = raw.strip().lstrip("-*• ").strip()
619
+ if line:
620
+ story.append(Paragraph(f"• &nbsp; {_md_to_html(line)}", bullet_style))
621
+ story.append(Spacer(1, 8))
622
+
623
+ # Relations
624
+ if relations_text:
625
+ story.append(Paragraph("Key Correlation &amp; Relationship Map", h2_style))
626
+ story.append(Paragraph("Direct associations identified across columns for target visualization selection:", body_style))
627
+ story.append(Spacer(1, 4))
628
+
629
+ tbl_data = [
630
+ [
631
+ Paragraph("<b>Variable X</b>", header_style),
632
+ Paragraph("<b>Variable Y</b>", header_style),
633
+ Paragraph("<b>Chart Type</b>", header_style),
634
+ Paragraph("<b>Business Context &amp; Relevance</b>", header_style)
635
+ ]
636
+ ]
637
+
638
+ for raw in relations_text.split("\n"):
639
+ line = raw.strip().lstrip("-*• ").strip()
640
+ if not line:
641
+ continue
642
+ parsed = _parse_relation_line(line)
643
+ tbl_data.append([
644
+ Paragraph(parsed["x"], body_style),
645
+ Paragraph(parsed["y"], body_style),
646
+ Paragraph(parsed["type"], body_style),
647
+ Paragraph(parsed["details"], body_style)
648
+ ])
649
+
650
+ if len(tbl_data) > 1:
651
+ rel_table = Table(tbl_data, colWidths=[95, 95, 95, 219])
652
+ rel_table.setStyle(TableStyle([
653
+ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#f1f5f9")),
654
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
655
+ ("INNERGRID", (0, 0), (-1, -1), 0.3, colors.HexColor("#cbd5e1")),
656
+ ("VALIGN", (0, 0), (-1, -1), "TOP"),
657
+ ("TOPPADDING", (0, 0), (-1, -1), 5),
658
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
659
+ ("LEFTPADDING", (0, 0), (-1, -1), 6),
660
+ ("RIGHTPADDING", (0, 0), (-1, -1), 6),
661
+ ]))
662
+ story.append(rel_table)
663
+ story.append(Spacer(1, 10))
664
+
665
+ story.append(Spacer(1, 15))
666
+
667
+ # ── 5. APPENDIX (Page 5+) ─────────────────────────────────────────────────
668
+ story.append(Paragraph("Appendix", h1_style))
669
+
670
+ # Per-column statistical summary
671
+ if df is not None and isinstance(df, pd.DataFrame):
672
+ story.append(Paragraph("Per-Column Statistical Summary", h2_style))
673
+ story.append(Paragraph(
674
+ "Detailed metric breakdowns for numeric distributions and categorical frequencies:",
675
+ body_style,
676
+ ))
677
+ story.append(Spacer(1, 4))
678
+ insight_flowables = _build_insights_table(df, body_style, header_style, primary_color, secondary_color)
679
+ if insight_flowables:
680
+ story.extend(insight_flowables)
681
+ story.append(Spacer(1, 10))
682
+
683
+ # Remaining/Unmatched Visualizations
684
+ unplaced_charts = [png for png in png_files if png not in placed_charts]
685
+ if unplaced_charts:
686
+ story.append(Paragraph("Additional Analytical Visualizations", h2_style))
687
+ story.append(Paragraph(
688
+ "Supplementary visual mappings of auxiliary data relationships:",
689
+ body_style,
690
+ ))
691
+ story.append(Spacer(1, 6))
692
+
693
+ for png_file in unplaced_charts:
694
+ try:
695
+ with PILImage.open(png_file) as img:
696
+ orig_w, orig_h = img.size
697
+ max_w, max_h = 440, 240
698
+ aspect = orig_h / orig_w
699
+ if aspect > (max_h / max_w):
700
+ new_h = max_h
701
+ new_w = new_h / aspect
702
+ else:
703
+ new_w = max_w
704
+ new_h = new_w * aspect
705
+
706
+ fig_title = Paragraph(
707
+ f"<b>Figure: {png_file.stem.replace('_', ' ').title()}</b>",
708
+ ParagraphStyle("FigTitle", parent=body_style, fontName="Helvetica-Bold",
709
+ textColor=primary_color, spaceBefore=8, spaceAfter=4, keepWithNext=True),
710
+ )
711
+ img_flow = Image(str(png_file), width=new_w, height=new_h)
712
+ img_table = Table([[img_flow]], colWidths=[504])
713
+ img_table.setStyle(TableStyle([
714
+ ("ALIGN", (0, 0), (-1, -1), "CENTER"),
715
+ ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
716
+ ("BOX", (0, 0), (-1, -1), 0.5, colors.HexColor("#cbd5e1")),
717
+ ("BACKGROUND", (0, 0), (-1, -1), colors.white),
718
+ ("TOPPADDING", (0, 0), (-1, -1), 6),
719
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
720
+ ]))
721
+ story.append(KeepTogether([fig_title, img_table, Spacer(1, 10)]))
722
+ except Exception as exc:
723
+ story.append(Paragraph(f"Could not load image {png_file.name}: {exc}", body_style))
724
+
725
+ # Conclusion & Next steps styled box
726
+ conclusion_style = ParagraphStyle("ConclStyle", parent=body_style, fontSize=9.5, textColor=colors.HexColor("#1e293b"))
727
+ conclusion_content = [
728
+ Paragraph("<b>Executive Conclusion &amp; Next Steps</b>", ParagraphStyle("ConclHdr", parent=h2_style, fontSize=11, textColor=primary_color, spaceAfter=6)),
729
+ Paragraph(
730
+ "The automated data pipeline has successfully validated, cleaned, and evaluated the dataset "
731
+ "under the specified project guidelines. To maximize return on these insights, management is advised "
732
+ "to prioritize the Actionable Strategy recommendations outlined in the insights section, address the warnings "
733
+ "disclosed, and leverage the visual intelligence charts for stakeholder presentations.",
734
+ conclusion_style
735
+ )
736
+ ]
737
+ conclusion_table = Table([[conclusion_content]], colWidths=[504])
738
+ conclusion_table.setStyle(TableStyle([
739
+ ("BACKGROUND", (0, 0), (-1, -1), colors.HexColor("#f8fafc")), # light slate gray background
740
+ ("BOX", (0, 0), (-1, -1), 1, colors.HexColor("#cbd5e1")),
741
+ ("LINELEFT", (0, 0), (0, -1), 4, primary_color), # accent left border
742
+ ("LEFTPADDING", (0, 0), (-1, -1), 14),
743
+ ("RIGHTPADDING", (0, 0), (-1, -1), 14),
744
+ ("TOPPADDING", (0, 0), (-1, -1), 10),
745
+ ("BOTTOMPADDING", (0, 0), (-1, -1), 10),
746
+ ]))
747
+ story.append(Spacer(1, 15))
748
+ story.append(KeepTogether([conclusion_table]))
749
+
750
+ doc.build(story, canvasmaker=NumberedCanvas)
751
+ pdf_bytes = buffer.getvalue()
752
+ buffer.close()
753
+ return pdf_bytes
754
+
755
+
756
+ # ---------------------------------------------------------------------------
757
+ # Cached wrapper (used by app.py)
758
+ # ---------------------------------------------------------------------------
759
+
760
+ _pdf_cache: dict = {}
761
+
762
+
763
+ def export_pdf_cached(
764
+ cache_key: str,
765
+ filename: str = "",
766
+ result_cleaning: str = "",
767
+ result_relations: str = "",
768
+ result_insights: str = "",
769
+ result_code: str = "",
770
+ output_dir_str: str = "outputs",
771
+ df_csv: str = "",
772
+ ) -> bytes:
773
+ """
774
+ Build (or return cached) PDF bytes from serialized result components.
775
+ Uses an in-process dict cache keyed by content hash to avoid rebuilding
776
+ identical PDFs on every Streamlit rerun.
777
+ """
778
+ if cache_key in _pdf_cache:
779
+ return _pdf_cache[cache_key]
780
+
781
+ # Reconstruct DataFrame from CSV string
782
+ df = None
783
+ if df_csv:
784
+ try:
785
+ df = pd.read_csv(io.StringIO(df_csv))
786
+ except Exception:
787
+ df = None
788
+
789
+ result = {
790
+ "dataframe": df,
791
+ "cleaning_steps": result_cleaning,
792
+ "relations": result_relations,
793
+ "insights": result_insights,
794
+ "code": result_code,
795
+ "output_dir": output_dir_str,
796
+ }
797
+
798
+ pdf_bytes = export_pdf(result, filename=filename)
799
+ _pdf_cache[cache_key] = pdf_bytes
800
+ return pdf_bytes