crewlyze 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.dockerignore +12 -0
  2. package/.gitattributes +2 -0
  3. package/CHANGELOG.md +86 -0
  4. package/Dockerfile +21 -0
  5. package/LICENSE +21 -0
  6. package/README.md +139 -0
  7. package/USAGE.md +106 -0
  8. package/agents/__init__.py +0 -0
  9. package/agents/cleaner.py +38 -0
  10. package/agents/insights.py +44 -0
  11. package/agents/relation.py +36 -0
  12. package/agents/visualizer.py +41 -0
  13. package/assets/badge_crewai.svg +4 -0
  14. package/assets/badge_matplotlib.svg +4 -0
  15. package/assets/badge_ollama.svg +4 -0
  16. package/assets/badge_pandas.svg +4 -0
  17. package/assets/badge_seaborn.svg +4 -0
  18. package/assets/branding_image.png +0 -0
  19. package/assets/complete_workflow.svg +216 -0
  20. package/assets/favicon.png +0 -0
  21. package/assets/logo.png +0 -0
  22. package/assets/stars.svg +12 -0
  23. package/bin/crewlyze.js +79 -0
  24. package/config/README.md +129 -0
  25. package/config/__init__.py +1 -0
  26. package/config/context.py +16 -0
  27. package/config/llm_config.py +300 -0
  28. package/config/metrics_tracker.py +70 -0
  29. package/crew.py +870 -0
  30. package/crewlyze-3.1.0.tgz +0 -0
  31. package/fix_syntax.py +54 -0
  32. package/main.py +1279 -0
  33. package/package.json +22 -0
  34. package/pyproject.toml +32 -0
  35. package/requirements.txt +33 -0
  36. package/tools/__init__.py +0 -0
  37. package/tools/dataset_tools.py +803 -0
  38. package/ui/__init__.py +3 -0
  39. package/ui/copilot.py +200 -0
  40. package/ui/export.py +800 -0
  41. package/update_appjs.py +54 -0
  42. package/update_llm.py +21 -0
  43. package/update_main.py +20 -0
  44. package/web/app.js +3142 -0
  45. package/web/index.html +1105 -0
  46. package/web/style.css +2561 -0
  47. package/workflows/__init__.py +0 -0
  48. package/workflows/pipeline.py +254 -0
package/main.py ADDED
@@ -0,0 +1,1279 @@
1
+ # Crewlyze
2
+ # Copyright (c) 2025 Sowmiyan S
3
+ # Licensed under the MIT License
4
+
5
+ """
6
+ FastAPI Server backend for the Crewlyze application.
7
+ Serves static HTML/JS/CSS assets and exposes REST APIs + Server-Sent Events (SSE)
8
+ for streaming real-time analysis logs.
9
+ """
10
+
11
+ import os
12
+ import sys
13
+
14
+ try:
15
+ sys.stdout.reconfigure(encoding='utf-8')
16
+ sys.stderr.reconfigure(encoding='utf-8')
17
+ except Exception:
18
+ pass
19
+
20
+ import json
21
+ import re
22
+ import uuid
23
+ import asyncio
24
+ import shutil
25
+ import threading
26
+ import time
27
+ import zipfile
28
+ from io import BytesIO
29
+ from pathlib import Path
30
+ from typing import Optional
31
+
32
+ import pandas as pd
33
+ from tools.dataset_tools import read_csv_robust
34
+
35
+ # Monkey patch crewai caching to avoid Nvidia NIM / LiteLLM validation errors
36
+ try:
37
+ import crewai.llms.cache as _crewai_cache
38
+ _crewai_cache.mark_cache_breakpoint = lambda msg: msg
39
+ except Exception:
40
+ pass
41
+
42
+ from fastapi import FastAPI, File, UploadFile, Form, BackgroundTasks, HTTPException
43
+ from fastapi.responses import StreamingResponse, FileResponse, HTMLResponse
44
+ from fastapi.staticfiles import StaticFiles
45
+ from fastapi.middleware.cors import CORSMiddleware
46
+
47
+ # regex to find ANSI terminal escape patterns
48
+ ANSI_ESCAPE = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
49
+
50
+ # To keep track of log states (e.g. ignoring prompt blocks) per session
51
+ log_stream_states = {}
52
+
53
+ def clean_log_message(line: str, session_id: Optional[str] = None) -> Optional[str]:
54
+ """Strip ANSI color codes, ignore noisy messages, and format thoughts/actions nicely."""
55
+ # Strip ANSI colors/escapes
56
+ line = ANSI_ESCAPE.sub('', line)
57
+
58
+ # Check if empty
59
+ stripped = line.strip()
60
+ if not stripped:
61
+ return None
62
+
63
+ line_lower = stripped.lower()
64
+
65
+ # System logs noise keywords to ignore
66
+ noise_keywords = [
67
+ "scriptruncontext",
68
+ "telemetry_opt_out",
69
+ "otel_sdk_disabled",
70
+ "opentelemetry",
71
+ "urllib3",
72
+ "connectionpool",
73
+ "http/1.1",
74
+ "httpx",
75
+ "backoff",
76
+ "requests.packages",
77
+ "missing scriptruncontext",
78
+ "openai-api-keyword",
79
+ "http request",
80
+ "cooldown",
81
+ "rate limit",
82
+ "max_tokens",
83
+ ]
84
+ if any(kw in line_lower for kw in noise_keywords):
85
+ return None
86
+
87
+ # Handle stateful prompt block ignoring
88
+ if session_id:
89
+ if session_id not in log_stream_states:
90
+ log_stream_states[session_id] = {"in_prompt": False}
91
+ state = log_stream_states[session_id]
92
+
93
+ # Start ignoring if prompt starts
94
+ if "prompt after formatting:" in line_lower or "use the following format:" in line_lower:
95
+ state["in_prompt"] = True
96
+ return None
97
+
98
+ # Stop ignoring if agent thoughts/actions/results start
99
+ if state["in_prompt"]:
100
+ stop_triggers = ["thought:", "action:", "action input:", "response:", "observation:", "entering new", "finished chain"]
101
+ if any(trig in line_lower for trig in stop_triggers):
102
+ state["in_prompt"] = False
103
+ else:
104
+ return None # Still ignoring prompt contents
105
+
106
+ # Ignore raw debug logs from crewai/langchain
107
+ if stripped.startswith("[DEBUG]:") or stripped.startswith("[INFO]:"):
108
+ if "working agent" in line_lower:
109
+ agent_name = stripped.split(":", 2)[-1].strip()
110
+ return f"[Agent] {agent_name} is active..."
111
+ return None
112
+
113
+ # Format specific Langchain output structures for a premium look
114
+ if "entering new crewagentexecutor chain" in line_lower:
115
+ return "[Task] Starting agent execution task..."
116
+ if "finished chain" in line_lower:
117
+ return "[Task] Execution task completed."
118
+
119
+ # Format Thoughts, Actions, inputs, and outputs nicely
120
+ if stripped.startswith("Thought:"):
121
+ thought_text = stripped[8:].strip()
122
+ return f"[Thought] {thought_text}"
123
+
124
+ if stripped.startswith("Action:"):
125
+ action_text = stripped[7:].strip()
126
+ return f"[Calling Tool] {action_text}"
127
+
128
+ if stripped.startswith("Action Input:"):
129
+ input_text = stripped[13:].strip()
130
+ if len(input_text) > 150:
131
+ input_text = input_text[:150] + "..."
132
+ return f"[Input] {input_text}"
133
+
134
+ if stripped.startswith("Response:") or stripped.startswith("Observation:"):
135
+ resp_text = stripped.split(":", 1)[1].strip()
136
+ if len(resp_text) > 150:
137
+ resp_text = resp_text[:150] + "..."
138
+ return f"[Tool Response] {resp_text}"
139
+
140
+ if "warning" in line_lower or "error" in line_lower:
141
+ if "error" in line_lower:
142
+ return f"[Error] {stripped}"
143
+ return f"[Warning] {stripped}"
144
+
145
+ return stripped
146
+
147
+ # Core analysis engines — imported lazily so the server boots
148
+ # even if crewai has install issues on this Python version.
149
+ # Actual ImportError surfaces only when analysis is triggered.
150
+ _run_crew = None
151
+ _apply_runtime_llm_settings = None
152
+ _validate_llm_connection = None
153
+ _run_copilot_query = None
154
+ _export_pdf = None
155
+
156
+ def _load_crew():
157
+ global _run_crew, _apply_runtime_llm_settings, _validate_llm_connection
158
+ global _run_copilot_query, _export_pdf
159
+ if _run_crew is None:
160
+ from crew import run_crew as _rc
161
+ from config.llm_config import apply_runtime_llm_settings as _arls, validate_llm_connection as _vlc
162
+ from ui.copilot import run_copilot_query as _rcq
163
+ from ui.export import export_pdf as _ep
164
+ _run_crew = _rc
165
+ _apply_runtime_llm_settings = _arls
166
+ _validate_llm_connection = _vlc
167
+ _run_copilot_query = _rcq
168
+ _export_pdf = _ep
169
+
170
+ # Suppress warnings
171
+ os.environ["CREWAI_TELEMETRY_OPT_OUT"] = "true"
172
+ os.environ["OTEL_SDK_DISABLED"] = "true"
173
+
174
+ app = FastAPI(
175
+ title="Crewlyze API",
176
+ description="Autonomous Multi-Agent Business Intelligence and Data Engineering Platform",
177
+ version="3.1.0"
178
+ )
179
+
180
+ # Enable CORS for local development flexibility
181
+ app.add_middleware(
182
+ CORSMiddleware,
183
+ allow_origins=["*"],
184
+ allow_credentials=True,
185
+ allow_methods=["*"],
186
+ allow_headers=["*"],
187
+ )
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # State & Directory Setup
191
+ # ---------------------------------------------------------------------------
192
+
193
+ USER_HOME = Path.home() / ".crewlyze"
194
+ DATA_DIR = Path(os.getenv("CREWLYZE_DATA_DIR", str(USER_HOME / "data")))
195
+ SESSIONS_DIR = DATA_DIR / "sessions"
196
+ OUTPUTS_DIR = Path(os.getenv("CREWLYZE_OUTPUTS_DIR", str(USER_HOME / "outputs")))
197
+
198
+ for path in (DATA_DIR, SESSIONS_DIR, OUTPUTS_DIR):
199
+ path.mkdir(exist_ok=True, parents=True)
200
+
201
+ def is_safe_id(id_str: str) -> bool:
202
+ """Ensure the ID is strictly alphanumeric (plus dashes/underscores) to prevent path traversal."""
203
+ if not id_str:
204
+ return False
205
+ return bool(re.match(r"^[a-zA-Z0-9_-]+$", id_str))
206
+
207
+ def is_safe_filename(filename: str) -> bool:
208
+ """Ensure the filename doesn't contain path traversal characters and has a safe pattern."""
209
+ if not filename:
210
+ return False
211
+ if ".." in filename or "/" in filename or "\\" in filename:
212
+ return False
213
+ if "\0" in filename:
214
+ return False
215
+ # Allow safe characters including spaces, dashes, dots, underscores, parentheses, brackets, and common special symbols in column names
216
+ return bool(re.match(r"^[a-zA-Z0-9_\-. ()[\]$,%&+@=;\'~#]+$", filename))
217
+
218
+ def validate_project_id(project_id: str) -> str:
219
+ """Validate that the project_id matches a safe pattern to prevent path traversal."""
220
+ if not is_safe_id(project_id):
221
+ raise HTTPException(status_code=400, detail="Invalid project ID.")
222
+ return project_id
223
+
224
+ def get_safe_session_dir(project_id: str) -> Path:
225
+ pid = validate_project_id(project_id)
226
+ base = SESSIONS_DIR.resolve()
227
+ resolved = (base / pid).resolve()
228
+ try:
229
+ resolved.relative_to(base)
230
+ except ValueError:
231
+ raise HTTPException(status_code=400, detail="Path traversal detected.")
232
+ return resolved
233
+
234
+ def get_safe_output_dir(project_id: str) -> Path:
235
+ pid = validate_project_id(project_id)
236
+ base = OUTPUTS_DIR.resolve()
237
+ resolved = (base / pid).resolve()
238
+ try:
239
+ resolved.relative_to(base)
240
+ except ValueError:
241
+ raise HTTPException(status_code=400, detail="Path traversal detected.")
242
+ return resolved
243
+
244
+ _metadata_lock = threading.Lock()
245
+
246
+ def save_project_metadata(project_id: str, meta: dict):
247
+ session_dir = get_safe_session_dir(project_id)
248
+ if not session_dir.exists():
249
+ session_dir.mkdir(parents=True, exist_ok=True)
250
+ metadata_path = session_dir / "metadata.json"
251
+ with _metadata_lock:
252
+ with open(metadata_path, "w", encoding="utf-8") as f:
253
+ json.dump(meta, f, indent=2)
254
+
255
+ def get_project_metadata(project_id: str) -> dict:
256
+ session_dir = get_safe_session_dir(project_id)
257
+ metadata_path = session_dir / "metadata.json"
258
+
259
+ if not session_dir.exists():
260
+ return {}
261
+
262
+ meta = {}
263
+ with _metadata_lock:
264
+ if metadata_path.exists():
265
+ try:
266
+ with open(metadata_path, "r", encoding="utf-8") as f:
267
+ meta = json.load(f)
268
+ except Exception:
269
+ pass
270
+
271
+ # Default metadata if not present or corrupt (compatibility check)
272
+ if not meta:
273
+ upload_file = session_dir / "original_upload.csv"
274
+ filename = "dataset.csv"
275
+ size = 0
276
+ if upload_file.exists():
277
+ filename = "dataset.csv"
278
+ size = upload_file.stat().st_size
279
+
280
+ results_path = session_dir / "results.json"
281
+ status = "idle"
282
+ if results_path.exists():
283
+ status = "completed"
284
+ elif (session_dir / "done.txt").exists():
285
+ status = "completed"
286
+
287
+ created_at = session_dir.stat().st_ctime
288
+ meta = {
289
+ "id": project_id,
290
+ "name": f"Project {project_id}",
291
+ "filename": filename,
292
+ "report_title": f"{filename.rsplit('.', 1)[0].replace('_', ' ').title()} Executive Analysis",
293
+ "size": size,
294
+ "created_at": created_at * 1000,
295
+ "status": status,
296
+ "thumbnail": None
297
+ }
298
+
299
+ # Dynamically resolve and update the thumbnail link if generated PNGs exist
300
+ output_dir = get_safe_output_dir(project_id)
301
+ current_thumb = meta.get("thumbnail")
302
+ target_thumb = None
303
+ if output_dir.exists() and output_dir.is_dir():
304
+ png_charts = sorted(
305
+ [f for f in output_dir.glob("*.png")],
306
+ key=lambda x: x.stat().st_mtime,
307
+ reverse=True
308
+ )
309
+ if png_charts:
310
+ import urllib.parse
311
+ target_thumb = f"/api/charts/{project_id}/{urllib.parse.quote(png_charts[0].name)}"
312
+
313
+ if current_thumb != target_thumb:
314
+ meta["thumbnail"] = target_thumb
315
+ save_project_metadata(project_id, meta)
316
+
317
+ return meta
318
+
319
+
320
+ def parse_bool(value: Optional[str]) -> bool:
321
+ return bool(value and str(value).strip().lower() not in {"false", "0", "off", "no", ""})
322
+
323
+
324
+ def optimize_goal_grammar(goal: str, provider: str, model: str, api_key: str, env_key_name: str) -> str:
325
+ """Uses the runtime-configured LLM to optimize the grammar of the project goal."""
326
+ if not goal.strip():
327
+ return ""
328
+ try:
329
+ from config.llm_config import apply_runtime_llm_settings, get_llm_params
330
+ from crewai import LLM
331
+
332
+ apply_runtime_llm_settings(provider, model, api_key or "", env_key_name)
333
+ params = get_llm_params()
334
+ llm = LLM(**params)
335
+
336
+ prompt = (
337
+ "You are a professional editor. Improve the grammar, phrasing, and professional tone "
338
+ "of the following data analysis goal. Keep it concise (1-2 sentences). "
339
+ "Return ONLY the corrected goal text, without any introductory text, quotes, or metadata.\n\n"
340
+ f"Goal: {goal.strip()}"
341
+ )
342
+ response = llm.call([{"role": "user", "content": prompt}])
343
+ result = response if isinstance(response, str) else str(response)
344
+ return result.strip().strip('"').strip("'")
345
+ except Exception as e:
346
+ print(f"Grammar optimization failed: {e}")
347
+ return goal.strip()
348
+
349
+
350
+ # ---------------------------------------------------------------------------
351
+ # Background Task Pipeline
352
+ # ---------------------------------------------------------------------------
353
+
354
+ MAX_CONCURRENT_ANALYSES = 2
355
+ active_analyses = 0
356
+ active_analyses_lock = threading.Lock()
357
+
358
+ def run_crew_in_background(
359
+ session_id: str,
360
+ csv_path: str,
361
+ provider: str,
362
+ model: str,
363
+ api_key: str,
364
+ env_key_name: str,
365
+ cooldown: int,
366
+ selected_tasks: list[str],
367
+ deep_analysis: bool,
368
+ report_title: str,
369
+ ):
370
+ """
371
+ Orchestrates the CrewAI pipeline in a background thread, writing all
372
+ stdout progress to a tail-able stdout.log file and serializing results.
373
+ """
374
+ if not is_safe_id(session_id):
375
+ raise ValueError("Invalid session ID.")
376
+ session_dir = (SESSIONS_DIR / session_id).resolve()
377
+ resolved_csv = Path(csv_path).resolve()
378
+ try:
379
+ resolved_csv.relative_to(session_dir)
380
+ except ValueError:
381
+ raise ValueError("Path traversal detected in CSV path.")
382
+
383
+ # 1. Inject thread-isolated LLM configurations and context variables
384
+ from config.context import (
385
+ current_session_id,
386
+ current_session_csv,
387
+ current_session_output_dir,
388
+ current_llm_provider,
389
+ current_llm_model,
390
+ current_llm_api_key,
391
+ current_llm_env_key_name,
392
+ current_cooldown,
393
+ current_deep_analysis,
394
+ )
395
+ current_session_id.set(session_id)
396
+ current_session_csv.set(str(resolved_csv))
397
+ current_session_output_dir.set(str((OUTPUTS_DIR / session_id).resolve()))
398
+ current_llm_provider.set(provider)
399
+ current_llm_model.set(model)
400
+ current_llm_api_key.set(api_key or "")
401
+ current_llm_env_key_name.set(env_key_name or "")
402
+ current_cooldown.set(cooldown)
403
+ current_deep_analysis.set(deep_analysis)
404
+
405
+
406
+
407
+ # Save or update the report title and goal in project metadata
408
+ try:
409
+ meta = get_project_metadata(session_id)
410
+ if report_title:
411
+ meta["report_title"] = report_title.strip()
412
+
413
+ user_goal = meta.get("goal", "")
414
+ if user_goal.strip():
415
+ print("Optimizing goal grammar...")
416
+ opt_goal = optimize_goal_grammar(user_goal, provider, model, api_key, env_key_name)
417
+ meta["optimized_goal"] = opt_goal
418
+ print(f"Optimized goal: {opt_goal}")
419
+ else:
420
+ meta["optimized_goal"] = ""
421
+
422
+ save_project_metadata(session_id, meta)
423
+ except Exception as e:
424
+ print(f"Error handling metadata goal/title: {e}")
425
+
426
+ session_dir = SESSIONS_DIR / session_id
427
+ log_path = session_dir / "stdout.log"
428
+ done_path = session_dir / "done.txt"
429
+ results_path = session_dir / "results.json"
430
+
431
+ # Clean up previous state
432
+ done_path.unlink(missing_ok=True)
433
+ results_path.unlink(missing_ok=True)
434
+
435
+ # Update metadata status to running
436
+ try:
437
+ meta = get_project_metadata(session_id)
438
+ meta["status"] = "running"
439
+ save_project_metadata(session_id, meta)
440
+ except Exception:
441
+ pass
442
+
443
+ # 2. Redirect stdout and kickoff
444
+ with open(log_path, "w", encoding="utf-8", errors="replace") as log_file:
445
+ import contextlib
446
+ with contextlib.redirect_stdout(log_file):
447
+ try:
448
+ print("Initializing multi-agent workflows...")
449
+ _load_crew()
450
+
451
+ env_tasks = os.getenv("SELECTED_TASKS", "")
452
+ parsed_tasks = [t.strip() for t in env_tasks.split(",") if t.strip()]
453
+ deep_flag = os.getenv("DEEP_ANALYSIS", "false").lower() in {"true", "1", "yes", "on"}
454
+
455
+ result = _run_crew(
456
+ csv_path,
457
+ session_id=session_id,
458
+ selected_tasks=parsed_tasks or None,
459
+ deep_analysis=deep_flag,
460
+ )
461
+
462
+ # Convert results to JSON-serializable structure
463
+ # Re-map Plotly charts into serializable JSON dictionaries
464
+ plotly_serializable = []
465
+ for chart in result.get("plotly_charts", []):
466
+ try:
467
+ plotly_serializable.append({
468
+ "title": chart["title"],
469
+ "fig_json": json.loads(chart["fig"].to_json())
470
+ })
471
+ except Exception:
472
+ pass
473
+
474
+ # Gather static PNG charts
475
+ png_charts_list = [f.name for f in Path(result["output_dir"]).glob("*.png")]
476
+
477
+ serializable_result = {
478
+ "cleaning_steps": result["cleaning_steps"],
479
+ "relations": result["relations"],
480
+ "insights": result["insights"],
481
+ "code": result.get("code", ""),
482
+ "output_dir": result["output_dir"],
483
+ "plotly_charts": plotly_serializable,
484
+ "png_charts": png_charts_list,
485
+ "rows_count": int(result["dataframe"].shape[0]),
486
+ "cols_count": int(result["dataframe"].shape[1]),
487
+ "numeric_count": int(len(result["dataframe"].select_dtypes(include=["number"]).columns)),
488
+ "cat_count": int(len(result["dataframe"].select_dtypes(include=["object"]).columns))
489
+ }
490
+
491
+ # Cache first 100 rows as JSON data preview
492
+ preview_data = result["dataframe"].head(100).replace([float('inf'), float('-inf')], float('nan')).fillna("").to_dict(orient="records")
493
+ serializable_result["preview"] = preview_data
494
+
495
+ with open(results_path, "w", encoding="utf-8") as f:
496
+ json.dump(serializable_result, f, indent=2)
497
+
498
+ print("\nAnalysis complete! Ready to render dashboard.")
499
+
500
+ # Update metadata status to completed
501
+ try:
502
+ meta = get_project_metadata(session_id)
503
+ meta["status"] = "completed"
504
+ if png_charts_list:
505
+ import urllib.parse
506
+ meta["thumbnail"] = f"/api/charts/{session_id}/{urllib.parse.quote(png_charts_list[0])}"
507
+ save_project_metadata(session_id, meta)
508
+ except Exception:
509
+ pass
510
+
511
+ except Exception as e:
512
+ import traceback
513
+ print(f"\nPipeline failed: {e}", file=sys.stderr)
514
+ traceback.print_exc(file=log_file)
515
+
516
+ error_result = {"error": str(e)}
517
+ with open(results_path, "w", encoding="utf-8") as f:
518
+ json.dump(error_result, f, indent=2)
519
+
520
+ # Update metadata status to failed
521
+ try:
522
+ meta = get_project_metadata(session_id)
523
+ meta["status"] = "failed"
524
+ save_project_metadata(session_id, meta)
525
+ except Exception:
526
+ pass
527
+ finally:
528
+ # Write done sentinel to stop EventSource streams
529
+ with open(done_path, "w") as f:
530
+ f.write("done")
531
+ global active_analyses
532
+ with active_analyses_lock:
533
+ active_analyses = max(0, active_analyses - 1)
534
+
535
+
536
+ # ---------------------------------------------------------------------------
537
+ # API Endpoints
538
+ # ---------------------------------------------------------------------------
539
+
540
+ @app.post("/api/upload")
541
+ async def upload_file(file: UploadFile = File(...)):
542
+ """Uploads the dataset and registers a unique user session ID."""
543
+ session_id = uuid.uuid4().hex[:12]
544
+ session_dir = get_safe_session_dir(session_id)
545
+ session_dir.mkdir(parents=True, exist_ok=True)
546
+
547
+ file_path = session_dir / "original_upload.csv"
548
+ with open(file_path, "wb") as buffer:
549
+ shutil.copyfileobj(file.file, buffer)
550
+
551
+ # Pre-configure fresh log files
552
+ log_path = session_dir / "stdout.log"
553
+ with open(log_path, "w") as f:
554
+ f.write("Dataset uploaded successfully.\n")
555
+
556
+ # Save default project metadata
557
+ try:
558
+ proj_name = file.filename.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
559
+ meta = {
560
+ "id": session_id,
561
+ "name": proj_name,
562
+ "filename": file.filename,
563
+ "size": file_path.stat().st_size,
564
+ "created_at": time.time() * 1000,
565
+ "status": "idle"
566
+ }
567
+ save_project_metadata(session_id, meta)
568
+ except Exception:
569
+ pass
570
+
571
+ return {
572
+ "session_id": session_id,
573
+ "filename": file.filename,
574
+ "size": file_path.stat().st_size
575
+ }
576
+
577
+
578
+ @app.post("/api/validate-key")
579
+ async def validate_api_key(
580
+ provider: str = Form(...),
581
+ model: str = Form(...),
582
+ api_key: Optional[str] = Form(""),
583
+ ):
584
+ """Validate LLM provider credentials before starting analysis."""
585
+ try:
586
+ _load_crew()
587
+ except ImportError as exc:
588
+ raise HTTPException(status_code=503, detail=f"CrewAI not available: {exc}")
589
+ result = _validate_llm_connection(provider, model, api_key or "")
590
+ if not result.get("valid"):
591
+ raise HTTPException(status_code=400, detail=result.get("message", "Validation failed."))
592
+ return result
593
+
594
+
595
+ @app.post("/api/analyze")
596
+ async def trigger_analysis(
597
+ background_tasks: BackgroundTasks,
598
+ session_id: str = Form(...),
599
+ provider: str = Form(...),
600
+ model: str = Form(...),
601
+ api_key: Optional[str] = Form(""),
602
+ cooldown: int = Form(5),
603
+ selected_tasks: str = Form(""),
604
+ deep_analysis: str = Form("false"),
605
+ report_title: str = Form("")
606
+ ):
607
+ """Launches the CrewAI analysis process in the background."""
608
+ session_dir = get_safe_session_dir(session_id)
609
+ csv_path = session_dir / "original_upload.csv"
610
+
611
+ if not csv_path.exists():
612
+ raise HTTPException(status_code=400, detail="Session upload not found.")
613
+
614
+ # Match provider key name
615
+ if provider == "ollama":
616
+ env_key_name = "OLLAMA_BASE_URL"
617
+ elif provider in ("nvidia", "minimax"):
618
+ env_key_name = "NVIDIA_API_KEY"
619
+ else:
620
+ env_key_name = f"{provider.upper()}_API_KEY"
621
+
622
+ selected_tasks = [
623
+ task.strip()
624
+ for task in selected_tasks.split(",")
625
+ if task.strip()
626
+ ]
627
+ if not selected_tasks:
628
+ selected_tasks = ["cleaning", "relations", "insights", "visualization"]
629
+
630
+ deep = deep_analysis.strip().lower() in {"true", "1", "yes", "on"}
631
+
632
+ # Persist report title if provided
633
+ try:
634
+ meta = get_project_metadata(session_id)
635
+ if report_title.strip():
636
+ meta["report_title"] = report_title.strip()
637
+ save_project_metadata(session_id, meta)
638
+ except Exception:
639
+ pass
640
+
641
+ # Concurrency control checks
642
+ global active_analyses
643
+ with active_analyses_lock:
644
+ if active_analyses >= MAX_CONCURRENT_ANALYSES:
645
+ raise HTTPException(
646
+ status_code=429,
647
+ detail="Server is busy. Maximum concurrent analyses limit reached. Please try again later."
648
+ )
649
+ active_analyses += 1
650
+
651
+ # Spawn thread-safe background execution
652
+ try:
653
+ background_tasks.add_task(
654
+ run_crew_in_background,
655
+ session_id=session_id,
656
+ csv_path=str(csv_path),
657
+ provider=provider,
658
+ model=model,
659
+ api_key=api_key,
660
+ env_key_name=env_key_name,
661
+ cooldown=cooldown,
662
+ selected_tasks=selected_tasks,
663
+ deep_analysis=deep,
664
+ report_title=report_title.strip(),
665
+ )
666
+ except Exception as e:
667
+ with active_analyses_lock:
668
+ active_analyses = max(0, active_analyses - 1)
669
+ raise e
670
+
671
+ return {"status": "started", "session_id": session_id}
672
+
673
+
674
+ @app.get("/api/analyze/stream")
675
+ async def stream_analysis_logs(session_id: str):
676
+ """Streams running stdout log lines using Server-Sent Events (SSE)."""
677
+ session_dir = get_safe_session_dir(session_id)
678
+ log_path = session_dir / "stdout.log"
679
+
680
+ # Reset streaming state
681
+ if session_id in log_stream_states:
682
+ log_stream_states[session_id] = {"in_prompt": False}
683
+
684
+ async def log_generator():
685
+ # Wait for stdout.log file to populate
686
+ for _ in range(50):
687
+ if log_path.exists():
688
+ break
689
+ await asyncio.sleep(0.1)
690
+
691
+ if not log_path.exists():
692
+ yield "data: [Initializing pipeline...]\n\n"
693
+
694
+ with open(log_path, "r", encoding="utf-8", errors="replace") as f:
695
+ while True:
696
+ line = f.readline()
697
+ if line:
698
+ cleaned = clean_log_message(line, session_id=session_id)
699
+ if cleaned is not None:
700
+ yield f"data: {cleaned}\n\n"
701
+ else:
702
+ # Look for done flag
703
+ done_path = session_dir / "done.txt"
704
+ if done_path.exists():
705
+ # Read final trailing lines
706
+ for trail_line in f.readlines():
707
+ cleaned_trail = clean_log_message(trail_line, session_id=session_id)
708
+ if cleaned_trail is not None:
709
+ yield f"data: {cleaned_trail}\n\n"
710
+ yield "data: [EOF]\n\n"
711
+ break
712
+ await asyncio.sleep(0.1)
713
+
714
+ return StreamingResponse(log_generator(), media_type="text/event-stream")
715
+
716
+
717
+ @app.get("/api/results")
718
+ async def get_results(session_id: str):
719
+ """Retrieves cached JSON results containing stats, insights, and charts."""
720
+ session_dir = get_safe_session_dir(session_id)
721
+ results_path = session_dir / "results.json"
722
+ if not results_path.exists():
723
+ return {"ready": False, "status": "pending"}
724
+
725
+ with open(results_path, "r", encoding="utf-8") as f:
726
+ data = json.load(f)
727
+ if "error" in data:
728
+ return data
729
+ data["ready"] = True
730
+ return data
731
+
732
+
733
+ @app.post("/api/copilot")
734
+ async def ask_copilot(
735
+ session_id: str = Form(...),
736
+ query: str = Form(...),
737
+ provider: str = Form(...),
738
+ model: str = Form(...),
739
+ api_key: Optional[str] = Form("")
740
+ ):
741
+ """Runs a natural language query against the dataset using the Copilot agent."""
742
+ if provider == "ollama":
743
+ env_key_name = "OLLAMA_BASE_URL"
744
+ elif provider in ("nvidia", "minimax"):
745
+ env_key_name = "NVIDIA_API_KEY"
746
+ else:
747
+ env_key_name = f"{provider.upper()}_API_KEY"
748
+ _load_crew()
749
+ _apply_runtime_llm_settings(provider, model, api_key or "", env_key_name)
750
+
751
+ session_dir = get_safe_session_dir(session_id)
752
+ csv_path = session_dir / "cleaned.csv"
753
+ output_dir = get_safe_output_dir(session_id)
754
+
755
+ if not csv_path.exists():
756
+ # Fall back to original upload if cleaning hasn't run or completed
757
+ csv_path = session_dir / "original_upload.csv"
758
+
759
+ if not csv_path.exists():
760
+ raise HTTPException(status_code=400, detail="Dataset not uploaded.")
761
+
762
+ # Bind thread-local context variables for the current request
763
+ from config.context import current_session_csv, current_session_output_dir
764
+ current_session_csv.set(str(csv_path))
765
+ current_session_output_dir.set(str(output_dir))
766
+
767
+ # Call copilot model runner
768
+ res = _run_copilot_query(query, str(csv_path), str(output_dir))
769
+
770
+ # Re-map absolute plot path to relative HTTP endpoint URL
771
+ plot_url = None
772
+ if res.get("plot_path"):
773
+ plot_filename = Path(res["plot_path"]).name
774
+ import urllib.parse
775
+ plot_url = f"/api/charts/{session_id}/{urllib.parse.quote(plot_filename)}"
776
+
777
+ return {
778
+ "success": res["success"],
779
+ "text": res["text"],
780
+ "plot_url": plot_url
781
+ }
782
+
783
+
784
+ @app.get("/api/export-pdf")
785
+ async def get_pdf_report(session_id: str, report_title: Optional[str] = None):
786
+ """Generates and streams back the executive PDF report."""
787
+ session_dir = get_safe_session_dir(session_id)
788
+ results_path = session_dir / "results.json"
789
+ cleaned_csv = session_dir / "cleaned.csv"
790
+
791
+ if not results_path.exists() or not cleaned_csv.exists():
792
+ raise HTTPException(status_code=400, detail="Data analysis results not available.")
793
+
794
+ with open(results_path, "r", encoding="utf-8") as f:
795
+ data = json.load(f)
796
+
797
+ meta = get_project_metadata(session_id)
798
+ title = report_title.strip() if report_title else meta.get("report_title", meta.get("name", "Analysis Report"))
799
+ goal = meta.get("optimized_goal") or meta.get("goal") or ""
800
+
801
+ # Format result structure for reportlab builder
802
+ df = read_csv_robust(cleaned_csv)
803
+ report_dict = {
804
+ "dataframe": df,
805
+ "cleaning_steps": data["cleaning_steps"],
806
+ "relations": data["relations"],
807
+ "insights": data["insights"],
808
+ "code": data.get("code", ""),
809
+ "output_dir": str(get_safe_output_dir(session_id)),
810
+ "report_title": title,
811
+ "goal": goal,
812
+ }
813
+
814
+ try:
815
+ _load_crew()
816
+ pdf_bytes = _export_pdf(report_dict)
817
+ filename = re.sub(r"[^a-zA-Z0-9_-]", "_", title.lower())[:60] or f"report_{session_id}"
818
+ return StreamingResponse(
819
+ BytesIO_iterator(pdf_bytes),
820
+ media_type="application/pdf",
821
+ headers={"Content-Disposition": f"attachment; filename={filename}.pdf"}
822
+ )
823
+ except Exception as e:
824
+ raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
825
+
826
+
827
+ @app.get("/api/charts/{session_id}/{filename}")
828
+ async def serve_chart(session_id: str, filename: str):
829
+ """Serves the generated PNG visual charts."""
830
+ if not is_safe_filename(filename):
831
+ raise HTTPException(status_code=400, detail="Invalid filename.")
832
+ output_dir = get_safe_output_dir(session_id)
833
+ chart_path = (output_dir / filename).resolve()
834
+ try:
835
+ chart_path.relative_to(output_dir)
836
+ except ValueError:
837
+ raise HTTPException(status_code=400, detail="Path traversal detected.")
838
+ if not chart_path.exists():
839
+ raise HTTPException(status_code=404, detail="Chart not found.")
840
+ return FileResponse(chart_path)
841
+
842
+
843
+ # ---------------------------------------------------------------------------
844
+ # Utility Streams
845
+ # ---------------------------------------------------------------------------
846
+
847
+ def BytesIO_iterator(data_bytes: bytes):
848
+ """Simple generator to stream raw bytes back to the response."""
849
+ yield data_bytes
850
+
851
+
852
+ # ---------------------------------------------------------------------------
853
+ # Ollama Models Fetch
854
+ # ---------------------------------------------------------------------------
855
+
856
+ @app.get("/api/ollama-models")
857
+ async def list_ollama_models(base_url: str = "http://localhost:11434"):
858
+ """Fetches list of local Ollama models from the local Ollama service tags API."""
859
+ import requests
860
+ try:
861
+ url = base_url.rstrip("/") + "/api/tags"
862
+ response = requests.get(url, timeout=2.0)
863
+ if response.status_code == 200:
864
+ data = response.json()
865
+ models = [m["name"] for m in data.get("models", [])]
866
+ if models:
867
+ prefixed = [f"ollama/{m}" if not m.startswith("ollama/") else m for m in models]
868
+ return {"models": prefixed}
869
+ except Exception:
870
+ pass
871
+ # Fallback defaults if Ollama service is unreachable or empty
872
+ return {"models": ["ollama/llama3", "ollama/mistral", "ollama/gemma2"]}
873
+
874
+
875
+ # ---------------------------------------------------------------------------
876
+ # Metrics & Configurations APIs
877
+ # ---------------------------------------------------------------------------
878
+
879
+ def get_local_config_path() -> Path:
880
+ return USER_HOME / "config.json"
881
+
882
+ @app.get("/api/metrics")
883
+ async def get_performance_metrics():
884
+ from config.metrics_tracker import get_metrics
885
+ return get_metrics()
886
+
887
+ @app.get("/api/config")
888
+ async def get_local_config():
889
+ cfg_path = get_local_config_path()
890
+ if not cfg_path.exists():
891
+ return {}
892
+ try:
893
+ with open(cfg_path, "r", encoding="utf-8") as f:
894
+ cfg = json.load(f)
895
+ masked = {}
896
+ for k, v in cfg.items():
897
+ if v and any(keyword in k.lower() for keyword in ("key", "secret", "token")):
898
+ masked[k] = v[:4] + "..." + v[-4:] if len(v) > 8 else "********"
899
+ else:
900
+ masked[k] = v
901
+ return masked
902
+ except Exception:
903
+ return {}
904
+
905
+ @app.post("/api/config")
906
+ async def save_local_config(
907
+ provider: str = Form(...),
908
+ api_key: Optional[str] = Form(""),
909
+ base_url: Optional[str] = Form("")
910
+ ):
911
+ cfg_path = get_local_config_path()
912
+ cfg_path.parent.mkdir(parents=True, exist_ok=True)
913
+ cfg = {}
914
+ if cfg_path.exists():
915
+ try:
916
+ with open(cfg_path, "r", encoding="utf-8") as f:
917
+ cfg = json.load(f)
918
+ except Exception:
919
+ pass
920
+
921
+ if provider == "ollama":
922
+ key_name = "OLLAMA_BASE_URL"
923
+ cfg[key_name] = base_url.strip()
924
+ elif provider in ("nvidia", "minimax"):
925
+ key_name = "NVIDIA_API_KEY"
926
+ else:
927
+ key_name = f"{provider.upper()}_API_KEY"
928
+
929
+ if provider != "ollama":
930
+ if api_key.strip():
931
+ if not api_key.endswith("..."):
932
+ cfg[key_name] = api_key.strip()
933
+ else:
934
+ cfg.pop(key_name, None)
935
+
936
+ if base_url.strip() and provider == "custom":
937
+ cfg["CUSTOM_BASE_URL"] = base_url.strip()
938
+
939
+ try:
940
+ with open(cfg_path, "w", encoding="utf-8") as f:
941
+ json.dump(cfg, f, indent=2)
942
+ for k, v in cfg.items():
943
+ os.environ[k] = str(v)
944
+ except Exception as e:
945
+ raise HTTPException(status_code=500, detail=f"Failed to write config: {e}")
946
+ return {"status": "success"}
947
+
948
+
949
+ # ---------------------------------------------------------------------------
950
+ # Project Management APIs
951
+ # ---------------------------------------------------------------------------
952
+
953
+ @app.get("/api/projects")
954
+ async def list_projects():
955
+ """Lists all available data analysis projects/sessions."""
956
+ projects = []
957
+ if SESSIONS_DIR.exists():
958
+ for p in SESSIONS_DIR.iterdir():
959
+ if p.is_dir():
960
+ try:
961
+ meta = get_project_metadata(p.name)
962
+ if meta:
963
+ projects.append(meta)
964
+ except Exception:
965
+ pass
966
+ # Sort projects: newest first
967
+ projects.sort(key=lambda x: x.get("created_at", 0), reverse=True)
968
+ return projects
969
+
970
+ @app.post("/api/projects")
971
+ async def create_project(
972
+ name: str = Form(...),
973
+ report_title: str = Form(""),
974
+ goal: str = Form(""),
975
+ file: UploadFile = File(...)
976
+ ):
977
+ """Creates a new project context and uploads the dataset CSV."""
978
+ project_id = uuid.uuid4().hex[:12]
979
+ session_dir = get_safe_session_dir(project_id)
980
+ session_dir.mkdir(parents=True, exist_ok=True)
981
+
982
+ file_path = session_dir / "original_upload.csv"
983
+ with open(file_path, "wb") as buffer:
984
+ shutil.copyfileobj(file.file, buffer)
985
+
986
+ # Pre-configure fresh log files
987
+ log_path = session_dir / "stdout.log"
988
+ with open(log_path, "w") as f:
989
+ f.write("Project created. Dataset uploaded successfully.\n")
990
+
991
+ meta = {
992
+ "id": project_id,
993
+ "name": name.strip(),
994
+ "report_title": report_title.strip() or f"{name.strip()} Executive Analysis",
995
+ "goal": goal.strip(),
996
+ "optimized_goal": "",
997
+ "filename": file.filename,
998
+ "size": file_path.stat().st_size,
999
+ "created_at": time.time() * 1000,
1000
+ "status": "idle"
1001
+ }
1002
+ save_project_metadata(project_id, meta)
1003
+
1004
+ return meta
1005
+
1006
+ @app.post("/api/projects/{project_id}/rename")
1007
+ async def rename_project(project_id: str, name: str = Form(...)):
1008
+ """Renames an existing project context."""
1009
+ session_dir = get_safe_session_dir(project_id)
1010
+ if not session_dir.exists():
1011
+ raise HTTPException(status_code=404, detail="Project not found")
1012
+
1013
+ meta = get_project_metadata(project_id)
1014
+ meta["name"] = name.strip()
1015
+ save_project_metadata(project_id, meta)
1016
+
1017
+ return meta
1018
+
1019
+
1020
+ @app.post("/api/projects/{project_id}/tweak-relations")
1021
+ async def tweak_relations(project_id: str, relations_text: str = Form(...)):
1022
+ """Saves tweaked relationships back to the results cache."""
1023
+ session_dir = get_safe_session_dir(project_id)
1024
+ if not session_dir.exists():
1025
+ raise HTTPException(status_code=404, detail="Project not found")
1026
+
1027
+ results_path = session_dir / "results.json"
1028
+
1029
+ # Ensure results.json structure is present even if not analysed yet
1030
+ res_data = {}
1031
+ if results_path.exists():
1032
+ try:
1033
+ with open(results_path, "r", encoding="utf-8") as f:
1034
+ res_data = json.load(f)
1035
+ except Exception:
1036
+ pass
1037
+
1038
+ res_data["relations"] = relations_text.strip()
1039
+
1040
+ with open(results_path, "w", encoding="utf-8") as f:
1041
+ json.dump(res_data, f, indent=2)
1042
+
1043
+ return {"status": "success", "relations": res_data["relations"]}
1044
+
1045
+ @app.delete("/api/projects/{project_id}")
1046
+ async def delete_project(project_id: str):
1047
+ """Deletes all session files, artifacts, and outputs of a project."""
1048
+ session_dir = get_safe_session_dir(project_id)
1049
+ output_dir = get_safe_output_dir(project_id)
1050
+
1051
+ if not session_dir.exists():
1052
+ raise HTTPException(status_code=404, detail="Project not found")
1053
+
1054
+ shutil.rmtree(session_dir, ignore_errors=True)
1055
+ if output_dir.exists():
1056
+ shutil.rmtree(output_dir, ignore_errors=True)
1057
+
1058
+ return {"status": "deleted", "id": project_id}
1059
+
1060
+
1061
+ @app.get("/api/projects/{project_id}/export-zip")
1062
+ async def export_project_zip(project_id: str):
1063
+ """Exports the entire project (metadata, data files, results, and generated charts) as a ZIP file."""
1064
+ session_dir = get_safe_session_dir(project_id)
1065
+ output_dir = get_safe_output_dir(project_id)
1066
+ if not session_dir.exists():
1067
+ raise HTTPException(status_code=404, detail="Project not found")
1068
+
1069
+ zip_buffer = BytesIO()
1070
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
1071
+ # Zip session files
1072
+ for root, dirs, files in os.walk(session_dir):
1073
+ for file in files:
1074
+ file_path = Path(root) / file
1075
+ arcname = Path("session") / file_path.relative_to(session_dir)
1076
+ zip_file.write(file_path, arcname=arcname)
1077
+ # Zip output files (charts)
1078
+ if output_dir.exists():
1079
+ for root, dirs, files in os.walk(output_dir):
1080
+ for file in files:
1081
+ file_path = Path(root) / file
1082
+ arcname = Path("outputs") / file_path.relative_to(output_dir)
1083
+ zip_file.write(file_path, arcname=arcname)
1084
+
1085
+ zip_buffer.seek(0)
1086
+ meta = get_project_metadata(project_id)
1087
+ safe_name = re.sub(r"[^a-zA-Z0-9_-]", "_", meta.get("name", "project").lower())
1088
+ filename = f"{safe_name}_{project_id}.zip"
1089
+ return StreamingResponse(
1090
+ BytesIO_iterator(zip_buffer.getvalue()),
1091
+ media_type="application/x-zip-compressed",
1092
+ headers={"Content-Disposition": f"attachment; filename={filename}"}
1093
+ )
1094
+
1095
+
1096
+ @app.post("/api/projects/import-zip")
1097
+ async def import_project_zip(file: UploadFile = File(...)):
1098
+ """Imports a project from a ZIP file and registers it in the system."""
1099
+ zip_contents = await file.read()
1100
+ zip_buffer = BytesIO(zip_contents)
1101
+
1102
+ project_id = uuid.uuid4().hex[:12]
1103
+ temp_dir = DATA_DIR / "temp_import" / project_id
1104
+ temp_dir.mkdir(parents=True, exist_ok=True)
1105
+
1106
+ target_project_id = project_id
1107
+ session_dir = None
1108
+ output_dir = None
1109
+ try:
1110
+ with zipfile.ZipFile(zip_buffer, "r") as zip_file:
1111
+ # Zip Slip check:
1112
+ for member in zip_file.infolist():
1113
+ if ".." in member.filename or member.filename.startswith("/") or member.filename.startswith("\\"):
1114
+ raise HTTPException(status_code=400, detail=f"Invalid zip entry: {member.filename}")
1115
+ target_path = (temp_dir / member.filename).resolve()
1116
+ try:
1117
+ target_path.relative_to(temp_dir.resolve())
1118
+ except ValueError:
1119
+ raise HTTPException(status_code=400, detail=f"Zip Slip detected: {member.filename}")
1120
+ zip_file.extractall(temp_dir)
1121
+
1122
+ # Verify metadata.json exists
1123
+ meta_file = temp_dir / "session" / "metadata.json"
1124
+ if not meta_file.exists():
1125
+ raise HTTPException(status_code=400, detail="Invalid zip format: missing metadata.json")
1126
+
1127
+ with open(meta_file, "r", encoding="utf-8") as f:
1128
+ meta = json.load(f)
1129
+
1130
+ orig_project_id = meta.get("id")
1131
+ if orig_project_id:
1132
+ if not is_safe_id(orig_project_id):
1133
+ raise HTTPException(status_code=400, detail="Invalid project ID in metadata.")
1134
+ target_project_id = orig_project_id
1135
+
1136
+ # Check if project conflicts. If so, generate new ID
1137
+ session_dir = get_safe_session_dir(target_project_id)
1138
+ if session_dir.exists():
1139
+ target_project_id = uuid.uuid4().hex[:12]
1140
+ session_dir = get_safe_session_dir(target_project_id)
1141
+ meta["id"] = target_project_id
1142
+ meta["name"] = f"{meta.get('name', 'Imported')} (Copy)"
1143
+
1144
+ output_dir = get_safe_output_dir(target_project_id)
1145
+ session_dir.mkdir(parents=True, exist_ok=True)
1146
+
1147
+ # Copy session files
1148
+ for item in (temp_dir / "session").iterdir():
1149
+ if item.is_file():
1150
+ if not is_safe_filename(item.name):
1151
+ continue
1152
+ shutil.copy2(item, session_dir / item.name)
1153
+
1154
+ # Copy outputs
1155
+ if (temp_dir / "outputs").exists():
1156
+ output_dir.mkdir(parents=True, exist_ok=True)
1157
+ for item in (temp_dir / "outputs").iterdir():
1158
+ if item.is_file():
1159
+ if not is_safe_filename(item.name):
1160
+ continue
1161
+ shutil.copy2(item, output_dir / item.name)
1162
+
1163
+ # Update metadata.json
1164
+ meta["id"] = target_project_id
1165
+ if meta.get("thumbnail"):
1166
+ # Update thumbnail link with new project ID
1167
+ thumb_parts = meta["thumbnail"].split("/")
1168
+ if len(thumb_parts) >= 5:
1169
+ thumb_parts[3] = target_project_id
1170
+ meta["thumbnail"] = "/".join(thumb_parts)
1171
+
1172
+ with open(session_dir / "metadata.json", "w", encoding="utf-8") as f:
1173
+ json.dump(meta, f, indent=2)
1174
+
1175
+ return meta
1176
+ except Exception as e:
1177
+ if session_dir and session_dir.exists():
1178
+ shutil.rmtree(session_dir, ignore_errors=True)
1179
+ if output_dir and output_dir.exists():
1180
+ shutil.rmtree(output_dir, ignore_errors=True)
1181
+ raise HTTPException(status_code=400, detail=f"Import failed: {str(e)}")
1182
+ finally:
1183
+ shutil.rmtree(temp_dir, ignore_errors=True)
1184
+
1185
+
1186
+ @app.get("/api/projects/{project_id}/preview")
1187
+ async def get_dynamic_preview(project_id: str):
1188
+ """Dynamically reads the latest state of the CSV and returns a 100-row preview, column names, shapes, and types."""
1189
+ session_dir = get_safe_session_dir(project_id)
1190
+ if not session_dir.exists():
1191
+ raise HTTPException(status_code=404, detail="Project not found")
1192
+
1193
+ cleaned_csv = session_dir / "cleaned.csv"
1194
+ original_csv = session_dir / "original_upload.csv"
1195
+ csv_path = cleaned_csv if cleaned_csv.exists() else original_csv
1196
+
1197
+ if not csv_path.exists():
1198
+ raise HTTPException(status_code=404, detail="CSV not found.")
1199
+
1200
+ try:
1201
+ df = read_csv_robust(str(csv_path))
1202
+ rows_count, cols_count = df.shape
1203
+ preview = df.head(100).fillna("").to_dict(orient="records")
1204
+ col_types = {col: str(dtype) for col, dtype in df.dtypes.items()}
1205
+ columns = list(df.columns)
1206
+
1207
+ # Update cache in results.json if it exists
1208
+ results_path = session_dir / "results.json"
1209
+ if results_path.exists():
1210
+ try:
1211
+ with open(results_path, "r", encoding="utf-8") as f:
1212
+ res_data = json.load(f)
1213
+ res_data["preview"] = preview
1214
+ res_data["rows_count"] = rows_count
1215
+ res_data["cols_count"] = cols_count
1216
+ with open(results_path, "w", encoding="utf-8") as f:
1217
+ json.dump(res_data, f, indent=2)
1218
+ except Exception:
1219
+ pass
1220
+
1221
+ return {
1222
+ "columns": columns,
1223
+ "col_types": col_types,
1224
+ "rows_count": rows_count,
1225
+ "cols_count": cols_count,
1226
+ "preview": preview
1227
+ }
1228
+ except Exception as e:
1229
+ raise HTTPException(status_code=500, detail=f"Failed to load preview: {str(e)}")
1230
+
1231
+
1232
+ @app.get("/api/projects/{project_id}/download-csv")
1233
+ async def download_project_csv(project_id: str):
1234
+ """Downloads the cleaned dataset CSV for the specified project."""
1235
+ session_dir = get_safe_session_dir(project_id)
1236
+ if not session_dir.exists():
1237
+ raise HTTPException(status_code=404, detail="Project not found")
1238
+
1239
+ cleaned_csv = session_dir / "cleaned.csv"
1240
+ original_csv = session_dir / "original_upload.csv"
1241
+ csv_path = cleaned_csv if cleaned_csv.exists() else original_csv
1242
+
1243
+ if not csv_path.exists():
1244
+ raise HTTPException(status_code=404, detail="CSV not found.")
1245
+
1246
+ try:
1247
+ meta = get_project_metadata(project_id)
1248
+ orig_name = meta.get("filename", "dataset.csv")
1249
+ except Exception:
1250
+ orig_name = "dataset.csv"
1251
+
1252
+ base_name = orig_name.rsplit(".", 1)[0] if "." in orig_name else orig_name
1253
+ download_filename = f"{base_name}_cleaned.csv"
1254
+
1255
+ return FileResponse(csv_path, media_type="text/csv", filename=download_filename)
1256
+
1257
+
1258
+ # ---------------------------------------------------------------------------
1259
+ # Frontend Static Mounts
1260
+ # ---------------------------------------------------------------------------
1261
+
1262
+ BASE_DIR = Path(__file__).resolve().parent
1263
+ web_dir = BASE_DIR / "web"
1264
+ assets_dir = BASE_DIR / "assets"
1265
+
1266
+ app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets")
1267
+ app.mount("/", StaticFiles(directory=str(web_dir), html=True), name="web")
1268
+
1269
+
1270
+ # ── Server Boot ─────────────────────────────────────────────────────────────
1271
+
1272
+ if __name__ == "__main__":
1273
+ import uvicorn
1274
+ # Start server on 8000
1275
+ print("\n" + "=" * 50)
1276
+ print("Crewlyze Web Platform")
1277
+ print("Local URL: http://localhost:8000")
1278
+ print("=" * 50 + "\n")
1279
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)