delimit-cli 3.3.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,830 @@
1
+ """
2
+ Real implementations for cost, data, and intel tools.
3
+ All tools work WITHOUT external integrations by default, using file-based
4
+ analysis and local storage. Optional cloud API integration when keys are configured.
5
+ """
6
+
7
+ import csv
8
+ import hashlib
9
+ import io
10
+ import json
11
+ import logging
12
+ import os
13
+ import re
14
+ import shutil
15
+ import sqlite3
16
+ import uuid
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional
20
+
21
+ logger = logging.getLogger("delimit.ai.tools_data")
22
+
23
+ DELIMIT_HOME = Path(os.environ.get("DELIMIT_HOME", os.path.expanduser("~/.delimit")))
24
+ BACKUPS_DIR = DELIMIT_HOME / "backups"
25
+ INTEL_DIR = DELIMIT_HOME / "intel"
26
+ COST_ALERTS_FILE = DELIMIT_HOME / "cost_alerts.json"
27
+ DATASETS_FILE = INTEL_DIR / "datasets.json"
28
+ SNAPSHOTS_DIR = INTEL_DIR / "snapshots"
29
+
30
+ # Typical VPS monthly pricing estimates (USD)
31
+ VPS_COST_ESTIMATES = {
32
+ "small": 5.0, # 1 vCPU, 1GB RAM
33
+ "medium": 20.0, # 2 vCPU, 4GB RAM
34
+ "large": 40.0, # 4 vCPU, 8GB RAM
35
+ "xlarge": 80.0, # 8 vCPU, 16GB RAM
36
+ }
37
+
38
+
39
+ def _ensure_dir(path: Path) -> None:
40
+ path.mkdir(parents=True, exist_ok=True)
41
+
42
+
43
+ # ═══════════════════════════════════════════════════════════════════════
44
+ # COST TOOLS
45
+ # ═══════════════════════════════════════════════════════════════════════
46
+
47
+
48
+ def cost_analyze(target: str = ".") -> Dict[str, Any]:
49
+ """Analyze project costs by scanning infrastructure files."""
50
+ target_path = Path(target).resolve()
51
+ if not target_path.exists():
52
+ return {"error": "target_not_found", "message": f"Path does not exist: {target}"}
53
+
54
+ services = []
55
+ cost_breakdown = []
56
+ recommendations = []
57
+ total_cost = 0.0
58
+
59
+ # Scan for Dockerfiles
60
+ dockerfiles = list(target_path.rglob("Dockerfile")) + list(target_path.rglob("Dockerfile.*"))
61
+ for df in dockerfiles:
62
+ rel = str(df.relative_to(target_path))
63
+ content = df.read_text(errors="ignore")
64
+ # Estimate size category from base image
65
+ size = "medium"
66
+ if any(kw in content.lower() for kw in ["alpine", "slim", "distroless"]):
67
+ size = "small"
68
+ elif any(kw in content.lower() for kw in ["gpu", "cuda", "nvidia"]):
69
+ size = "xlarge"
70
+ est = VPS_COST_ESTIMATES[size]
71
+ services.append({"type": "container", "file": rel, "size_estimate": size})
72
+ cost_breakdown.append({"item": f"Container ({rel})", "monthly_usd": est})
73
+ total_cost += est
74
+
75
+ # Scan for docker-compose
76
+ compose_files = list(target_path.rglob("docker-compose.yml")) + list(target_path.rglob("docker-compose.yaml")) + list(target_path.rglob("compose.yml")) + list(target_path.rglob("compose.yaml"))
77
+ for cf in compose_files:
78
+ try:
79
+ content = cf.read_text(errors="ignore")
80
+ # Count services by looking for service blocks
81
+ svc_count = len(re.findall(r"^\s{2}\w[\w-]*:\s*$", content, re.MULTILINE))
82
+ if svc_count == 0:
83
+ # Fallback: count lines that look like service definitions
84
+ svc_count = max(1, content.lower().count("image:"))
85
+ rel = str(cf.relative_to(target_path))
86
+ for i in range(svc_count):
87
+ est = VPS_COST_ESTIMATES["medium"]
88
+ services.append({"type": "compose_service", "file": rel, "index": i})
89
+ cost_breakdown.append({"item": f"Compose service #{i+1} ({rel})", "monthly_usd": est})
90
+ total_cost += est
91
+ except Exception:
92
+ pass
93
+
94
+ # Scan for package.json (npm dependencies)
95
+ pkg_files = list(target_path.rglob("package.json"))
96
+ dep_count = 0
97
+ for pf in pkg_files:
98
+ if "node_modules" in str(pf):
99
+ continue
100
+ try:
101
+ data = json.loads(pf.read_text(errors="ignore"))
102
+ deps = len(data.get("dependencies", {}))
103
+ dev_deps = len(data.get("devDependencies", {}))
104
+ dep_count += deps + dev_deps
105
+ services.append({"type": "node_project", "file": str(pf.relative_to(target_path)), "dependencies": deps, "dev_dependencies": dev_deps})
106
+ except Exception:
107
+ pass
108
+
109
+ # Scan for requirements.txt (Python dependencies)
110
+ req_files = list(target_path.rglob("requirements.txt")) + list(target_path.rglob("requirements/*.txt"))
111
+ for rf in req_files:
112
+ try:
113
+ lines = [l.strip() for l in rf.read_text(errors="ignore").splitlines() if l.strip() and not l.strip().startswith("#") and not l.strip().startswith("-")]
114
+ dep_count += len(lines)
115
+ services.append({"type": "python_project", "file": str(rf.relative_to(target_path)), "dependencies": len(lines)})
116
+ except Exception:
117
+ pass
118
+
119
+ # Scan for pyproject.toml
120
+ pyproject_files = list(target_path.rglob("pyproject.toml"))
121
+ for pf in pyproject_files:
122
+ if "node_modules" in str(pf):
123
+ continue
124
+ try:
125
+ content = pf.read_text(errors="ignore")
126
+ dep_lines = re.findall(r'^\s*"[^"]+[><=!]', content, re.MULTILINE)
127
+ if dep_lines:
128
+ dep_count += len(dep_lines)
129
+ services.append({"type": "python_project", "file": str(pf.relative_to(target_path)), "dependencies": len(dep_lines)})
130
+ except Exception:
131
+ pass
132
+
133
+ # Scan for cloud config files
134
+ cloud_providers = []
135
+ aws_paths = [target_path / ".aws", Path.home() / ".aws"]
136
+ for ap in aws_paths:
137
+ if ap.exists():
138
+ cloud_providers.append("aws")
139
+ break
140
+
141
+ gcloud_paths = [target_path / ".gcloud", Path.home() / ".config" / "gcloud"]
142
+ for gp in gcloud_paths:
143
+ if gp.exists():
144
+ cloud_providers.append("gcp")
145
+ break
146
+
147
+ if (target_path / ".azure").exists() or (Path.home() / ".azure").exists():
148
+ cloud_providers.append("azure")
149
+
150
+ # Check for Vercel / Netlify / Railway configs
151
+ for conf, provider in [("vercel.json", "vercel"), ("netlify.toml", "netlify"), ("railway.json", "railway"), ("fly.toml", "fly.io")]:
152
+ if (target_path / conf).exists():
153
+ cloud_providers.append(provider)
154
+ services.append({"type": "paas", "provider": provider, "file": conf})
155
+ cost_breakdown.append({"item": f"PaaS ({provider})", "monthly_usd": 10.0})
156
+ total_cost += 10.0
157
+
158
+ # If no services found, report that
159
+ if not services:
160
+ recommendations.append("No infrastructure files detected. Add Dockerfiles or deployment configs for cost estimation.")
161
+
162
+ if dep_count > 100:
163
+ recommendations.append(f"High dependency count ({dep_count}). Consider auditing for unused packages to reduce build times and attack surface.")
164
+ if len(dockerfiles) > 0 and not any("alpine" in df.read_text(errors="ignore").lower() or "slim" in df.read_text(errors="ignore").lower() for df in dockerfiles):
165
+ recommendations.append("Consider using Alpine or slim base images to reduce container costs.")
166
+
167
+ return {
168
+ "tool": "cost.analyze",
169
+ "target": str(target_path),
170
+ "estimated_monthly_cost": round(total_cost, 2),
171
+ "services_detected": len(services),
172
+ "services": services,
173
+ "dependency_count": dep_count,
174
+ "cloud_providers": cloud_providers,
175
+ "cost_breakdown": cost_breakdown,
176
+ "recommendations": recommendations,
177
+ }
178
+
179
+
180
+ def cost_optimize(target: str = ".") -> Dict[str, Any]:
181
+ """Find cost optimization opportunities in a project."""
182
+ target_path = Path(target).resolve()
183
+ if not target_path.exists():
184
+ return {"error": "target_not_found", "message": f"Path does not exist: {target}"}
185
+
186
+ opportunities = []
187
+ estimated_savings = 0.0
188
+
189
+ # Check for oversized Docker images (no multi-stage build)
190
+ dockerfiles = list(target_path.rglob("Dockerfile")) + list(target_path.rglob("Dockerfile.*"))
191
+ for df in dockerfiles:
192
+ content = df.read_text(errors="ignore")
193
+ rel = str(df.relative_to(target_path))
194
+ from_count = len(re.findall(r"^FROM\s+", content, re.MULTILINE | re.IGNORECASE))
195
+
196
+ if from_count == 1 and "AS" not in content.upper().split("FROM", 1)[-1].split("\n")[0]:
197
+ opportunities.append({
198
+ "type": "docker_multistage",
199
+ "file": rel,
200
+ "severity": "medium",
201
+ "description": "Single-stage Docker build. Multi-stage builds can reduce image size by 50-80%.",
202
+ "estimated_savings_usd": 5.0,
203
+ })
204
+ estimated_savings += 5.0
205
+
206
+ if not any(kw in content.lower() for kw in ["alpine", "slim", "distroless", "scratch"]):
207
+ opportunities.append({
208
+ "type": "docker_base_image",
209
+ "file": rel,
210
+ "severity": "low",
211
+ "description": "Using full base image. Alpine/slim variants reduce image size and pull costs.",
212
+ "estimated_savings_usd": 2.0,
213
+ })
214
+ estimated_savings += 2.0
215
+
216
+ # Check for .dockerignore
217
+ dockerignore = df.parent / ".dockerignore"
218
+ if not dockerignore.exists():
219
+ opportunities.append({
220
+ "type": "missing_dockerignore",
221
+ "file": rel,
222
+ "severity": "low",
223
+ "description": "No .dockerignore found. Build context may include unnecessary files.",
224
+ "estimated_savings_usd": 1.0,
225
+ })
226
+ estimated_savings += 1.0
227
+
228
+ # Check for unused dependencies in package.json
229
+ pkg_files = list(target_path.rglob("package.json"))
230
+ for pf in pkg_files:
231
+ if "node_modules" in str(pf):
232
+ continue
233
+ try:
234
+ data = json.loads(pf.read_text(errors="ignore"))
235
+ deps = data.get("dependencies", {})
236
+ rel = str(pf.relative_to(target_path))
237
+
238
+ # Scan source files for import references
239
+ src_dir = pf.parent / "src"
240
+ if not src_dir.exists():
241
+ src_dir = pf.parent
242
+
243
+ source_content = ""
244
+ for ext in ["*.js", "*.ts", "*.jsx", "*.tsx", "*.mjs"]:
245
+ for sf in src_dir.rglob(ext):
246
+ if "node_modules" in str(sf) or "dist" in str(sf) or ".next" in str(sf):
247
+ continue
248
+ try:
249
+ source_content += sf.read_text(errors="ignore") + "\n"
250
+ except Exception:
251
+ pass
252
+
253
+ if source_content:
254
+ potentially_unused = []
255
+ for dep_name in deps:
256
+ # Check common import patterns
257
+ patterns = [
258
+ dep_name,
259
+ dep_name.replace("-", "_"),
260
+ dep_name.split("/")[-1],
261
+ ]
262
+ if not any(p in source_content for p in patterns):
263
+ potentially_unused.append(dep_name)
264
+
265
+ if potentially_unused:
266
+ opportunities.append({
267
+ "type": "unused_npm_dependencies",
268
+ "file": rel,
269
+ "severity": "medium",
270
+ "description": f"Potentially unused dependencies: {', '.join(potentially_unused[:10])}",
271
+ "count": len(potentially_unused),
272
+ "packages": potentially_unused[:10],
273
+ "estimated_savings_usd": len(potentially_unused) * 0.5,
274
+ })
275
+ estimated_savings += len(potentially_unused) * 0.5
276
+ except Exception:
277
+ pass
278
+
279
+ # Check for uncompressed assets
280
+ large_assets = []
281
+ for ext in ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.bmp", "*.svg"]:
282
+ for af in target_path.rglob(ext):
283
+ if "node_modules" in str(af) or ".git" in str(af):
284
+ continue
285
+ try:
286
+ size = af.stat().st_size
287
+ if size > 500_000: # > 500KB
288
+ large_assets.append({"file": str(af.relative_to(target_path)), "size_bytes": size})
289
+ except Exception:
290
+ pass
291
+
292
+ if large_assets:
293
+ opportunities.append({
294
+ "type": "uncompressed_assets",
295
+ "severity": "low",
296
+ "description": f"Found {len(large_assets)} large image files (>500KB). Consider compression or WebP conversion.",
297
+ "files": large_assets[:10],
298
+ "estimated_savings_usd": 2.0,
299
+ })
300
+ estimated_savings += 2.0
301
+
302
+ # Check for uncompressed JS/CSS bundles
303
+ for ext in ["*.js", "*.css"]:
304
+ for bf in target_path.rglob(ext):
305
+ if "node_modules" in str(bf) or ".git" in str(bf):
306
+ continue
307
+ parts = str(bf).lower()
308
+ if any(d in parts for d in ["/dist/", "/build/", "/public/", "/.next/"]):
309
+ try:
310
+ size = bf.stat().st_size
311
+ if size > 1_000_000: # > 1MB
312
+ gz_path = Path(str(bf) + ".gz")
313
+ br_path = Path(str(bf) + ".br")
314
+ if not gz_path.exists() and not br_path.exists():
315
+ opportunities.append({
316
+ "type": "uncompressed_bundle",
317
+ "file": str(bf.relative_to(target_path)),
318
+ "severity": "medium",
319
+ "size_bytes": size,
320
+ "description": f"Large uncompressed bundle ({size // 1024}KB). Enable gzip/brotli compression.",
321
+ "estimated_savings_usd": 1.0,
322
+ })
323
+ estimated_savings += 1.0
324
+ except Exception:
325
+ pass
326
+
327
+ return {
328
+ "tool": "cost.optimize",
329
+ "target": str(target_path),
330
+ "optimization_opportunities": opportunities,
331
+ "opportunity_count": len(opportunities),
332
+ "estimated_savings": round(estimated_savings, 2),
333
+ }
334
+
335
+
336
+ def cost_alert(action: str = "list", name: Optional[str] = None,
337
+ threshold: Optional[float] = None, alert_id: Optional[str] = None) -> Dict[str, Any]:
338
+ """Manage file-based cost alerts."""
339
+ _ensure_dir(DELIMIT_HOME)
340
+
341
+ # Load existing alerts
342
+ alerts = []
343
+ if COST_ALERTS_FILE.exists():
344
+ try:
345
+ alerts = json.loads(COST_ALERTS_FILE.read_text())
346
+ except Exception:
347
+ alerts = []
348
+
349
+ if action == "list":
350
+ return {
351
+ "tool": "cost.alert",
352
+ "action": "list",
353
+ "alerts": alerts,
354
+ "active_count": sum(1 for a in alerts if a.get("active", True)),
355
+ }
356
+
357
+ elif action == "create":
358
+ if not name or threshold is None:
359
+ return {"error": "missing_params", "message": "create requires 'name' and 'threshold'"}
360
+ new_alert = {
361
+ "id": str(uuid.uuid4())[:8],
362
+ "name": name,
363
+ "threshold_usd": threshold,
364
+ "active": True,
365
+ "created_at": datetime.now(timezone.utc).isoformat(),
366
+ }
367
+ alerts.append(new_alert)
368
+ COST_ALERTS_FILE.write_text(json.dumps(alerts, indent=2))
369
+ return {
370
+ "tool": "cost.alert",
371
+ "action": "create",
372
+ "alert": new_alert,
373
+ "alerts": alerts,
374
+ "active_count": sum(1 for a in alerts if a.get("active", True)),
375
+ }
376
+
377
+ elif action == "delete":
378
+ if not alert_id:
379
+ return {"error": "missing_params", "message": "delete requires 'alert_id'"}
380
+ original_count = len(alerts)
381
+ alerts = [a for a in alerts if a.get("id") != alert_id]
382
+ if len(alerts) == original_count:
383
+ return {"error": "not_found", "message": f"Alert {alert_id} not found"}
384
+ COST_ALERTS_FILE.write_text(json.dumps(alerts, indent=2))
385
+ return {
386
+ "tool": "cost.alert",
387
+ "action": "delete",
388
+ "deleted_id": alert_id,
389
+ "alerts": alerts,
390
+ "active_count": sum(1 for a in alerts if a.get("active", True)),
391
+ }
392
+
393
+ elif action == "toggle":
394
+ if not alert_id:
395
+ return {"error": "missing_params", "message": "toggle requires 'alert_id'"}
396
+ found = False
397
+ for a in alerts:
398
+ if a.get("id") == alert_id:
399
+ a["active"] = not a.get("active", True)
400
+ found = True
401
+ break
402
+ if not found:
403
+ return {"error": "not_found", "message": f"Alert {alert_id} not found"}
404
+ COST_ALERTS_FILE.write_text(json.dumps(alerts, indent=2))
405
+ return {
406
+ "tool": "cost.alert",
407
+ "action": "toggle",
408
+ "alert_id": alert_id,
409
+ "alerts": alerts,
410
+ "active_count": sum(1 for a in alerts if a.get("active", True)),
411
+ }
412
+
413
+ else:
414
+ return {"error": "invalid_action", "message": f"Unknown action: {action}. Use list/create/delete/toggle."}
415
+
416
+
417
+ # ═══════════════════════════════════════════════════════════════════════
418
+ # DATA TOOLS
419
+ # ═══════════════════════════════════════════════════════════════════════
420
+
421
+
422
+ def data_validate(target: str = ".") -> Dict[str, Any]:
423
+ """Validate data files in a directory."""
424
+ target_path = Path(target).resolve()
425
+ if not target_path.exists():
426
+ return {"error": "target_not_found", "message": f"Path does not exist: {target}"}
427
+
428
+ files_checked = 0
429
+ valid = 0
430
+ invalid = 0
431
+ issues = []
432
+
433
+ # If target is a single file, validate just that
434
+ if target_path.is_file():
435
+ file_list = [target_path]
436
+ else:
437
+ file_list = []
438
+ for ext in ["*.json", "*.csv", "*.sqlite", "*.sqlite3", "*.db"]:
439
+ for f in target_path.rglob(ext):
440
+ if "node_modules" in str(f) or ".git" in str(f):
441
+ continue
442
+ file_list.append(f)
443
+
444
+ for fpath in file_list:
445
+ files_checked += 1
446
+ suffix = fpath.suffix.lower()
447
+ rel = str(fpath.relative_to(target_path)) if target_path.is_dir() else fpath.name
448
+
449
+ if suffix == ".json":
450
+ try:
451
+ content = fpath.read_text(errors="ignore")
452
+ json.loads(content)
453
+ valid += 1
454
+ except json.JSONDecodeError as e:
455
+ invalid += 1
456
+ issues.append({"file": rel, "type": "json_parse_error", "message": str(e)})
457
+ except Exception as e:
458
+ invalid += 1
459
+ issues.append({"file": rel, "type": "read_error", "message": str(e)})
460
+
461
+ elif suffix == ".csv":
462
+ try:
463
+ content = fpath.read_text(errors="ignore")
464
+ reader = csv.reader(io.StringIO(content))
465
+ rows = list(reader)
466
+ if not rows:
467
+ issues.append({"file": rel, "type": "empty_csv", "message": "CSV file is empty"})
468
+ invalid += 1
469
+ else:
470
+ header_len = len(rows[0])
471
+ inconsistent_rows = []
472
+ for i, row in enumerate(rows[1:], start=2):
473
+ if len(row) != header_len:
474
+ inconsistent_rows.append(i)
475
+ if inconsistent_rows:
476
+ invalid += 1
477
+ issues.append({
478
+ "file": rel,
479
+ "type": "csv_column_mismatch",
480
+ "message": f"Expected {header_len} columns, found mismatches on rows: {inconsistent_rows[:10]}",
481
+ "expected_columns": header_len,
482
+ "mismatched_rows": inconsistent_rows[:10],
483
+ })
484
+ else:
485
+ valid += 1
486
+ except Exception as e:
487
+ invalid += 1
488
+ issues.append({"file": rel, "type": "csv_error", "message": str(e)})
489
+
490
+ elif suffix in (".sqlite", ".sqlite3", ".db"):
491
+ try:
492
+ conn = sqlite3.connect(str(fpath))
493
+ cursor = conn.execute("PRAGMA integrity_check")
494
+ result = cursor.fetchone()
495
+ if result and result[0] == "ok":
496
+ # Also get table count
497
+ tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
498
+ valid += 1
499
+ if not tables:
500
+ issues.append({"file": rel, "type": "empty_database", "message": "SQLite database has no tables", "severity": "info"})
501
+ else:
502
+ invalid += 1
503
+ issues.append({"file": rel, "type": "sqlite_integrity_failed", "message": str(result)})
504
+ conn.close()
505
+ except Exception as e:
506
+ invalid += 1
507
+ issues.append({"file": rel, "type": "sqlite_error", "message": str(e)})
508
+
509
+ return {
510
+ "tool": "data.validate",
511
+ "target": str(target_path),
512
+ "files_checked": files_checked,
513
+ "valid": valid,
514
+ "invalid": invalid,
515
+ "issues": issues,
516
+ }
517
+
518
+
519
+ def data_migrate(target: str = ".") -> Dict[str, Any]:
520
+ """Check for migration files and report status."""
521
+ target_path = Path(target).resolve()
522
+ if not target_path.exists():
523
+ return {"error": "target_not_found", "message": f"Path does not exist: {target}"}
524
+
525
+ framework_detected = None
526
+ migrations_found = []
527
+ pending = 0
528
+ status = "no_migrations"
529
+
530
+ # Check for Alembic (Python/SQLAlchemy)
531
+ alembic_dir = target_path / "alembic"
532
+ if not alembic_dir.exists():
533
+ alembic_dir = target_path / "migrations" / "versions"
534
+ if alembic_dir.exists():
535
+ framework_detected = "alembic"
536
+ for mf in sorted(alembic_dir.glob("*.py")):
537
+ if mf.name == "__init__.py" or mf.name == "env.py":
538
+ continue
539
+ migrations_found.append({
540
+ "file": str(mf.relative_to(target_path)),
541
+ "name": mf.stem,
542
+ "modified": datetime.fromtimestamp(mf.stat().st_mtime, tz=timezone.utc).isoformat(),
543
+ })
544
+
545
+ # Check for Django migrations
546
+ django_dirs = list(target_path.rglob("migrations"))
547
+ for md in django_dirs:
548
+ if "node_modules" in str(md) or ".git" in str(md) or "alembic" in str(md):
549
+ continue
550
+ init_file = md / "__init__.py"
551
+ if init_file.exists():
552
+ framework_detected = framework_detected or "django"
553
+ for mf in sorted(md.glob("*.py")):
554
+ if mf.name == "__init__.py":
555
+ continue
556
+ migrations_found.append({
557
+ "file": str(mf.relative_to(target_path)),
558
+ "name": mf.stem,
559
+ "modified": datetime.fromtimestamp(mf.stat().st_mtime, tz=timezone.utc).isoformat(),
560
+ })
561
+
562
+ # Check for Prisma migrations
563
+ prisma_dir = target_path / "prisma" / "migrations"
564
+ if prisma_dir.exists():
565
+ framework_detected = framework_detected or "prisma"
566
+ for mdir in sorted(prisma_dir.iterdir()):
567
+ if mdir.is_dir() and mdir.name != "migration_lock.toml":
568
+ sql_file = mdir / "migration.sql"
569
+ migrations_found.append({
570
+ "file": str(mdir.relative_to(target_path)),
571
+ "name": mdir.name,
572
+ "has_sql": sql_file.exists(),
573
+ "modified": datetime.fromtimestamp(mdir.stat().st_mtime, tz=timezone.utc).isoformat(),
574
+ })
575
+
576
+ # Check for Knex/Sequelize migrations
577
+ knex_dir = target_path / "migrations"
578
+ if knex_dir.exists() and not framework_detected:
579
+ js_migrations = list(knex_dir.glob("*.js")) + list(knex_dir.glob("*.ts"))
580
+ if js_migrations:
581
+ framework_detected = "knex/sequelize"
582
+ for mf in sorted(js_migrations):
583
+ migrations_found.append({
584
+ "file": str(mf.relative_to(target_path)),
585
+ "name": mf.stem,
586
+ "modified": datetime.fromtimestamp(mf.stat().st_mtime, tz=timezone.utc).isoformat(),
587
+ })
588
+
589
+ last_applied = migrations_found[-1]["name"] if migrations_found else None
590
+ if migrations_found:
591
+ status = "migrations_found"
592
+ # Without a DB connection, we can only report found migrations, not applied vs pending
593
+ pending = 0 # Would need DB connection to determine
594
+
595
+ return {
596
+ "tool": "data.migrate",
597
+ "target": str(target_path),
598
+ "framework_detected": framework_detected,
599
+ "migrations_found": len(migrations_found),
600
+ "migrations": migrations_found,
601
+ "pending_migrations": pending,
602
+ "last_migration": last_applied,
603
+ "status": status,
604
+ "note": "Connect a database to determine applied vs pending migrations." if migrations_found else None,
605
+ }
606
+
607
+
608
+ def data_backup(target: str = ".") -> Dict[str, Any]:
609
+ """Back up SQLite and JSON data files to ~/.delimit/backups/."""
610
+ target_path = Path(target).resolve()
611
+ if not target_path.exists():
612
+ return {"error": "target_not_found", "message": f"Path does not exist: {target}"}
613
+
614
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
615
+ backup_dir = BACKUPS_DIR / timestamp
616
+ _ensure_dir(backup_dir)
617
+
618
+ files_backed_up = []
619
+ total_size = 0
620
+
621
+ # Collect data files
622
+ data_files = []
623
+ if target_path.is_file():
624
+ data_files = [target_path]
625
+ else:
626
+ for ext in ["*.sqlite", "*.sqlite3", "*.db", "*.json"]:
627
+ for f in target_path.rglob(ext):
628
+ if "node_modules" in str(f) or ".git" in str(f) or str(DELIMIT_HOME) in str(f):
629
+ continue
630
+ # Skip package.json, tsconfig.json etc -- only back up data files
631
+ if f.name in ("package.json", "package-lock.json", "tsconfig.json", "jsconfig.json", "composer.json"):
632
+ continue
633
+ data_files.append(f)
634
+
635
+ for fpath in data_files:
636
+ try:
637
+ size = fpath.stat().st_size
638
+ if target_path.is_dir():
639
+ rel = fpath.relative_to(target_path)
640
+ else:
641
+ rel = Path(fpath.name)
642
+ dest = backup_dir / rel
643
+ _ensure_dir(dest.parent)
644
+ shutil.copy2(str(fpath), str(dest))
645
+ files_backed_up.append({"file": str(rel), "size_bytes": size})
646
+ total_size += size
647
+ except Exception as e:
648
+ files_backed_up.append({"file": str(fpath.name), "error": str(e)})
649
+
650
+ return {
651
+ "tool": "data.backup",
652
+ "target": str(target_path),
653
+ "files_backed_up": len([f for f in files_backed_up if "error" not in f]),
654
+ "files": files_backed_up,
655
+ "backup_path": str(backup_dir),
656
+ "total_size": total_size,
657
+ "total_size_human": _human_size(total_size),
658
+ }
659
+
660
+
661
+ def _human_size(size_bytes: int) -> str:
662
+ for unit in ["B", "KB", "MB", "GB"]:
663
+ if size_bytes < 1024:
664
+ return f"{size_bytes:.1f} {unit}"
665
+ size_bytes /= 1024
666
+ return f"{size_bytes:.1f} TB"
667
+
668
+
669
+ # ═══════════════════════════════════════════════════════════════════════
670
+ # INTEL TOOLS
671
+ # ═══════════════════════════════════════════════════════════════════════
672
+
673
+
674
+ def _load_datasets() -> List[Dict[str, Any]]:
675
+ _ensure_dir(INTEL_DIR)
676
+ if DATASETS_FILE.exists():
677
+ try:
678
+ return json.loads(DATASETS_FILE.read_text())
679
+ except Exception:
680
+ return []
681
+ return []
682
+
683
+
684
+ def _save_datasets(datasets: List[Dict[str, Any]]) -> None:
685
+ _ensure_dir(INTEL_DIR)
686
+ DATASETS_FILE.write_text(json.dumps(datasets, indent=2))
687
+
688
+
689
+ def intel_snapshot_ingest(data: Dict[str, Any], provenance: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
690
+ """Save JSON data with provenance metadata."""
691
+ _ensure_dir(SNAPSHOTS_DIR)
692
+
693
+ snapshot_id = str(uuid.uuid4())[:12]
694
+ timestamp = datetime.now(timezone.utc).isoformat()
695
+
696
+ snapshot = {
697
+ "id": snapshot_id,
698
+ "data": data,
699
+ "provenance": provenance or {},
700
+ "ingested_at": timestamp,
701
+ "checksum": hashlib.sha256(json.dumps(data, sort_keys=True).encode()).hexdigest()[:16],
702
+ }
703
+
704
+ snapshot_file = SNAPSHOTS_DIR / f"{snapshot_id}.json"
705
+ snapshot_file.write_text(json.dumps(snapshot, indent=2))
706
+
707
+ return {
708
+ "tool": "intel.snapshot_ingest",
709
+ "snapshot_id": snapshot_id,
710
+ "ingested_at": timestamp,
711
+ "checksum": snapshot["checksum"],
712
+ "storage_path": str(snapshot_file),
713
+ }
714
+
715
+
716
+ def intel_query(dataset_id: Optional[str] = None, query: str = "", parameters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
717
+ """Search saved snapshots by keyword/date."""
718
+ _ensure_dir(SNAPSHOTS_DIR)
719
+
720
+ results = []
721
+ params = parameters or {}
722
+ date_from = params.get("date_from")
723
+ date_to = params.get("date_to")
724
+ limit = params.get("limit", 50)
725
+
726
+ # Search snapshots
727
+ for sf in sorted(SNAPSHOTS_DIR.glob("*.json"), reverse=True):
728
+ try:
729
+ snapshot = json.loads(sf.read_text())
730
+ except Exception:
731
+ continue
732
+
733
+ # Filter by dataset_id if specified
734
+ if dataset_id and snapshot.get("provenance", {}).get("dataset_id") != dataset_id:
735
+ continue
736
+
737
+ # Date filtering
738
+ ingested = snapshot.get("ingested_at", "")
739
+ if date_from and ingested < date_from:
740
+ continue
741
+ if date_to and ingested > date_to:
742
+ continue
743
+
744
+ # Keyword search in data
745
+ if query:
746
+ data_str = json.dumps(snapshot.get("data", {})).lower()
747
+ if query.lower() not in data_str:
748
+ continue
749
+
750
+ results.append({
751
+ "snapshot_id": snapshot.get("id"),
752
+ "ingested_at": ingested,
753
+ "provenance": snapshot.get("provenance", {}),
754
+ "data_preview": _truncate_data(snapshot.get("data", {})),
755
+ })
756
+
757
+ if len(results) >= limit:
758
+ break
759
+
760
+ return {
761
+ "tool": "intel.query",
762
+ "query": query,
763
+ "dataset_id": dataset_id,
764
+ "results": results,
765
+ "total_results": len(results),
766
+ }
767
+
768
+
769
+ def _truncate_data(data: Any, max_len: int = 200) -> Any:
770
+ """Truncate data for preview."""
771
+ s = json.dumps(data)
772
+ if len(s) <= max_len:
773
+ return data
774
+ return {"_preview": s[:max_len] + "...", "_truncated": True}
775
+
776
+
777
+ def intel_dataset_register(name: str, schema: Optional[Dict[str, Any]] = None,
778
+ description: Optional[str] = None) -> Dict[str, Any]:
779
+ """Register a new dataset."""
780
+ datasets = _load_datasets()
781
+
782
+ # Check for duplicate name
783
+ for ds in datasets:
784
+ if ds.get("name") == name:
785
+ return {"error": "duplicate", "message": f"Dataset '{name}' already registered", "dataset_id": ds["id"]}
786
+
787
+ dataset_id = str(uuid.uuid4())[:12]
788
+ new_dataset = {
789
+ "id": dataset_id,
790
+ "name": name,
791
+ "schema": schema or {},
792
+ "description": description or "",
793
+ "frozen": False,
794
+ "created_at": datetime.now(timezone.utc).isoformat(),
795
+ "updated_at": datetime.now(timezone.utc).isoformat(),
796
+ }
797
+ datasets.append(new_dataset)
798
+ _save_datasets(datasets)
799
+
800
+ return {
801
+ "tool": "intel.dataset_register",
802
+ "dataset": new_dataset,
803
+ }
804
+
805
+
806
+ def intel_dataset_list() -> Dict[str, Any]:
807
+ """List all registered datasets."""
808
+ datasets = _load_datasets()
809
+ return {
810
+ "tool": "intel.dataset_list",
811
+ "datasets": datasets,
812
+ "total": len(datasets),
813
+ }
814
+
815
+
816
+ def intel_dataset_freeze(dataset_id: str) -> Dict[str, Any]:
817
+ """Mark a dataset as immutable."""
818
+ datasets = _load_datasets()
819
+
820
+ for ds in datasets:
821
+ if ds.get("id") == dataset_id:
822
+ if ds.get("frozen"):
823
+ return {"tool": "intel.dataset_freeze", "dataset_id": dataset_id, "status": "already_frozen"}
824
+ ds["frozen"] = True
825
+ ds["frozen_at"] = datetime.now(timezone.utc).isoformat()
826
+ ds["updated_at"] = ds["frozen_at"]
827
+ _save_datasets(datasets)
828
+ return {"tool": "intel.dataset_freeze", "dataset_id": dataset_id, "status": "frozen", "frozen_at": ds["frozen_at"]}
829
+
830
+ return {"error": "not_found", "message": f"Dataset {dataset_id} not found"}