data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/cli.py ADDED
@@ -0,0 +1,841 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+
7
+ _ANSI_RE = re.compile(r'\x1b\[[0-9;]*[mGKHJF]')
8
+
9
+ import typer
10
+ import yaml
11
+
12
+ app = typer.Typer(help="dcf", no_args_is_help=True)
13
+ gcp_app = typer.Typer(help="GCP lake provisioning", no_args_is_help=True)
14
+ mcp_app = typer.Typer(help="MCP server for AI-driven collector development", no_args_is_help=True)
15
+ app.add_typer(gcp_app, name="gcp")
16
+ app.add_typer(mcp_app, name="mcp")
17
+
18
+ def _project_root() -> Path:
19
+ from .project import find_project_root
20
+ return find_project_root()
21
+
22
+
23
+ def _collectors_dir() -> Path:
24
+ return _project_root() / "collectors"
25
+
26
+
27
+ def _load_config() -> dict:
28
+ cfg = _project_root() / "project.yml"
29
+ if not cfg.exists():
30
+ return {}
31
+ return yaml.safe_load(cfg.read_text()) or {}
32
+
33
+
34
+ def _save_config(config: dict) -> None:
35
+ cfg = _project_root() / "project.yml"
36
+ cfg.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
37
+
38
+
39
+ def _get_catalog() -> str:
40
+ return _load_config().get("catalog", "local")
41
+
42
+
43
+ # ------------------------------------------------------------------ #
44
+ # init #
45
+ # ------------------------------------------------------------------ #
46
+
47
+ @app.command()
48
+ def init(
49
+ catalog: str = typer.Option("local", prompt="Catalog type (local or gcp)", show_default=True),
50
+ ):
51
+ """Create or update project.yml configuration."""
52
+ cfg = _load_config()
53
+ cfg["catalog"] = catalog
54
+ _save_config(cfg)
55
+ typer.echo(f"Config saved to {_project_root() / 'project.yml'}")
56
+ typer.echo("\nTo store API keys, add them to project.yml as plain keys:")
57
+ typer.echo(" my_api_key: sk-xxxx")
58
+ typer.echo("Then reference them in collector YAML: value: \"{{ env.MY_API_KEY }}\"")
59
+ typer.echo("Or set them as environment variables before running.")
60
+
61
+
62
+ # ------------------------------------------------------------------ #
63
+ # run #
64
+ # ------------------------------------------------------------------ #
65
+
66
+ @app.command()
67
+ def run(
68
+ collector_name: str = typer.Argument(..., help="Collector name (without .yml) or 'all'"),
69
+ start: str | None = typer.Option(None, help="Override backfill start date (YYYY-MM-DD)"),
70
+ end: str | None = typer.Option(None, help="Override backfill end date (YYYY-MM-DD)"),
71
+ limit: int | None = typer.Option(None, help="Run only the first N iterations"),
72
+ param: list[str] = typer.Option([], help="Override a param value: key=value (repeatable)"),
73
+ ):
74
+ """Run one or all collectors."""
75
+ from .config import load_collector, load_all_collectors
76
+ from .engine import run_collector
77
+
78
+ catalog = _get_catalog()
79
+ param_overrides = _parse_params(param)
80
+
81
+ collectors_dir = _collectors_dir()
82
+ if collector_name == "all":
83
+ collectors = load_all_collectors(collectors_dir)
84
+ else:
85
+ path = collectors_dir / f"{collector_name}.yml"
86
+ if not path.exists():
87
+ typer.echo(f"Collector not found: {path}", err=True)
88
+ raise typer.Exit(1)
89
+ collectors = [load_collector(path)]
90
+
91
+ for collector in collectors:
92
+ if start or end:
93
+ _override_date_range(collector, start, end)
94
+ run_collector(collector, catalog=catalog, limit=limit, param_overrides=param_overrides)
95
+
96
+
97
+ # ------------------------------------------------------------------ #
98
+ # validate #
99
+ # ------------------------------------------------------------------ #
100
+
101
+ def _check_unset_env_refs(path: Path) -> list[str]:
102
+ """Return sorted list of {{ env.VAR }} references in a YAML file that are not currently set."""
103
+ raw = path.read_text()
104
+ refs = re.findall(r"\{\{\s*env\.(\w+)\s*\}\}", raw)
105
+ cfg = _load_config()
106
+ return sorted({v for v in refs if not os.environ.get(v) and not cfg.get(v.lower())})
107
+
108
+
109
+ @app.command()
110
+ def validate(
111
+ collector_name: str = typer.Argument(..., help="Collector name (without .yml) or 'all'"),
112
+ ):
113
+ """Parse and validate collector YAML without running it."""
114
+ from .config import load_collector, load_all_collectors
115
+
116
+ collectors_dir = _collectors_dir()
117
+ if collector_name == "all":
118
+ collectors = load_all_collectors(collectors_dir, resolve_env=False)
119
+ names = [c.name for c in collectors]
120
+ for path in sorted(collectors_dir.glob("*.yml")):
121
+ unset = _check_unset_env_refs(path)
122
+ if unset:
123
+ typer.echo(
124
+ f" WARNING '{path.stem}': these env vars are not set: {', '.join(unset)}\n"
125
+ f" Set them as environment variables or add to project.yml before running.",
126
+ err=True,
127
+ )
128
+ typer.echo(f"OK — {len(collectors)} collector(s): {', '.join(names)}")
129
+ else:
130
+ path = collectors_dir / f"{collector_name}.yml"
131
+ if not path.exists():
132
+ typer.echo(f"Collector not found: {path}", err=True)
133
+ raise typer.Exit(1)
134
+ try:
135
+ collector = load_collector(path, resolve_env=False)
136
+ except Exception as e:
137
+ from pydantic import ValidationError
138
+ if isinstance(e, ValidationError):
139
+ for err in e.errors():
140
+ loc = ".".join(str(x) for x in err["loc"])
141
+ typer.echo(f"Validation error in '{collector_name}': {loc} — {err['msg']}", err=True)
142
+ else:
143
+ typer.echo(f"Error loading '{collector_name}': {e}", err=True)
144
+ raise typer.Exit(1)
145
+ unset = _check_unset_env_refs(path)
146
+ if unset:
147
+ typer.echo(
148
+ f"WARNING: these env vars are referenced but not set: {', '.join(unset)}\n"
149
+ f" Set them as environment variables or add to project.yml before running.",
150
+ err=True,
151
+ )
152
+ from .config.models import PubSubSource
153
+ if isinstance(collector.source, PubSubSource):
154
+ typer.echo(
155
+ f"OK — '{collector.name}' (streaming, "
156
+ f"subscription: {collector.source.subscription}, "
157
+ f"{len(collector.source.schema_.columns)} columns)"
158
+ )
159
+ else:
160
+ typer.echo(f"OK — '{collector.name}' ({len(collector.source.params)} params, "
161
+ f"{len(collector.cadence.iterate)} cadence axes, "
162
+ f"{len(collector.source.schema_.columns)} columns)")
163
+
164
+
165
+ # ------------------------------------------------------------------ #
166
+ # query #
167
+ # ------------------------------------------------------------------ #
168
+
169
+ @app.command()
170
+ def query(
171
+ sql: str | None = typer.Argument(None, help="SQL query string"),
172
+ file: Path | None = typer.Option(None, "--file", "-f", help="Path to a .sql file"),
173
+ ):
174
+ """Run a SQL query against the warehouse and print results."""
175
+ from .warehouse_reader import query as run_query
176
+
177
+ if sql is None and file is None:
178
+ typer.echo("Error: provide a SQL string or --file <path>.sql", err=True)
179
+ raise typer.Exit(1)
180
+ if sql is not None and file is not None:
181
+ typer.echo("Error: provide either a SQL string or --file, not both", err=True)
182
+ raise typer.Exit(1)
183
+
184
+ if file is not None:
185
+ if not file.exists():
186
+ typer.echo(f"Error: file not found: {file}", err=True)
187
+ raise typer.Exit(1)
188
+ sql = file.read_text()
189
+
190
+ try:
191
+ rows = run_query(sql)
192
+ except Exception as e:
193
+ typer.echo(f"Query error: {e}", err=True)
194
+ raise typer.Exit(1)
195
+
196
+ if not rows:
197
+ typer.echo("0 rows")
198
+ return
199
+
200
+ from rich.console import Console
201
+ from rich.table import Table
202
+
203
+ table = Table(show_header=True, header_style="bold")
204
+ for col in rows[0].keys():
205
+ table.add_column(col)
206
+ for row in rows:
207
+ table.add_row(*[str(v) if v is not None else "" for v in row.values()])
208
+
209
+ Console().print(table)
210
+ noun = "row" if len(rows) == 1 else "rows"
211
+ typer.echo(f"{len(rows)} {noun}")
212
+
213
+
214
+ # ------------------------------------------------------------------ #
215
+ # gcp #
216
+ # ------------------------------------------------------------------ #
217
+
218
+ @gcp_app.command("setup")
219
+ def gcp_setup(
220
+ project_id: str = typer.Option(..., "--project-id", "-p", help="GCP project ID"),
221
+ region: str = typer.Option(..., "--region", "-r", help="GCP region (e.g. us-central1)"),
222
+ ):
223
+ """Provision a GCP data lake."""
224
+ from .gcp import bootstrap, terraform
225
+ from .gcp.gcloud import get_credentials
226
+
227
+ cfg = _load_config()
228
+ gcp = cfg.get("gcp", {})
229
+
230
+ if gcp.get("setup_status") in ("running", "complete"):
231
+ typer.echo(f"GCP setup already '{gcp['setup_status']}'. Use --force to re-run.", err=True)
232
+ raise typer.Exit(1)
233
+
234
+ typer.echo("Checking Google credentials...")
235
+ try:
236
+ credentials = get_credentials()
237
+ except RuntimeError as e:
238
+ typer.echo(str(e), err=True)
239
+ raise typer.Exit(1)
240
+ typer.echo("Credentials OK.")
241
+
242
+ gcp.update({"project_id": project_id, "region": region, "setup_status": "running"})
243
+ cfg["gcp"] = gcp
244
+ _save_config(cfg)
245
+
246
+ try:
247
+ typer.echo("Creating Terraform state bucket...")
248
+ tf_state_bucket = bootstrap.create_state_bucket(project_id, region, credentials)
249
+
250
+ typer.echo("Creating service account...")
251
+ sa_email = bootstrap.create_service_account(project_id, credentials)
252
+
253
+ typer.echo("Creating service account key...")
254
+ key_data = bootstrap.create_service_account_key(project_id, sa_email, credentials)
255
+
256
+ typer.echo("Storing key in Secret Manager...")
257
+ secret_name = bootstrap.store_key_in_secret_manager(project_id, key_data, credentials)
258
+
259
+ typer.echo("Provisioning lake infrastructure (terraform apply)...")
260
+ warehouse_bucket = terraform.provision(
261
+ project_id=project_id,
262
+ region=region,
263
+ sa_email=sa_email,
264
+ tf_state_bucket=tf_state_bucket,
265
+ )
266
+
267
+ gcp.update({
268
+ "sa_email": sa_email,
269
+ "secret_name": secret_name,
270
+ "tf_state_bucket": tf_state_bucket,
271
+ "warehouse_bucket": warehouse_bucket,
272
+ "setup_status": "complete",
273
+ "setup_error": None,
274
+ })
275
+ cfg["gcp"] = gcp
276
+ _save_config(cfg)
277
+
278
+ typer.echo(f"\nGCP lake provisioned successfully!")
279
+ typer.echo(f" Warehouse bucket: {warehouse_bucket}")
280
+ typer.echo(f" Service account: {sa_email}")
281
+
282
+ except Exception as e:
283
+ gcp.update({"setup_status": "failed", "setup_error": _ANSI_RE.sub("", str(e))})
284
+ cfg["gcp"] = gcp
285
+ _save_config(cfg)
286
+ typer.echo(f"\nSetup failed: {e}", err=True)
287
+ raise typer.Exit(1)
288
+
289
+
290
+ @gcp_app.command("teardown")
291
+ def gcp_teardown(
292
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
293
+ ):
294
+ """Destroy GCP lake resources (warehouse bucket, service account, Secret Manager secret)."""
295
+ from .gcp import bootstrap, terraform
296
+ from .gcp.gcloud import get_credentials
297
+
298
+ cfg = _load_config()
299
+ gcp = cfg.get("gcp", {})
300
+
301
+ if not gcp or gcp.get("setup_status") not in ("complete", "failed"):
302
+ typer.echo("No completed GCP setup found in project.yml. Nothing to tear down.")
303
+ return
304
+
305
+ project_id = gcp.get("project_id", "")
306
+ if not yes:
307
+ typer.confirm(
308
+ f"This will destroy all GCP resources for project '{project_id}'. Continue?",
309
+ abort=True,
310
+ )
311
+
312
+ try:
313
+ credentials = get_credentials()
314
+ except RuntimeError as e:
315
+ typer.echo(str(e), err=True)
316
+ raise typer.Exit(1)
317
+
318
+ destroyed: list[str] = []
319
+
320
+ tf_state_bucket = gcp.get("tf_state_bucket", "")
321
+ if tf_state_bucket:
322
+ typer.echo("Running terraform destroy (warehouse bucket)...")
323
+ try:
324
+ terraform.destroy(
325
+ project_id=project_id,
326
+ region=gcp.get("region", ""),
327
+ sa_email=gcp.get("sa_email", ""),
328
+ tf_state_bucket=tf_state_bucket,
329
+ )
330
+ destroyed.append("warehouse bucket")
331
+ except Exception as e:
332
+ typer.echo(f" terraform destroy failed (continuing): {e}", err=True)
333
+
334
+ secret_name = gcp.get("secret_name", "")
335
+ if secret_name:
336
+ typer.echo("Deleting Secret Manager secret...")
337
+ try:
338
+ bootstrap.delete_secret(secret_name, credentials)
339
+ destroyed.append("SA key secret")
340
+ except Exception as e:
341
+ typer.echo(f" secret delete failed (continuing): {e}", err=True)
342
+
343
+ sa_email = gcp.get("sa_email", "")
344
+ if sa_email:
345
+ typer.echo("Deleting service account...")
346
+ try:
347
+ bootstrap.delete_service_account(project_id, sa_email, credentials)
348
+ destroyed.append("service account")
349
+ except Exception as e:
350
+ typer.echo(f" service account delete failed (continuing): {e}", err=True)
351
+
352
+ cfg.pop("gcp", None)
353
+ cfg["catalog"] = "local"
354
+ _save_config(cfg)
355
+
356
+ if destroyed:
357
+ typer.echo(f"\nDestroyed: {', '.join(destroyed)}. project.yml reset to catalog: local.")
358
+ else:
359
+ typer.echo("\nNo GCP resources were found to destroy. project.yml reset to catalog: local.")
360
+
361
+
362
+ @gcp_app.command("status")
363
+ def gcp_status():
364
+ """Show GCP lake setup status."""
365
+ cfg = _load_config()
366
+ gcp = cfg.get("gcp")
367
+
368
+ if not gcp:
369
+ typer.echo("No GCP configuration found. Run: dcf gcp setup --project-id X --region Y")
370
+ return
371
+
372
+ typer.echo(f"Status: {gcp.get('setup_status', 'unknown')}")
373
+ typer.echo(f"Project ID: {gcp.get('project_id', '-')}")
374
+ typer.echo(f"Region: {gcp.get('region', '-')}")
375
+ typer.echo(f"Warehouse bucket: {gcp.get('warehouse_bucket', '-')}")
376
+ typer.echo(f"Service account: {gcp.get('sa_email', '-')}")
377
+ if gcp.get("setup_error"):
378
+ typer.echo(f"\nLast error:\n{gcp['setup_error']}", err=True)
379
+
380
+
381
+ # ------------------------------------------------------------------ #
382
+ # deploy / undeploy #
383
+ # ------------------------------------------------------------------ #
384
+
385
+ def _require_gcp_config() -> tuple[dict, dict]:
386
+ """Return (full_config, gcp_section). Exits with a clear error if GCP is not ready."""
387
+ cfg = _load_config()
388
+ if cfg.get("catalog") != "gcp":
389
+ typer.echo(
390
+ "Error: catalog is not 'gcp'. Deployment requires a GCP data lake.\n"
391
+ " Set catalog: gcp in project.yml or run: dcf init",
392
+ err=True,
393
+ )
394
+ raise typer.Exit(1)
395
+ gcp = cfg.get("gcp", {})
396
+ if gcp.get("setup_status") != "complete":
397
+ typer.echo(
398
+ "Error: GCP setup is not complete. Run: dcf gcp setup --project-id X --region Y",
399
+ err=True,
400
+ )
401
+ raise typer.Exit(1)
402
+ for key in ("project_id", "region", "warehouse_bucket", "sa_email"):
403
+ if not gcp.get(key):
404
+ typer.echo(f"Error: gcp.{key} is missing from project.yml. Re-run: dcf gcp setup", err=True)
405
+ raise typer.Exit(1)
406
+ return cfg, gcp
407
+
408
+
409
+ @app.command()
410
+ def deploy(
411
+ collector_name: str | None = typer.Argument(None, help="Collector name (without .yml), or omit to deploy all"),
412
+ ):
413
+ """Deploy a collector locally (Docker + Airflow) or to GCP based on catalog in project.yml."""
414
+ if collector_name is None:
415
+ _deploy_all()
416
+ return
417
+
418
+ _deploy_one(collector_name)
419
+
420
+
421
+ def _deploy_all() -> None:
422
+ """Deploy every collector YAML that has a deployment: block."""
423
+ collectors_dir = _collectors_dir()
424
+ from .config import load_collector
425
+
426
+ candidates = []
427
+ for path in sorted(collectors_dir.glob("*.yml")):
428
+ try:
429
+ collector = load_collector(path, resolve_env=False)
430
+ if collector.deployment is not None:
431
+ candidates.append(path.stem)
432
+ except Exception:
433
+ pass
434
+
435
+ if not candidates:
436
+ typer.echo("No collectors with a 'deployment:' block found in collectors/.")
437
+ raise typer.Exit(0)
438
+
439
+ typer.echo(f"Deploying {len(candidates)} collector(s): {', '.join(candidates)}")
440
+ failures = []
441
+ for name in candidates:
442
+ typer.echo(f"\n--- {name} ---")
443
+ try:
444
+ _deploy_one(name)
445
+ except SystemExit:
446
+ failures.append(name)
447
+ except Exception as e:
448
+ typer.echo(f"Deploy failed for '{name}': {e}", err=True)
449
+ failures.append(name)
450
+
451
+ if failures:
452
+ typer.echo(f"\nFailed: {', '.join(failures)}", err=True)
453
+ raise typer.Exit(1)
454
+
455
+
456
+ def _deploy_one(collector_name: str) -> None:
457
+ from .config import load_collector
458
+ from .config.models import PubSubSource
459
+
460
+ path = _collectors_dir() / f"{collector_name}.yml"
461
+ if not path.exists():
462
+ typer.echo(f"Collector not found: {path}", err=True)
463
+ raise typer.Exit(1)
464
+
465
+ try:
466
+ collector = load_collector(path, resolve_env=False)
467
+ except Exception as e:
468
+ typer.echo(f"Error loading collector: {e}", err=True)
469
+ raise typer.Exit(1)
470
+
471
+ if collector.deployment is None:
472
+ typer.echo(
473
+ f"Error: '{collector_name}' has no 'deployment:' block in its collector YAML.\n"
474
+ "For a batch collector, add a deploy block with a schedule:\n\n"
475
+ " deployment:\n"
476
+ " schedule: \"0 8 * * *\"\n\n"
477
+ "For a streaming collector (source.type: pubsub), add:\n\n"
478
+ " deployment:\n"
479
+ " type: streaming\n"
480
+ " window_seconds: 60\n",
481
+ err=True,
482
+ )
483
+ raise typer.Exit(1)
484
+
485
+ catalog = _get_catalog()
486
+ deploy_type = collector.deployment.type
487
+
488
+ try:
489
+ if catalog == "local":
490
+ from . import local_deploy
491
+ subscription = None
492
+ if deploy_type == "streaming":
493
+ if not isinstance(collector.source, PubSubSource):
494
+ typer.echo(
495
+ "Error: deploy.type: streaming requires source.type: pubsub", err=True
496
+ )
497
+ raise typer.Exit(1)
498
+ subscription = collector.source.subscription
499
+ typer.echo(f"Deploying '{collector_name}' (local streaming, Kafka)...")
500
+ else:
501
+ typer.echo(f"Deploying '{collector_name}' (local batch, Terraform + Airflow)...")
502
+
503
+ cfg = _load_config()
504
+ state = local_deploy.deploy(
505
+ collector_name=collector_name,
506
+ deployment=collector.deployment,
507
+ project_root=_project_root(),
508
+ subscription=subscription,
509
+ )
510
+ cfg.setdefault("deployments", {})[collector_name] = state
511
+ _save_config(cfg)
512
+
513
+ typer.echo(f"\nDeployed '{collector_name}' successfully.")
514
+ if deploy_type == "streaming":
515
+ typer.echo(f" Type: streaming (local Docker + Kafka)")
516
+ typer.echo(f" Kafka: {state['kafka_container']} ({state['kafka_external_bootstrap']})")
517
+ typer.echo(f" Runner: {state['runner_container']}")
518
+ typer.echo(f" Warehouse: {state['warehouse_path']}")
519
+ typer.echo(f" Window: {state['window_seconds']}s")
520
+ typer.echo(f" To publish: dcf publish {collector_name} '{{\"field\": \"value\"}}'")
521
+ else:
522
+ typer.echo(f" Type: batch (local Terraform)")
523
+ typer.echo(f" Image: {state['image_tag']}")
524
+ typer.echo(f" Warehouse: {state['warehouse_path']}")
525
+ typer.echo(f" Airflow UI: {state.get('airflow_url', 'http://localhost:8080')}")
526
+ return
527
+
528
+ cfg, gcp = _require_gcp_config()
529
+ if deploy_type == "streaming":
530
+ from .gcp import streaming_deploy
531
+ assert isinstance(collector.source, PubSubSource)
532
+ typer.echo(
533
+ f"Deploying '{collector_name}' (streaming, "
534
+ f"subscription: {collector.source.subscription})..."
535
+ )
536
+ state = streaming_deploy.deploy(
537
+ collector_name=collector_name,
538
+ subscription=collector.source.subscription,
539
+ window_seconds=collector.deployment.window_seconds,
540
+ project_root=_project_root(),
541
+ gcp_config=gcp,
542
+ )
543
+ else:
544
+ from .gcp import batch_deploy
545
+ typer.echo(f"Deploying '{collector_name}' (schedule: {collector.deployment.schedule})...")
546
+ state = batch_deploy.deploy(
547
+ collector_name=collector_name,
548
+ schedule=collector.deployment.schedule,
549
+ paused=collector.deployment.paused,
550
+ project_root=_project_root(),
551
+ gcp_config=gcp,
552
+ )
553
+ except (typer.Exit, SystemExit):
554
+ raise
555
+ except Exception as e:
556
+ typer.echo(f"\nDeploy failed: {e}", err=True)
557
+ raise typer.Exit(1)
558
+
559
+ cfg.setdefault("deployments", {})[collector_name] = state
560
+ _save_config(cfg)
561
+
562
+ typer.echo(f"\nDeployed '{collector_name}' successfully.")
563
+ if deploy_type == "streaming":
564
+ typer.echo(f" Type: streaming (GCP Dataflow)")
565
+ typer.echo(f" Dataflow job: {state['dataflow_job_name']}")
566
+ typer.echo(f" Subscription: {state['subscription']}")
567
+ typer.echo(f" Window: {state['window_seconds']}s")
568
+ else:
569
+ typer.echo(f" DAG: {state['dag_id']}")
570
+ typer.echo(f" Cloud Run: {state['cloud_run_job']}")
571
+ typer.echo(f" Schedule: {state['schedule']}")
572
+ if state.get("airflow_url"):
573
+ typer.echo(f" Airflow UI: {state['airflow_url']}")
574
+
575
+
576
+ @app.command()
577
+ def undeploy(
578
+ collector_name: str | None = typer.Argument(None, help="Collector name (without .yml). Omit to undeploy everything."),
579
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation prompt"),
580
+ ):
581
+ """Stop and remove deployed collector(s) (warehouse data is untouched).
582
+
583
+ Omit COLLECTOR_NAME to destroy all deployments including the Airflow stack.
584
+ """
585
+ cfg = _load_config()
586
+ deployments = cfg.get("deployments", {})
587
+
588
+ if collector_name is None:
589
+ # Undeploy everything
590
+ if not deployments:
591
+ typer.echo("Nothing to undeploy.")
592
+ return
593
+ if not yes:
594
+ typer.confirm(
595
+ f"Destroy all {len(deployments)} collector(s) and the Airflow stack? "
596
+ "(warehouse data will NOT be deleted)",
597
+ abort=True,
598
+ )
599
+ try:
600
+ from . import local_deploy
601
+ local_deploy.undeploy_all(deployments, _project_root())
602
+ except Exception as e:
603
+ typer.echo(f"\nUndeploy failed: {e}", err=True)
604
+ raise typer.Exit(1)
605
+ cfg.pop("deployments", None)
606
+ _save_config(cfg)
607
+ typer.echo("All collectors undeployed. Warehouse data is untouched.")
608
+ return
609
+
610
+ if collector_name not in deployments:
611
+ typer.echo(
612
+ f"Error: '{collector_name}' is not in project.yml deployments. "
613
+ "Nothing to undeploy.",
614
+ err=True,
615
+ )
616
+ raise typer.Exit(1)
617
+
618
+ deployment = deployments[collector_name]
619
+ deploy_type = deployment.get("type", "batch")
620
+ is_local = "kafka_container" in deployment or (
621
+ "image_tag" in deployment and "dag_id" not in deployment
622
+ )
623
+
624
+ if not yes:
625
+ if is_local:
626
+ if deploy_type == "streaming":
627
+ typer.confirm(
628
+ f"Stop and remove local Docker containers for '{collector_name}'? "
629
+ "(warehouse data will NOT be deleted)",
630
+ abort=True,
631
+ )
632
+ else:
633
+ typer.confirm(
634
+ f"Remove local Docker image for '{collector_name}'? "
635
+ "(warehouse data will NOT be deleted)",
636
+ abort=True,
637
+ )
638
+ elif deploy_type == "streaming":
639
+ typer.confirm(
640
+ f"Drain and remove Dataflow job '{deployment.get('dataflow_job_name', collector_name)}'? "
641
+ "(warehouse data will NOT be deleted)",
642
+ abort=True,
643
+ )
644
+ else:
645
+ typer.confirm(
646
+ f"Remove collector '{collector_name}' deployment and stop its scheduling? "
647
+ "(warehouse data will NOT be deleted)",
648
+ abort=True,
649
+ )
650
+
651
+ typer.echo(f"Undeploying '{collector_name}'...")
652
+ try:
653
+ if is_local:
654
+ from . import local_deploy
655
+ local_deploy.undeploy(collector_name, deployment, _project_root())
656
+ elif deploy_type == "streaming":
657
+ _, gcp = _require_gcp_config()
658
+ from .gcp import streaming_deploy
659
+ streaming_deploy.undeploy(
660
+ collector_name=collector_name,
661
+ deployment=deployment,
662
+ gcp_config=gcp,
663
+ )
664
+ else:
665
+ _, gcp = _require_gcp_config()
666
+ from .gcp import batch_deploy
667
+ batch_deploy.undeploy(
668
+ collector_name=collector_name,
669
+ deployment=deployment,
670
+ gcp_config=gcp,
671
+ project_root=_project_root(),
672
+ )
673
+ except Exception as e:
674
+ typer.echo(f"\nUndeploy failed: {e}", err=True)
675
+ raise typer.Exit(1)
676
+
677
+ del cfg["deployments"][collector_name]
678
+ if not cfg["deployments"]:
679
+ del cfg["deployments"]
680
+ _save_config(cfg)
681
+
682
+ typer.echo(f"'{collector_name}' undeployed. Warehouse data is untouched.")
683
+
684
+
685
+ @app.command(name="deploy-status")
686
+ def deploy_status(
687
+ collector_name: str | None = typer.Argument(None, help="Collector name, or omit for all"),
688
+ ):
689
+ """Show deployment state for one or all collectors."""
690
+ cfg = _load_config()
691
+ deployments = cfg.get("deployments", {})
692
+
693
+ if not deployments:
694
+ typer.echo("No collectors are currently deployed.")
695
+ return
696
+
697
+ targets = {collector_name: deployments[collector_name]} if collector_name else deployments
698
+
699
+ if collector_name and collector_name not in deployments:
700
+ typer.echo(f"'{collector_name}' is not deployed.", err=True)
701
+ raise typer.Exit(1)
702
+
703
+ for name, state in targets.items():
704
+ typer.echo(f"\n{name}")
705
+ if "kafka_container" in state:
706
+ typer.echo(f" Type: streaming (local Docker + Kafka)")
707
+ typer.echo(f" Kafka: {state.get('kafka_container', '-')} ({state.get('kafka_external_bootstrap', '-')})")
708
+ typer.echo(f" Runner: {state.get('runner_container', '-')}")
709
+ typer.echo(f" Topic: {state.get('kafka_topic', '-')}")
710
+ typer.echo(f" Window: {state.get('window_seconds', '-')}s")
711
+ elif state.get("type") == "batch" and "image_tag" in state and "dag_id" not in state:
712
+ typer.echo(f" Type: batch (local Docker)")
713
+ typer.echo(f" Image: {state.get('image_tag', '-')}")
714
+ elif state.get("type") == "streaming":
715
+ typer.echo(f" Type: streaming (GCP Dataflow)")
716
+ typer.echo(f" Dataflow job: {state.get('dataflow_job_name', '-')}")
717
+ typer.echo(f" Subscription: {state.get('subscription', '-')}")
718
+ typer.echo(f" Window: {state.get('window_seconds', '-')}s")
719
+ else:
720
+ typer.echo(f" Type: batch (GCP)")
721
+ typer.echo(f" Schedule: {state.get('schedule', '-')}")
722
+ typer.echo(f" DAG: {state.get('dag_id', '-')}")
723
+ typer.echo(f" Cloud Run: {state.get('cloud_run_job', '-')}")
724
+ if state.get("airflow_url"):
725
+ typer.echo(f" Airflow UI: {state.get('airflow_url', '-')}")
726
+ typer.echo(f" Deployed at: {state.get('deployed_at', '-')}")
727
+
728
+
729
+ @app.command()
730
+ def publish(
731
+ collector_name: str = typer.Argument(..., help="Collector name"),
732
+ message: str = typer.Argument(..., help="JSON message body to publish"),
733
+ count: int = typer.Option(1, "--count", "-n", help="Number of times to publish the message"),
734
+ ):
735
+ """Publish a JSON message to the local Kafka topic for a deployed streaming collector."""
736
+ import json
737
+
738
+ cfg = _load_config()
739
+ state = cfg.get("deployments", {}).get(collector_name)
740
+ if not state:
741
+ typer.echo(
742
+ f"Error: '{collector_name}' is not deployed. Run: dcf deploy {collector_name}",
743
+ err=True,
744
+ )
745
+ raise typer.Exit(1)
746
+
747
+ if state.get("type") != "streaming":
748
+ typer.echo(
749
+ f"Error: '{collector_name}' is a batch deployment. dcf publish only works for streaming.",
750
+ err=True,
751
+ )
752
+ raise typer.Exit(1)
753
+
754
+ if "kafka_topic" not in state:
755
+ typer.echo(
756
+ f"Error: '{collector_name}' is deployed on GCP, not locally. "
757
+ "Use gcloud pubsub topics publish to inject messages.",
758
+ err=True,
759
+ )
760
+ raise typer.Exit(1)
761
+
762
+ try:
763
+ json.loads(message)
764
+ except json.JSONDecodeError as e:
765
+ typer.echo(f"Error: message is not valid JSON: {e}", err=True)
766
+ raise typer.Exit(1)
767
+
768
+ from . import local_deploy
769
+ local_deploy.publish(collector_name, state, message, count)
770
+
771
+ noun = "message" if count == 1 else "messages"
772
+ typer.echo(f"Published {count} {noun} to topic '{state['kafka_topic']}'.")
773
+ typer.echo(f"Data will appear in warehouse after the {state['window_seconds']}s window closes.")
774
+
775
+
776
+ # ------------------------------------------------------------------ #
777
+ # mcp #
778
+ # ------------------------------------------------------------------ #
779
+
780
+ @mcp_app.command("serve")
781
+ def mcp_serve():
782
+ """Start the dcf MCP server (stdio transport for Claude Desktop)."""
783
+ from .mcp_server import serve
784
+ serve()
785
+
786
+
787
+ @mcp_app.command("setup-desktop")
788
+ def mcp_setup_desktop():
789
+ """Register dcf as an MCP server in Claude Desktop's config."""
790
+ import json
791
+ import shutil
792
+
793
+ claude_config = Path.home() / "Library" / "Application Support" / "Claude" / "claude_desktop_config.json"
794
+ if not claude_config.exists():
795
+ typer.echo(f"Claude Desktop config not found at {claude_config}", err=True)
796
+ typer.echo("Is Claude Desktop installed?", err=True)
797
+ raise typer.Exit(1)
798
+
799
+ project_dir = str(_project_root())
800
+ uv_path = shutil.which("uv") or "uv"
801
+
802
+ cfg = json.loads(claude_config.read_text()) if claude_config.stat().st_size else {}
803
+ cfg.setdefault("mcpServers", {})
804
+ cfg["mcpServers"]["dcf"] = {
805
+ "command": uv_path,
806
+ "args": ["--directory", project_dir, "run", "dcf", "mcp", "serve"],
807
+ }
808
+ claude_config.write_text(json.dumps(cfg, indent=2))
809
+ typer.echo(f"Registered dcf MCP server in {claude_config}")
810
+ typer.echo("Restart Claude Desktop to pick up the change.")
811
+
812
+
813
+ # ------------------------------------------------------------------ #
814
+ # Helpers #
815
+ # ------------------------------------------------------------------ #
816
+
817
+ def _parse_params(raw: list[str]) -> dict:
818
+ result = {}
819
+ for item in raw:
820
+ if "=" not in item:
821
+ typer.echo(f"Invalid --param format (expected key=value): '{item}'", err=True)
822
+ raise typer.Exit(1)
823
+ k, v = item.split("=", 1)
824
+ for cast in (int, float):
825
+ try:
826
+ v = cast(v)
827
+ break
828
+ except ValueError:
829
+ pass
830
+ result[k.strip()] = v
831
+ return result
832
+
833
+
834
+ def _override_date_range(collector, start: str | None, end: str | None) -> None:
835
+ from .config.models import DateRangeIterate
836
+ for spec in collector.cadence.iterate:
837
+ if isinstance(spec, DateRangeIterate):
838
+ if start:
839
+ spec.start = start
840
+ if end:
841
+ spec.end = end