data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/local_deploy.py ADDED
@@ -0,0 +1,756 @@
1
+ """Local Docker-based deployment for batch and streaming collectors.
2
+
3
+ No GCP account required. Batch collectors are built and scheduled via local
4
+ Terraform modules (batch_collector_local + airflow_local). Streaming collectors
5
+ run a Kafka broker + local stream runner container.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import json
12
+ import logging
13
+ import os
14
+ import re
15
+ import secrets
16
+ import shutil
17
+ import subprocess
18
+ import time
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+
22
+ import yaml
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ _DCF_PKG_DIR = Path(__file__).parent
27
+ _DCF_REPO_ROOT = _DCF_PKG_DIR.parent
28
+
29
+ _BATCH_COLLECTOR_MODULE = _DCF_PKG_DIR / "infra" / "modules" / "batch_collector"
30
+
31
+
32
+ def _write_pyproject_toml(dest: Path) -> None:
33
+ """Write dcf's pyproject.toml to dest/pyproject.toml.
34
+
35
+ Works whether dcf is running from a development checkout or an installed
36
+ package (where the repo root is not on disk and pyproject.toml lives only
37
+ in package metadata).
38
+ """
39
+ repo_pyproject = _DCF_REPO_ROOT / "pyproject.toml"
40
+ if repo_pyproject.exists():
41
+ shutil.copy2(repo_pyproject, dest / "pyproject.toml")
42
+ return
43
+
44
+ import importlib.metadata
45
+
46
+ meta = importlib.metadata.metadata("dcf")
47
+ version = meta["Version"]
48
+ reqs = importlib.metadata.requires("dcf") or []
49
+ direct_deps = [r for r in reqs if "extra ==" not in r]
50
+ deps_str = "\n".join(f' "{r}",' for r in direct_deps)
51
+ (dest / "pyproject.toml").write_text(
52
+ f'[project]\n'
53
+ f'name = "dcf"\n'
54
+ f'version = "{version}"\n'
55
+ f'requires-python = ">=3.12"\n'
56
+ f'dependencies = [\n{deps_str}\n]\n\n'
57
+ f'[project.scripts]\n'
58
+ f'dcf = "dcf.cli:app"\n\n'
59
+ f'[tool.setuptools.packages.find]\n'
60
+ f'include = ["dcf*"]\n'
61
+ )
62
+
63
+ _BUILD_DIR = Path.home() / ".dcf" / "build"
64
+ _TF_PLUGIN_CACHE = Path.home() / ".dcf" / ".plugin-cache"
65
+ _AIRFLOW_DAGS_DIR = Path.home() / ".dcf" / "airflow" / "dags"
66
+ _AIRFLOW_COMPOSE_FILE = Path.home() / ".dcf" / "airflow" / "docker-compose.yml"
67
+
68
+
69
+ def _tf_state_dir(project_root: Path) -> Path:
70
+ """Return the Terraform state directory for this project.
71
+
72
+ Defaults to <project_root>/.dcf/terraform; can be overridden with
73
+ `terraform_state_dir` in project.yml.
74
+ """
75
+ cfg_path = project_root / "project.yml"
76
+ if cfg_path.exists():
77
+ cfg = yaml.safe_load(cfg_path.read_text()) or {}
78
+ custom = cfg.get("terraform_state_dir")
79
+ if custom:
80
+ return Path(custom).expanduser()
81
+ return project_root / ".dcf" / "terraform"
82
+
83
+
84
+ def _collect_env_vars(project_root: Path, collector_name: str) -> dict[str, str]:
85
+ """Scan collector YAML for {{ env.VAR }} references and return {VAR: value}."""
86
+ collector_path = project_root / "collectors" / f"{collector_name}.yml"
87
+ if not collector_path.exists():
88
+ return {}
89
+
90
+ raw_yaml = collector_path.read_text()
91
+ var_names = re.findall(r"\{\{\s*env\.(\w+)\s*\}\}", raw_yaml)
92
+ if not var_names:
93
+ return {}
94
+
95
+ project_cfg: dict = {}
96
+ cfg_path = project_root / "project.yml"
97
+ if cfg_path.exists():
98
+ project_cfg = yaml.safe_load(cfg_path.read_text()) or {}
99
+
100
+ result: dict[str, str] = {}
101
+ for var in dict.fromkeys(var_names):
102
+ value = os.environ.get(var) or project_cfg.get(var.lower())
103
+ if not value:
104
+ raise EnvironmentError(
105
+ f"Collector references '{{{{ env.{var} }}}}' but '{var}' is not set "
106
+ f"in the host environment and '{var.lower()}' is not in project.yml"
107
+ )
108
+ result[var] = value
109
+ return result
110
+
111
+
112
+ # ------------------------------------------------------------------ #
113
+ # Public API #
114
+ # ------------------------------------------------------------------ #
115
+
116
+ def deploy(
117
+ collector_name: str,
118
+ deployment,
119
+ project_root: Path,
120
+ subscription: str | None = None,
121
+ ) -> dict:
122
+ """Build and start local Docker containers for a collector."""
123
+ _check_docker()
124
+ if deployment.type == "streaming":
125
+ if subscription is None:
126
+ raise ValueError("subscription is required for streaming local deploy")
127
+ return _deploy_streaming(collector_name, subscription, deployment.window_seconds, project_root)
128
+ else:
129
+ return _deploy_batch(collector_name, deployment, project_root)
130
+
131
+
132
+ def undeploy(collector_name: str, deployment_state: dict, project_root: Path) -> None:
133
+ """Stop and remove all local Docker resources for this collector."""
134
+ if deployment_state.get("type") == "streaming":
135
+ _undeploy_streaming(collector_name, deployment_state)
136
+ else:
137
+ _undeploy_batch(collector_name, deployment_state, project_root)
138
+
139
+
140
+ def undeploy_all(deployments: dict, project_root: Path) -> None:
141
+ """Destroy all collector resources then tear down the shared Airflow stack."""
142
+ for name, state in deployments.items():
143
+ print(f" Undeploying '{name}'...", flush=True)
144
+ undeploy(name, state, project_root)
145
+ _tf_destroy_airflow_local(project_root)
146
+
147
+
148
+ def publish(collector_name: str, deployment_state: dict, message_json: str, count: int = 1) -> None:
149
+ """Publish a JSON message to the collector's local Kafka topic."""
150
+ from kafka import KafkaProducer
151
+
152
+ bootstrap = deployment_state.get("kafka_external_bootstrap", "localhost:29092")
153
+ topic = deployment_state.get("kafka_topic", f"dcf-{collector_name}")
154
+
155
+ producer = KafkaProducer(
156
+ bootstrap_servers=bootstrap,
157
+ value_serializer=lambda v: v.encode("utf-8"),
158
+ )
159
+ for _ in range(count):
160
+ producer.send(topic, value=message_json)
161
+ producer.flush()
162
+ producer.close()
163
+
164
+
165
+ # ------------------------------------------------------------------ #
166
+ # Batch — Terraform path #
167
+ # ------------------------------------------------------------------ #
168
+
169
+ def _deploy_batch(collector_name: str, deployment, project_root: Path) -> dict:
170
+ image_tag = f"dcf-local/{collector_name}:latest"
171
+ warehouse_path = project_root / "warehouse"
172
+ warehouse_path.mkdir(exist_ok=True)
173
+
174
+ print(f" Syncing build context for '{collector_name}'...", flush=True)
175
+ build_context = _sync_build_context(project_root, collector_name)
176
+
177
+ content_hash = _content_hash(build_context)
178
+
179
+ print(f" Applying Terraform (collector image)...", flush=True)
180
+ _tf_apply_local_collector(collector_name, build_context, image_tag, content_hash, project_root)
181
+
182
+ print(f" Writing DAG file...", flush=True)
183
+ _AIRFLOW_DAGS_DIR.mkdir(parents=True, exist_ok=True)
184
+ dag_content = _local_dag_content(
185
+ collector_name=collector_name,
186
+ schedule=deployment.schedule,
187
+ paused=getattr(deployment, "paused", False),
188
+ image_tag=image_tag,
189
+ warehouse_path=str(warehouse_path),
190
+ env_vars=_collect_env_vars(project_root, collector_name),
191
+ )
192
+ _write_local_dag(collector_name, dag_content)
193
+
194
+ print(f" Applying Terraform (Airflow stack)...", flush=True)
195
+ credentials = _generate_airflow_credentials(project_root)
196
+ airflow_outputs = _tf_apply_airflow_local(
197
+ dag_dir=str(_AIRFLOW_DAGS_DIR),
198
+ warehouse_path=str(warehouse_path),
199
+ credentials=credentials,
200
+ project_root=project_root,
201
+ )
202
+
203
+ airflow_url = airflow_outputs.get("webserver_url", {}).get("value", "http://localhost:8080")
204
+ print(f" Airflow UI: {airflow_url}", flush=True)
205
+
206
+ return {
207
+ "type": "batch",
208
+ "image_tag": image_tag,
209
+ "warehouse_path": str(warehouse_path),
210
+ "airflow_url": airflow_url,
211
+ "schedule": deployment.schedule,
212
+ "deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
213
+ }
214
+
215
+
216
+ def _undeploy_batch(collector_name: str, state: dict, project_root: Path) -> None:
217
+ print(f" Destroying collector Terraform resources...", flush=True)
218
+ _tf_destroy_local_collector(collector_name, project_root)
219
+
220
+ dag_file = _AIRFLOW_DAGS_DIR / f"{collector_name}.py"
221
+ if dag_file.exists():
222
+ dag_file.unlink()
223
+ print(f" Removed DAG file: {dag_file}", flush=True)
224
+
225
+
226
+ # ------------------------------------------------------------------ #
227
+ # Build context helpers #
228
+ # ------------------------------------------------------------------ #
229
+
230
+ def _sync_build_context(project_root: Path, collector_name: str) -> Path:
231
+ """Create a stable build context dir at ~/.dcf/build/local/<name>/."""
232
+ build_context = _BUILD_DIR / "local" / collector_name
233
+ shutil.rmtree(build_context, ignore_errors=True)
234
+ build_context.mkdir(parents=True)
235
+
236
+ shutil.copytree(_DCF_PKG_DIR, build_context / "dcf")
237
+ _write_pyproject_toml(build_context)
238
+
239
+ for subdir in ("collectors", "connectors"):
240
+ src = project_root / subdir
241
+ dst = build_context / subdir
242
+ if src.exists():
243
+ shutil.copytree(src, dst)
244
+ else:
245
+ dst.mkdir()
246
+
247
+ (build_context / "project.yml").write_text("catalog: local\n")
248
+
249
+ return build_context
250
+
251
+
252
+ def _content_hash(build_context: Path) -> str:
253
+ """SHA256 of all files in build_context, excluding Dockerfile (written by Terraform)."""
254
+ h = hashlib.sha256()
255
+ for path in sorted(build_context.rglob("*")):
256
+ if path.is_file() and path.name != "Dockerfile":
257
+ h.update(path.read_bytes())
258
+ return h.hexdigest()
259
+
260
+
261
+ # ------------------------------------------------------------------ #
262
+ # Terraform helpers — collector #
263
+ # ------------------------------------------------------------------ #
264
+
265
+ def _tf_env() -> dict:
266
+ return {
267
+ **os.environ,
268
+ "TF_INPUT": "0",
269
+ "TF_PLUGIN_CACHE_DIR": str(_TF_PLUGIN_CACHE),
270
+ }
271
+
272
+
273
+ def _tf_run(cmd: list[str], work_dir: Path, env: dict) -> None:
274
+ result = subprocess.run(cmd, cwd=str(work_dir), env=env, capture_output=True, text=True)
275
+ if result.returncode != 0:
276
+ raise RuntimeError(
277
+ f"terraform {cmd[1]} failed (exit {result.returncode}):\n{result.stderr[-2000:]}"
278
+ )
279
+ logger.info("terraform %s OK", cmd[1])
280
+
281
+
282
+ def _copy_module_to_work_dir(module_dir: Path, work_dir: Path) -> None:
283
+ """Copy a leaf Terraform module's .tf files + shared templates into work_dir."""
284
+ for item in module_dir.iterdir():
285
+ if item.name in (".terraform", ".terraform.lock.hcl"):
286
+ continue
287
+ if item.is_file() and item.suffix == ".tf":
288
+ shutil.copy2(item, work_dir / item.name)
289
+ templates_src = _DCF_PKG_DIR / "infra" / "templates"
290
+ templates_dst = work_dir / "templates"
291
+ if templates_dst.exists():
292
+ shutil.rmtree(templates_dst)
293
+ shutil.copytree(templates_src, templates_dst)
294
+
295
+
296
+ def _tf_apply_local_collector(
297
+ collector_name: str,
298
+ build_context: Path,
299
+ image_tag: str,
300
+ content_hash: str,
301
+ project_root: Path,
302
+ ) -> None:
303
+ work_dir = _tf_state_dir(project_root) / "collectors" / collector_name / "local"
304
+ work_dir.mkdir(parents=True, exist_ok=True)
305
+ _TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
306
+
307
+ _copy_module_to_work_dir(_BATCH_COLLECTOR_MODULE / "local", work_dir)
308
+
309
+ tfvars = {
310
+ "collector_name": collector_name,
311
+ "build_context": str(build_context),
312
+ "image_tag": image_tag,
313
+ "content_hash": content_hash,
314
+ "java_enabled": True,
315
+ }
316
+ (work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
317
+
318
+ env = _tf_env()
319
+ _tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
320
+ _tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
321
+
322
+
323
+ def _tf_destroy_local_collector(collector_name: str, project_root: Path) -> None:
324
+ work_dir = _tf_state_dir(project_root) / "collectors" / collector_name / "local"
325
+ if not work_dir.exists():
326
+ logger.warning("No Terraform state found at %s — skipping destroy", work_dir)
327
+ return
328
+
329
+ env = _tf_env()
330
+ _tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
331
+ shutil.rmtree(work_dir)
332
+
333
+
334
+ def _tf_destroy_airflow_local(project_root: Path) -> None:
335
+ work_dir = _tf_state_dir(project_root) / "airflow" / "local"
336
+ if not work_dir.exists():
337
+ logger.warning("No Airflow Terraform state found at %s — skipping destroy", work_dir)
338
+ return
339
+
340
+ print(" Destroying Airflow stack...", flush=True)
341
+ if _AIRFLOW_COMPOSE_FILE.exists():
342
+ subprocess.run(
343
+ ["docker", "compose", "-f", str(_AIRFLOW_COMPOSE_FILE), "down", "--volumes"],
344
+ check=False,
345
+ )
346
+
347
+ env = _tf_env()
348
+ _tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
349
+ shutil.rmtree(work_dir)
350
+
351
+
352
+ # ------------------------------------------------------------------ #
353
+ # DAG content #
354
+ # ------------------------------------------------------------------ #
355
+
356
+ def _local_dag_content(
357
+ collector_name: str,
358
+ schedule: str,
359
+ paused: bool,
360
+ image_tag: str,
361
+ warehouse_path: str,
362
+ env_vars: dict[str, str] | None = None,
363
+ ) -> str:
364
+ paused_str = "True" if paused else "False"
365
+ environment = {"COLLECTOR_NAME": collector_name, **(env_vars or {})}
366
+ return f"""\
367
+ # Generated by dcf — do not edit manually
368
+ from datetime import datetime
369
+ from docker.types import Mount
370
+ from airflow import DAG
371
+ from airflow.providers.docker.operators.docker import DockerOperator
372
+
373
+ with DAG(
374
+ dag_id="{collector_name}",
375
+ schedule_interval="{schedule}",
376
+ start_date=datetime(2024, 1, 1),
377
+ catchup=False,
378
+ is_paused_upon_creation={paused_str},
379
+ tags=["dcf"],
380
+ ) as dag:
381
+ run_collector = DockerOperator(
382
+ task_id="run_{collector_name}",
383
+ image="{image_tag}",
384
+ environment={environment!r},
385
+ mounts=[Mount(target="/app/warehouse", source="{warehouse_path}", type="bind")],
386
+ docker_url="unix:///var/run/docker.sock",
387
+ auto_remove="success",
388
+ )
389
+ """
390
+
391
+
392
+ def _write_local_dag(collector_name: str, dag_content: str) -> None:
393
+ _AIRFLOW_DAGS_DIR.mkdir(parents=True, exist_ok=True)
394
+ (_AIRFLOW_DAGS_DIR / f"{collector_name}.py").write_text(dag_content)
395
+
396
+
397
+ # ------------------------------------------------------------------ #
398
+ # Terraform helpers — Airflow #
399
+ # ------------------------------------------------------------------ #
400
+
401
+ def _airflow_build_context() -> Path:
402
+ """Return the stable build context dir for the local Airflow image."""
403
+ build_context = _BUILD_DIR / "airflow-local"
404
+ build_context.mkdir(parents=True, exist_ok=True)
405
+ return build_context
406
+
407
+
408
+ def _airflow_content_hash() -> str:
409
+ """Hash of the airflow Dockerfile template to detect when Airflow image needs rebuild."""
410
+ template = _DCF_PKG_DIR / "infra" / "templates" / "airflow.Dockerfile.tftpl"
411
+ return hashlib.sha256(template.read_bytes()).hexdigest()
412
+
413
+
414
+ def _generate_airflow_credentials(project_root: Path) -> dict:
415
+ """Read/generate Airflow credentials from project.yml."""
416
+ cfg_path = project_root / "project.yml"
417
+ cfg: dict = yaml.safe_load(cfg_path.read_text()) or {} if cfg_path.exists() else {}
418
+
419
+ admin_password = cfg.get("airflow_admin_password")
420
+ if not admin_password:
421
+ import getpass
422
+ admin_password = getpass.getpass("Enter Airflow admin password: ").strip()
423
+ if not admin_password:
424
+ raise RuntimeError("Airflow admin password cannot be empty.")
425
+ cfg["airflow_admin_password"] = admin_password
426
+ cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
427
+ logger.info("Saved airflow_admin_password to project.yml")
428
+
429
+ changed = False
430
+
431
+ fernet_key = cfg.get("airflow_fernet_key")
432
+ if not fernet_key:
433
+ from cryptography.fernet import Fernet
434
+ fernet_key = Fernet.generate_key().decode()
435
+ cfg["airflow_fernet_key"] = fernet_key
436
+ changed = True
437
+
438
+ if changed:
439
+ cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
440
+
441
+ return {
442
+ "db_password": "airflow",
443
+ "admin_password": admin_password,
444
+ "fernet_key": fernet_key,
445
+ }
446
+
447
+
448
+ def _tf_apply_airflow_local(dag_dir: str, warehouse_path: str, credentials: dict, project_root: Path) -> dict:
449
+ work_dir = _tf_state_dir(project_root) / "airflow" / "local"
450
+ work_dir.mkdir(parents=True, exist_ok=True)
451
+ _TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
452
+
453
+ _copy_module_to_work_dir(_BATCH_COLLECTOR_MODULE / "local" / "airflow", work_dir)
454
+
455
+ build_context = _airflow_build_context()
456
+ content_hash = _airflow_content_hash()
457
+
458
+ tfvars = {
459
+ "image_tag": "dcf-airflow-local:latest",
460
+ "build_context": str(build_context),
461
+ "content_hash": content_hash,
462
+ "dag_dir": dag_dir,
463
+ "warehouse_path": warehouse_path,
464
+ "docker_socket": "/var/run/docker.sock",
465
+ "db_password": credentials["db_password"],
466
+ "admin_password": credentials["admin_password"],
467
+ "fernet_key": credentials["fernet_key"],
468
+ "compose_file_path": str(_AIRFLOW_COMPOSE_FILE),
469
+ "webserver_port": 8090,
470
+ }
471
+ (work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
472
+
473
+ _AIRFLOW_COMPOSE_FILE.parent.mkdir(parents=True, exist_ok=True)
474
+
475
+ env = _tf_env()
476
+ _tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
477
+ _tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
478
+
479
+ raw = subprocess.run(
480
+ ["terraform", "output", "-json"],
481
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
482
+ ).stdout
483
+ return json.loads(raw) if raw.strip() else {}
484
+
485
+
486
+ # ------------------------------------------------------------------ #
487
+ # Streaming #
488
+ # ------------------------------------------------------------------ #
489
+
490
+ def _kafka_container(name: str) -> str:
491
+ return f"dcf-kafka-{name}"
492
+
493
+
494
+ def _runner_container(name: str) -> str:
495
+ return f"dcf-runner-{name}"
496
+
497
+
498
+ def _network_name(name: str) -> str:
499
+ return f"dcf-{name}"
500
+
501
+
502
+ def _deploy_streaming(
503
+ collector_name: str,
504
+ subscription: str,
505
+ window_seconds: int,
506
+ project_root: Path,
507
+ ) -> dict:
508
+ network = _network_name(collector_name)
509
+ kafka_cname = _kafka_container(collector_name)
510
+ runner_cname = _runner_container(collector_name)
511
+ image_tag = f"dcf-local/{collector_name}-stream:latest"
512
+ kafka_topic = f"dcf-{collector_name}"
513
+ warehouse_path = project_root / "warehouse"
514
+ warehouse_path.mkdir(exist_ok=True)
515
+
516
+ _stop_remove(runner_cname)
517
+ _stop_remove(kafka_cname)
518
+ _remove_network(network)
519
+
520
+ print(f" Creating Docker network '{network}'...", flush=True)
521
+ subprocess.run(["docker", "network", "create", network], check=True, capture_output=True)
522
+
523
+ print(f" Starting Kafka broker (apache/kafka, KRaft)...", flush=True)
524
+ _start_kafka(kafka_cname, network, collector_name)
525
+
526
+ print(f" Waiting for Kafka to be ready...", flush=True)
527
+ _wait_for_kafka("localhost:29092", timeout=30)
528
+
529
+ print(f" Creating topic '{kafka_topic}'...", flush=True)
530
+ _create_kafka_topic("localhost:29092", kafka_topic)
531
+
532
+ print(f" Building local runner image '{image_tag}'...", flush=True)
533
+ print(" (First build downloads python:3.12-slim + kafka-python, ~1 minute)", flush=True)
534
+ _build_stream_image(project_root, image_tag)
535
+
536
+ print(f" Starting stream runner...", flush=True)
537
+ _start_runner(
538
+ runner_cname, image_tag, network, collector_name,
539
+ kafka_cname, kafka_topic, window_seconds, warehouse_path,
540
+ project_root=project_root,
541
+ )
542
+
543
+ time.sleep(4)
544
+ status = subprocess.run(
545
+ ["docker", "inspect", "--format", "{{.State.Status}}", runner_cname],
546
+ capture_output=True, text=True,
547
+ ).stdout.strip()
548
+ if status != "running":
549
+ logs = subprocess.run(
550
+ ["docker", "logs", "--tail", "40", runner_cname],
551
+ capture_output=True, text=True,
552
+ )
553
+ raise RuntimeError(
554
+ f"Stream runner container stopped unexpectedly (status: {status}).\n"
555
+ f"Logs:\n{logs.stdout}{logs.stderr}"
556
+ )
557
+
558
+ return {
559
+ "type": "streaming",
560
+ "window_seconds": window_seconds,
561
+ "docker_network": network,
562
+ "kafka_container": kafka_cname,
563
+ "runner_container": runner_cname,
564
+ "kafka_topic": kafka_topic,
565
+ "kafka_external_bootstrap": "localhost:29092",
566
+ "image_tag": image_tag,
567
+ "warehouse_path": str(warehouse_path),
568
+ "deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
569
+ }
570
+
571
+
572
+ def _start_kafka(container_name: str, network: str, collector_name: str) -> None:
573
+ subprocess.run(
574
+ [
575
+ "docker", "run", "-d",
576
+ "--name", container_name,
577
+ "--network", network,
578
+ "-p", "29092:29092",
579
+ "-e", "KAFKA_NODE_ID=1",
580
+ "-e", "KAFKA_PROCESS_ROLES=broker,controller",
581
+ "-e", "KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER",
582
+ "-e", (
583
+ "KAFKA_LISTENERS="
584
+ "INTERNAL://0.0.0.0:9092,"
585
+ "EXTERNAL://0.0.0.0:29092,"
586
+ "CONTROLLER://0.0.0.0:9093"
587
+ ),
588
+ "-e", (
589
+ f"KAFKA_ADVERTISED_LISTENERS="
590
+ f"INTERNAL://{container_name}:9092,"
591
+ f"EXTERNAL://localhost:29092"
592
+ ),
593
+ "-e", (
594
+ "KAFKA_LISTENER_SECURITY_PROTOCOL_MAP="
595
+ "INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT"
596
+ ),
597
+ "-e", "KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL",
598
+ "-e", "KAFKA_CONTROLLER_QUORUM_VOTERS=1@localhost:9093",
599
+ "-e", "KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1",
600
+ "-e", "KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1",
601
+ "-e", "KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1",
602
+ "-e", "KAFKA_AUTO_CREATE_TOPICS_ENABLE=false",
603
+ "apache/kafka:latest",
604
+ ],
605
+ check=True, capture_output=True, text=True,
606
+ )
607
+
608
+
609
+ def _wait_for_kafka(bootstrap: str, timeout: int = 30) -> None:
610
+ from kafka import KafkaAdminClient
611
+ from kafka.errors import NoBrokersAvailable
612
+
613
+ deadline = time.time() + timeout
614
+ while time.time() < deadline:
615
+ try:
616
+ admin = KafkaAdminClient(
617
+ bootstrap_servers=bootstrap,
618
+ request_timeout_ms=3000,
619
+ connections_max_idle_ms=5000,
620
+ )
621
+ admin.close()
622
+ return
623
+ except (NoBrokersAvailable, Exception):
624
+ time.sleep(2)
625
+ raise RuntimeError(
626
+ f"Kafka did not become available at {bootstrap} within {timeout}s.\n"
627
+ "Check: docker logs dcf-kafka-<collector>"
628
+ )
629
+
630
+
631
+ def _create_kafka_topic(bootstrap: str, topic_name: str) -> None:
632
+ from kafka.admin import KafkaAdminClient, NewTopic
633
+ from kafka.errors import TopicAlreadyExistsError
634
+
635
+ admin = KafkaAdminClient(bootstrap_servers=bootstrap, request_timeout_ms=5000)
636
+ try:
637
+ admin.create_topics([NewTopic(topic_name, num_partitions=1, replication_factor=1)])
638
+ except TopicAlreadyExistsError:
639
+ pass
640
+ finally:
641
+ admin.close()
642
+
643
+
644
+ def _build_stream_image(project_root: Path, image_tag: str) -> None:
645
+ import tempfile
646
+ from textwrap import dedent
647
+
648
+ with tempfile.TemporaryDirectory(prefix="dcf-local-stream-") as tmp:
649
+ tmp_path = Path(tmp)
650
+ shutil.copytree(_DCF_PKG_DIR, tmp_path / "dcf")
651
+ _write_pyproject_toml(tmp_path)
652
+
653
+ for subdir in ("collectors", "connectors"):
654
+ src = project_root / subdir
655
+ if src.exists():
656
+ shutil.copytree(src, tmp_path / subdir)
657
+ else:
658
+ (tmp_path / subdir).mkdir()
659
+
660
+ (tmp_path / "project.yml").write_text("catalog: local\n")
661
+
662
+ (tmp_path / "Dockerfile").write_text(dedent("""\
663
+ FROM python:3.12-slim
664
+ WORKDIR /app
665
+ COPY pyproject.toml .
666
+ COPY dcf/ ./dcf/
667
+ RUN pip install --no-cache-dir -e . 'kafka-python>=2.0'
668
+ COPY collectors/ ./collectors/
669
+ COPY connectors/ ./connectors/
670
+ COPY project.yml .
671
+ ENTRYPOINT ["python", "-m", "dcf.local_stream_runner"]
672
+ """))
673
+
674
+ result = subprocess.run(["docker", "build", "-t", image_tag, "."], cwd=tmp)
675
+ if result.returncode != 0:
676
+ raise RuntimeError(f"docker build failed for '{image_tag}'")
677
+
678
+
679
+ def _start_runner(
680
+ container_name: str,
681
+ image_tag: str,
682
+ network: str,
683
+ collector_name: str,
684
+ kafka_cname: str,
685
+ kafka_topic: str,
686
+ window_seconds: int,
687
+ warehouse_path: Path,
688
+ project_root: Path | None = None,
689
+ ) -> None:
690
+ env_dict = _collect_env_vars(project_root, collector_name) if project_root else {}
691
+ env_args = [arg for var, val in env_dict.items() for arg in ("-e", f"{var}={val}")]
692
+ subprocess.run(
693
+ [
694
+ "docker", "run", "-d",
695
+ "--name", container_name,
696
+ "--network", network,
697
+ *env_args,
698
+ "-v", f"{warehouse_path}:/warehouse",
699
+ image_tag,
700
+ "--collector_name", collector_name,
701
+ "--bootstrap_servers", f"{kafka_cname}:9092",
702
+ "--topic", kafka_topic,
703
+ "--output_path", f"/warehouse/{collector_name}/{collector_name}/data/",
704
+ "--window_seconds", str(window_seconds),
705
+ ],
706
+ check=True, capture_output=True, text=True,
707
+ )
708
+
709
+
710
+ def _undeploy_streaming(collector_name: str, state: dict) -> None:
711
+ runner = state.get("runner_container", _runner_container(collector_name))
712
+ kafka = state.get("kafka_container", _kafka_container(collector_name))
713
+ network = state.get("docker_network", _network_name(collector_name))
714
+ image_tag = state.get("image_tag", f"dcf-local/{collector_name}-stream:latest")
715
+
716
+ print(f" Stopping stream runner '{runner}'...", flush=True)
717
+ _stop_remove(runner)
718
+
719
+ print(f" Stopping Kafka broker '{kafka}'...", flush=True)
720
+ _stop_remove(kafka)
721
+
722
+ print(f" Removing Docker network '{network}'...", flush=True)
723
+ _remove_network(network)
724
+
725
+ print(f" Removing local image '{image_tag}'...", flush=True)
726
+ subprocess.run(["docker", "rmi", "-f", image_tag], capture_output=True)
727
+
728
+ warehouse = state.get("warehouse_path", "warehouse/")
729
+ print(f" Warehouse data at {warehouse} is untouched.", flush=True)
730
+
731
+
732
+ # ------------------------------------------------------------------ #
733
+ # Docker helpers #
734
+ # ------------------------------------------------------------------ #
735
+
736
+ def _check_docker() -> None:
737
+ result = subprocess.run(["docker", "info"], capture_output=True)
738
+ if result.returncode != 0:
739
+ raise RuntimeError("Docker is not running. Start Docker Desktop and retry.")
740
+
741
+
742
+ def _stop_remove(container_name: str) -> None:
743
+ exists = subprocess.run(
744
+ ["docker", "inspect", container_name], capture_output=True,
745
+ ).returncode == 0
746
+ if exists:
747
+ subprocess.run(["docker", "stop", container_name], capture_output=True)
748
+ subprocess.run(["docker", "rm", container_name], capture_output=True)
749
+
750
+
751
+ def _remove_network(network: str) -> None:
752
+ exists = subprocess.run(
753
+ ["docker", "network", "inspect", network], capture_output=True,
754
+ ).returncode == 0
755
+ if exists:
756
+ subprocess.run(["docker", "network", "rm", network], capture_output=True)