data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
dcf/local_deploy.py
ADDED
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
"""Local Docker-based deployment for batch and streaming collectors.
|
|
2
|
+
|
|
3
|
+
No GCP account required. Batch collectors are built and scheduled via local
|
|
4
|
+
Terraform modules (batch_collector_local + airflow_local). Streaming collectors
|
|
5
|
+
run a Kafka broker + local stream runner container.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import secrets
|
|
16
|
+
import shutil
|
|
17
|
+
import subprocess
|
|
18
|
+
import time
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
_DCF_PKG_DIR = Path(__file__).parent
|
|
27
|
+
_DCF_REPO_ROOT = _DCF_PKG_DIR.parent
|
|
28
|
+
|
|
29
|
+
_BATCH_COLLECTOR_MODULE = _DCF_PKG_DIR / "infra" / "modules" / "batch_collector"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _write_pyproject_toml(dest: Path) -> None:
|
|
33
|
+
"""Write dcf's pyproject.toml to dest/pyproject.toml.
|
|
34
|
+
|
|
35
|
+
Works whether dcf is running from a development checkout or an installed
|
|
36
|
+
package (where the repo root is not on disk and pyproject.toml lives only
|
|
37
|
+
in package metadata).
|
|
38
|
+
"""
|
|
39
|
+
repo_pyproject = _DCF_REPO_ROOT / "pyproject.toml"
|
|
40
|
+
if repo_pyproject.exists():
|
|
41
|
+
shutil.copy2(repo_pyproject, dest / "pyproject.toml")
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
import importlib.metadata
|
|
45
|
+
|
|
46
|
+
meta = importlib.metadata.metadata("dcf")
|
|
47
|
+
version = meta["Version"]
|
|
48
|
+
reqs = importlib.metadata.requires("dcf") or []
|
|
49
|
+
direct_deps = [r for r in reqs if "extra ==" not in r]
|
|
50
|
+
deps_str = "\n".join(f' "{r}",' for r in direct_deps)
|
|
51
|
+
(dest / "pyproject.toml").write_text(
|
|
52
|
+
f'[project]\n'
|
|
53
|
+
f'name = "dcf"\n'
|
|
54
|
+
f'version = "{version}"\n'
|
|
55
|
+
f'requires-python = ">=3.12"\n'
|
|
56
|
+
f'dependencies = [\n{deps_str}\n]\n\n'
|
|
57
|
+
f'[project.scripts]\n'
|
|
58
|
+
f'dcf = "dcf.cli:app"\n\n'
|
|
59
|
+
f'[tool.setuptools.packages.find]\n'
|
|
60
|
+
f'include = ["dcf*"]\n'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
_BUILD_DIR = Path.home() / ".dcf" / "build"
|
|
64
|
+
_TF_PLUGIN_CACHE = Path.home() / ".dcf" / ".plugin-cache"
|
|
65
|
+
_AIRFLOW_DAGS_DIR = Path.home() / ".dcf" / "airflow" / "dags"
|
|
66
|
+
_AIRFLOW_COMPOSE_FILE = Path.home() / ".dcf" / "airflow" / "docker-compose.yml"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _tf_state_dir(project_root: Path) -> Path:
|
|
70
|
+
"""Return the Terraform state directory for this project.
|
|
71
|
+
|
|
72
|
+
Defaults to <project_root>/.dcf/terraform; can be overridden with
|
|
73
|
+
`terraform_state_dir` in project.yml.
|
|
74
|
+
"""
|
|
75
|
+
cfg_path = project_root / "project.yml"
|
|
76
|
+
if cfg_path.exists():
|
|
77
|
+
cfg = yaml.safe_load(cfg_path.read_text()) or {}
|
|
78
|
+
custom = cfg.get("terraform_state_dir")
|
|
79
|
+
if custom:
|
|
80
|
+
return Path(custom).expanduser()
|
|
81
|
+
return project_root / ".dcf" / "terraform"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _collect_env_vars(project_root: Path, collector_name: str) -> dict[str, str]:
|
|
85
|
+
"""Scan collector YAML for {{ env.VAR }} references and return {VAR: value}."""
|
|
86
|
+
collector_path = project_root / "collectors" / f"{collector_name}.yml"
|
|
87
|
+
if not collector_path.exists():
|
|
88
|
+
return {}
|
|
89
|
+
|
|
90
|
+
raw_yaml = collector_path.read_text()
|
|
91
|
+
var_names = re.findall(r"\{\{\s*env\.(\w+)\s*\}\}", raw_yaml)
|
|
92
|
+
if not var_names:
|
|
93
|
+
return {}
|
|
94
|
+
|
|
95
|
+
project_cfg: dict = {}
|
|
96
|
+
cfg_path = project_root / "project.yml"
|
|
97
|
+
if cfg_path.exists():
|
|
98
|
+
project_cfg = yaml.safe_load(cfg_path.read_text()) or {}
|
|
99
|
+
|
|
100
|
+
result: dict[str, str] = {}
|
|
101
|
+
for var in dict.fromkeys(var_names):
|
|
102
|
+
value = os.environ.get(var) or project_cfg.get(var.lower())
|
|
103
|
+
if not value:
|
|
104
|
+
raise EnvironmentError(
|
|
105
|
+
f"Collector references '{{{{ env.{var} }}}}' but '{var}' is not set "
|
|
106
|
+
f"in the host environment and '{var.lower()}' is not in project.yml"
|
|
107
|
+
)
|
|
108
|
+
result[var] = value
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ------------------------------------------------------------------ #
|
|
113
|
+
# Public API #
|
|
114
|
+
# ------------------------------------------------------------------ #
|
|
115
|
+
|
|
116
|
+
def deploy(
|
|
117
|
+
collector_name: str,
|
|
118
|
+
deployment,
|
|
119
|
+
project_root: Path,
|
|
120
|
+
subscription: str | None = None,
|
|
121
|
+
) -> dict:
|
|
122
|
+
"""Build and start local Docker containers for a collector."""
|
|
123
|
+
_check_docker()
|
|
124
|
+
if deployment.type == "streaming":
|
|
125
|
+
if subscription is None:
|
|
126
|
+
raise ValueError("subscription is required for streaming local deploy")
|
|
127
|
+
return _deploy_streaming(collector_name, subscription, deployment.window_seconds, project_root)
|
|
128
|
+
else:
|
|
129
|
+
return _deploy_batch(collector_name, deployment, project_root)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def undeploy(collector_name: str, deployment_state: dict, project_root: Path) -> None:
|
|
133
|
+
"""Stop and remove all local Docker resources for this collector."""
|
|
134
|
+
if deployment_state.get("type") == "streaming":
|
|
135
|
+
_undeploy_streaming(collector_name, deployment_state)
|
|
136
|
+
else:
|
|
137
|
+
_undeploy_batch(collector_name, deployment_state, project_root)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def undeploy_all(deployments: dict, project_root: Path) -> None:
|
|
141
|
+
"""Destroy all collector resources then tear down the shared Airflow stack."""
|
|
142
|
+
for name, state in deployments.items():
|
|
143
|
+
print(f" Undeploying '{name}'...", flush=True)
|
|
144
|
+
undeploy(name, state, project_root)
|
|
145
|
+
_tf_destroy_airflow_local(project_root)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def publish(collector_name: str, deployment_state: dict, message_json: str, count: int = 1) -> None:
|
|
149
|
+
"""Publish a JSON message to the collector's local Kafka topic."""
|
|
150
|
+
from kafka import KafkaProducer
|
|
151
|
+
|
|
152
|
+
bootstrap = deployment_state.get("kafka_external_bootstrap", "localhost:29092")
|
|
153
|
+
topic = deployment_state.get("kafka_topic", f"dcf-{collector_name}")
|
|
154
|
+
|
|
155
|
+
producer = KafkaProducer(
|
|
156
|
+
bootstrap_servers=bootstrap,
|
|
157
|
+
value_serializer=lambda v: v.encode("utf-8"),
|
|
158
|
+
)
|
|
159
|
+
for _ in range(count):
|
|
160
|
+
producer.send(topic, value=message_json)
|
|
161
|
+
producer.flush()
|
|
162
|
+
producer.close()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------ #
|
|
166
|
+
# Batch — Terraform path #
|
|
167
|
+
# ------------------------------------------------------------------ #
|
|
168
|
+
|
|
169
|
+
def _deploy_batch(collector_name: str, deployment, project_root: Path) -> dict:
|
|
170
|
+
image_tag = f"dcf-local/{collector_name}:latest"
|
|
171
|
+
warehouse_path = project_root / "warehouse"
|
|
172
|
+
warehouse_path.mkdir(exist_ok=True)
|
|
173
|
+
|
|
174
|
+
print(f" Syncing build context for '{collector_name}'...", flush=True)
|
|
175
|
+
build_context = _sync_build_context(project_root, collector_name)
|
|
176
|
+
|
|
177
|
+
content_hash = _content_hash(build_context)
|
|
178
|
+
|
|
179
|
+
print(f" Applying Terraform (collector image)...", flush=True)
|
|
180
|
+
_tf_apply_local_collector(collector_name, build_context, image_tag, content_hash, project_root)
|
|
181
|
+
|
|
182
|
+
print(f" Writing DAG file...", flush=True)
|
|
183
|
+
_AIRFLOW_DAGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
184
|
+
dag_content = _local_dag_content(
|
|
185
|
+
collector_name=collector_name,
|
|
186
|
+
schedule=deployment.schedule,
|
|
187
|
+
paused=getattr(deployment, "paused", False),
|
|
188
|
+
image_tag=image_tag,
|
|
189
|
+
warehouse_path=str(warehouse_path),
|
|
190
|
+
env_vars=_collect_env_vars(project_root, collector_name),
|
|
191
|
+
)
|
|
192
|
+
_write_local_dag(collector_name, dag_content)
|
|
193
|
+
|
|
194
|
+
print(f" Applying Terraform (Airflow stack)...", flush=True)
|
|
195
|
+
credentials = _generate_airflow_credentials(project_root)
|
|
196
|
+
airflow_outputs = _tf_apply_airflow_local(
|
|
197
|
+
dag_dir=str(_AIRFLOW_DAGS_DIR),
|
|
198
|
+
warehouse_path=str(warehouse_path),
|
|
199
|
+
credentials=credentials,
|
|
200
|
+
project_root=project_root,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
airflow_url = airflow_outputs.get("webserver_url", {}).get("value", "http://localhost:8080")
|
|
204
|
+
print(f" Airflow UI: {airflow_url}", flush=True)
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"type": "batch",
|
|
208
|
+
"image_tag": image_tag,
|
|
209
|
+
"warehouse_path": str(warehouse_path),
|
|
210
|
+
"airflow_url": airflow_url,
|
|
211
|
+
"schedule": deployment.schedule,
|
|
212
|
+
"deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _undeploy_batch(collector_name: str, state: dict, project_root: Path) -> None:
|
|
217
|
+
print(f" Destroying collector Terraform resources...", flush=True)
|
|
218
|
+
_tf_destroy_local_collector(collector_name, project_root)
|
|
219
|
+
|
|
220
|
+
dag_file = _AIRFLOW_DAGS_DIR / f"{collector_name}.py"
|
|
221
|
+
if dag_file.exists():
|
|
222
|
+
dag_file.unlink()
|
|
223
|
+
print(f" Removed DAG file: {dag_file}", flush=True)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# ------------------------------------------------------------------ #
|
|
227
|
+
# Build context helpers #
|
|
228
|
+
# ------------------------------------------------------------------ #
|
|
229
|
+
|
|
230
|
+
def _sync_build_context(project_root: Path, collector_name: str) -> Path:
|
|
231
|
+
"""Create a stable build context dir at ~/.dcf/build/local/<name>/."""
|
|
232
|
+
build_context = _BUILD_DIR / "local" / collector_name
|
|
233
|
+
shutil.rmtree(build_context, ignore_errors=True)
|
|
234
|
+
build_context.mkdir(parents=True)
|
|
235
|
+
|
|
236
|
+
shutil.copytree(_DCF_PKG_DIR, build_context / "dcf")
|
|
237
|
+
_write_pyproject_toml(build_context)
|
|
238
|
+
|
|
239
|
+
for subdir in ("collectors", "connectors"):
|
|
240
|
+
src = project_root / subdir
|
|
241
|
+
dst = build_context / subdir
|
|
242
|
+
if src.exists():
|
|
243
|
+
shutil.copytree(src, dst)
|
|
244
|
+
else:
|
|
245
|
+
dst.mkdir()
|
|
246
|
+
|
|
247
|
+
(build_context / "project.yml").write_text("catalog: local\n")
|
|
248
|
+
|
|
249
|
+
return build_context
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _content_hash(build_context: Path) -> str:
|
|
253
|
+
"""SHA256 of all files in build_context, excluding Dockerfile (written by Terraform)."""
|
|
254
|
+
h = hashlib.sha256()
|
|
255
|
+
for path in sorted(build_context.rglob("*")):
|
|
256
|
+
if path.is_file() and path.name != "Dockerfile":
|
|
257
|
+
h.update(path.read_bytes())
|
|
258
|
+
return h.hexdigest()
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# ------------------------------------------------------------------ #
|
|
262
|
+
# Terraform helpers — collector #
|
|
263
|
+
# ------------------------------------------------------------------ #
|
|
264
|
+
|
|
265
|
+
def _tf_env() -> dict:
|
|
266
|
+
return {
|
|
267
|
+
**os.environ,
|
|
268
|
+
"TF_INPUT": "0",
|
|
269
|
+
"TF_PLUGIN_CACHE_DIR": str(_TF_PLUGIN_CACHE),
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _tf_run(cmd: list[str], work_dir: Path, env: dict) -> None:
|
|
274
|
+
result = subprocess.run(cmd, cwd=str(work_dir), env=env, capture_output=True, text=True)
|
|
275
|
+
if result.returncode != 0:
|
|
276
|
+
raise RuntimeError(
|
|
277
|
+
f"terraform {cmd[1]} failed (exit {result.returncode}):\n{result.stderr[-2000:]}"
|
|
278
|
+
)
|
|
279
|
+
logger.info("terraform %s OK", cmd[1])
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _copy_module_to_work_dir(module_dir: Path, work_dir: Path) -> None:
|
|
283
|
+
"""Copy a leaf Terraform module's .tf files + shared templates into work_dir."""
|
|
284
|
+
for item in module_dir.iterdir():
|
|
285
|
+
if item.name in (".terraform", ".terraform.lock.hcl"):
|
|
286
|
+
continue
|
|
287
|
+
if item.is_file() and item.suffix == ".tf":
|
|
288
|
+
shutil.copy2(item, work_dir / item.name)
|
|
289
|
+
templates_src = _DCF_PKG_DIR / "infra" / "templates"
|
|
290
|
+
templates_dst = work_dir / "templates"
|
|
291
|
+
if templates_dst.exists():
|
|
292
|
+
shutil.rmtree(templates_dst)
|
|
293
|
+
shutil.copytree(templates_src, templates_dst)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _tf_apply_local_collector(
|
|
297
|
+
collector_name: str,
|
|
298
|
+
build_context: Path,
|
|
299
|
+
image_tag: str,
|
|
300
|
+
content_hash: str,
|
|
301
|
+
project_root: Path,
|
|
302
|
+
) -> None:
|
|
303
|
+
work_dir = _tf_state_dir(project_root) / "collectors" / collector_name / "local"
|
|
304
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
305
|
+
_TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
|
|
307
|
+
_copy_module_to_work_dir(_BATCH_COLLECTOR_MODULE / "local", work_dir)
|
|
308
|
+
|
|
309
|
+
tfvars = {
|
|
310
|
+
"collector_name": collector_name,
|
|
311
|
+
"build_context": str(build_context),
|
|
312
|
+
"image_tag": image_tag,
|
|
313
|
+
"content_hash": content_hash,
|
|
314
|
+
"java_enabled": True,
|
|
315
|
+
}
|
|
316
|
+
(work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
|
|
317
|
+
|
|
318
|
+
env = _tf_env()
|
|
319
|
+
_tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
|
|
320
|
+
_tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _tf_destroy_local_collector(collector_name: str, project_root: Path) -> None:
|
|
324
|
+
work_dir = _tf_state_dir(project_root) / "collectors" / collector_name / "local"
|
|
325
|
+
if not work_dir.exists():
|
|
326
|
+
logger.warning("No Terraform state found at %s — skipping destroy", work_dir)
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
env = _tf_env()
|
|
330
|
+
_tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
|
|
331
|
+
shutil.rmtree(work_dir)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _tf_destroy_airflow_local(project_root: Path) -> None:
|
|
335
|
+
work_dir = _tf_state_dir(project_root) / "airflow" / "local"
|
|
336
|
+
if not work_dir.exists():
|
|
337
|
+
logger.warning("No Airflow Terraform state found at %s — skipping destroy", work_dir)
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
print(" Destroying Airflow stack...", flush=True)
|
|
341
|
+
if _AIRFLOW_COMPOSE_FILE.exists():
|
|
342
|
+
subprocess.run(
|
|
343
|
+
["docker", "compose", "-f", str(_AIRFLOW_COMPOSE_FILE), "down", "--volumes"],
|
|
344
|
+
check=False,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
env = _tf_env()
|
|
348
|
+
_tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
|
|
349
|
+
shutil.rmtree(work_dir)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# ------------------------------------------------------------------ #
|
|
353
|
+
# DAG content #
|
|
354
|
+
# ------------------------------------------------------------------ #
|
|
355
|
+
|
|
356
|
+
def _local_dag_content(
|
|
357
|
+
collector_name: str,
|
|
358
|
+
schedule: str,
|
|
359
|
+
paused: bool,
|
|
360
|
+
image_tag: str,
|
|
361
|
+
warehouse_path: str,
|
|
362
|
+
env_vars: dict[str, str] | None = None,
|
|
363
|
+
) -> str:
|
|
364
|
+
paused_str = "True" if paused else "False"
|
|
365
|
+
environment = {"COLLECTOR_NAME": collector_name, **(env_vars or {})}
|
|
366
|
+
return f"""\
|
|
367
|
+
# Generated by dcf — do not edit manually
|
|
368
|
+
from datetime import datetime
|
|
369
|
+
from docker.types import Mount
|
|
370
|
+
from airflow import DAG
|
|
371
|
+
from airflow.providers.docker.operators.docker import DockerOperator
|
|
372
|
+
|
|
373
|
+
with DAG(
|
|
374
|
+
dag_id="{collector_name}",
|
|
375
|
+
schedule_interval="{schedule}",
|
|
376
|
+
start_date=datetime(2024, 1, 1),
|
|
377
|
+
catchup=False,
|
|
378
|
+
is_paused_upon_creation={paused_str},
|
|
379
|
+
tags=["dcf"],
|
|
380
|
+
) as dag:
|
|
381
|
+
run_collector = DockerOperator(
|
|
382
|
+
task_id="run_{collector_name}",
|
|
383
|
+
image="{image_tag}",
|
|
384
|
+
environment={environment!r},
|
|
385
|
+
mounts=[Mount(target="/app/warehouse", source="{warehouse_path}", type="bind")],
|
|
386
|
+
docker_url="unix:///var/run/docker.sock",
|
|
387
|
+
auto_remove="success",
|
|
388
|
+
)
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _write_local_dag(collector_name: str, dag_content: str) -> None:
|
|
393
|
+
_AIRFLOW_DAGS_DIR.mkdir(parents=True, exist_ok=True)
|
|
394
|
+
(_AIRFLOW_DAGS_DIR / f"{collector_name}.py").write_text(dag_content)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# ------------------------------------------------------------------ #
|
|
398
|
+
# Terraform helpers — Airflow #
|
|
399
|
+
# ------------------------------------------------------------------ #
|
|
400
|
+
|
|
401
|
+
def _airflow_build_context() -> Path:
|
|
402
|
+
"""Return the stable build context dir for the local Airflow image."""
|
|
403
|
+
build_context = _BUILD_DIR / "airflow-local"
|
|
404
|
+
build_context.mkdir(parents=True, exist_ok=True)
|
|
405
|
+
return build_context
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _airflow_content_hash() -> str:
|
|
409
|
+
"""Hash of the airflow Dockerfile template to detect when Airflow image needs rebuild."""
|
|
410
|
+
template = _DCF_PKG_DIR / "infra" / "templates" / "airflow.Dockerfile.tftpl"
|
|
411
|
+
return hashlib.sha256(template.read_bytes()).hexdigest()
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _generate_airflow_credentials(project_root: Path) -> dict:
|
|
415
|
+
"""Read/generate Airflow credentials from project.yml."""
|
|
416
|
+
cfg_path = project_root / "project.yml"
|
|
417
|
+
cfg: dict = yaml.safe_load(cfg_path.read_text()) or {} if cfg_path.exists() else {}
|
|
418
|
+
|
|
419
|
+
admin_password = cfg.get("airflow_admin_password")
|
|
420
|
+
if not admin_password:
|
|
421
|
+
import getpass
|
|
422
|
+
admin_password = getpass.getpass("Enter Airflow admin password: ").strip()
|
|
423
|
+
if not admin_password:
|
|
424
|
+
raise RuntimeError("Airflow admin password cannot be empty.")
|
|
425
|
+
cfg["airflow_admin_password"] = admin_password
|
|
426
|
+
cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
|
|
427
|
+
logger.info("Saved airflow_admin_password to project.yml")
|
|
428
|
+
|
|
429
|
+
changed = False
|
|
430
|
+
|
|
431
|
+
fernet_key = cfg.get("airflow_fernet_key")
|
|
432
|
+
if not fernet_key:
|
|
433
|
+
from cryptography.fernet import Fernet
|
|
434
|
+
fernet_key = Fernet.generate_key().decode()
|
|
435
|
+
cfg["airflow_fernet_key"] = fernet_key
|
|
436
|
+
changed = True
|
|
437
|
+
|
|
438
|
+
if changed:
|
|
439
|
+
cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
|
|
440
|
+
|
|
441
|
+
return {
|
|
442
|
+
"db_password": "airflow",
|
|
443
|
+
"admin_password": admin_password,
|
|
444
|
+
"fernet_key": fernet_key,
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _tf_apply_airflow_local(dag_dir: str, warehouse_path: str, credentials: dict, project_root: Path) -> dict:
|
|
449
|
+
work_dir = _tf_state_dir(project_root) / "airflow" / "local"
|
|
450
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
451
|
+
_TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
|
|
452
|
+
|
|
453
|
+
_copy_module_to_work_dir(_BATCH_COLLECTOR_MODULE / "local" / "airflow", work_dir)
|
|
454
|
+
|
|
455
|
+
build_context = _airflow_build_context()
|
|
456
|
+
content_hash = _airflow_content_hash()
|
|
457
|
+
|
|
458
|
+
tfvars = {
|
|
459
|
+
"image_tag": "dcf-airflow-local:latest",
|
|
460
|
+
"build_context": str(build_context),
|
|
461
|
+
"content_hash": content_hash,
|
|
462
|
+
"dag_dir": dag_dir,
|
|
463
|
+
"warehouse_path": warehouse_path,
|
|
464
|
+
"docker_socket": "/var/run/docker.sock",
|
|
465
|
+
"db_password": credentials["db_password"],
|
|
466
|
+
"admin_password": credentials["admin_password"],
|
|
467
|
+
"fernet_key": credentials["fernet_key"],
|
|
468
|
+
"compose_file_path": str(_AIRFLOW_COMPOSE_FILE),
|
|
469
|
+
"webserver_port": 8090,
|
|
470
|
+
}
|
|
471
|
+
(work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
|
|
472
|
+
|
|
473
|
+
_AIRFLOW_COMPOSE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
474
|
+
|
|
475
|
+
env = _tf_env()
|
|
476
|
+
_tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
|
|
477
|
+
_tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
|
|
478
|
+
|
|
479
|
+
raw = subprocess.run(
|
|
480
|
+
["terraform", "output", "-json"],
|
|
481
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
482
|
+
).stdout
|
|
483
|
+
return json.loads(raw) if raw.strip() else {}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
# ------------------------------------------------------------------ #
|
|
487
|
+
# Streaming #
|
|
488
|
+
# ------------------------------------------------------------------ #
|
|
489
|
+
|
|
490
|
+
def _kafka_container(name: str) -> str:
|
|
491
|
+
return f"dcf-kafka-{name}"
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _runner_container(name: str) -> str:
|
|
495
|
+
return f"dcf-runner-{name}"
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _network_name(name: str) -> str:
|
|
499
|
+
return f"dcf-{name}"
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _deploy_streaming(
|
|
503
|
+
collector_name: str,
|
|
504
|
+
subscription: str,
|
|
505
|
+
window_seconds: int,
|
|
506
|
+
project_root: Path,
|
|
507
|
+
) -> dict:
|
|
508
|
+
network = _network_name(collector_name)
|
|
509
|
+
kafka_cname = _kafka_container(collector_name)
|
|
510
|
+
runner_cname = _runner_container(collector_name)
|
|
511
|
+
image_tag = f"dcf-local/{collector_name}-stream:latest"
|
|
512
|
+
kafka_topic = f"dcf-{collector_name}"
|
|
513
|
+
warehouse_path = project_root / "warehouse"
|
|
514
|
+
warehouse_path.mkdir(exist_ok=True)
|
|
515
|
+
|
|
516
|
+
_stop_remove(runner_cname)
|
|
517
|
+
_stop_remove(kafka_cname)
|
|
518
|
+
_remove_network(network)
|
|
519
|
+
|
|
520
|
+
print(f" Creating Docker network '{network}'...", flush=True)
|
|
521
|
+
subprocess.run(["docker", "network", "create", network], check=True, capture_output=True)
|
|
522
|
+
|
|
523
|
+
print(f" Starting Kafka broker (apache/kafka, KRaft)...", flush=True)
|
|
524
|
+
_start_kafka(kafka_cname, network, collector_name)
|
|
525
|
+
|
|
526
|
+
print(f" Waiting for Kafka to be ready...", flush=True)
|
|
527
|
+
_wait_for_kafka("localhost:29092", timeout=30)
|
|
528
|
+
|
|
529
|
+
print(f" Creating topic '{kafka_topic}'...", flush=True)
|
|
530
|
+
_create_kafka_topic("localhost:29092", kafka_topic)
|
|
531
|
+
|
|
532
|
+
print(f" Building local runner image '{image_tag}'...", flush=True)
|
|
533
|
+
print(" (First build downloads python:3.12-slim + kafka-python, ~1 minute)", flush=True)
|
|
534
|
+
_build_stream_image(project_root, image_tag)
|
|
535
|
+
|
|
536
|
+
print(f" Starting stream runner...", flush=True)
|
|
537
|
+
_start_runner(
|
|
538
|
+
runner_cname, image_tag, network, collector_name,
|
|
539
|
+
kafka_cname, kafka_topic, window_seconds, warehouse_path,
|
|
540
|
+
project_root=project_root,
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
time.sleep(4)
|
|
544
|
+
status = subprocess.run(
|
|
545
|
+
["docker", "inspect", "--format", "{{.State.Status}}", runner_cname],
|
|
546
|
+
capture_output=True, text=True,
|
|
547
|
+
).stdout.strip()
|
|
548
|
+
if status != "running":
|
|
549
|
+
logs = subprocess.run(
|
|
550
|
+
["docker", "logs", "--tail", "40", runner_cname],
|
|
551
|
+
capture_output=True, text=True,
|
|
552
|
+
)
|
|
553
|
+
raise RuntimeError(
|
|
554
|
+
f"Stream runner container stopped unexpectedly (status: {status}).\n"
|
|
555
|
+
f"Logs:\n{logs.stdout}{logs.stderr}"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
return {
|
|
559
|
+
"type": "streaming",
|
|
560
|
+
"window_seconds": window_seconds,
|
|
561
|
+
"docker_network": network,
|
|
562
|
+
"kafka_container": kafka_cname,
|
|
563
|
+
"runner_container": runner_cname,
|
|
564
|
+
"kafka_topic": kafka_topic,
|
|
565
|
+
"kafka_external_bootstrap": "localhost:29092",
|
|
566
|
+
"image_tag": image_tag,
|
|
567
|
+
"warehouse_path": str(warehouse_path),
|
|
568
|
+
"deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _start_kafka(container_name: str, network: str, collector_name: str) -> None:
|
|
573
|
+
subprocess.run(
|
|
574
|
+
[
|
|
575
|
+
"docker", "run", "-d",
|
|
576
|
+
"--name", container_name,
|
|
577
|
+
"--network", network,
|
|
578
|
+
"-p", "29092:29092",
|
|
579
|
+
"-e", "KAFKA_NODE_ID=1",
|
|
580
|
+
"-e", "KAFKA_PROCESS_ROLES=broker,controller",
|
|
581
|
+
"-e", "KAFKA_CONTROLLER_LISTENER_NAMES=CONTROLLER",
|
|
582
|
+
"-e", (
|
|
583
|
+
"KAFKA_LISTENERS="
|
|
584
|
+
"INTERNAL://0.0.0.0:9092,"
|
|
585
|
+
"EXTERNAL://0.0.0.0:29092,"
|
|
586
|
+
"CONTROLLER://0.0.0.0:9093"
|
|
587
|
+
),
|
|
588
|
+
"-e", (
|
|
589
|
+
f"KAFKA_ADVERTISED_LISTENERS="
|
|
590
|
+
f"INTERNAL://{container_name}:9092,"
|
|
591
|
+
f"EXTERNAL://localhost:29092"
|
|
592
|
+
),
|
|
593
|
+
"-e", (
|
|
594
|
+
"KAFKA_LISTENER_SECURITY_PROTOCOL_MAP="
|
|
595
|
+
"INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT,CONTROLLER:PLAINTEXT"
|
|
596
|
+
),
|
|
597
|
+
"-e", "KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL",
|
|
598
|
+
"-e", "KAFKA_CONTROLLER_QUORUM_VOTERS=1@localhost:9093",
|
|
599
|
+
"-e", "KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1",
|
|
600
|
+
"-e", "KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR=1",
|
|
601
|
+
"-e", "KAFKA_TRANSACTION_STATE_LOG_MIN_ISR=1",
|
|
602
|
+
"-e", "KAFKA_AUTO_CREATE_TOPICS_ENABLE=false",
|
|
603
|
+
"apache/kafka:latest",
|
|
604
|
+
],
|
|
605
|
+
check=True, capture_output=True, text=True,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def _wait_for_kafka(bootstrap: str, timeout: int = 30) -> None:
|
|
610
|
+
from kafka import KafkaAdminClient
|
|
611
|
+
from kafka.errors import NoBrokersAvailable
|
|
612
|
+
|
|
613
|
+
deadline = time.time() + timeout
|
|
614
|
+
while time.time() < deadline:
|
|
615
|
+
try:
|
|
616
|
+
admin = KafkaAdminClient(
|
|
617
|
+
bootstrap_servers=bootstrap,
|
|
618
|
+
request_timeout_ms=3000,
|
|
619
|
+
connections_max_idle_ms=5000,
|
|
620
|
+
)
|
|
621
|
+
admin.close()
|
|
622
|
+
return
|
|
623
|
+
except (NoBrokersAvailable, Exception):
|
|
624
|
+
time.sleep(2)
|
|
625
|
+
raise RuntimeError(
|
|
626
|
+
f"Kafka did not become available at {bootstrap} within {timeout}s.\n"
|
|
627
|
+
"Check: docker logs dcf-kafka-<collector>"
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _create_kafka_topic(bootstrap: str, topic_name: str) -> None:
|
|
632
|
+
from kafka.admin import KafkaAdminClient, NewTopic
|
|
633
|
+
from kafka.errors import TopicAlreadyExistsError
|
|
634
|
+
|
|
635
|
+
admin = KafkaAdminClient(bootstrap_servers=bootstrap, request_timeout_ms=5000)
|
|
636
|
+
try:
|
|
637
|
+
admin.create_topics([NewTopic(topic_name, num_partitions=1, replication_factor=1)])
|
|
638
|
+
except TopicAlreadyExistsError:
|
|
639
|
+
pass
|
|
640
|
+
finally:
|
|
641
|
+
admin.close()
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _build_stream_image(project_root: Path, image_tag: str) -> None:
|
|
645
|
+
import tempfile
|
|
646
|
+
from textwrap import dedent
|
|
647
|
+
|
|
648
|
+
with tempfile.TemporaryDirectory(prefix="dcf-local-stream-") as tmp:
|
|
649
|
+
tmp_path = Path(tmp)
|
|
650
|
+
shutil.copytree(_DCF_PKG_DIR, tmp_path / "dcf")
|
|
651
|
+
_write_pyproject_toml(tmp_path)
|
|
652
|
+
|
|
653
|
+
for subdir in ("collectors", "connectors"):
|
|
654
|
+
src = project_root / subdir
|
|
655
|
+
if src.exists():
|
|
656
|
+
shutil.copytree(src, tmp_path / subdir)
|
|
657
|
+
else:
|
|
658
|
+
(tmp_path / subdir).mkdir()
|
|
659
|
+
|
|
660
|
+
(tmp_path / "project.yml").write_text("catalog: local\n")
|
|
661
|
+
|
|
662
|
+
(tmp_path / "Dockerfile").write_text(dedent("""\
|
|
663
|
+
FROM python:3.12-slim
|
|
664
|
+
WORKDIR /app
|
|
665
|
+
COPY pyproject.toml .
|
|
666
|
+
COPY dcf/ ./dcf/
|
|
667
|
+
RUN pip install --no-cache-dir -e . 'kafka-python>=2.0'
|
|
668
|
+
COPY collectors/ ./collectors/
|
|
669
|
+
COPY connectors/ ./connectors/
|
|
670
|
+
COPY project.yml .
|
|
671
|
+
ENTRYPOINT ["python", "-m", "dcf.local_stream_runner"]
|
|
672
|
+
"""))
|
|
673
|
+
|
|
674
|
+
result = subprocess.run(["docker", "build", "-t", image_tag, "."], cwd=tmp)
|
|
675
|
+
if result.returncode != 0:
|
|
676
|
+
raise RuntimeError(f"docker build failed for '{image_tag}'")
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def _start_runner(
|
|
680
|
+
container_name: str,
|
|
681
|
+
image_tag: str,
|
|
682
|
+
network: str,
|
|
683
|
+
collector_name: str,
|
|
684
|
+
kafka_cname: str,
|
|
685
|
+
kafka_topic: str,
|
|
686
|
+
window_seconds: int,
|
|
687
|
+
warehouse_path: Path,
|
|
688
|
+
project_root: Path | None = None,
|
|
689
|
+
) -> None:
|
|
690
|
+
env_dict = _collect_env_vars(project_root, collector_name) if project_root else {}
|
|
691
|
+
env_args = [arg for var, val in env_dict.items() for arg in ("-e", f"{var}={val}")]
|
|
692
|
+
subprocess.run(
|
|
693
|
+
[
|
|
694
|
+
"docker", "run", "-d",
|
|
695
|
+
"--name", container_name,
|
|
696
|
+
"--network", network,
|
|
697
|
+
*env_args,
|
|
698
|
+
"-v", f"{warehouse_path}:/warehouse",
|
|
699
|
+
image_tag,
|
|
700
|
+
"--collector_name", collector_name,
|
|
701
|
+
"--bootstrap_servers", f"{kafka_cname}:9092",
|
|
702
|
+
"--topic", kafka_topic,
|
|
703
|
+
"--output_path", f"/warehouse/{collector_name}/{collector_name}/data/",
|
|
704
|
+
"--window_seconds", str(window_seconds),
|
|
705
|
+
],
|
|
706
|
+
check=True, capture_output=True, text=True,
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _undeploy_streaming(collector_name: str, state: dict) -> None:
|
|
711
|
+
runner = state.get("runner_container", _runner_container(collector_name))
|
|
712
|
+
kafka = state.get("kafka_container", _kafka_container(collector_name))
|
|
713
|
+
network = state.get("docker_network", _network_name(collector_name))
|
|
714
|
+
image_tag = state.get("image_tag", f"dcf-local/{collector_name}-stream:latest")
|
|
715
|
+
|
|
716
|
+
print(f" Stopping stream runner '{runner}'...", flush=True)
|
|
717
|
+
_stop_remove(runner)
|
|
718
|
+
|
|
719
|
+
print(f" Stopping Kafka broker '{kafka}'...", flush=True)
|
|
720
|
+
_stop_remove(kafka)
|
|
721
|
+
|
|
722
|
+
print(f" Removing Docker network '{network}'...", flush=True)
|
|
723
|
+
_remove_network(network)
|
|
724
|
+
|
|
725
|
+
print(f" Removing local image '{image_tag}'...", flush=True)
|
|
726
|
+
subprocess.run(["docker", "rmi", "-f", image_tag], capture_output=True)
|
|
727
|
+
|
|
728
|
+
warehouse = state.get("warehouse_path", "warehouse/")
|
|
729
|
+
print(f" Warehouse data at {warehouse} is untouched.", flush=True)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
# ------------------------------------------------------------------ #
|
|
733
|
+
# Docker helpers #
|
|
734
|
+
# ------------------------------------------------------------------ #
|
|
735
|
+
|
|
736
|
+
def _check_docker() -> None:
|
|
737
|
+
result = subprocess.run(["docker", "info"], capture_output=True)
|
|
738
|
+
if result.returncode != 0:
|
|
739
|
+
raise RuntimeError("Docker is not running. Start Docker Desktop and retry.")
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def _stop_remove(container_name: str) -> None:
|
|
743
|
+
exists = subprocess.run(
|
|
744
|
+
["docker", "inspect", container_name], capture_output=True,
|
|
745
|
+
).returncode == 0
|
|
746
|
+
if exists:
|
|
747
|
+
subprocess.run(["docker", "stop", container_name], capture_output=True)
|
|
748
|
+
subprocess.run(["docker", "rm", container_name], capture_output=True)
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def _remove_network(network: str) -> None:
|
|
752
|
+
exists = subprocess.run(
|
|
753
|
+
["docker", "network", "inspect", network], capture_output=True,
|
|
754
|
+
).returncode == 0
|
|
755
|
+
if exists:
|
|
756
|
+
subprocess.run(["docker", "network", "rm", network], capture_output=True)
|