data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from ..config.models import ArrayJoinTransform, CrsReprojectTransform, Transform
6
+ from .fetcher import _get_nested
7
+
8
+
9
+ def apply_transform(transform: Transform, record: dict) -> Any:
10
+ if isinstance(transform, CrsReprojectTransform):
11
+ return _crs_reproject(transform, record)
12
+ if isinstance(transform, ArrayJoinTransform):
13
+ return _array_join(transform, record)
14
+ raise ValueError(f"Unknown transform type: {type(transform)}")
15
+
16
+
17
+ def _array_join(t: ArrayJoinTransform, record: dict) -> str | None:
18
+ value = _get_nested(record, t.path)
19
+ if value is None:
20
+ return None
21
+ if not isinstance(value, list):
22
+ return str(value)
23
+ return t.separator.join(str(item) for item in value)
24
+
25
+
26
+ def _crs_reproject(t: CrsReprojectTransform, record: dict) -> float | None:
27
+ from pyproj import Transformer as ProjTransformer
28
+
29
+ try:
30
+ raw_x = record.get(t.from_columns[0])
31
+ raw_y = record.get(t.from_columns[1])
32
+ if raw_x is None or raw_y is None:
33
+ return None
34
+ x = float(raw_x)
35
+ y = float(raw_y)
36
+ except (TypeError, ValueError):
37
+ return None
38
+
39
+ proj = ProjTransformer.from_crs(t.from_crs, t.to_crs, always_xy=True)
40
+ lon, lat = proj.transform(x, y)
41
+ return lon if t.component == "x" else lat
dcf/gcp/__init__.py ADDED
File without changes
@@ -0,0 +1,87 @@
1
+ """Shared schema projection utilities used by both beam_runner (GCP/Dataflow)
2
+ and local_stream_runner (local Kafka)."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import logging
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+
11
+ import pyarrow as pa
12
+ import yaml
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ TYPE_MAP: dict[str, pa.DataType] = {
17
+ "string": pa.string(),
18
+ "integer": pa.int64(),
19
+ "float": pa.float64(),
20
+ "boolean": pa.bool_(),
21
+ "timestamp": pa.timestamp("us", tz="UTC"),
22
+ "date": pa.date32(),
23
+ }
24
+
25
+
26
+ def load_columns(collector_name: str) -> list[dict]:
27
+ path = Path("collectors") / f"{collector_name}.yml"
28
+ data = yaml.safe_load(path.read_text())
29
+ return data["schema"]["columns"]
30
+
31
+
32
+ def to_pyarrow_schema(columns: list[dict]) -> pa.Schema:
33
+ fields = [
34
+ pa.field(col["name"], TYPE_MAP.get(col.get("type", "string"), pa.string()))
35
+ for col in columns
36
+ ]
37
+ return pa.schema(fields)
38
+
39
+
40
+ def cast_value(value, col_type: str | None):
41
+ if value is None:
42
+ return None
43
+ if col_type == "integer":
44
+ return int(value)
45
+ if col_type == "float":
46
+ return float(value)
47
+ if col_type == "boolean":
48
+ return bool(value)
49
+ if col_type == "timestamp":
50
+ if isinstance(value, str):
51
+ for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S%z"):
52
+ try:
53
+ dt = datetime.strptime(value.rstrip("Z") + "+00:00", fmt.replace("Z", "%z"))
54
+ return dt.astimezone(timezone.utc)
55
+ except ValueError:
56
+ continue
57
+ return value
58
+ if col_type == "date":
59
+ if isinstance(value, str):
60
+ try:
61
+ return datetime.strptime(value, "%Y-%m-%d").date()
62
+ except ValueError:
63
+ return value
64
+ return value
65
+ return str(value) if value is not None else None
66
+
67
+
68
+ def project_message(msg_bytes: bytes, columns: list[dict]) -> dict | None:
69
+ try:
70
+ record = json.loads(msg_bytes.decode("utf-8"))
71
+ except (json.JSONDecodeError, UnicodeDecodeError):
72
+ logger.warning("Skipping unparseable message")
73
+ return None
74
+
75
+ row: dict = {}
76
+ for col in columns:
77
+ path = col.get("path") or col["name"]
78
+ parts = path.split(".")
79
+ val = record
80
+ for part in parts:
81
+ if isinstance(val, dict):
82
+ val = val.get(part)
83
+ else:
84
+ val = None
85
+ break
86
+ row[col["name"]] = cast_value(val, col.get("type"))
87
+ return row
dcf/gcp/auth.py ADDED
@@ -0,0 +1 @@
1
+ # Replaced by api/gcp/gcloud.py
@@ -0,0 +1,548 @@
1
+ """Batch collector deployment: builds a container image via Cloud Build, then uses
2
+ Terraform to provision a Cloud Run job. DAG is written directly to GCS for the
3
+ custom Airflow stack (no Cloud Composer)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import os
11
+ import secrets
12
+ import shutil
13
+ import subprocess
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+
17
+ import yaml
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _DCF_PKG_DIR = Path(__file__).parent.parent # dcf/ package
22
+ _DCF_REPO_ROOT = _DCF_PKG_DIR.parent
23
+ _BATCH_MODULE_DIR = _DCF_PKG_DIR / "infra" / "modules" / "batch_collector"
24
+ _BUILD_DIR = Path.home() / ".dcf" / "build"
25
+ _TF_PLUGIN_CACHE = Path.home() / ".dcf" / ".plugin-cache"
26
+
27
+
28
+ def _tf_state_dir(project_root: Path) -> Path:
29
+ """Return the Terraform state directory for this project.
30
+
31
+ Defaults to <project_root>/.dcf/terraform; can be overridden with
32
+ `terraform_state_dir` in project.yml.
33
+ """
34
+ cfg_path = project_root / "project.yml"
35
+ if cfg_path.exists():
36
+ cfg = yaml.safe_load(cfg_path.read_text()) or {}
37
+ custom = cfg.get("terraform_state_dir")
38
+ if custom:
39
+ return Path(custom).expanduser()
40
+ return project_root / ".dcf" / "terraform"
41
+
42
+
43
+ def _write_pyproject_toml(dest: Path) -> None:
44
+ repo_pyproject = _DCF_REPO_ROOT / "pyproject.toml"
45
+ if repo_pyproject.exists():
46
+ shutil.copy2(repo_pyproject, dest / "pyproject.toml")
47
+ return
48
+
49
+ import importlib.metadata
50
+
51
+ meta = importlib.metadata.metadata("dcf")
52
+ version = meta["Version"]
53
+ reqs = importlib.metadata.requires("dcf") or []
54
+ direct_deps = [r for r in reqs if "extra ==" not in r]
55
+ deps_str = "\n".join(f' "{r}",' for r in direct_deps)
56
+ (dest / "pyproject.toml").write_text(
57
+ f'[project]\n'
58
+ f'name = "dcf"\n'
59
+ f'version = "{version}"\n'
60
+ f'requires-python = ">=3.12"\n'
61
+ f'dependencies = [\n{deps_str}\n]\n\n'
62
+ f'[project.scripts]\n'
63
+ f'dcf = "dcf.cli:app"\n\n'
64
+ f'[tool.setuptools.packages.find]\n'
65
+ f'include = ["dcf*"]\n'
66
+ )
67
+
68
+
69
+ # ------------------------------------------------------------------ #
70
+ # Public API #
71
+ # ------------------------------------------------------------------ #
72
+
73
+ def deploy(
74
+ collector_name: str,
75
+ schedule: str,
76
+ paused: bool,
77
+ project_root: Path,
78
+ gcp_config: dict,
79
+ ) -> dict:
80
+ """Provision a Cloud Run job for a collector via Terraform, write DAG to GCS,
81
+ and provision the GCP Airflow stack (Cloud Run + Cloud SQL) if needed.
82
+
83
+ Returns the deployment state dict to write into project.yml.
84
+ """
85
+ project_id = gcp_config["project_id"]
86
+ region = gcp_config["region"]
87
+ warehouse_bucket = gcp_config["warehouse_bucket"]
88
+ sa_email = gcp_config["sa_email"]
89
+
90
+ image_uri = _image_uri(project_id, region, collector_name)
91
+
92
+ print(f" Syncing build context for '{collector_name}'...", flush=True)
93
+ build_context = _sync_build_context(project_root, collector_name, gcp_config)
94
+ content_hash = _content_hash(build_context)
95
+
96
+ print(f" Ensuring Artifact Registry repository exists...", flush=True)
97
+ _ensure_artifact_registry_repo(project_id, region)
98
+
99
+ print(f" Applying Terraform (Cloud Run job + Cloud Build)...", flush=True)
100
+ print(f" (First build may take a few minutes)", flush=True)
101
+ job_name = _terraform_apply_collector(
102
+ collector_name=collector_name,
103
+ image_uri=image_uri,
104
+ sa_email=sa_email,
105
+ build_context=build_context,
106
+ content_hash=content_hash,
107
+ project_id=project_id,
108
+ region=region,
109
+ project_root=project_root,
110
+ )
111
+
112
+ print(f" Writing DAG to GCS...", flush=True)
113
+ dag_content = _gcp_dag_content(
114
+ collector_name=collector_name,
115
+ schedule=schedule,
116
+ paused=paused,
117
+ project_id=project_id,
118
+ region=region,
119
+ job_name=job_name,
120
+ )
121
+ _write_dag_gcs(dag_content, collector_name, warehouse_bucket)
122
+
123
+ print(f" Provisioning GCP Airflow stack...", flush=True)
124
+ credentials = _generate_airflow_credentials(project_root)
125
+ airflow_outputs = _tf_apply_airflow_gcp(
126
+ build_context=_airflow_build_context(),
127
+ image_uri=_airflow_image_uri(project_id, region),
128
+ content_hash=_airflow_content_hash(),
129
+ gcp_config=gcp_config,
130
+ credentials=credentials,
131
+ project_root=project_root,
132
+ )
133
+
134
+ airflow_url = airflow_outputs.get("webserver_url", {}).get("value", "")
135
+ if airflow_url:
136
+ print(f" Airflow UI: {airflow_url}", flush=True)
137
+
138
+ return {
139
+ "schedule": schedule,
140
+ "dag_id": collector_name,
141
+ "cloud_run_job": job_name,
142
+ "airflow_url": airflow_url,
143
+ "image_uri": image_uri,
144
+ "deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
145
+ }
146
+
147
+
148
+ def undeploy(collector_name: str, deployment: dict, gcp_config: dict, project_root: Path) -> None:
149
+ """Remove the Cloud Run job via Terraform destroy and delete the DAG from GCS."""
150
+ project_id = gcp_config["project_id"]
151
+ region = gcp_config["region"]
152
+ warehouse_bucket = gcp_config["warehouse_bucket"]
153
+
154
+ print(f" Destroying Terraform resources for '{collector_name}'...", flush=True)
155
+ _terraform_destroy_collector(collector_name, project_id, region, project_root)
156
+
157
+ print(f" Deleting DAG from GCS...", flush=True)
158
+ _delete_dag_gcs(collector_name, warehouse_bucket)
159
+
160
+ if not _gcs_dag_files_exist(warehouse_bucket):
161
+ print(f" No remaining DAGs — tearing down Airflow stack...", flush=True)
162
+ _tf_destroy_airflow_gcp(project_root)
163
+
164
+
165
+ # ------------------------------------------------------------------ #
166
+ # Build context #
167
+ # ------------------------------------------------------------------ #
168
+
169
+ def _image_uri(project_id: str, region: str, collector_name: str) -> str:
170
+ return f"{region}-docker.pkg.dev/{project_id}/dcf-runner/{collector_name}:latest"
171
+
172
+
173
+ def _sync_build_context(
174
+ project_root: Path, collector_name: str, gcp_config: dict
175
+ ) -> Path:
176
+ """Create a stable build context dir at ~/.dcf/build/gcp/<name>/."""
177
+ build_context = _BUILD_DIR / "gcp" / collector_name
178
+ shutil.rmtree(build_context, ignore_errors=True)
179
+ build_context.mkdir(parents=True)
180
+
181
+ shutil.copytree(_DCF_PKG_DIR, build_context / "dcf")
182
+ _write_pyproject_toml(build_context)
183
+
184
+ for subdir in ("collectors", "connectors"):
185
+ src = project_root / subdir
186
+ dst = build_context / subdir
187
+ if src.exists():
188
+ shutil.copytree(src, dst)
189
+ else:
190
+ dst.mkdir()
191
+
192
+ minimal_config = {
193
+ "catalog": "gcp",
194
+ "gcp": {
195
+ "project_id": gcp_config["project_id"],
196
+ "region": gcp_config["region"],
197
+ "warehouse_bucket": gcp_config["warehouse_bucket"],
198
+ },
199
+ }
200
+ (build_context / "project.yml").write_text(
201
+ yaml.dump(minimal_config, default_flow_style=False)
202
+ )
203
+
204
+ return build_context
205
+
206
+
207
+ def _content_hash(build_context: Path) -> str:
208
+ """SHA256 of all files in build_context, excluding Dockerfile (written by Terraform)."""
209
+ h = hashlib.sha256()
210
+ for path in sorted(build_context.rglob("*")):
211
+ if path.is_file() and path.name != "Dockerfile":
212
+ h.update(path.read_bytes())
213
+ return h.hexdigest()
214
+
215
+
216
+ # ------------------------------------------------------------------ #
217
+ # Terraform: per-collector resources #
218
+ # ------------------------------------------------------------------ #
219
+
220
+ def _expected_job_name(collector_name: str) -> str:
221
+ return f"dcf-job-{collector_name.replace('_', '-')}"
222
+
223
+
224
+ def _tf_work_dir(collector_name: str, project_root: Path) -> Path:
225
+ return _tf_state_dir(project_root) / "collectors" / collector_name / "gcp"
226
+
227
+
228
+ def _copy_module_to_work_dir(module_dir: Path, work_dir: Path) -> None:
229
+ """Copy a leaf Terraform module's .tf files + shared templates into work_dir."""
230
+ for item in module_dir.iterdir():
231
+ if item.name in (".terraform", ".terraform.lock.hcl"):
232
+ continue
233
+ if item.is_file() and item.suffix == ".tf":
234
+ shutil.copy2(item, work_dir / item.name)
235
+ templates_src = _DCF_PKG_DIR / "infra" / "templates"
236
+ templates_dst = work_dir / "templates"
237
+ if templates_dst.exists():
238
+ shutil.rmtree(templates_dst)
239
+ shutil.copytree(templates_src, templates_dst)
240
+
241
+
242
+ def _tf_env() -> dict:
243
+ return {
244
+ **os.environ,
245
+ "TF_INPUT": "0",
246
+ "TF_PLUGIN_CACHE_DIR": str(_TF_PLUGIN_CACHE),
247
+ }
248
+
249
+
250
+ def _tf_run(cmd: list[str], work_dir: Path, env: dict) -> None:
251
+ result = subprocess.run(cmd, cwd=str(work_dir), env=env, capture_output=True, text=True)
252
+ if result.returncode != 0:
253
+ logger.error(
254
+ "Terraform command failed: %s\nSTDOUT: %s\nSTDERR: %s",
255
+ " ".join(cmd), result.stdout, result.stderr,
256
+ )
257
+ raise RuntimeError(
258
+ f"terraform {cmd[1]} failed (exit {result.returncode}): {result.stderr[-2000:]}"
259
+ )
260
+ logger.info("terraform %s OK", cmd[1])
261
+
262
+
263
+ def _terraform_apply_collector(
264
+ collector_name: str,
265
+ image_uri: str,
266
+ sa_email: str,
267
+ build_context: Path,
268
+ content_hash: str,
269
+ project_id: str,
270
+ region: str,
271
+ project_root: Path,
272
+ ) -> str:
273
+ """Provision Cloud Run job via Terraform + Cloud Build. Returns the job name."""
274
+ work_dir = _tf_work_dir(collector_name, project_root)
275
+ work_dir.mkdir(parents=True, exist_ok=True)
276
+ _TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
277
+
278
+ _copy_module_to_work_dir(_BATCH_MODULE_DIR / "gcp", work_dir)
279
+
280
+ tfvars = {
281
+ "project_id": project_id,
282
+ "region": region,
283
+ "collector_name": collector_name,
284
+ "image_uri": image_uri,
285
+ "sa_email": sa_email,
286
+ "build_context": str(build_context),
287
+ "content_hash": content_hash,
288
+ "java_enabled": False,
289
+ }
290
+ (work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
291
+
292
+ env = _tf_env()
293
+ _tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
294
+ _import_existing_cloud_run_job(collector_name, project_id, region, work_dir, env)
295
+ _tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
296
+
297
+ outputs = json.loads(
298
+ subprocess.run(
299
+ ["terraform", "output", "-json"],
300
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
301
+ ).stdout
302
+ )
303
+ return outputs["job_name"]["value"]
304
+
305
+
306
+ def _terraform_destroy_collector(
307
+ collector_name: str, project_id: str, region: str, project_root: Path,
308
+ ) -> None:
309
+ """Destroy Cloud Run job via Terraform, then remove the state dir."""
310
+ work_dir = _tf_work_dir(collector_name, project_root)
311
+ if not work_dir.exists():
312
+ raise RuntimeError(
313
+ f"No Terraform state found for collector '{collector_name}' at {work_dir}.\n"
314
+ "If you deployed from a different machine, delete the Cloud Run job manually:\n"
315
+ f" gcloud run jobs delete dcf-job-{collector_name.replace('_', '-')} "
316
+ f"--region {region} --project {project_id} --quiet"
317
+ )
318
+
319
+ env = _tf_env()
320
+ _tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
321
+ shutil.rmtree(work_dir)
322
+
323
+
324
+ def _import_existing_cloud_run_job(
325
+ collector_name: str, project_id: str, region: str, work_dir: Path, env: dict,
326
+ ) -> None:
327
+ """Import an existing Cloud Run job into Terraform state to avoid 409 on apply."""
328
+ job_name = _expected_job_name(collector_name)
329
+ check = subprocess.run(
330
+ ["gcloud", "run", "jobs", "describe", job_name,
331
+ "--region", region, "--project", project_id],
332
+ capture_output=True,
333
+ )
334
+ if check.returncode != 0:
335
+ return
336
+
337
+ resource_id = f"projects/{project_id}/locations/{region}/jobs/{job_name}"
338
+ result = subprocess.run(
339
+ ["terraform", "import", "google_cloud_run_v2_job.collector", resource_id],
340
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
341
+ )
342
+ if result.returncode == 0:
343
+ logger.info("Imported existing Cloud Run job '%s' into Terraform state", job_name)
344
+ elif "already managed by Terraform" in result.stdout + result.stderr:
345
+ logger.info("Cloud Run job '%s' already in Terraform state", job_name)
346
+ else:
347
+ logger.warning("terraform import returned non-zero: %s", result.stderr[-500:])
348
+
349
+
350
+ # ------------------------------------------------------------------ #
351
+ # GCS DAG management #
352
+ # ------------------------------------------------------------------ #
353
+
354
+ def _dag_gcs_path(collector_name: str) -> str:
355
+ return f"airflow/dags/{collector_name}.py"
356
+
357
+
358
+ def _gcp_dag_content(
359
+ collector_name: str, schedule: str, paused: bool,
360
+ project_id: str, region: str, job_name: str,
361
+ ) -> str:
362
+ paused_str = "True" if paused else "False"
363
+ return f"""\
364
+ # Generated by dcf — do not edit manually
365
+ from datetime import datetime
366
+ from airflow import DAG
367
+ from airflow.providers.google.cloud.operators.cloud_run import CloudRunExecuteJobOperator
368
+
369
+ with DAG(
370
+ dag_id="{collector_name}",
371
+ schedule_interval="{schedule}",
372
+ start_date=datetime(2024, 1, 1),
373
+ catchup=False,
374
+ is_paused_upon_creation={paused_str},
375
+ tags=["dcf"],
376
+ ) as dag:
377
+ run_job = CloudRunExecuteJobOperator(
378
+ task_id="run_{collector_name}",
379
+ project_id="{project_id}",
380
+ region="{region}",
381
+ job_name="{job_name}",
382
+ )
383
+ """
384
+
385
+
386
+ def _write_dag_gcs(dag_content: str, collector_name: str, warehouse_bucket: str) -> None:
387
+ from google.cloud import storage
388
+ client = storage.Client()
389
+ bucket = client.bucket(warehouse_bucket)
390
+ blob = bucket.blob(_dag_gcs_path(collector_name))
391
+ blob.upload_from_string(dag_content, content_type="text/plain")
392
+ logger.info("Uploaded DAG to gs://%s/%s", warehouse_bucket, _dag_gcs_path(collector_name))
393
+
394
+
395
+ def _delete_dag_gcs(collector_name: str, warehouse_bucket: str) -> None:
396
+ from google.cloud import storage
397
+ client = storage.Client()
398
+ bucket = client.bucket(warehouse_bucket)
399
+ blob = bucket.blob(_dag_gcs_path(collector_name))
400
+ if blob.exists():
401
+ blob.delete()
402
+ logger.info("Deleted DAG gs://%s/%s", warehouse_bucket, _dag_gcs_path(collector_name))
403
+
404
+
405
+ def _gcs_dag_files_exist(warehouse_bucket: str) -> bool:
406
+ """Return True if any DAG files remain in gs://<bucket>/airflow/dags/."""
407
+ from google.cloud import storage
408
+ client = storage.Client()
409
+ blobs = list(client.list_blobs(warehouse_bucket, prefix="airflow/dags/", max_results=1))
410
+ return len(blobs) > 0
411
+
412
+
413
+ # ------------------------------------------------------------------ #
414
+ # Artifact Registry #
415
+ # ------------------------------------------------------------------ #
416
+
417
+ def _ensure_artifact_registry_repo(project_id: str, region: str) -> None:
418
+ check = subprocess.run(
419
+ [
420
+ "gcloud", "artifacts", "repositories", "describe", "dcf-runner",
421
+ "--location", region, "--project", project_id,
422
+ ],
423
+ capture_output=True,
424
+ )
425
+ if check.returncode != 0:
426
+ result = subprocess.run(
427
+ [
428
+ "gcloud", "artifacts", "repositories", "create", "dcf-runner",
429
+ "--repository-format=docker",
430
+ "--location", region,
431
+ "--project", project_id,
432
+ ],
433
+ capture_output=True, text=True,
434
+ )
435
+ if result.returncode != 0:
436
+ raise RuntimeError(
437
+ f"Failed to create Artifact Registry repository: {result.stderr}\n"
438
+ "Ensure the API is enabled:\n"
439
+ " gcloud services enable artifactregistry.googleapis.com"
440
+ )
441
+
442
+
443
+ # ------------------------------------------------------------------ #
444
+ # GCP Airflow stack #
445
+ # ------------------------------------------------------------------ #
446
+
447
+ def _airflow_image_uri(project_id: str, region: str) -> str:
448
+ return f"{region}-docker.pkg.dev/{project_id}/dcf-runner/dcf-airflow:latest"
449
+
450
+
451
+ def _airflow_build_context() -> Path:
452
+ build_context = _BUILD_DIR / "airflow-gcp"
453
+ build_context.mkdir(parents=True, exist_ok=True)
454
+ return build_context
455
+
456
+
457
+ def _airflow_content_hash() -> str:
458
+ template = _DCF_PKG_DIR / "infra" / "modules" / "templates" / "airflow.Dockerfile.tftpl"
459
+ return hashlib.sha256(template.read_bytes()).hexdigest()
460
+
461
+
462
+ def _generate_airflow_credentials(project_root: Path) -> dict:
463
+ """Read/generate Airflow credentials from project.yml."""
464
+ cfg_path = project_root / "project.yml"
465
+ cfg: dict = yaml.safe_load(cfg_path.read_text()) or {} if cfg_path.exists() else {}
466
+
467
+ admin_password = cfg.get("airflow_admin_password")
468
+ if not admin_password:
469
+ import getpass
470
+ admin_password = getpass.getpass("Enter Airflow admin password: ").strip()
471
+ if not admin_password:
472
+ raise RuntimeError("Airflow admin password cannot be empty.")
473
+ cfg["airflow_admin_password"] = admin_password
474
+ cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
475
+ logger.info("Saved airflow_admin_password to project.yml")
476
+
477
+ changed = False
478
+
479
+ fernet_key = cfg.get("airflow_fernet_key")
480
+ if not fernet_key:
481
+ from cryptography.fernet import Fernet
482
+ fernet_key = Fernet.generate_key().decode()
483
+ cfg["airflow_fernet_key"] = fernet_key
484
+ changed = True
485
+
486
+ db_password = cfg.get("airflow_db_password")
487
+ if not db_password:
488
+ db_password = secrets.token_urlsafe(16)
489
+ cfg["airflow_db_password"] = db_password
490
+ changed = True
491
+
492
+ if changed:
493
+ cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
494
+
495
+ return {
496
+ "db_password": db_password,
497
+ "admin_password": admin_password,
498
+ "fernet_key": fernet_key,
499
+ }
500
+
501
+
502
+ def _tf_apply_airflow_gcp(
503
+ build_context: Path,
504
+ image_uri: str,
505
+ content_hash: str,
506
+ gcp_config: dict,
507
+ credentials: dict,
508
+ project_root: Path,
509
+ ) -> dict:
510
+ work_dir = _tf_state_dir(project_root) / "airflow" / "gcp"
511
+ work_dir.mkdir(parents=True, exist_ok=True)
512
+ _TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
513
+
514
+ _copy_module_to_work_dir(_BATCH_MODULE_DIR / "gcp" / "airflow", work_dir)
515
+
516
+ tfvars = {
517
+ "image_uri": image_uri,
518
+ "build_context": str(build_context),
519
+ "content_hash": content_hash,
520
+ "project_id": gcp_config["project_id"],
521
+ "region": gcp_config["region"],
522
+ "sa_email": gcp_config["sa_email"],
523
+ "warehouse_bucket": gcp_config["warehouse_bucket"],
524
+ "db_password": credentials["db_password"],
525
+ "admin_password": credentials["admin_password"],
526
+ "fernet_key": credentials["fernet_key"],
527
+ }
528
+ (work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
529
+
530
+ env = _tf_env()
531
+ _tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
532
+ _tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
533
+
534
+ raw = subprocess.run(
535
+ ["terraform", "output", "-json"],
536
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
537
+ ).stdout
538
+ return json.loads(raw) if raw.strip() else {}
539
+
540
+
541
+ def _tf_destroy_airflow_gcp(project_root: Path) -> None:
542
+ work_dir = _tf_state_dir(project_root) / "airflow" / "gcp"
543
+ if not work_dir.exists():
544
+ return
545
+
546
+ env = _tf_env()
547
+ _tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
548
+ shutil.rmtree(work_dir)