data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
dcf/engine/transforms.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from ..config.models import ArrayJoinTransform, CrsReprojectTransform, Transform
|
|
6
|
+
from .fetcher import _get_nested
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_transform(transform: Transform, record: dict) -> Any:
|
|
10
|
+
if isinstance(transform, CrsReprojectTransform):
|
|
11
|
+
return _crs_reproject(transform, record)
|
|
12
|
+
if isinstance(transform, ArrayJoinTransform):
|
|
13
|
+
return _array_join(transform, record)
|
|
14
|
+
raise ValueError(f"Unknown transform type: {type(transform)}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _array_join(t: ArrayJoinTransform, record: dict) -> str | None:
|
|
18
|
+
value = _get_nested(record, t.path)
|
|
19
|
+
if value is None:
|
|
20
|
+
return None
|
|
21
|
+
if not isinstance(value, list):
|
|
22
|
+
return str(value)
|
|
23
|
+
return t.separator.join(str(item) for item in value)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _crs_reproject(t: CrsReprojectTransform, record: dict) -> float | None:
|
|
27
|
+
from pyproj import Transformer as ProjTransformer
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
raw_x = record.get(t.from_columns[0])
|
|
31
|
+
raw_y = record.get(t.from_columns[1])
|
|
32
|
+
if raw_x is None or raw_y is None:
|
|
33
|
+
return None
|
|
34
|
+
x = float(raw_x)
|
|
35
|
+
y = float(raw_y)
|
|
36
|
+
except (TypeError, ValueError):
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
proj = ProjTransformer.from_crs(t.from_crs, t.to_crs, always_xy=True)
|
|
40
|
+
lon, lat = proj.transform(x, y)
|
|
41
|
+
return lon if t.component == "x" else lat
|
dcf/gcp/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Shared schema projection utilities used by both beam_runner (GCP/Dataflow)
|
|
2
|
+
and local_stream_runner (local Kafka)."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
TYPE_MAP: dict[str, pa.DataType] = {
|
|
17
|
+
"string": pa.string(),
|
|
18
|
+
"integer": pa.int64(),
|
|
19
|
+
"float": pa.float64(),
|
|
20
|
+
"boolean": pa.bool_(),
|
|
21
|
+
"timestamp": pa.timestamp("us", tz="UTC"),
|
|
22
|
+
"date": pa.date32(),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_columns(collector_name: str) -> list[dict]:
|
|
27
|
+
path = Path("collectors") / f"{collector_name}.yml"
|
|
28
|
+
data = yaml.safe_load(path.read_text())
|
|
29
|
+
return data["schema"]["columns"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def to_pyarrow_schema(columns: list[dict]) -> pa.Schema:
|
|
33
|
+
fields = [
|
|
34
|
+
pa.field(col["name"], TYPE_MAP.get(col.get("type", "string"), pa.string()))
|
|
35
|
+
for col in columns
|
|
36
|
+
]
|
|
37
|
+
return pa.schema(fields)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def cast_value(value, col_type: str | None):
|
|
41
|
+
if value is None:
|
|
42
|
+
return None
|
|
43
|
+
if col_type == "integer":
|
|
44
|
+
return int(value)
|
|
45
|
+
if col_type == "float":
|
|
46
|
+
return float(value)
|
|
47
|
+
if col_type == "boolean":
|
|
48
|
+
return bool(value)
|
|
49
|
+
if col_type == "timestamp":
|
|
50
|
+
if isinstance(value, str):
|
|
51
|
+
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S%z"):
|
|
52
|
+
try:
|
|
53
|
+
dt = datetime.strptime(value.rstrip("Z") + "+00:00", fmt.replace("Z", "%z"))
|
|
54
|
+
return dt.astimezone(timezone.utc)
|
|
55
|
+
except ValueError:
|
|
56
|
+
continue
|
|
57
|
+
return value
|
|
58
|
+
if col_type == "date":
|
|
59
|
+
if isinstance(value, str):
|
|
60
|
+
try:
|
|
61
|
+
return datetime.strptime(value, "%Y-%m-%d").date()
|
|
62
|
+
except ValueError:
|
|
63
|
+
return value
|
|
64
|
+
return value
|
|
65
|
+
return str(value) if value is not None else None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def project_message(msg_bytes: bytes, columns: list[dict]) -> dict | None:
|
|
69
|
+
try:
|
|
70
|
+
record = json.loads(msg_bytes.decode("utf-8"))
|
|
71
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
72
|
+
logger.warning("Skipping unparseable message")
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
row: dict = {}
|
|
76
|
+
for col in columns:
|
|
77
|
+
path = col.get("path") or col["name"]
|
|
78
|
+
parts = path.split(".")
|
|
79
|
+
val = record
|
|
80
|
+
for part in parts:
|
|
81
|
+
if isinstance(val, dict):
|
|
82
|
+
val = val.get(part)
|
|
83
|
+
else:
|
|
84
|
+
val = None
|
|
85
|
+
break
|
|
86
|
+
row[col["name"]] = cast_value(val, col.get("type"))
|
|
87
|
+
return row
|
dcf/gcp/auth.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Replaced by api/gcp/gcloud.py
|
dcf/gcp/batch_deploy.py
ADDED
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
"""Batch collector deployment: builds a container image via Cloud Build, then uses
|
|
2
|
+
Terraform to provision a Cloud Run job. DAG is written directly to GCS for the
|
|
3
|
+
custom Airflow stack (no Cloud Composer)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import secrets
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_DCF_PKG_DIR = Path(__file__).parent.parent # dcf/ package
|
|
22
|
+
_DCF_REPO_ROOT = _DCF_PKG_DIR.parent
|
|
23
|
+
_BATCH_MODULE_DIR = _DCF_PKG_DIR / "infra" / "modules" / "batch_collector"
|
|
24
|
+
_BUILD_DIR = Path.home() / ".dcf" / "build"
|
|
25
|
+
_TF_PLUGIN_CACHE = Path.home() / ".dcf" / ".plugin-cache"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _tf_state_dir(project_root: Path) -> Path:
|
|
29
|
+
"""Return the Terraform state directory for this project.
|
|
30
|
+
|
|
31
|
+
Defaults to <project_root>/.dcf/terraform; can be overridden with
|
|
32
|
+
`terraform_state_dir` in project.yml.
|
|
33
|
+
"""
|
|
34
|
+
cfg_path = project_root / "project.yml"
|
|
35
|
+
if cfg_path.exists():
|
|
36
|
+
cfg = yaml.safe_load(cfg_path.read_text()) or {}
|
|
37
|
+
custom = cfg.get("terraform_state_dir")
|
|
38
|
+
if custom:
|
|
39
|
+
return Path(custom).expanduser()
|
|
40
|
+
return project_root / ".dcf" / "terraform"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _write_pyproject_toml(dest: Path) -> None:
|
|
44
|
+
repo_pyproject = _DCF_REPO_ROOT / "pyproject.toml"
|
|
45
|
+
if repo_pyproject.exists():
|
|
46
|
+
shutil.copy2(repo_pyproject, dest / "pyproject.toml")
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
import importlib.metadata
|
|
50
|
+
|
|
51
|
+
meta = importlib.metadata.metadata("dcf")
|
|
52
|
+
version = meta["Version"]
|
|
53
|
+
reqs = importlib.metadata.requires("dcf") or []
|
|
54
|
+
direct_deps = [r for r in reqs if "extra ==" not in r]
|
|
55
|
+
deps_str = "\n".join(f' "{r}",' for r in direct_deps)
|
|
56
|
+
(dest / "pyproject.toml").write_text(
|
|
57
|
+
f'[project]\n'
|
|
58
|
+
f'name = "dcf"\n'
|
|
59
|
+
f'version = "{version}"\n'
|
|
60
|
+
f'requires-python = ">=3.12"\n'
|
|
61
|
+
f'dependencies = [\n{deps_str}\n]\n\n'
|
|
62
|
+
f'[project.scripts]\n'
|
|
63
|
+
f'dcf = "dcf.cli:app"\n\n'
|
|
64
|
+
f'[tool.setuptools.packages.find]\n'
|
|
65
|
+
f'include = ["dcf*"]\n'
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ------------------------------------------------------------------ #
|
|
70
|
+
# Public API #
|
|
71
|
+
# ------------------------------------------------------------------ #
|
|
72
|
+
|
|
73
|
+
def deploy(
|
|
74
|
+
collector_name: str,
|
|
75
|
+
schedule: str,
|
|
76
|
+
paused: bool,
|
|
77
|
+
project_root: Path,
|
|
78
|
+
gcp_config: dict,
|
|
79
|
+
) -> dict:
|
|
80
|
+
"""Provision a Cloud Run job for a collector via Terraform, write DAG to GCS,
|
|
81
|
+
and provision the GCP Airflow stack (Cloud Run + Cloud SQL) if needed.
|
|
82
|
+
|
|
83
|
+
Returns the deployment state dict to write into project.yml.
|
|
84
|
+
"""
|
|
85
|
+
project_id = gcp_config["project_id"]
|
|
86
|
+
region = gcp_config["region"]
|
|
87
|
+
warehouse_bucket = gcp_config["warehouse_bucket"]
|
|
88
|
+
sa_email = gcp_config["sa_email"]
|
|
89
|
+
|
|
90
|
+
image_uri = _image_uri(project_id, region, collector_name)
|
|
91
|
+
|
|
92
|
+
print(f" Syncing build context for '{collector_name}'...", flush=True)
|
|
93
|
+
build_context = _sync_build_context(project_root, collector_name, gcp_config)
|
|
94
|
+
content_hash = _content_hash(build_context)
|
|
95
|
+
|
|
96
|
+
print(f" Ensuring Artifact Registry repository exists...", flush=True)
|
|
97
|
+
_ensure_artifact_registry_repo(project_id, region)
|
|
98
|
+
|
|
99
|
+
print(f" Applying Terraform (Cloud Run job + Cloud Build)...", flush=True)
|
|
100
|
+
print(f" (First build may take a few minutes)", flush=True)
|
|
101
|
+
job_name = _terraform_apply_collector(
|
|
102
|
+
collector_name=collector_name,
|
|
103
|
+
image_uri=image_uri,
|
|
104
|
+
sa_email=sa_email,
|
|
105
|
+
build_context=build_context,
|
|
106
|
+
content_hash=content_hash,
|
|
107
|
+
project_id=project_id,
|
|
108
|
+
region=region,
|
|
109
|
+
project_root=project_root,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
print(f" Writing DAG to GCS...", flush=True)
|
|
113
|
+
dag_content = _gcp_dag_content(
|
|
114
|
+
collector_name=collector_name,
|
|
115
|
+
schedule=schedule,
|
|
116
|
+
paused=paused,
|
|
117
|
+
project_id=project_id,
|
|
118
|
+
region=region,
|
|
119
|
+
job_name=job_name,
|
|
120
|
+
)
|
|
121
|
+
_write_dag_gcs(dag_content, collector_name, warehouse_bucket)
|
|
122
|
+
|
|
123
|
+
print(f" Provisioning GCP Airflow stack...", flush=True)
|
|
124
|
+
credentials = _generate_airflow_credentials(project_root)
|
|
125
|
+
airflow_outputs = _tf_apply_airflow_gcp(
|
|
126
|
+
build_context=_airflow_build_context(),
|
|
127
|
+
image_uri=_airflow_image_uri(project_id, region),
|
|
128
|
+
content_hash=_airflow_content_hash(),
|
|
129
|
+
gcp_config=gcp_config,
|
|
130
|
+
credentials=credentials,
|
|
131
|
+
project_root=project_root,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
airflow_url = airflow_outputs.get("webserver_url", {}).get("value", "")
|
|
135
|
+
if airflow_url:
|
|
136
|
+
print(f" Airflow UI: {airflow_url}", flush=True)
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"schedule": schedule,
|
|
140
|
+
"dag_id": collector_name,
|
|
141
|
+
"cloud_run_job": job_name,
|
|
142
|
+
"airflow_url": airflow_url,
|
|
143
|
+
"image_uri": image_uri,
|
|
144
|
+
"deployed_at": datetime.now(tz=timezone.utc).isoformat(timespec="seconds"),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def undeploy(collector_name: str, deployment: dict, gcp_config: dict, project_root: Path) -> None:
|
|
149
|
+
"""Remove the Cloud Run job via Terraform destroy and delete the DAG from GCS."""
|
|
150
|
+
project_id = gcp_config["project_id"]
|
|
151
|
+
region = gcp_config["region"]
|
|
152
|
+
warehouse_bucket = gcp_config["warehouse_bucket"]
|
|
153
|
+
|
|
154
|
+
print(f" Destroying Terraform resources for '{collector_name}'...", flush=True)
|
|
155
|
+
_terraform_destroy_collector(collector_name, project_id, region, project_root)
|
|
156
|
+
|
|
157
|
+
print(f" Deleting DAG from GCS...", flush=True)
|
|
158
|
+
_delete_dag_gcs(collector_name, warehouse_bucket)
|
|
159
|
+
|
|
160
|
+
if not _gcs_dag_files_exist(warehouse_bucket):
|
|
161
|
+
print(f" No remaining DAGs — tearing down Airflow stack...", flush=True)
|
|
162
|
+
_tf_destroy_airflow_gcp(project_root)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------ #
|
|
166
|
+
# Build context #
|
|
167
|
+
# ------------------------------------------------------------------ #
|
|
168
|
+
|
|
169
|
+
def _image_uri(project_id: str, region: str, collector_name: str) -> str:
|
|
170
|
+
return f"{region}-docker.pkg.dev/{project_id}/dcf-runner/{collector_name}:latest"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _sync_build_context(
|
|
174
|
+
project_root: Path, collector_name: str, gcp_config: dict
|
|
175
|
+
) -> Path:
|
|
176
|
+
"""Create a stable build context dir at ~/.dcf/build/gcp/<name>/."""
|
|
177
|
+
build_context = _BUILD_DIR / "gcp" / collector_name
|
|
178
|
+
shutil.rmtree(build_context, ignore_errors=True)
|
|
179
|
+
build_context.mkdir(parents=True)
|
|
180
|
+
|
|
181
|
+
shutil.copytree(_DCF_PKG_DIR, build_context / "dcf")
|
|
182
|
+
_write_pyproject_toml(build_context)
|
|
183
|
+
|
|
184
|
+
for subdir in ("collectors", "connectors"):
|
|
185
|
+
src = project_root / subdir
|
|
186
|
+
dst = build_context / subdir
|
|
187
|
+
if src.exists():
|
|
188
|
+
shutil.copytree(src, dst)
|
|
189
|
+
else:
|
|
190
|
+
dst.mkdir()
|
|
191
|
+
|
|
192
|
+
minimal_config = {
|
|
193
|
+
"catalog": "gcp",
|
|
194
|
+
"gcp": {
|
|
195
|
+
"project_id": gcp_config["project_id"],
|
|
196
|
+
"region": gcp_config["region"],
|
|
197
|
+
"warehouse_bucket": gcp_config["warehouse_bucket"],
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
(build_context / "project.yml").write_text(
|
|
201
|
+
yaml.dump(minimal_config, default_flow_style=False)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return build_context
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _content_hash(build_context: Path) -> str:
|
|
208
|
+
"""SHA256 of all files in build_context, excluding Dockerfile (written by Terraform)."""
|
|
209
|
+
h = hashlib.sha256()
|
|
210
|
+
for path in sorted(build_context.rglob("*")):
|
|
211
|
+
if path.is_file() and path.name != "Dockerfile":
|
|
212
|
+
h.update(path.read_bytes())
|
|
213
|
+
return h.hexdigest()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ------------------------------------------------------------------ #
|
|
217
|
+
# Terraform: per-collector resources #
|
|
218
|
+
# ------------------------------------------------------------------ #
|
|
219
|
+
|
|
220
|
+
def _expected_job_name(collector_name: str) -> str:
|
|
221
|
+
return f"dcf-job-{collector_name.replace('_', '-')}"
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _tf_work_dir(collector_name: str, project_root: Path) -> Path:
|
|
225
|
+
return _tf_state_dir(project_root) / "collectors" / collector_name / "gcp"
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _copy_module_to_work_dir(module_dir: Path, work_dir: Path) -> None:
|
|
229
|
+
"""Copy a leaf Terraform module's .tf files + shared templates into work_dir."""
|
|
230
|
+
for item in module_dir.iterdir():
|
|
231
|
+
if item.name in (".terraform", ".terraform.lock.hcl"):
|
|
232
|
+
continue
|
|
233
|
+
if item.is_file() and item.suffix == ".tf":
|
|
234
|
+
shutil.copy2(item, work_dir / item.name)
|
|
235
|
+
templates_src = _DCF_PKG_DIR / "infra" / "templates"
|
|
236
|
+
templates_dst = work_dir / "templates"
|
|
237
|
+
if templates_dst.exists():
|
|
238
|
+
shutil.rmtree(templates_dst)
|
|
239
|
+
shutil.copytree(templates_src, templates_dst)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _tf_env() -> dict:
|
|
243
|
+
return {
|
|
244
|
+
**os.environ,
|
|
245
|
+
"TF_INPUT": "0",
|
|
246
|
+
"TF_PLUGIN_CACHE_DIR": str(_TF_PLUGIN_CACHE),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _tf_run(cmd: list[str], work_dir: Path, env: dict) -> None:
|
|
251
|
+
result = subprocess.run(cmd, cwd=str(work_dir), env=env, capture_output=True, text=True)
|
|
252
|
+
if result.returncode != 0:
|
|
253
|
+
logger.error(
|
|
254
|
+
"Terraform command failed: %s\nSTDOUT: %s\nSTDERR: %s",
|
|
255
|
+
" ".join(cmd), result.stdout, result.stderr,
|
|
256
|
+
)
|
|
257
|
+
raise RuntimeError(
|
|
258
|
+
f"terraform {cmd[1]} failed (exit {result.returncode}): {result.stderr[-2000:]}"
|
|
259
|
+
)
|
|
260
|
+
logger.info("terraform %s OK", cmd[1])
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _terraform_apply_collector(
|
|
264
|
+
collector_name: str,
|
|
265
|
+
image_uri: str,
|
|
266
|
+
sa_email: str,
|
|
267
|
+
build_context: Path,
|
|
268
|
+
content_hash: str,
|
|
269
|
+
project_id: str,
|
|
270
|
+
region: str,
|
|
271
|
+
project_root: Path,
|
|
272
|
+
) -> str:
|
|
273
|
+
"""Provision Cloud Run job via Terraform + Cloud Build. Returns the job name."""
|
|
274
|
+
work_dir = _tf_work_dir(collector_name, project_root)
|
|
275
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
_TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
|
|
277
|
+
|
|
278
|
+
_copy_module_to_work_dir(_BATCH_MODULE_DIR / "gcp", work_dir)
|
|
279
|
+
|
|
280
|
+
tfvars = {
|
|
281
|
+
"project_id": project_id,
|
|
282
|
+
"region": region,
|
|
283
|
+
"collector_name": collector_name,
|
|
284
|
+
"image_uri": image_uri,
|
|
285
|
+
"sa_email": sa_email,
|
|
286
|
+
"build_context": str(build_context),
|
|
287
|
+
"content_hash": content_hash,
|
|
288
|
+
"java_enabled": False,
|
|
289
|
+
}
|
|
290
|
+
(work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
|
|
291
|
+
|
|
292
|
+
env = _tf_env()
|
|
293
|
+
_tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
|
|
294
|
+
_import_existing_cloud_run_job(collector_name, project_id, region, work_dir, env)
|
|
295
|
+
_tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
|
|
296
|
+
|
|
297
|
+
outputs = json.loads(
|
|
298
|
+
subprocess.run(
|
|
299
|
+
["terraform", "output", "-json"],
|
|
300
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
301
|
+
).stdout
|
|
302
|
+
)
|
|
303
|
+
return outputs["job_name"]["value"]
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _terraform_destroy_collector(
|
|
307
|
+
collector_name: str, project_id: str, region: str, project_root: Path,
|
|
308
|
+
) -> None:
|
|
309
|
+
"""Destroy Cloud Run job via Terraform, then remove the state dir."""
|
|
310
|
+
work_dir = _tf_work_dir(collector_name, project_root)
|
|
311
|
+
if not work_dir.exists():
|
|
312
|
+
raise RuntimeError(
|
|
313
|
+
f"No Terraform state found for collector '{collector_name}' at {work_dir}.\n"
|
|
314
|
+
"If you deployed from a different machine, delete the Cloud Run job manually:\n"
|
|
315
|
+
f" gcloud run jobs delete dcf-job-{collector_name.replace('_', '-')} "
|
|
316
|
+
f"--region {region} --project {project_id} --quiet"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
env = _tf_env()
|
|
320
|
+
_tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
|
|
321
|
+
shutil.rmtree(work_dir)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _import_existing_cloud_run_job(
|
|
325
|
+
collector_name: str, project_id: str, region: str, work_dir: Path, env: dict,
|
|
326
|
+
) -> None:
|
|
327
|
+
"""Import an existing Cloud Run job into Terraform state to avoid 409 on apply."""
|
|
328
|
+
job_name = _expected_job_name(collector_name)
|
|
329
|
+
check = subprocess.run(
|
|
330
|
+
["gcloud", "run", "jobs", "describe", job_name,
|
|
331
|
+
"--region", region, "--project", project_id],
|
|
332
|
+
capture_output=True,
|
|
333
|
+
)
|
|
334
|
+
if check.returncode != 0:
|
|
335
|
+
return
|
|
336
|
+
|
|
337
|
+
resource_id = f"projects/{project_id}/locations/{region}/jobs/{job_name}"
|
|
338
|
+
result = subprocess.run(
|
|
339
|
+
["terraform", "import", "google_cloud_run_v2_job.collector", resource_id],
|
|
340
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
341
|
+
)
|
|
342
|
+
if result.returncode == 0:
|
|
343
|
+
logger.info("Imported existing Cloud Run job '%s' into Terraform state", job_name)
|
|
344
|
+
elif "already managed by Terraform" in result.stdout + result.stderr:
|
|
345
|
+
logger.info("Cloud Run job '%s' already in Terraform state", job_name)
|
|
346
|
+
else:
|
|
347
|
+
logger.warning("terraform import returned non-zero: %s", result.stderr[-500:])
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
# ------------------------------------------------------------------ #
|
|
351
|
+
# GCS DAG management #
|
|
352
|
+
# ------------------------------------------------------------------ #
|
|
353
|
+
|
|
354
|
+
def _dag_gcs_path(collector_name: str) -> str:
|
|
355
|
+
return f"airflow/dags/{collector_name}.py"
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _gcp_dag_content(
|
|
359
|
+
collector_name: str, schedule: str, paused: bool,
|
|
360
|
+
project_id: str, region: str, job_name: str,
|
|
361
|
+
) -> str:
|
|
362
|
+
paused_str = "True" if paused else "False"
|
|
363
|
+
return f"""\
|
|
364
|
+
# Generated by dcf — do not edit manually
|
|
365
|
+
from datetime import datetime
|
|
366
|
+
from airflow import DAG
|
|
367
|
+
from airflow.providers.google.cloud.operators.cloud_run import CloudRunExecuteJobOperator
|
|
368
|
+
|
|
369
|
+
with DAG(
|
|
370
|
+
dag_id="{collector_name}",
|
|
371
|
+
schedule_interval="{schedule}",
|
|
372
|
+
start_date=datetime(2024, 1, 1),
|
|
373
|
+
catchup=False,
|
|
374
|
+
is_paused_upon_creation={paused_str},
|
|
375
|
+
tags=["dcf"],
|
|
376
|
+
) as dag:
|
|
377
|
+
run_job = CloudRunExecuteJobOperator(
|
|
378
|
+
task_id="run_{collector_name}",
|
|
379
|
+
project_id="{project_id}",
|
|
380
|
+
region="{region}",
|
|
381
|
+
job_name="{job_name}",
|
|
382
|
+
)
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _write_dag_gcs(dag_content: str, collector_name: str, warehouse_bucket: str) -> None:
|
|
387
|
+
from google.cloud import storage
|
|
388
|
+
client = storage.Client()
|
|
389
|
+
bucket = client.bucket(warehouse_bucket)
|
|
390
|
+
blob = bucket.blob(_dag_gcs_path(collector_name))
|
|
391
|
+
blob.upload_from_string(dag_content, content_type="text/plain")
|
|
392
|
+
logger.info("Uploaded DAG to gs://%s/%s", warehouse_bucket, _dag_gcs_path(collector_name))
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _delete_dag_gcs(collector_name: str, warehouse_bucket: str) -> None:
|
|
396
|
+
from google.cloud import storage
|
|
397
|
+
client = storage.Client()
|
|
398
|
+
bucket = client.bucket(warehouse_bucket)
|
|
399
|
+
blob = bucket.blob(_dag_gcs_path(collector_name))
|
|
400
|
+
if blob.exists():
|
|
401
|
+
blob.delete()
|
|
402
|
+
logger.info("Deleted DAG gs://%s/%s", warehouse_bucket, _dag_gcs_path(collector_name))
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _gcs_dag_files_exist(warehouse_bucket: str) -> bool:
|
|
406
|
+
"""Return True if any DAG files remain in gs://<bucket>/airflow/dags/."""
|
|
407
|
+
from google.cloud import storage
|
|
408
|
+
client = storage.Client()
|
|
409
|
+
blobs = list(client.list_blobs(warehouse_bucket, prefix="airflow/dags/", max_results=1))
|
|
410
|
+
return len(blobs) > 0
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# ------------------------------------------------------------------ #
|
|
414
|
+
# Artifact Registry #
|
|
415
|
+
# ------------------------------------------------------------------ #
|
|
416
|
+
|
|
417
|
+
def _ensure_artifact_registry_repo(project_id: str, region: str) -> None:
|
|
418
|
+
check = subprocess.run(
|
|
419
|
+
[
|
|
420
|
+
"gcloud", "artifacts", "repositories", "describe", "dcf-runner",
|
|
421
|
+
"--location", region, "--project", project_id,
|
|
422
|
+
],
|
|
423
|
+
capture_output=True,
|
|
424
|
+
)
|
|
425
|
+
if check.returncode != 0:
|
|
426
|
+
result = subprocess.run(
|
|
427
|
+
[
|
|
428
|
+
"gcloud", "artifacts", "repositories", "create", "dcf-runner",
|
|
429
|
+
"--repository-format=docker",
|
|
430
|
+
"--location", region,
|
|
431
|
+
"--project", project_id,
|
|
432
|
+
],
|
|
433
|
+
capture_output=True, text=True,
|
|
434
|
+
)
|
|
435
|
+
if result.returncode != 0:
|
|
436
|
+
raise RuntimeError(
|
|
437
|
+
f"Failed to create Artifact Registry repository: {result.stderr}\n"
|
|
438
|
+
"Ensure the API is enabled:\n"
|
|
439
|
+
" gcloud services enable artifactregistry.googleapis.com"
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# ------------------------------------------------------------------ #
|
|
444
|
+
# GCP Airflow stack #
|
|
445
|
+
# ------------------------------------------------------------------ #
|
|
446
|
+
|
|
447
|
+
def _airflow_image_uri(project_id: str, region: str) -> str:
|
|
448
|
+
return f"{region}-docker.pkg.dev/{project_id}/dcf-runner/dcf-airflow:latest"
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _airflow_build_context() -> Path:
|
|
452
|
+
build_context = _BUILD_DIR / "airflow-gcp"
|
|
453
|
+
build_context.mkdir(parents=True, exist_ok=True)
|
|
454
|
+
return build_context
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _airflow_content_hash() -> str:
|
|
458
|
+
template = _DCF_PKG_DIR / "infra" / "modules" / "templates" / "airflow.Dockerfile.tftpl"
|
|
459
|
+
return hashlib.sha256(template.read_bytes()).hexdigest()
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _generate_airflow_credentials(project_root: Path) -> dict:
|
|
463
|
+
"""Read/generate Airflow credentials from project.yml."""
|
|
464
|
+
cfg_path = project_root / "project.yml"
|
|
465
|
+
cfg: dict = yaml.safe_load(cfg_path.read_text()) or {} if cfg_path.exists() else {}
|
|
466
|
+
|
|
467
|
+
admin_password = cfg.get("airflow_admin_password")
|
|
468
|
+
if not admin_password:
|
|
469
|
+
import getpass
|
|
470
|
+
admin_password = getpass.getpass("Enter Airflow admin password: ").strip()
|
|
471
|
+
if not admin_password:
|
|
472
|
+
raise RuntimeError("Airflow admin password cannot be empty.")
|
|
473
|
+
cfg["airflow_admin_password"] = admin_password
|
|
474
|
+
cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
|
|
475
|
+
logger.info("Saved airflow_admin_password to project.yml")
|
|
476
|
+
|
|
477
|
+
changed = False
|
|
478
|
+
|
|
479
|
+
fernet_key = cfg.get("airflow_fernet_key")
|
|
480
|
+
if not fernet_key:
|
|
481
|
+
from cryptography.fernet import Fernet
|
|
482
|
+
fernet_key = Fernet.generate_key().decode()
|
|
483
|
+
cfg["airflow_fernet_key"] = fernet_key
|
|
484
|
+
changed = True
|
|
485
|
+
|
|
486
|
+
db_password = cfg.get("airflow_db_password")
|
|
487
|
+
if not db_password:
|
|
488
|
+
db_password = secrets.token_urlsafe(16)
|
|
489
|
+
cfg["airflow_db_password"] = db_password
|
|
490
|
+
changed = True
|
|
491
|
+
|
|
492
|
+
if changed:
|
|
493
|
+
cfg_path.write_text(yaml.dump(cfg, default_flow_style=False, sort_keys=False))
|
|
494
|
+
|
|
495
|
+
return {
|
|
496
|
+
"db_password": db_password,
|
|
497
|
+
"admin_password": admin_password,
|
|
498
|
+
"fernet_key": fernet_key,
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _tf_apply_airflow_gcp(
|
|
503
|
+
build_context: Path,
|
|
504
|
+
image_uri: str,
|
|
505
|
+
content_hash: str,
|
|
506
|
+
gcp_config: dict,
|
|
507
|
+
credentials: dict,
|
|
508
|
+
project_root: Path,
|
|
509
|
+
) -> dict:
|
|
510
|
+
work_dir = _tf_state_dir(project_root) / "airflow" / "gcp"
|
|
511
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
512
|
+
_TF_PLUGIN_CACHE.mkdir(parents=True, exist_ok=True)
|
|
513
|
+
|
|
514
|
+
_copy_module_to_work_dir(_BATCH_MODULE_DIR / "gcp" / "airflow", work_dir)
|
|
515
|
+
|
|
516
|
+
tfvars = {
|
|
517
|
+
"image_uri": image_uri,
|
|
518
|
+
"build_context": str(build_context),
|
|
519
|
+
"content_hash": content_hash,
|
|
520
|
+
"project_id": gcp_config["project_id"],
|
|
521
|
+
"region": gcp_config["region"],
|
|
522
|
+
"sa_email": gcp_config["sa_email"],
|
|
523
|
+
"warehouse_bucket": gcp_config["warehouse_bucket"],
|
|
524
|
+
"db_password": credentials["db_password"],
|
|
525
|
+
"admin_password": credentials["admin_password"],
|
|
526
|
+
"fernet_key": credentials["fernet_key"],
|
|
527
|
+
}
|
|
528
|
+
(work_dir / "terraform.tfvars.json").write_text(json.dumps(tfvars, indent=2))
|
|
529
|
+
|
|
530
|
+
env = _tf_env()
|
|
531
|
+
_tf_run(["terraform", "init", "-reconfigure"], work_dir, env)
|
|
532
|
+
_tf_run(["terraform", "apply", "-auto-approve"], work_dir, env)
|
|
533
|
+
|
|
534
|
+
raw = subprocess.run(
|
|
535
|
+
["terraform", "output", "-json"],
|
|
536
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
537
|
+
).stdout
|
|
538
|
+
return json.loads(raw) if raw.strip() else {}
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _tf_destroy_airflow_gcp(project_root: Path) -> None:
|
|
542
|
+
work_dir = _tf_state_dir(project_root) / "airflow" / "gcp"
|
|
543
|
+
if not work_dir.exists():
|
|
544
|
+
return
|
|
545
|
+
|
|
546
|
+
env = _tf_env()
|
|
547
|
+
_tf_run(["terraform", "destroy", "-auto-approve"], work_dir, env)
|
|
548
|
+
shutil.rmtree(work_dir)
|