data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
dcf/writer/iceberg.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import io
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import pytz
|
|
10
|
+
|
|
11
|
+
from ..config.models import Collector, StagingConfig, MergeConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _gcs_warehouse_bucket() -> str:
|
|
15
|
+
import yaml
|
|
16
|
+
from ..project import find_project_root
|
|
17
|
+
cfg_file = find_project_root() / "project.yml"
|
|
18
|
+
cfg = yaml.safe_load(cfg_file.read_text()) if cfg_file.exists() else {}
|
|
19
|
+
bucket = cfg.get("gcp", {}).get("warehouse_bucket")
|
|
20
|
+
if not bucket:
|
|
21
|
+
raise RuntimeError(
|
|
22
|
+
"GCP warehouse bucket not configured. Run: dcf gcp setup --project-id ... --region ..."
|
|
23
|
+
)
|
|
24
|
+
return bucket
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pst_now() -> str:
|
|
28
|
+
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
|
|
29
|
+
return utc_now.astimezone(pytz.timezone("America/Los_Angeles")).isoformat()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _spark_df(spark, df: pd.DataFrame):
|
|
33
|
+
from pyspark.sql.types import StructType, StructField, StringType
|
|
34
|
+
df = df.astype(str)
|
|
35
|
+
schema = StructType([StructField(col, StringType(), True) for col in df.columns])
|
|
36
|
+
return spark.createDataFrame(df, schema=schema)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _ensure_namespace(spark, catalog: str, namespace: str) -> None:
|
|
40
|
+
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog}.{namespace}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def write(
|
|
44
|
+
spark,
|
|
45
|
+
collector: Collector,
|
|
46
|
+
df: pd.DataFrame,
|
|
47
|
+
catalog: str = "local",
|
|
48
|
+
dynamic_params: dict | None = None,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Write a projected DataFrame to the Iceberg warehouse according to
|
|
52
|
+
the collector's build strategy.
|
|
53
|
+
"""
|
|
54
|
+
if df.empty:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
df = df.copy()
|
|
58
|
+
df["dcf_updated_at"] = _pst_now()
|
|
59
|
+
|
|
60
|
+
catalog_namespace = collector.namespace or collector.name # Spark catalog always needs a namespace
|
|
61
|
+
build = collector.cadence
|
|
62
|
+
|
|
63
|
+
# GCS: use google-cloud-storage + PyArrow directly for all strategies — no Spark catalog needed
|
|
64
|
+
if catalog == "gcp":
|
|
65
|
+
warehouse_bucket = _gcs_warehouse_bucket()
|
|
66
|
+
if build.strategy == "incremental":
|
|
67
|
+
_upsert_gcs(df, warehouse_bucket, collector.namespace, collector.name, build.primary_key)
|
|
68
|
+
elif build.strategy == "append":
|
|
69
|
+
_append_gcs(df, warehouse_bucket, collector.namespace, collector.name)
|
|
70
|
+
elif build.strategy == "full_refresh":
|
|
71
|
+
_overwrite_gcs(df, warehouse_bucket, collector.namespace, collector.name)
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
_ensure_namespace(spark, catalog, catalog_namespace)
|
|
75
|
+
|
|
76
|
+
if build.staging:
|
|
77
|
+
_write_staged(spark, collector, df, catalog, catalog_namespace, build.staging, build.merge, dynamic_params or {})
|
|
78
|
+
elif build.strategy == "incremental":
|
|
79
|
+
warehouse_root = Path(spark.conf.get(f"spark.sql.catalog.{catalog}.warehouse"))
|
|
80
|
+
_upsert(df, warehouse_root, collector.namespace, collector.name, build.primary_key)
|
|
81
|
+
elif build.strategy == "append":
|
|
82
|
+
_append(spark, df, f"{catalog}.{catalog_namespace}.{collector.name}")
|
|
83
|
+
elif build.strategy == "full_refresh":
|
|
84
|
+
_overwrite(spark, df, f"{catalog}.{catalog_namespace}.{collector.name}")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _write_staged(
|
|
88
|
+
spark,
|
|
89
|
+
collector: Collector,
|
|
90
|
+
df: pd.DataFrame,
|
|
91
|
+
catalog: str,
|
|
92
|
+
namespace: str,
|
|
93
|
+
staging: StagingConfig,
|
|
94
|
+
merge_cfg: MergeConfig | None,
|
|
95
|
+
dynamic_params: dict,
|
|
96
|
+
) -> None:
|
|
97
|
+
param_value = dynamic_params.get(staging.partition_param, "default")
|
|
98
|
+
table_name = staging.table_pattern.format(**{staging.partition_param: param_value})
|
|
99
|
+
|
|
100
|
+
warehouse_root = Path(spark.conf.get(f"spark.sql.catalog.{catalog}.warehouse"))
|
|
101
|
+
_upsert(df, warehouse_root, namespace, table_name, collector.cadence.primary_key)
|
|
102
|
+
|
|
103
|
+
if merge_cfg:
|
|
104
|
+
_rebuild_merged(spark, catalog, namespace, staging, merge_cfg, collector.cadence.primary_key)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _upsert(df: pd.DataFrame, warehouse_root: Path, namespace: str | None, table_name: str, primary_key: str | None) -> None:
|
|
108
|
+
"""Upsert df into warehouse_root/namespace/table_name using pyarrow directly.
|
|
109
|
+
|
|
110
|
+
Manages parquet files without Iceberg so the data directory always contains
|
|
111
|
+
exactly the current data. This lets DuckDB glob reads (warehouse_reader.py)
|
|
112
|
+
see correct results without needing to parse Iceberg snapshot metadata.
|
|
113
|
+
"""
|
|
114
|
+
import pyarrow as pa
|
|
115
|
+
import pyarrow.parquet as pq
|
|
116
|
+
|
|
117
|
+
df = df.copy()
|
|
118
|
+
if primary_key:
|
|
119
|
+
df = df.drop_duplicates(subset=[primary_key])
|
|
120
|
+
|
|
121
|
+
table_root = warehouse_root / namespace / table_name if namespace else warehouse_root / table_name
|
|
122
|
+
data_dir = table_root / "data"
|
|
123
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
|
|
125
|
+
existing_files = sorted(data_dir.glob("*.parquet"))
|
|
126
|
+
if existing_files:
|
|
127
|
+
existing = pd.concat(
|
|
128
|
+
[pq.read_table(f).to_pandas() for f in existing_files],
|
|
129
|
+
ignore_index=True,
|
|
130
|
+
)
|
|
131
|
+
if primary_key:
|
|
132
|
+
existing = existing[~existing[primary_key].isin(df[primary_key].values)]
|
|
133
|
+
merged = pd.concat([existing, df], ignore_index=True)
|
|
134
|
+
else:
|
|
135
|
+
merged = df
|
|
136
|
+
|
|
137
|
+
new_file = data_dir / f"{uuid.uuid4()}.parquet"
|
|
138
|
+
pq.write_table(pa.Table.from_pandas(merged, preserve_index=False), new_file)
|
|
139
|
+
|
|
140
|
+
for f in existing_files:
|
|
141
|
+
f.unlink()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _upsert_gcs(
|
|
145
|
+
df: pd.DataFrame,
|
|
146
|
+
bucket_name: str,
|
|
147
|
+
namespace: str | None,
|
|
148
|
+
table_name: str,
|
|
149
|
+
primary_key: str | None,
|
|
150
|
+
) -> None:
|
|
151
|
+
"""Upsert df into GCS bucket using google-cloud-storage + PyArrow (no Spark needed)."""
|
|
152
|
+
import pyarrow as pa
|
|
153
|
+
import pyarrow.parquet as pq
|
|
154
|
+
from google.cloud import storage
|
|
155
|
+
|
|
156
|
+
client = storage.Client()
|
|
157
|
+
bucket = client.bucket(bucket_name)
|
|
158
|
+
prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
|
|
159
|
+
|
|
160
|
+
parquet_blobs = [
|
|
161
|
+
b for b in bucket.list_blobs(prefix=f"{prefix}/")
|
|
162
|
+
if b.name.endswith(".parquet")
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
df = df.copy()
|
|
166
|
+
if primary_key:
|
|
167
|
+
df = df.drop_duplicates(subset=[primary_key])
|
|
168
|
+
|
|
169
|
+
if parquet_blobs:
|
|
170
|
+
existing = pd.concat(
|
|
171
|
+
[pq.read_table(io.BytesIO(b.download_as_bytes())).to_pandas() for b in parquet_blobs],
|
|
172
|
+
ignore_index=True,
|
|
173
|
+
)
|
|
174
|
+
if primary_key:
|
|
175
|
+
existing = existing[~existing[primary_key].isin(df[primary_key].values)]
|
|
176
|
+
merged = pd.concat([existing, df], ignore_index=True)
|
|
177
|
+
else:
|
|
178
|
+
merged = df
|
|
179
|
+
|
|
180
|
+
buf = io.BytesIO()
|
|
181
|
+
pq.write_table(pa.Table.from_pandas(merged, preserve_index=False), buf)
|
|
182
|
+
buf.seek(0)
|
|
183
|
+
|
|
184
|
+
new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
|
|
185
|
+
new_blob.upload_from_file(buf, content_type="application/octet-stream")
|
|
186
|
+
|
|
187
|
+
for blob in parquet_blobs:
|
|
188
|
+
blob.delete()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _append_gcs(
|
|
192
|
+
df: pd.DataFrame,
|
|
193
|
+
bucket_name: str,
|
|
194
|
+
namespace: str | None,
|
|
195
|
+
table_name: str,
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Append df to GCS by writing a new Parquet file alongside existing ones."""
|
|
198
|
+
import pyarrow as pa
|
|
199
|
+
import pyarrow.parquet as pq
|
|
200
|
+
from google.cloud import storage
|
|
201
|
+
|
|
202
|
+
client = storage.Client()
|
|
203
|
+
bucket = client.bucket(bucket_name)
|
|
204
|
+
prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
|
|
205
|
+
|
|
206
|
+
buf = io.BytesIO()
|
|
207
|
+
pq.write_table(pa.Table.from_pandas(df.copy(), preserve_index=False), buf)
|
|
208
|
+
buf.seek(0)
|
|
209
|
+
|
|
210
|
+
new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
|
|
211
|
+
new_blob.upload_from_file(buf, content_type="application/octet-stream")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _overwrite_gcs(
|
|
215
|
+
df: pd.DataFrame,
|
|
216
|
+
bucket_name: str,
|
|
217
|
+
namespace: str | None,
|
|
218
|
+
table_name: str,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Full-refresh: delete all existing Parquet blobs, write a fresh single file."""
|
|
221
|
+
import pyarrow as pa
|
|
222
|
+
import pyarrow.parquet as pq
|
|
223
|
+
from google.cloud import storage
|
|
224
|
+
|
|
225
|
+
client = storage.Client()
|
|
226
|
+
bucket = client.bucket(bucket_name)
|
|
227
|
+
prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
|
|
228
|
+
|
|
229
|
+
for blob in bucket.list_blobs(prefix=f"{prefix}/"):
|
|
230
|
+
if blob.name.endswith(".parquet"):
|
|
231
|
+
blob.delete()
|
|
232
|
+
|
|
233
|
+
buf = io.BytesIO()
|
|
234
|
+
pq.write_table(pa.Table.from_pandas(df.copy(), preserve_index=False), buf)
|
|
235
|
+
buf.seek(0)
|
|
236
|
+
|
|
237
|
+
new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
|
|
238
|
+
new_blob.upload_from_file(buf, content_type="application/octet-stream")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _append(spark, df: pd.DataFrame, table_id: str) -> None:
|
|
242
|
+
sdf = _spark_df(spark, df)
|
|
243
|
+
if spark.catalog.tableExists(table_id):
|
|
244
|
+
sdf.writeTo(table_id).append()
|
|
245
|
+
else:
|
|
246
|
+
sdf.writeTo(table_id).using("iceberg").tableProperty("format-version", "2").create()
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _overwrite(spark, df: pd.DataFrame, table_id: str) -> None:
|
|
250
|
+
sdf = _spark_df(spark, df)
|
|
251
|
+
sdf.writeTo(table_id).using("iceberg").tableProperty("format-version", "2").createOrReplace()
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _rebuild_merged(
|
|
255
|
+
spark,
|
|
256
|
+
catalog: str,
|
|
257
|
+
namespace: str,
|
|
258
|
+
staging: StagingConfig,
|
|
259
|
+
merge_cfg: MergeConfig,
|
|
260
|
+
primary_key: str | None,
|
|
261
|
+
) -> None:
|
|
262
|
+
from pyspark.sql import functions as F
|
|
263
|
+
from pyspark.sql.window import Window
|
|
264
|
+
|
|
265
|
+
# Collect all staging tables that match the pattern by listing the namespace
|
|
266
|
+
tables = spark.sql(f"SHOW TABLES IN {catalog}.{namespace}").collect()
|
|
267
|
+
prefix = staging.table_pattern.split("{")[0] # e.g. "permits_"
|
|
268
|
+
staging_ids = [
|
|
269
|
+
f"{catalog}.{namespace}.{t['tableName']}"
|
|
270
|
+
for t in tables
|
|
271
|
+
if t["tableName"].startswith(prefix) and t["tableName"].endswith("_loader_staging")
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
if not staging_ids:
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
combined = spark.table(staging_ids[0])
|
|
278
|
+
for tid in staging_ids[1:]:
|
|
279
|
+
combined = combined.union(spark.table(tid))
|
|
280
|
+
|
|
281
|
+
if merge_cfg.dedup and merge_cfg.dedup.type == "latest_non_null" and primary_key:
|
|
282
|
+
from functools import reduce
|
|
283
|
+
import operator
|
|
284
|
+
|
|
285
|
+
dedup_cols = merge_cfg.dedup.columns
|
|
286
|
+
|
|
287
|
+
def safe_unix_ts(col_name):
|
|
288
|
+
# Cast to timestamp without a strict format so that both
|
|
289
|
+
# 'M/d/yyyy' and 'yyyy-MM-dd HH:mm:ss' values are handled.
|
|
290
|
+
# ANSI mode is disabled in the session so invalid strings
|
|
291
|
+
# return null rather than throwing.
|
|
292
|
+
return F.when(
|
|
293
|
+
F.upper(F.col(col_name)) != "NAN",
|
|
294
|
+
F.col(col_name).cast("timestamp").cast("long"),
|
|
295
|
+
).otherwise(F.lit(None).cast("long"))
|
|
296
|
+
|
|
297
|
+
def non_nan_flag(col_name):
|
|
298
|
+
return F.when(F.upper(F.col(col_name)) != "NAN", F.lit(1)).otherwise(F.lit(0))
|
|
299
|
+
|
|
300
|
+
flag_sum = reduce(operator.add, [non_nan_flag(c) for c in dedup_cols])
|
|
301
|
+
|
|
302
|
+
w = Window.partitionBy(primary_key).orderBy(
|
|
303
|
+
F.greatest(*[safe_unix_ts(c) for c in dedup_cols]).desc_nulls_last(),
|
|
304
|
+
flag_sum.desc(),
|
|
305
|
+
)
|
|
306
|
+
combined = (
|
|
307
|
+
combined
|
|
308
|
+
.withColumn("_rn", F.row_number().over(w))
|
|
309
|
+
.filter(F.col("_rn") == 1)
|
|
310
|
+
.drop("_rn")
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
merged_id = f"{catalog}.{namespace}.{merge_cfg.table}"
|
|
314
|
+
combined.writeTo(merged_id).using("iceberg").tableProperty("format-version", "2").createOrReplace()
|
|
315
|
+
print(f" Rebuilt merged table → {merged_id} ({combined.count()} rows)")
|