data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/writer/iceberg.py ADDED
@@ -0,0 +1,315 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import io
5
+ import uuid
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ import pytz
10
+
11
+ from ..config.models import Collector, StagingConfig, MergeConfig
12
+
13
+
14
+ def _gcs_warehouse_bucket() -> str:
15
+ import yaml
16
+ from ..project import find_project_root
17
+ cfg_file = find_project_root() / "project.yml"
18
+ cfg = yaml.safe_load(cfg_file.read_text()) if cfg_file.exists() else {}
19
+ bucket = cfg.get("gcp", {}).get("warehouse_bucket")
20
+ if not bucket:
21
+ raise RuntimeError(
22
+ "GCP warehouse bucket not configured. Run: dcf gcp setup --project-id ... --region ..."
23
+ )
24
+ return bucket
25
+
26
+
27
+ def _pst_now() -> str:
28
+ utc_now = pytz.utc.localize(datetime.datetime.utcnow())
29
+ return utc_now.astimezone(pytz.timezone("America/Los_Angeles")).isoformat()
30
+
31
+
32
+ def _spark_df(spark, df: pd.DataFrame):
33
+ from pyspark.sql.types import StructType, StructField, StringType
34
+ df = df.astype(str)
35
+ schema = StructType([StructField(col, StringType(), True) for col in df.columns])
36
+ return spark.createDataFrame(df, schema=schema)
37
+
38
+
39
+ def _ensure_namespace(spark, catalog: str, namespace: str) -> None:
40
+ spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog}.{namespace}")
41
+
42
+
43
+ def write(
44
+ spark,
45
+ collector: Collector,
46
+ df: pd.DataFrame,
47
+ catalog: str = "local",
48
+ dynamic_params: dict | None = None,
49
+ ) -> None:
50
+ """
51
+ Write a projected DataFrame to the Iceberg warehouse according to
52
+ the collector's build strategy.
53
+ """
54
+ if df.empty:
55
+ return
56
+
57
+ df = df.copy()
58
+ df["dcf_updated_at"] = _pst_now()
59
+
60
+ catalog_namespace = collector.namespace or collector.name # Spark catalog always needs a namespace
61
+ build = collector.cadence
62
+
63
+ # GCS: use google-cloud-storage + PyArrow directly for all strategies — no Spark catalog needed
64
+ if catalog == "gcp":
65
+ warehouse_bucket = _gcs_warehouse_bucket()
66
+ if build.strategy == "incremental":
67
+ _upsert_gcs(df, warehouse_bucket, collector.namespace, collector.name, build.primary_key)
68
+ elif build.strategy == "append":
69
+ _append_gcs(df, warehouse_bucket, collector.namespace, collector.name)
70
+ elif build.strategy == "full_refresh":
71
+ _overwrite_gcs(df, warehouse_bucket, collector.namespace, collector.name)
72
+ return
73
+
74
+ _ensure_namespace(spark, catalog, catalog_namespace)
75
+
76
+ if build.staging:
77
+ _write_staged(spark, collector, df, catalog, catalog_namespace, build.staging, build.merge, dynamic_params or {})
78
+ elif build.strategy == "incremental":
79
+ warehouse_root = Path(spark.conf.get(f"spark.sql.catalog.{catalog}.warehouse"))
80
+ _upsert(df, warehouse_root, collector.namespace, collector.name, build.primary_key)
81
+ elif build.strategy == "append":
82
+ _append(spark, df, f"{catalog}.{catalog_namespace}.{collector.name}")
83
+ elif build.strategy == "full_refresh":
84
+ _overwrite(spark, df, f"{catalog}.{catalog_namespace}.{collector.name}")
85
+
86
+
87
+ def _write_staged(
88
+ spark,
89
+ collector: Collector,
90
+ df: pd.DataFrame,
91
+ catalog: str,
92
+ namespace: str,
93
+ staging: StagingConfig,
94
+ merge_cfg: MergeConfig | None,
95
+ dynamic_params: dict,
96
+ ) -> None:
97
+ param_value = dynamic_params.get(staging.partition_param, "default")
98
+ table_name = staging.table_pattern.format(**{staging.partition_param: param_value})
99
+
100
+ warehouse_root = Path(spark.conf.get(f"spark.sql.catalog.{catalog}.warehouse"))
101
+ _upsert(df, warehouse_root, namespace, table_name, collector.cadence.primary_key)
102
+
103
+ if merge_cfg:
104
+ _rebuild_merged(spark, catalog, namespace, staging, merge_cfg, collector.cadence.primary_key)
105
+
106
+
107
+ def _upsert(df: pd.DataFrame, warehouse_root: Path, namespace: str | None, table_name: str, primary_key: str | None) -> None:
108
+ """Upsert df into warehouse_root/namespace/table_name using pyarrow directly.
109
+
110
+ Manages parquet files without Iceberg so the data directory always contains
111
+ exactly the current data. This lets DuckDB glob reads (warehouse_reader.py)
112
+ see correct results without needing to parse Iceberg snapshot metadata.
113
+ """
114
+ import pyarrow as pa
115
+ import pyarrow.parquet as pq
116
+
117
+ df = df.copy()
118
+ if primary_key:
119
+ df = df.drop_duplicates(subset=[primary_key])
120
+
121
+ table_root = warehouse_root / namespace / table_name if namespace else warehouse_root / table_name
122
+ data_dir = table_root / "data"
123
+ data_dir.mkdir(parents=True, exist_ok=True)
124
+
125
+ existing_files = sorted(data_dir.glob("*.parquet"))
126
+ if existing_files:
127
+ existing = pd.concat(
128
+ [pq.read_table(f).to_pandas() for f in existing_files],
129
+ ignore_index=True,
130
+ )
131
+ if primary_key:
132
+ existing = existing[~existing[primary_key].isin(df[primary_key].values)]
133
+ merged = pd.concat([existing, df], ignore_index=True)
134
+ else:
135
+ merged = df
136
+
137
+ new_file = data_dir / f"{uuid.uuid4()}.parquet"
138
+ pq.write_table(pa.Table.from_pandas(merged, preserve_index=False), new_file)
139
+
140
+ for f in existing_files:
141
+ f.unlink()
142
+
143
+
144
+ def _upsert_gcs(
145
+ df: pd.DataFrame,
146
+ bucket_name: str,
147
+ namespace: str | None,
148
+ table_name: str,
149
+ primary_key: str | None,
150
+ ) -> None:
151
+ """Upsert df into GCS bucket using google-cloud-storage + PyArrow (no Spark needed)."""
152
+ import pyarrow as pa
153
+ import pyarrow.parquet as pq
154
+ from google.cloud import storage
155
+
156
+ client = storage.Client()
157
+ bucket = client.bucket(bucket_name)
158
+ prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
159
+
160
+ parquet_blobs = [
161
+ b for b in bucket.list_blobs(prefix=f"{prefix}/")
162
+ if b.name.endswith(".parquet")
163
+ ]
164
+
165
+ df = df.copy()
166
+ if primary_key:
167
+ df = df.drop_duplicates(subset=[primary_key])
168
+
169
+ if parquet_blobs:
170
+ existing = pd.concat(
171
+ [pq.read_table(io.BytesIO(b.download_as_bytes())).to_pandas() for b in parquet_blobs],
172
+ ignore_index=True,
173
+ )
174
+ if primary_key:
175
+ existing = existing[~existing[primary_key].isin(df[primary_key].values)]
176
+ merged = pd.concat([existing, df], ignore_index=True)
177
+ else:
178
+ merged = df
179
+
180
+ buf = io.BytesIO()
181
+ pq.write_table(pa.Table.from_pandas(merged, preserve_index=False), buf)
182
+ buf.seek(0)
183
+
184
+ new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
185
+ new_blob.upload_from_file(buf, content_type="application/octet-stream")
186
+
187
+ for blob in parquet_blobs:
188
+ blob.delete()
189
+
190
+
191
+ def _append_gcs(
192
+ df: pd.DataFrame,
193
+ bucket_name: str,
194
+ namespace: str | None,
195
+ table_name: str,
196
+ ) -> None:
197
+ """Append df to GCS by writing a new Parquet file alongside existing ones."""
198
+ import pyarrow as pa
199
+ import pyarrow.parquet as pq
200
+ from google.cloud import storage
201
+
202
+ client = storage.Client()
203
+ bucket = client.bucket(bucket_name)
204
+ prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
205
+
206
+ buf = io.BytesIO()
207
+ pq.write_table(pa.Table.from_pandas(df.copy(), preserve_index=False), buf)
208
+ buf.seek(0)
209
+
210
+ new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
211
+ new_blob.upload_from_file(buf, content_type="application/octet-stream")
212
+
213
+
214
+ def _overwrite_gcs(
215
+ df: pd.DataFrame,
216
+ bucket_name: str,
217
+ namespace: str | None,
218
+ table_name: str,
219
+ ) -> None:
220
+ """Full-refresh: delete all existing Parquet blobs, write a fresh single file."""
221
+ import pyarrow as pa
222
+ import pyarrow.parquet as pq
223
+ from google.cloud import storage
224
+
225
+ client = storage.Client()
226
+ bucket = client.bucket(bucket_name)
227
+ prefix = f"{namespace}/{table_name}/data" if namespace else f"{table_name}/data"
228
+
229
+ for blob in bucket.list_blobs(prefix=f"{prefix}/"):
230
+ if blob.name.endswith(".parquet"):
231
+ blob.delete()
232
+
233
+ buf = io.BytesIO()
234
+ pq.write_table(pa.Table.from_pandas(df.copy(), preserve_index=False), buf)
235
+ buf.seek(0)
236
+
237
+ new_blob = bucket.blob(f"{prefix}/{uuid.uuid4()}.parquet")
238
+ new_blob.upload_from_file(buf, content_type="application/octet-stream")
239
+
240
+
241
+ def _append(spark, df: pd.DataFrame, table_id: str) -> None:
242
+ sdf = _spark_df(spark, df)
243
+ if spark.catalog.tableExists(table_id):
244
+ sdf.writeTo(table_id).append()
245
+ else:
246
+ sdf.writeTo(table_id).using("iceberg").tableProperty("format-version", "2").create()
247
+
248
+
249
+ def _overwrite(spark, df: pd.DataFrame, table_id: str) -> None:
250
+ sdf = _spark_df(spark, df)
251
+ sdf.writeTo(table_id).using("iceberg").tableProperty("format-version", "2").createOrReplace()
252
+
253
+
254
+ def _rebuild_merged(
255
+ spark,
256
+ catalog: str,
257
+ namespace: str,
258
+ staging: StagingConfig,
259
+ merge_cfg: MergeConfig,
260
+ primary_key: str | None,
261
+ ) -> None:
262
+ from pyspark.sql import functions as F
263
+ from pyspark.sql.window import Window
264
+
265
+ # Collect all staging tables that match the pattern by listing the namespace
266
+ tables = spark.sql(f"SHOW TABLES IN {catalog}.{namespace}").collect()
267
+ prefix = staging.table_pattern.split("{")[0] # e.g. "permits_"
268
+ staging_ids = [
269
+ f"{catalog}.{namespace}.{t['tableName']}"
270
+ for t in tables
271
+ if t["tableName"].startswith(prefix) and t["tableName"].endswith("_loader_staging")
272
+ ]
273
+
274
+ if not staging_ids:
275
+ return
276
+
277
+ combined = spark.table(staging_ids[0])
278
+ for tid in staging_ids[1:]:
279
+ combined = combined.union(spark.table(tid))
280
+
281
+ if merge_cfg.dedup and merge_cfg.dedup.type == "latest_non_null" and primary_key:
282
+ from functools import reduce
283
+ import operator
284
+
285
+ dedup_cols = merge_cfg.dedup.columns
286
+
287
+ def safe_unix_ts(col_name):
288
+ # Cast to timestamp without a strict format so that both
289
+ # 'M/d/yyyy' and 'yyyy-MM-dd HH:mm:ss' values are handled.
290
+ # ANSI mode is disabled in the session so invalid strings
291
+ # return null rather than throwing.
292
+ return F.when(
293
+ F.upper(F.col(col_name)) != "NAN",
294
+ F.col(col_name).cast("timestamp").cast("long"),
295
+ ).otherwise(F.lit(None).cast("long"))
296
+
297
+ def non_nan_flag(col_name):
298
+ return F.when(F.upper(F.col(col_name)) != "NAN", F.lit(1)).otherwise(F.lit(0))
299
+
300
+ flag_sum = reduce(operator.add, [non_nan_flag(c) for c in dedup_cols])
301
+
302
+ w = Window.partitionBy(primary_key).orderBy(
303
+ F.greatest(*[safe_unix_ts(c) for c in dedup_cols]).desc_nulls_last(),
304
+ flag_sum.desc(),
305
+ )
306
+ combined = (
307
+ combined
308
+ .withColumn("_rn", F.row_number().over(w))
309
+ .filter(F.col("_rn") == 1)
310
+ .drop("_rn")
311
+ )
312
+
313
+ merged_id = f"{catalog}.{namespace}.{merge_cfg.table}"
314
+ combined.writeTo(merged_id).using("iceberg").tableProperty("format-version", "2").createOrReplace()
315
+ print(f" Rebuilt merged table → {merged_id} ({combined.count()} rows)")