data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/project.py ADDED
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ def find_project_root(start: Path | None = None) -> Path:
8
+ """Return the dcf project root directory.
9
+
10
+ Resolution order:
11
+ 1. DCF_PROJECT_DIR environment variable (absolute path)
12
+ 2. Walk up from `start` (default: cwd) looking for project.yml
13
+ """
14
+ if env := os.environ.get("DCF_PROJECT_DIR"):
15
+ return Path(env).resolve()
16
+ start = (start or Path.cwd()).resolve()
17
+ for p in [start, *start.parents]:
18
+ if (p / "project.yml").exists():
19
+ return p
20
+ raise RuntimeError(
21
+ "No project.yml found in current directory or any parent. "
22
+ "Run 'dcf init' to create one, or set DCF_PROJECT_DIR."
23
+ )
dcf/spark_session.py ADDED
@@ -0,0 +1,66 @@
1
+ import contextlib
2
+ import io
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import pyspark as _pyspark
7
+ from pyspark.sql import SparkSession
8
+
9
+ # Force PySpark to use its own bundled Spark JARs instead of any system SPARK_HOME
10
+ os.environ['SPARK_HOME'] = str(Path(_pyspark.__file__).parent)
11
+
12
+
13
+ @contextlib.contextmanager
14
+ def _suppress_spark_startup_noise():
15
+ """Redirect fd 2 during Spark/Ivy/JVM initialisation to suppress startup noise.
16
+
17
+ setLogLevel only takes effect after getOrCreate() returns; Ivy and JVM warnings
18
+ are written to the raw fd before log4j is configured, so we redirect at the OS level.
19
+ """
20
+ saved_fd = os.dup(2)
21
+ null_fd = os.open(os.devnull, os.O_WRONLY)
22
+ os.dup2(null_fd, 2)
23
+ os.close(null_fd)
24
+ try:
25
+ with contextlib.redirect_stderr(io.StringIO()):
26
+ yield
27
+ finally:
28
+ os.dup2(saved_fd, 2)
29
+ os.close(saved_fd)
30
+
31
+
32
+ def get_spark(app_name="dcf"):
33
+ from .project import find_project_root
34
+ warehouse_path = find_project_root() / "warehouse"
35
+
36
+ with _suppress_spark_startup_noise():
37
+ spark = (
38
+ SparkSession.builder
39
+ .appName(app_name)
40
+ .master("local[*]")
41
+ .config("spark.driver.memory", "4g")
42
+ .config("spark.driver.host", "127.0.0.1")
43
+ # Downloads Iceberg runtime JAR from Maven on first run; cached in ~/.ivy2
44
+ .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.1")
45
+ .config("spark.sql.extensions",
46
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
47
+ .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
48
+ .config("spark.sql.catalog.local.type", "hadoop")
49
+ .config("spark.sql.catalog.local.warehouse", str(warehouse_path))
50
+ .config("spark.sql.ansi.enabled", "false")
51
+ .getOrCreate()
52
+ )
53
+ spark.sparkContext.setLogLevel("ERROR")
54
+ return spark
55
+
56
+
57
+ def drop_namespace(spark, catalog, namespace):
58
+ """Drop all tables in a namespace then drop the namespace itself.
59
+ Iceberg's Hadoop catalog doesn't support CASCADE on DROP NAMESPACE."""
60
+ try:
61
+ tables = spark.sql(f"SHOW TABLES IN {catalog}.{namespace}").collect()
62
+ for row in tables:
63
+ spark.sql(f"DROP TABLE IF EXISTS {catalog}.{namespace}.{row.tableName}")
64
+ except Exception:
65
+ pass
66
+ spark.sql(f"DROP NAMESPACE IF EXISTS {catalog}.{namespace}")
@@ -0,0 +1,323 @@
1
+ """
2
+ Fast warehouse querying via DuckDB.
3
+
4
+ For catalog: local — reads Parquet files from warehouse/{namespace}/{table}/data/*.parquet
5
+ For catalog: gcp — downloads Parquet blobs from GCS via google-cloud-storage,
6
+ registers them as Arrow tables in DuckDB, then rewrites
7
+ namespace.table references to the registered names.
8
+
9
+ list_tables() returns BOTH GCS and local-only tables when catalog: gcp,
10
+ with a `location` field ("gcs" | "local") on each row.
11
+
12
+ Returns at most 500 rows per query.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ _MAX_ROWS = 500
20
+
21
+ # SQL statement types that must NOT be wrapped in SELECT … LIMIT.
22
+ _WRITE_PREFIXES = {"copy", "create", "insert", "drop", "delete", "update", "alter"}
23
+
24
+
25
+ def _project_config() -> dict:
26
+ import yaml
27
+ from .project import find_project_root
28
+ cfg_file = find_project_root() / "project.yml"
29
+ return yaml.safe_load(cfg_file.read_text()) if cfg_file.exists() else {}
30
+
31
+
32
+ def _catalog() -> str:
33
+ return _project_config().get("catalog", "local")
34
+
35
+
36
+ def _warehouse() -> Path:
37
+ from .project import find_project_root
38
+ return find_project_root() / "warehouse"
39
+
40
+
41
+ def _gcs_bucket() -> str:
42
+ return _project_config().get("gcp", {}).get("warehouse_bucket", "")
43
+
44
+
45
+ def _iter_gcs_tables(bucket_name: str) -> list[tuple[str, str]]:
46
+ """List all namespace/table pairs that have data in the GCS warehouse bucket."""
47
+ from google.cloud import storage as gcs
48
+ client = gcs.Client()
49
+ blobs = client.list_blobs(bucket_name)
50
+ seen: set[tuple[str, str]] = set()
51
+ for blob in blobs:
52
+ parts = blob.name.split("/")
53
+ if len(parts) >= 4 and parts[2] == "data" and parts[3].endswith(".parquet"):
54
+ seen.add((parts[0], parts[1]))
55
+ return sorted(seen)
56
+
57
+
58
+ def _load_gcs_table(bucket_name: str, namespace: str, table: str):
59
+ """Download all Parquet blobs for a GCS table and return a single PyArrow table."""
60
+ import io
61
+ import pyarrow as pa
62
+ import pyarrow.parquet as pq
63
+ from google.cloud import storage as gcs
64
+
65
+ client = gcs.Client()
66
+ bucket = client.bucket(bucket_name)
67
+ prefix = f"{namespace}/{table}/data/"
68
+ blobs = [b for b in bucket.list_blobs(prefix=prefix) if b.name.endswith(".parquet")]
69
+ if not blobs:
70
+ return None
71
+ tables = [pq.read_table(io.BytesIO(b.download_as_bytes())) for b in blobs]
72
+ return pa.concat_tables(tables) if len(tables) > 1 else tables[0]
73
+
74
+
75
+ def _gcs_table_key(namespace: str, table: str) -> str:
76
+ """DuckDB-safe registered name for a GCS table."""
77
+ return f"_gcs_{namespace}_{table}"
78
+
79
+
80
+ def _is_write_statement(sql: str) -> bool:
81
+ """Return True if sql is a write/DDL statement that must not be wrapped in SELECT … LIMIT."""
82
+ first_word = sql.strip().split()[0].lower() if sql.strip() else ""
83
+ return first_word in _WRITE_PREFIXES
84
+
85
+
86
+ def _iter_local_tables() -> list[tuple[str, str, Path]]:
87
+ """Yield (namespace, table, data_dir) for every local warehouse table with parquet data."""
88
+ warehouse = _warehouse()
89
+ if not warehouse.exists():
90
+ return []
91
+ results = []
92
+ for ns_dir in sorted(warehouse.iterdir()):
93
+ if not ns_dir.is_dir():
94
+ continue
95
+ for table_dir in sorted(ns_dir.iterdir()):
96
+ if not table_dir.is_dir():
97
+ continue
98
+ data_dir = table_dir / "data"
99
+ if data_dir.exists() and list(data_dir.glob("*.parquet")):
100
+ results.append((ns_dir.name, table_dir.name, data_dir))
101
+ return results
102
+
103
+
104
+ def _resolve_table_refs(sql: str, conn, catalog: str) -> str:
105
+ """
106
+ Rewrite namespace.table references in sql to DuckDB-readable form.
107
+
108
+ GCS tables (catalog=gcp) → registered as Arrow tables in conn (priority).
109
+ Local tables → rewritten to read_parquet(glob). In GCP mode this acts as
110
+ a fallback so that local-only tables work transparently without an error (F-021).
111
+ """
112
+ import re
113
+
114
+ resolved = sql
115
+ gcs_pairs: set[tuple[str, str]] = set()
116
+
117
+ if catalog == "gcp":
118
+ bucket = _gcs_bucket()
119
+ if bucket:
120
+ for namespace, table in _iter_gcs_tables(bucket):
121
+ pattern = rf"\b{re.escape(namespace)}\.{re.escape(table)}\b"
122
+ if re.search(pattern, resolved):
123
+ arrow_table = _load_gcs_table(bucket, namespace, table)
124
+ if arrow_table is not None:
125
+ key = _gcs_table_key(namespace, table)
126
+ conn.register(key, arrow_table)
127
+ resolved = re.sub(pattern, key, resolved)
128
+ gcs_pairs.add((namespace, table))
129
+
130
+ # Resolve local tables — for local catalog, or as GCP fallback for local-only tables
131
+ for namespace, table, data_dir in _iter_local_tables():
132
+ if (namespace, table) in gcs_pairs:
133
+ continue
134
+ pattern = rf"\b{re.escape(namespace)}\.{re.escape(table)}\b"
135
+ glob = str(data_dir / "*.parquet")
136
+ resolved = re.sub(pattern, f"read_parquet('{glob}')", resolved)
137
+
138
+ return resolved
139
+
140
+
141
+ def list_tables() -> list[dict[str, Any]]:
142
+ """
143
+ Return all tables in the warehouse with column schemas and row counts.
144
+
145
+ When catalog=gcp, returns BOTH GCS tables (location='gcs') and local-only
146
+ tables that have not been synced to GCS (location='local').
147
+ """
148
+ import duckdb
149
+
150
+ catalog = _catalog()
151
+ results: list[dict[str, Any]] = []
152
+
153
+ if catalog == "gcp":
154
+ bucket = _gcs_bucket()
155
+ gcs_pairs: set[tuple[str, str]] = set()
156
+
157
+ if bucket:
158
+ conn = duckdb.connect()
159
+ for namespace, table in _iter_gcs_tables(bucket):
160
+ arrow_table = _load_gcs_table(bucket, namespace, table)
161
+ if arrow_table is None:
162
+ continue
163
+ key = _gcs_table_key(namespace, table)
164
+ try:
165
+ conn.register(key, arrow_table)
166
+ row_count = conn.execute(f"SELECT COUNT(*) FROM {key}").fetchone()[0]
167
+ cols = conn.execute(f"DESCRIBE SELECT * FROM {key} LIMIT 0").fetchall()
168
+ columns = [{"name": c[0], "type": c[1]} for c in cols]
169
+ except Exception as e:
170
+ row_count = -1
171
+ columns = [{"error": str(e)}]
172
+ results.append({
173
+ "namespace": namespace,
174
+ "table": table,
175
+ "full_name": f"{namespace}.{table}",
176
+ "row_count": row_count,
177
+ "columns": columns,
178
+ "location": "gcs",
179
+ })
180
+ gcs_pairs.add((namespace, table))
181
+ conn.close()
182
+
183
+ # Also list local-only tables not yet in GCS (F-018)
184
+ for namespace, table, data_dir in _iter_local_tables():
185
+ if (namespace, table) in gcs_pairs:
186
+ continue
187
+ glob = str(data_dir / "*.parquet")
188
+ try:
189
+ conn2 = duckdb.connect()
190
+ info = conn2.execute(f"SELECT COUNT(*) as n FROM read_parquet('{glob}')").fetchone()
191
+ row_count = info[0] if info else 0
192
+ cols = conn2.execute(
193
+ f"DESCRIBE SELECT * FROM read_parquet('{glob}') LIMIT 0"
194
+ ).fetchall()
195
+ columns = [{"name": c[0], "type": c[1]} for c in cols]
196
+ conn2.close()
197
+ except Exception as e:
198
+ row_count = -1
199
+ columns = [{"error": str(e)}]
200
+ results.append({
201
+ "namespace": namespace,
202
+ "table": table,
203
+ "full_name": f"{namespace}.{table}",
204
+ "row_count": row_count,
205
+ "columns": columns,
206
+ "location": "local",
207
+ })
208
+
209
+ return results
210
+
211
+ # local catalog
212
+ for namespace, table, data_dir in _iter_local_tables():
213
+ glob = str(data_dir / "*.parquet")
214
+ try:
215
+ conn = duckdb.connect()
216
+ info = conn.execute(f"SELECT COUNT(*) as n FROM read_parquet('{glob}')").fetchone()
217
+ row_count = info[0] if info else 0
218
+ cols = conn.execute(
219
+ f"DESCRIBE SELECT * FROM read_parquet('{glob}') LIMIT 0"
220
+ ).fetchall()
221
+ columns = [{"name": c[0], "type": c[1]} for c in cols]
222
+ conn.close()
223
+ except Exception as e:
224
+ row_count = -1
225
+ columns = [{"error": str(e)}]
226
+ results.append({
227
+ "namespace": namespace,
228
+ "table": table,
229
+ "full_name": f"{namespace}.{table}",
230
+ "row_count": row_count,
231
+ "columns": columns,
232
+ "location": "local",
233
+ })
234
+
235
+ return results
236
+
237
+
238
+ def query(sql: str) -> list[dict[str, Any]]:
239
+ """
240
+ Run a SQL query against the warehouse.
241
+
242
+ Table references use the form namespace.table — e.g.
243
+ SELECT neighborhood, AVG(CAST(price AS DOUBLE)) as avg_price
244
+ FROM craigslist_apts.craigslist_apts
245
+ GROUP BY 1
246
+ ORDER BY 2 DESC
247
+
248
+ Write statements (COPY, CREATE, INSERT, etc.) are executed as-is without
249
+ being wrapped in SELECT … LIMIT. SELECT queries are automatically capped
250
+ at 500 rows unless the caller includes a LIMIT clause.
251
+
252
+ Returns at most 500 rows for SELECT queries.
253
+ """
254
+ import duckdb
255
+
256
+ catalog = _catalog()
257
+ conn = duckdb.connect()
258
+ resolved = _resolve_table_refs(sql, conn, catalog)
259
+
260
+ # F-019: skip auto-LIMIT for write/DDL statements
261
+ if not _is_write_statement(resolved) and "limit" not in resolved.lower():
262
+ resolved = f"SELECT * FROM ({resolved}) _q LIMIT {_MAX_ROWS}"
263
+
264
+ try:
265
+ rows = conn.execute(resolved).fetchall()
266
+ except Exception:
267
+ conn.close()
268
+ raise
269
+
270
+ cols = [d[0] for d in conn.description]
271
+ conn.close()
272
+ return [dict(zip(cols, row)) for row in rows]
273
+
274
+
275
+ def materialize_model(sql: str, namespace: str, table: str) -> dict[str, Any]:
276
+ """
277
+ Run sql and write the result as a new warehouse table at namespace/table.
278
+
279
+ Writes locally to warehouse/<namespace>/<table>/data/part-001.parquet.
280
+ If catalog=gcp, also uploads the Parquet to the GCS warehouse bucket so
281
+ the model is immediately queryable via query_warehouse() and visible in
282
+ list_warehouse_tables().
283
+
284
+ Returns a dict with ok, namespace, table, row_count, and location.
285
+ """
286
+ import duckdb
287
+ import pyarrow.parquet as pq
288
+
289
+ catalog = _catalog()
290
+ conn = duckdb.connect()
291
+ resolved = _resolve_table_refs(sql, conn, catalog)
292
+
293
+ arrow_result = conn.execute(resolved).arrow()
294
+ if hasattr(arrow_result, "read_all"):
295
+ arrow_result = arrow_result.read_all() # RecordBatchReader → Table
296
+ row_count = arrow_result.num_rows
297
+ conn.close()
298
+
299
+ out_dir = _warehouse() / namespace / table / "data"
300
+ out_dir.mkdir(parents=True, exist_ok=True)
301
+ out_path = out_dir / "part-001.parquet"
302
+ pq.write_table(arrow_result, out_path)
303
+
304
+ location = str(out_path)
305
+
306
+ if catalog == "gcp":
307
+ bucket_name = _gcs_bucket()
308
+ if bucket_name:
309
+ from google.cloud import storage as gcs_storage
310
+ client = gcs_storage.Client()
311
+ gcs_bucket = client.bucket(bucket_name)
312
+ blob_name = f"{namespace}/{table}/data/part-001.parquet"
313
+ blob = gcs_bucket.blob(blob_name)
314
+ blob.upload_from_filename(str(out_path))
315
+ location = f"gs://{bucket_name}/{blob_name}"
316
+
317
+ return {
318
+ "ok": True,
319
+ "namespace": namespace,
320
+ "table": table,
321
+ "row_count": row_count,
322
+ "location": location,
323
+ }
dcf/writer/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .iceberg import write
2
+
3
+ __all__ = ["write"]