PyPI - data-collection-framework - Versions diffs - 0.1.0__py3-none-any.whl - Mend

data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

data_collection_framework-0.1.0.dist-info/METADATA +19 -0
data_collection_framework-0.1.0.dist-info/RECORD +44 -0
data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
dcf/__init__.py +4 -0
dcf/cli.py +841 -0
dcf/config/__init__.py +4 -0
dcf/config/loader.py +77 -0
dcf/config/models.py +240 -0
dcf/engine/__init__.py +6 -0
dcf/engine/fetcher.py +118 -0
dcf/engine/iterator.py +96 -0
dcf/engine/projector.py +56 -0
dcf/engine/runner.py +90 -0
dcf/engine/transforms.py +41 -0
dcf/gcp/__init__.py +0 -0
dcf/gcp/_collector_utils.py +87 -0
dcf/gcp/auth.py +1 -0
dcf/gcp/batch_deploy.py +548 -0
dcf/gcp/bootstrap.py +131 -0
dcf/gcp/gcloud.py +42 -0
dcf/gcp/terraform.py +151 -0
dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
dcf/infra/modules/batch_collector/local/main.tf +32 -0
dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
dcf/infra/modules/batch_collector/local/variables.tf +25 -0
dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
dcf/infra/templates/docker-compose.yml.tftpl +76 -0
dcf/local_deploy.py +756 -0
dcf/project.py +23 -0
dcf/spark_session.py +66 -0
dcf/warehouse_reader.py +323 -0
dcf/writer/__init__.py +3 -0
dcf/writer/iceberg.py +315 -0

dcf/project.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+import os
+from pathlib import Path
+def find_project_root(start: Path | None = None) -> Path:
+    """Return the dcf project root directory.
+    Resolution order:
+      1. DCF_PROJECT_DIR environment variable (absolute path)
+      2. Walk up from `start` (default: cwd) looking for project.yml
+    """
+    if env := os.environ.get("DCF_PROJECT_DIR"):
+        return Path(env).resolve()
+    start = (start or Path.cwd()).resolve()
+    for p in [start, *start.parents]:
+        if (p / "project.yml").exists():
+            return p
+    raise RuntimeError(
+        "No project.yml found in current directory or any parent. "
+        "Run 'dcf init' to create one, or set DCF_PROJECT_DIR."
+    )

dcf/spark_session.py ADDED Viewed

@@ -0,0 +1,66 @@
+import contextlib
+import io
+import os
+from pathlib import Path
+import pyspark as _pyspark
+from pyspark.sql import SparkSession
+# Force PySpark to use its own bundled Spark JARs instead of any system SPARK_HOME
+os.environ['SPARK_HOME'] = str(Path(_pyspark.__file__).parent)
+@contextlib.contextmanager
+def _suppress_spark_startup_noise():
+    """Redirect fd 2 during Spark/Ivy/JVM initialisation to suppress startup noise.
+    setLogLevel only takes effect after getOrCreate() returns; Ivy and JVM warnings
+    are written to the raw fd before log4j is configured, so we redirect at the OS level.
+    """
+    saved_fd = os.dup(2)
+    null_fd = os.open(os.devnull, os.O_WRONLY)
+    os.dup2(null_fd, 2)
+    os.close(null_fd)
+    try:
+        with contextlib.redirect_stderr(io.StringIO()):
+            yield
+    finally:
+        os.dup2(saved_fd, 2)
+        os.close(saved_fd)
+def get_spark(app_name="dcf"):
+    from .project import find_project_root
+    warehouse_path = find_project_root() / "warehouse"
+    with _suppress_spark_startup_noise():
+        spark = (
+            SparkSession.builder
+            .appName(app_name)
+            .master("local[*]")
+            .config("spark.driver.memory", "4g")
+            .config("spark.driver.host", "127.0.0.1")
+            # Downloads Iceberg runtime JAR from Maven on first run; cached in ~/.ivy2
+            .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.1")
+            .config("spark.sql.extensions",
+                    "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
+            .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
+            .config("spark.sql.catalog.local.type", "hadoop")
+            .config("spark.sql.catalog.local.warehouse", str(warehouse_path))
+            .config("spark.sql.ansi.enabled", "false")
+            .getOrCreate()
+        )
+    spark.sparkContext.setLogLevel("ERROR")
+    return spark
+def drop_namespace(spark, catalog, namespace):
+    """Drop all tables in a namespace then drop the namespace itself.
+    Iceberg's Hadoop catalog doesn't support CASCADE on DROP NAMESPACE."""
+    try:
+        tables = spark.sql(f"SHOW TABLES IN {catalog}.{namespace}").collect()
+        for row in tables:
+            spark.sql(f"DROP TABLE IF EXISTS {catalog}.{namespace}.{row.tableName}")
+    except Exception:
+        pass
+    spark.sql(f"DROP NAMESPACE IF EXISTS {catalog}.{namespace}")

dcf/warehouse_reader.py ADDED Viewed

@@ -0,0 +1,323 @@
+"""
+Fast warehouse querying via DuckDB.
+For catalog: local  — reads Parquet files from warehouse/{namespace}/{table}/data/*.parquet
+For catalog: gcp    — downloads Parquet blobs from GCS via google-cloud-storage,
+                      registers them as Arrow tables in DuckDB, then rewrites
+                      namespace.table references to the registered names.
+list_tables()       returns BOTH GCS and local-only tables when catalog: gcp,
+                    with a `location` field ("gcs" | "local") on each row.
+Returns at most 500 rows per query.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+_MAX_ROWS = 500
+# SQL statement types that must NOT be wrapped in SELECT … LIMIT.
+_WRITE_PREFIXES = {"copy", "create", "insert", "drop", "delete", "update", "alter"}
+def _project_config() -> dict:
+    import yaml
+    from .project import find_project_root
+    cfg_file = find_project_root() / "project.yml"
+    return yaml.safe_load(cfg_file.read_text()) if cfg_file.exists() else {}
+def _catalog() -> str:
+    return _project_config().get("catalog", "local")
+def _warehouse() -> Path:
+    from .project import find_project_root
+    return find_project_root() / "warehouse"
+def _gcs_bucket() -> str:
+    return _project_config().get("gcp", {}).get("warehouse_bucket", "")
+def _iter_gcs_tables(bucket_name: str) -> list[tuple[str, str]]:
+    """List all namespace/table pairs that have data in the GCS warehouse bucket."""
+    from google.cloud import storage as gcs
+    client = gcs.Client()
+    blobs = client.list_blobs(bucket_name)
+    seen: set[tuple[str, str]] = set()
+    for blob in blobs:
+        parts = blob.name.split("/")
+        if len(parts) >= 4 and parts[2] == "data" and parts[3].endswith(".parquet"):
+            seen.add((parts[0], parts[1]))
+    return sorted(seen)
+def _load_gcs_table(bucket_name: str, namespace: str, table: str):
+    """Download all Parquet blobs for a GCS table and return a single PyArrow table."""
+    import io
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+    from google.cloud import storage as gcs
+    client = gcs.Client()
+    bucket = client.bucket(bucket_name)
+    prefix = f"{namespace}/{table}/data/"
+    blobs = [b for b in bucket.list_blobs(prefix=prefix) if b.name.endswith(".parquet")]
+    if not blobs:
+        return None
+    tables = [pq.read_table(io.BytesIO(b.download_as_bytes())) for b in blobs]
+    return pa.concat_tables(tables) if len(tables) > 1 else tables[0]
+def _gcs_table_key(namespace: str, table: str) -> str:
+    """DuckDB-safe registered name for a GCS table."""
+    return f"_gcs_{namespace}_{table}"
+def _is_write_statement(sql: str) -> bool:
+    """Return True if sql is a write/DDL statement that must not be wrapped in SELECT … LIMIT."""
+    first_word = sql.strip().split()[0].lower() if sql.strip() else ""
+    return first_word in _WRITE_PREFIXES
+def _iter_local_tables() -> list[tuple[str, str, Path]]:
+    """Yield (namespace, table, data_dir) for every local warehouse table with parquet data."""
+    warehouse = _warehouse()
+    if not warehouse.exists():
+        return []
+    results = []
+    for ns_dir in sorted(warehouse.iterdir()):
+        if not ns_dir.is_dir():
+            continue
+        for table_dir in sorted(ns_dir.iterdir()):
+            if not table_dir.is_dir():
+                continue
+            data_dir = table_dir / "data"
+            if data_dir.exists() and list(data_dir.glob("*.parquet")):
+                results.append((ns_dir.name, table_dir.name, data_dir))
+    return results
+def _resolve_table_refs(sql: str, conn, catalog: str) -> str:
+    """
+    Rewrite namespace.table references in sql to DuckDB-readable form.
+    GCS tables (catalog=gcp) → registered as Arrow tables in conn (priority).
+    Local tables → rewritten to read_parquet(glob).  In GCP mode this acts as
+    a fallback so that local-only tables work transparently without an error (F-021).
+    """
+    import re
+    resolved = sql
+    gcs_pairs: set[tuple[str, str]] = set()
+    if catalog == "gcp":
+        bucket = _gcs_bucket()
+        if bucket:
+            for namespace, table in _iter_gcs_tables(bucket):
+                pattern = rf"\b{re.escape(namespace)}\.{re.escape(table)}\b"
+                if re.search(pattern, resolved):
+                    arrow_table = _load_gcs_table(bucket, namespace, table)
+                    if arrow_table is not None:
+                        key = _gcs_table_key(namespace, table)
+                        conn.register(key, arrow_table)
+                        resolved = re.sub(pattern, key, resolved)
+                gcs_pairs.add((namespace, table))
+    # Resolve local tables — for local catalog, or as GCP fallback for local-only tables
+    for namespace, table, data_dir in _iter_local_tables():
+        if (namespace, table) in gcs_pairs:
+            continue
+        pattern = rf"\b{re.escape(namespace)}\.{re.escape(table)}\b"
+        glob = str(data_dir / "*.parquet")
+        resolved = re.sub(pattern, f"read_parquet('{glob}')", resolved)
+    return resolved
+def list_tables() -> list[dict[str, Any]]:
+    """
+    Return all tables in the warehouse with column schemas and row counts.
+    When catalog=gcp, returns BOTH GCS tables (location='gcs') and local-only
+    tables that have not been synced to GCS (location='local').
+    """
+    import duckdb
+    catalog = _catalog()
+    results: list[dict[str, Any]] = []
+    if catalog == "gcp":
+        bucket = _gcs_bucket()
+        gcs_pairs: set[tuple[str, str]] = set()
+        if bucket:
+            conn = duckdb.connect()
+            for namespace, table in _iter_gcs_tables(bucket):
+                arrow_table = _load_gcs_table(bucket, namespace, table)
+                if arrow_table is None:
+                    continue
+                key = _gcs_table_key(namespace, table)
+                try:
+                    conn.register(key, arrow_table)
+                    row_count = conn.execute(f"SELECT COUNT(*) FROM {key}").fetchone()[0]
+                    cols = conn.execute(f"DESCRIBE SELECT * FROM {key} LIMIT 0").fetchall()
+                    columns = [{"name": c[0], "type": c[1]} for c in cols]
+                except Exception as e:
+                    row_count = -1
+                    columns = [{"error": str(e)}]
+                results.append({
+                    "namespace": namespace,
+                    "table": table,
+                    "full_name": f"{namespace}.{table}",
+                    "row_count": row_count,
+                    "columns": columns,
+                    "location": "gcs",
+                })
+                gcs_pairs.add((namespace, table))
+            conn.close()
+        # Also list local-only tables not yet in GCS (F-018)
+        for namespace, table, data_dir in _iter_local_tables():
+            if (namespace, table) in gcs_pairs:
+                continue
+            glob = str(data_dir / "*.parquet")
+            try:
+                conn2 = duckdb.connect()
+                info = conn2.execute(f"SELECT COUNT(*) as n FROM read_parquet('{glob}')").fetchone()
+                row_count = info[0] if info else 0
+                cols = conn2.execute(
+                    f"DESCRIBE SELECT * FROM read_parquet('{glob}') LIMIT 0"
+                ).fetchall()
+                columns = [{"name": c[0], "type": c[1]} for c in cols]
+                conn2.close()
+            except Exception as e:
+                row_count = -1
+                columns = [{"error": str(e)}]
+            results.append({
+                "namespace": namespace,
+                "table": table,
+                "full_name": f"{namespace}.{table}",
+                "row_count": row_count,
+                "columns": columns,
+                "location": "local",
+            })
+        return results
+    # local catalog
+    for namespace, table, data_dir in _iter_local_tables():
+        glob = str(data_dir / "*.parquet")
+        try:
+            conn = duckdb.connect()
+            info = conn.execute(f"SELECT COUNT(*) as n FROM read_parquet('{glob}')").fetchone()
+            row_count = info[0] if info else 0
+            cols = conn.execute(
+                f"DESCRIBE SELECT * FROM read_parquet('{glob}') LIMIT 0"
+            ).fetchall()
+            columns = [{"name": c[0], "type": c[1]} for c in cols]
+            conn.close()
+        except Exception as e:
+            row_count = -1
+            columns = [{"error": str(e)}]
+        results.append({
+            "namespace": namespace,
+            "table": table,
+            "full_name": f"{namespace}.{table}",
+            "row_count": row_count,
+            "columns": columns,
+            "location": "local",
+        })
+    return results
+def query(sql: str) -> list[dict[str, Any]]:
+    """
+    Run a SQL query against the warehouse.
+    Table references use the form  namespace.table  — e.g.
+        SELECT neighborhood, AVG(CAST(price AS DOUBLE)) as avg_price
+        FROM craigslist_apts.craigslist_apts
+        GROUP BY 1
+        ORDER BY 2 DESC
+    Write statements (COPY, CREATE, INSERT, etc.) are executed as-is without
+    being wrapped in SELECT … LIMIT.  SELECT queries are automatically capped
+    at 500 rows unless the caller includes a LIMIT clause.
+    Returns at most 500 rows for SELECT queries.
+    """
+    import duckdb
+    catalog = _catalog()
+    conn = duckdb.connect()
+    resolved = _resolve_table_refs(sql, conn, catalog)
+    # F-019: skip auto-LIMIT for write/DDL statements
+    if not _is_write_statement(resolved) and "limit" not in resolved.lower():
+        resolved = f"SELECT * FROM ({resolved}) _q LIMIT {_MAX_ROWS}"
+    try:
+        rows = conn.execute(resolved).fetchall()
+    except Exception:
+        conn.close()
+        raise
+    cols = [d[0] for d in conn.description]
+    conn.close()
+    return [dict(zip(cols, row)) for row in rows]
+def materialize_model(sql: str, namespace: str, table: str) -> dict[str, Any]:
+    """
+    Run sql and write the result as a new warehouse table at namespace/table.
+    Writes locally to warehouse/<namespace>/<table>/data/part-001.parquet.
+    If catalog=gcp, also uploads the Parquet to the GCS warehouse bucket so
+    the model is immediately queryable via query_warehouse() and visible in
+    list_warehouse_tables().
+    Returns a dict with ok, namespace, table, row_count, and location.
+    """
+    import duckdb
+    import pyarrow.parquet as pq
+    catalog = _catalog()
+    conn = duckdb.connect()
+    resolved = _resolve_table_refs(sql, conn, catalog)
+    arrow_result = conn.execute(resolved).arrow()
+    if hasattr(arrow_result, "read_all"):
+        arrow_result = arrow_result.read_all()  # RecordBatchReader → Table
+    row_count = arrow_result.num_rows
+    conn.close()
+    out_dir = _warehouse() / namespace / table / "data"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / "part-001.parquet"
+    pq.write_table(arrow_result, out_path)
+    location = str(out_path)
+    if catalog == "gcp":
+        bucket_name = _gcs_bucket()
+        if bucket_name:
+            from google.cloud import storage as gcs_storage
+            client = gcs_storage.Client()
+            gcs_bucket = client.bucket(bucket_name)
+            blob_name = f"{namespace}/{table}/data/part-001.parquet"
+            blob = gcs_bucket.blob(blob_name)
+            blob.upload_from_filename(str(out_path))
+            location = f"gs://{bucket_name}/{blob_name}"
+    return {
+        "ok": True,
+        "namespace": namespace,
+        "table": table,
+        "row_count": row_count,
+        "location": location,
+    }

dcf/writer/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .iceberg import write
+__all__ = ["write"]