npm - opencode-skills-collection - Versions diffs - 2.0.0 → 2.0.2 - Mend

opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""
+BigQuery — Lineage Collection (collect only)
+=============================================
+Collects table-level lineage from two sources:
+  1. INFORMATION_SCHEMA.SCHEMATA_LINKS — cross-project dataset shares (per region)
+  2. Job query history — SQL parsing for CREATE TABLE AS SELECT and INSERT INTO
+     SELECT patterns to derive source->destination relationships.
+Writes the collected lineage edges to a JSON manifest file.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points (search for "← SUBSTITUTE"):
+  - BIGQUERY_PROJECT_ID   : GCP project ID to collect from
+  - BIGQUERY_REGION       : BigQuery region for INFORMATION_SCHEMA queries (e.g. "us", "eu")
+  - LOOKBACK_HOURS        : how far back to scan job history (default 24 h)
+Prerequisites:
+  pip install google-cloud-bigquery
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+import re
+from datetime import datetime, timedelta, timezone
+from google.cloud import bigquery
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+RESOURCE_TYPE = "bigquery"
+LOOKBACK_HOURS = int(os.getenv("LOOKBACK_HOURS", "24"))  # ← SUBSTITUTE: adjust lookback window
+# Regex patterns to detect CTAS and INSERT INTO SELECT in BigQuery SQL
+_CTAS_PATTERN = re.compile(
+    r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:TABLE|VIEW)\s+`?(?P<dest>[\w.\-]+)`?"
+    r".*?(?:AS\s+)?SELECT\b",
+    re.IGNORECASE | re.DOTALL,
+)
+_INSERT_PATTERN = re.compile(
+    r"INSERT\s+(?:INTO\s+)?`?(?P<dest>[\w.\-]+)`?.*?SELECT\b",
+    re.IGNORECASE | re.DOTALL,
+)
+_TABLE_REF_PATTERN = re.compile(r"`?([\w\-]+\.[\w\-]+\.[\w\-]+)`?", re.IGNORECASE)
+def _parse_full_name(full_name: str) -> tuple[str, str, str]:
+    """Split 'project.dataset.table' into (project, dataset, table)."""
+    parts = full_name.replace("`", "").split(".")
+    if len(parts) == 3:
+        return parts[0], parts[1], parts[2]
+    if len(parts) == 2:
+        return "", parts[0], parts[1]
+    return "", "", parts[0]
+def _collect_schema_link_lineage(
+    bq_client: bigquery.Client,
+    project_id: str,
+    region: str,
+) -> list[dict]:
+    """Collect cross-project lineage from INFORMATION_SCHEMA.SCHEMATA_LINKS."""
+    query = f"""
+        SELECT
+            CATALOG_NAME            AS source_project,
+            SCHEMA_NAME             AS source_dataset,
+            LINKED_SCHEMA_CATALOG_NAME AS destination_project,
+            LINKED_SCHEMA_NAME      AS destination_dataset
+        FROM `{project_id}`.`{region}`.INFORMATION_SCHEMA.SCHEMATA_LINKS
+    """  # ← SUBSTITUTE: update project_id and region as needed
+    edges: list[dict] = []
+    try:
+        for row in bq_client.query(query).result():
+            edges.append(
+                {
+                    "destination": {
+                        "database": row.destination_project,
+                        "schema": row.destination_dataset,
+                        "table": "*",
+                    },
+                    "sources": [
+                        {
+                            "database": row.source_project,
+                            "schema": row.source_dataset,
+                            "table": "*",
+                        }
+                    ],
+                }
+            )
+    except Exception:
+        log.warning("SCHEMATA_LINKS query failed — skipping dataset-share lineage", exc_info=True)
+    return edges
+def _collect_query_lineage(
+    bq_client: bigquery.Client,
+    project_id: str,
+    lookback_hours: int,
+) -> list[dict]:
+    """Derive lineage by parsing CTAS/INSERT patterns in job query history."""
+    end_dt = datetime.now(timezone.utc)
+    start_dt = end_dt - timedelta(hours=lookback_hours)
+    edges: list[dict] = []
+    for job in bq_client.list_jobs(all_users=True, min_creation_time=start_dt, max_creation_time=end_dt):
+        sql: str = getattr(job, "query", None) or ""
+        if not sql.strip():
+            continue
+        dest_match = _CTAS_PATTERN.search(sql) or _INSERT_PATTERN.search(sql)
+        if not dest_match:
+            continue
+        dest_full = dest_match.group("dest")
+        dest_project, dest_dataset, dest_table = _parse_full_name(dest_full)
+        if not dest_table:
+            continue
+        # Collect all 3-part table references in the query as sources, excluding destination
+        source_refs = [
+            m.group(1)
+            for m in _TABLE_REF_PATTERN.finditer(sql)
+            if m.group(1) != dest_full
+        ]
+        if not source_refs:
+            continue
+        unique_sources = list(dict.fromkeys(source_refs))
+        sources = []
+        for ref in unique_sources:
+            p, d, t = _parse_full_name(ref)
+            sources.append({"database": p, "schema": d, "table": t})
+        edges.append(
+            {
+                "destination": {
+                    "database": dest_project or project_id,
+                    "schema": dest_dataset,
+                    "table": dest_table,
+                },
+                "sources": sources,
+            }
+        )
+    return edges
+def collect(
+    project_id: str,
+    region: str = "us",
+    lookback_hours: int = LOOKBACK_HOURS,
+    output_file: str = "lineage_output.json",
+) -> dict:
+    """
+    Connect to BigQuery, collect lineage edges, and write a JSON manifest.
+    Returns the manifest dict.
+    """
+    bq_client = bigquery.Client(project=project_id)
+    log.info("Collecting lineage from project %s ...", project_id)
+    schema_edges = _collect_schema_link_lineage(bq_client, project_id, region)
+    query_edges = _collect_query_lineage(bq_client, project_id, lookback_hours)
+    all_edges = schema_edges + query_edges
+    log.info(
+        "Collected %d lineage edges (%d schema-link, %d query-derived)",
+        len(all_edges), len(schema_edges), len(query_edges),
+    )
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": datetime.now(timezone.utc).isoformat(),
+        "schema_link_edges": len(schema_edges),
+        "query_derived_edges": len(query_edges),
+        "edges": all_edges,
+    }
+    with open(output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Lineage manifest written to %s", output_file)
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect BigQuery lineage and write to a manifest file",
+    )
+    parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))  # ← SUBSTITUTE
+    parser.add_argument("--region", default=os.getenv("BIGQUERY_REGION", "us"))    # ← SUBSTITUTE
+    parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
+    parser.add_argument("--output-file", default="lineage_output.json")
+    args = parser.parse_args()
+    required = ["project_id"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        project_id=args.project_id,
+        region=args.region,
+        lookback_hours=args.lookback_hours,
+        output_file=args.output_file,
+    )
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""
+BigQuery — Metadata Collection (collect only)
+==============================================
+Collects table schemas, row counts, byte sizes, and descriptions from all
+datasets in a BigQuery project and writes them to a JSON manifest file.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points (search for "← SUBSTITUTE"):
+  - BIGQUERY_PROJECT_ID   : GCP project ID to collect from
+  - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
+  - DATASET_EXCLUSIONS    : datasets to skip (informational / system datasets)
+Prerequisites:
+  pip install google-cloud-bigquery
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from google.cloud import bigquery
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+RESOURCE_TYPE = "bigquery"
+# Datasets to skip — add any internal / system datasets here
+DATASET_EXCLUSIONS = {  # ← SUBSTITUTE: add datasets to exclude
+    "_bqc_",
+    "INFORMATION_SCHEMA",
+}
+# BigQuery type → Monte Carlo canonical type
+BQ_TYPE_MAP: dict[str, str] = {
+    "INT64": "INTEGER",
+    "INTEGER": "INTEGER",
+    "FLOAT64": "FLOAT",
+    "FLOAT": "FLOAT",
+    "BOOL": "BOOLEAN",
+    "BOOLEAN": "BOOLEAN",
+    "STRING": "VARCHAR",
+    "BYTES": "BINARY",
+    "DATE": "DATE",
+    "DATETIME": "DATETIME",
+    "TIMESTAMP": "TIMESTAMP",
+    "TIME": "TIME",
+    "NUMERIC": "DECIMAL",
+    "BIGNUMERIC": "DECIMAL",
+    "RECORD": "STRUCT",
+    "STRUCT": "STRUCT",
+    "REPEATED": "ARRAY",
+    "JSON": "JSON",
+    "GEOGRAPHY": "GEOGRAPHY",
+}
+def map_bq_type(bq_type: str) -> str:
+    return BQ_TYPE_MAP.get(bq_type.upper(), bq_type.upper())
+def _collect_assets(bq_client: bigquery.Client, project_id: str) -> list[dict]:
+    """Collect table metadata from BigQuery and return as a list of dicts."""
+    assets: list[dict] = []
+    for dataset_item in bq_client.list_datasets():
+        dataset_id = dataset_item.dataset_id
+        if any(exc in dataset_id for exc in DATASET_EXCLUSIONS):
+            log.info("Skipping dataset %s", dataset_id)
+            continue
+        dataset_ref = bq_client.dataset(dataset_id)
+        for table_item in bq_client.list_tables(dataset_ref):
+            table_ref = dataset_ref.table(table_item.table_id)
+            table = bq_client.get_table(table_ref)
+            fields = [
+                {
+                    "name": field.name,
+                    "type": map_bq_type(field.field_type),
+                    "description": field.description or None,
+                }
+                for field in table.schema
+            ]
+            asset = {
+                "name": table.table_id,
+                "database": project_id,  # ← SUBSTITUTE: use project or dataset as database
+                "schema": dataset_id,
+                "type": "VIEW" if table.table_type == "VIEW" else "TABLE",
+                "description": table.description or None,
+                "fields": fields,
+                "volume": {
+                    "row_count": table.num_rows,
+                    "byte_count": table.num_bytes,
+                },
+                "freshness": {
+                    "last_updated_time": table.modified.isoformat() if table.modified else None,
+                },
+            }
+            assets.append(asset)
+            log.info("Queued %s.%s.%s", project_id, dataset_id, table.table_id)
+    return assets
+def collect(
+    project_id: str,
+    output_file: str = "metadata_output.json",
+) -> dict:
+    """
+    Connect to BigQuery, collect table metadata, and write a JSON manifest.
+    Returns the manifest dict.
+    """
+    bq_client = bigquery.Client(project=project_id)  # ← SUBSTITUTE: adjust auth if needed
+    log.info("Collecting metadata from project %s ...", project_id)
+    assets = _collect_assets(bq_client, project_id)
+    log.info("Collected %d asset(s).", len(assets))
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": datetime.now(timezone.utc).isoformat(),
+        "assets": assets,
+    }
+    with open(output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Asset manifest written to %s", output_file)
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect BigQuery metadata and write to a manifest file",
+    )
+    parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))  # ← SUBSTITUTE
+    parser.add_argument("--output-file", default="metadata_output.json")
+    args = parser.parse_args()
+    missing = [k for k, v in vars(args).items() if v is None and k != "output_file"]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        project_id=args.project_id,
+        output_file=args.output_file,
+    )
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+BigQuery — Query Log Collection (collect only)
+================================================
+Collects completed job query logs from BigQuery job history and writes them to
+a JSON manifest file for later push to Monte Carlo.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points (search for "← SUBSTITUTE"):
+  - BIGQUERY_PROJECT_ID   : GCP project ID to collect query logs from
+  - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
+  - LOOKBACK_HOURS        : how many hours back to collect (default 25, skip last 1 h)
+  - STATEMENT_TYPE_FILTER : restrict to specific statement types, or leave empty for all
+  - MAX_JOBS              : cap on number of jobs to collect per run
+Prerequisites:
+  pip install google-cloud-bigquery
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime, timedelta, timezone
+from google.cloud import bigquery
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+LOG_TYPE = "bigquery"
+# Collect jobs from [now - LOOKBACK_HOURS] to [now - LOOKBACK_LAG_HOURS].
+# The lag avoids collecting in-flight jobs that have not yet completed.
+LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25"))        # ← SUBSTITUTE
+LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
+# Limit statement types — e.g. ["SELECT", "CREATE_TABLE_AS_SELECT", "INSERT"]
+# Set to an empty list to collect all statement types.
+STATEMENT_TYPE_FILTER: list[str] = []  # ← SUBSTITUTE
+# Maximum number of jobs to collect in a single run to avoid runaway costs
+MAX_JOBS: int = int(os.getenv("MAX_JOBS", "10000"))  # ← SUBSTITUTE
+def _safe_isoformat(dt: datetime | None) -> str | None:
+    if dt is None:
+        return None
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.isoformat()
+def _collect_query_logs(
+    bq_client: bigquery.Client,
+    project_id: str,
+    start_dt: datetime,
+    end_dt: datetime,
+) -> list[dict]:
+    """Collect query logs from BigQuery job history and return as a list of dicts."""
+    entries: list[dict] = []
+    log.info(
+        "Listing jobs for project=%s from %s to %s",
+        project_id, start_dt.isoformat(), end_dt.isoformat(),
+    )
+    for job in bq_client.list_jobs(
+        project=project_id,
+        all_users=True,
+        min_creation_time=start_dt,
+        max_creation_time=end_dt,
+    ):
+        # Only process query jobs that have SQL text
+        sql: str = getattr(job, "query", None) or ""
+        if not sql.strip():
+            continue
+        statement_type: str = getattr(job, "statement_type", None) or ""
+        if STATEMENT_TYPE_FILTER and statement_type not in STATEMENT_TYPE_FILTER:
+            continue  # ← SUBSTITUTE: adjust filter as needed
+        total_bytes_billed: int | None = getattr(job, "total_bytes_billed", None)
+        entries.append(
+            {
+                "query_id": job.job_id,
+                "query_text": sql,
+                "start_time": _safe_isoformat(getattr(job, "created", None)),
+                "end_time": _safe_isoformat(getattr(job, "ended", None)),
+                "user": getattr(job, "user_email", None),
+                "total_bytes_billed": total_bytes_billed,
+                "statement_type": statement_type or None,
+            }
+        )
+        if len(entries) >= MAX_JOBS:
+            log.warning("Reached MAX_JOBS=%d — stopping early", MAX_JOBS)
+            break
+    return entries
+def collect(
+    project_id: str,
+    lookback_hours: int = LOOKBACK_HOURS,
+    lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
+    output_file: str = "query_logs_output.json",
+) -> dict:
+    """
+    Connect to BigQuery, collect query logs, and write a JSON manifest.
+    Returns the manifest dict.
+    """
+    bq_client = bigquery.Client(project=project_id)  # ← SUBSTITUTE: adjust auth if needed
+    end_dt = datetime.now(timezone.utc) - timedelta(hours=lookback_lag_hours)
+    start_dt = end_dt - timedelta(hours=lookback_hours)
+    entries = _collect_query_logs(bq_client, project_id, start_dt, end_dt)
+    log.info("Collected %d query log entries.", len(entries))
+    manifest = {
+        "log_type": LOG_TYPE,
+        "collected_at": datetime.now(timezone.utc).isoformat(),
+        "window_start": start_dt.isoformat(),
+        "window_end": end_dt.isoformat(),
+        "query_log_count": len(entries),
+        "queries": entries,
+    }
+    with open(output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Query log manifest written to %s", output_file)
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect BigQuery query logs and write to a manifest file",
+    )
+    parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))  # ← SUBSTITUTE
+    parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
+    parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
+    parser.add_argument("--output-file", default="query_logs_output.json")
+    args = parser.parse_args()
+    required = ["project_id"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        project_id=args.project_id,
+        lookback_hours=args.lookback_hours,
+        lookback_lag_hours=args.lookback_lag_hours,
+        output_file=args.output_file,
+    )
+if __name__ == "__main__":
+    main()