npm - opencode-skills-collection - Versions diffs - 2.0.0 → 2.0.2 - Mend

opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""
+BigQuery Iceberg — Query Log Push (push only)
+=============================================
+Reads a JSON manifest produced by collect_query_logs.py and pushes query
+log entries to Monte Carlo using the pycarlo SDK's IngestionService.
+Uses dateutil.isoparse() to convert ISO8601 strings back to datetime
+objects (QueryLogEntry requires datetime, not str).
+Can be run standalone via CLI or imported (use the ``push()`` function).
+Substitution points (search for "← SUBSTITUTE"):
+  - MCD_INGEST_ID      : Monte Carlo Ingestion API key ID
+  - MCD_INGEST_TOKEN   : Monte Carlo Ingestion API key token
+  - MCD_RESOURCE_UUID  : Monte Carlo warehouse resource UUID
+Prerequisites:
+  pip install pycarlo>=0.12.251 python-dateutil>=2.8.0
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from dateutil.parser import isoparse
+from pycarlo.core import Client, Session
+from pycarlo.features.ingestion import IngestionService
+from pycarlo.features.ingestion.models import QueryLogEntry
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+LOG_TYPE = "bigquery"
+# Query logs include full SQL text — keep batches small to stay under the
+# 1 MB compressed payload limit.
+_BATCH_SIZE = 100
+# Truncate very long SQL to prevent 413 errors.
+_MAX_QUERY_TEXT_LEN = 10_000
+_ENDPOINT = "https://integrations.getmontecarlo.com"
+def _build_query_log_entries(queries: list[dict]) -> list[QueryLogEntry]:
+    """Convert manifest query dicts into QueryLogEntry objects."""
+    entries = []
+    truncated = 0
+    for q in queries:
+        query_text = q.get("query_text") or ""
+        if len(query_text) > _MAX_QUERY_TEXT_LEN:
+            query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
+            truncated += 1
+        extra = {}
+        if q.get("total_bytes_billed") is not None:
+            extra["total_bytes_billed"] = q["total_bytes_billed"]
+        if q.get("statement_type") is not None:
+            extra["statement_type"] = q["statement_type"]
+        start_time = q.get("start_time")
+        end_time = q.get("end_time")
+        entry = QueryLogEntry(
+            query_id=q.get("query_id"),
+            query_text=query_text,
+            start_time=isoparse(start_time) if start_time else None,
+            end_time=isoparse(end_time) if end_time else None,
+            user=q.get("user"),
+            extra=extra or None,
+        )
+        entries.append(entry)
+    if truncated:
+        log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
+    return entries
+def push(
+    input_file: str,
+    resource_uuid: str,
+    key_id: str,
+    key_token: str,
+    batch_size: int = _BATCH_SIZE,
+    output_file: str = "query_logs_push_result.json",
+) -> dict:
+    """Read a query log manifest and push entries to Monte Carlo in batches."""
+    endpoint = _ENDPOINT
+    log.info("Using endpoint: %s", endpoint)
+    with open(input_file) as fh:
+        manifest = json.load(fh)
+    queries = manifest.get("queries", [])
+    log_type = manifest.get("log_type", LOG_TYPE)
+    entries = _build_query_log_entries(queries)
+    log.info("Loaded %d query log entry/entries from %s", len(entries), input_file)
+    if not entries:
+        log.info("No query log entries to push.")
+        push_result = {
+            "resource_uuid": resource_uuid,
+            "log_type": log_type,
+            "invocation_ids": [],
+            "pushed_at": datetime.now(timezone.utc).isoformat(),
+            "total_entries": 0,
+            "batch_count": 0,
+            "batch_size": batch_size,
+        }
+        with open(output_file, "w") as fh:
+            json.dump(push_result, fh, indent=2)
+        return push_result
+    batches = [entries[i : i + batch_size] for i in range(0, len(entries), batch_size)]
+    total_batches = len(batches)
+    def _push_batch(batch: list[QueryLogEntry], batch_num: int) -> str | None:
+        client = Client(session=Session(
+            mcd_id=key_id, mcd_token=key_token, scope="Ingestion", endpoint=endpoint,
+        ))
+        service = IngestionService(mc_client=client)
+        result = service.send_query_logs(
+            resource_uuid=resource_uuid,
+            log_type=log_type,
+            events=batch,
+        )
+        invocation_id = service.extract_invocation_id(result)
+        log.info(
+            "Pushed batch %d/%d (%d entries) — invocation_id=%s",
+            batch_num, total_batches, len(batch), invocation_id,
+        )
+        return invocation_id
+    max_workers = min(4, total_batches)
+    invocation_ids: list[str | None] = [None] * total_batches
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        futures = {
+            pool.submit(_push_batch, batch, i + 1): i
+            for i, batch in enumerate(batches)
+        }
+        for future in as_completed(futures):
+            idx = futures[future]
+            try:
+                invocation_ids[idx] = future.result()
+            except Exception as exc:
+                log.error("ERROR pushing batch %d: %s", idx + 1, exc)
+                raise
+    log.info("All %d batch(es) pushed.", total_batches)
+    push_result = {
+        "resource_uuid": resource_uuid,
+        "log_type": log_type,
+        "invocation_ids": invocation_ids,
+        "pushed_at": datetime.now(timezone.utc).isoformat(),
+        "total_entries": len(entries),
+        "batch_count": total_batches,
+        "batch_size": batch_size,
+    }
+    with open(output_file, "w") as fh:
+        json.dump(push_result, fh, indent=2)
+    log.info("Push result written to %s", output_file)
+    return push_result
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Push BigQuery query logs from a manifest to Monte Carlo",
+    )
+    parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
+    parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
+    parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
+    parser.add_argument("--input-file", default="query_logs_output.json")
+    parser.add_argument("--output-file", default="query_logs_push_result.json")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=_BATCH_SIZE,
+        help=f"Max entries per push batch (default: {_BATCH_SIZE})",
+    )
+    args = parser.parse_args()
+    required = ["resource_uuid", "key_id", "key_token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    push(
+        input_file=args.input_file,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        batch_size=args.batch_size,
+        output_file=args.output_file,
+    )
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Databricks — Lineage Collect & Push (combined)
+================================================
+Collects table-level and (optionally) column-level lineage from Databricks Unity
+Catalog system tables, then pushes them to Monte Carlo via the push ingestion API.
+This script imports and calls collect() from collect_lineage and push() from
+push_lineage, running both in sequence.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path
+  - DATABRICKS_TOKEN      : PAT or service-principal secret
+  - LOOKBACK_DAYS         : how many days back to collect lineage (default 30)
+  - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
+  - MCD_RESOURCE_UUID      : UUID of the Databricks connection in Monte Carlo
+  - PUSH_BATCH_SIZE       : number of events per API call (default 500)
+Use the --column-lineage flag to also push column-level lineage (disabled by default).
+Prerequisites:
+  pip install databricks-sql-connector pycarlo
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+from collect_lineage import LOOKBACK_DAYS, collect
+from push_lineage import DEFAULT_BATCH_SIZE, push
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect and push Databricks lineage to Monte Carlo")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
+    parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
+    parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
+    parser.add_argument("--lookback-days", type=int, default=LOOKBACK_DAYS)
+    parser.add_argument(
+        "--column-lineage", action="store_true",
+        help="Also collect column-level lineage (requires system.access.column_lineage access)",
+    )
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--manifest", default="manifest_lineage.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    log.info("Step 1: Collecting lineage …")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        manifest_path=args.manifest,
+        include_column_lineage=args.column_lineage,
+        lookback_days=args.lookback_days,
+    )
+    log.info("Step 2: Pushing lineage to Monte Carlo …")
+    push(
+        manifest_path=args.manifest,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        batch_size=args.batch_size,
+    )
+    log.info("Done — collect and push complete.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+Databricks — Metadata Collect & Push (combined)
+=================================================
+Collects table schemas, row counts, and byte sizes from Databricks Unity Catalog,
+then pushes them to Monte Carlo via the push ingestion API.
+This script imports and calls collect() from collect_metadata and push() from
+push_metadata, running both in sequence.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname (e.g. adb-1234.azuredatabricks.net)
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path (e.g. /sql/1.0/warehouses/abc123)
+  - DATABRICKS_TOKEN      : personal access token or service-principal secret
+  - DATABRICKS_CATALOG    : catalog to collect from (default: "hive_metastore" or "main")
+  - SCHEMA_EXCLUSIONS     : schemas to skip
+  - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
+  - MCD_RESOURCE_UUID      : UUID of the Databricks connection in Monte Carlo
+  - PUSH_BATCH_SIZE       : number of assets per API call (default 500)
+Prerequisites:
+  pip install databricks-sql-connector pycarlo
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+from collect_metadata import collect
+from push_metadata import DEFAULT_BATCH_SIZE, push
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect and push Databricks metadata to Monte Carlo")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--catalog", default=os.getenv("DATABRICKS_CATALOG", "hive_metastore"))
+    parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
+    parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
+    parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--manifest", default="manifest_metadata.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    log.info("Step 1: Collecting metadata …")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        catalog=args.catalog,
+        manifest_path=args.manifest,
+    )
+    log.info("Step 2: Pushing metadata to Monte Carlo …")
+    push(
+        manifest_path=args.manifest,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        batch_size=args.batch_size,
+    )
+    log.info("Done — collect and push complete.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Databricks — Query Log Collect & Push (combined)
+==================================================
+Collects finished query execution records from the Databricks system table
+system.query.history and pushes them to Monte Carlo for query-pattern analysis,
+lineage derivation, and usage attribution.
+This script imports and calls collect() from collect_query_logs and push() from
+push_query_logs, running both in sequence.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path
+  - DATABRICKS_TOKEN      : PAT or service-principal secret
+  - LOOKBACK_HOURS        : hours back from [now - LAG_HOURS] to collect (default 25)
+  - LOOKBACK_LAG_HOURS    : hours to lag behind now to avoid in-flight queries (default 1)
+  - MAX_ROWS              : maximum query rows to collect per run (default 10000)
+  - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
+  - MCD_RESOURCE_UUID      : UUID of the Databricks connection in Monte Carlo
+  - PUSH_BATCH_SIZE       : number of entries per API call (default 250)
+Prerequisites:
+  pip install databricks-sql-connector pycarlo
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+from collect_query_logs import LOOKBACK_HOURS, LOOKBACK_LAG_HOURS, MAX_ROWS, collect
+from push_query_logs import DEFAULT_BATCH_SIZE, push
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect and push Databricks query logs to Monte Carlo")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
+    parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
+    parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
+    parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
+    parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
+    parser.add_argument("--max-rows", type=int, default=MAX_ROWS)
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--manifest", default="manifest_query_logs.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    log.info("Step 1: Collecting query logs …")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        manifest_path=args.manifest,
+        lookback_hours=args.lookback_hours,
+        lookback_lag_hours=args.lookback_lag_hours,
+        max_rows=args.max_rows,
+    )
+    log.info("Step 2: Pushing query logs to Monte Carlo …")
+    push(
+        manifest_path=args.manifest,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        batch_size=args.batch_size,
+    )
+    log.info("Done — collect and push complete.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py ADDED Viewed

@@ -0,0 +1,240 @@
+"""
+Databricks — Lineage Collection (collect-only)
+================================================
+Collects table-level and (optionally) column-level lineage from Databricks Unity
+Catalog system tables (system.access.table_lineage and system.access.column_lineage).
+No SQL parsing required — Databricks provides first-class lineage metadata.
+Writes a JSON manifest file that can be consumed by push_lineage.py.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path
+  - DATABRICKS_TOKEN      : PAT or service-principal secret
+  - LOOKBACK_DAYS         : how many days back to collect lineage (default 30)
+Use the --column-lineage flag to also collect column-level lineage (disabled by default).
+Prerequisites:
+  pip install databricks-sql-connector
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Any
+from databricks import sql
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+RESOURCE_TYPE = "databricks"
+LOOKBACK_DAYS: int = int(os.getenv("LOOKBACK_DAYS", "30"))  # ← SUBSTITUTE
+def _check_available_memory(min_gb: float = 2.0) -> None:
+    """Warn if available memory is below the threshold."""
+    try:
+        if hasattr(os, "sysconf"):  # Linux / macOS
+            page_size = os.sysconf("SC_PAGE_SIZE")
+            avail_pages = os.sysconf("SC_AVPHYS_PAGES")
+            avail_gb = (page_size * avail_pages) / (1024 ** 3)
+        else:
+            return  # Windows — skip check
+    except (ValueError, OSError):
+        return
+    if avail_gb < min_gb:
+        log.warning(
+            "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
+            "Consider reducing the collection scope or increasing available memory.",
+            avail_gb,
+            min_gb,
+        )
+def _query(cursor: Any, sql_text: str) -> list[dict[str, Any]]:
+    cursor.execute(sql_text)
+    cols = [d[0] for d in cursor.description]
+    rows = []
+    while True:
+        chunk = cursor.fetchmany(1000)
+        if not chunk:
+            break
+        rows.extend(dict(zip(cols, row)) for row in chunk)
+    return rows
+def _parse_full_name(full_name: str) -> tuple[str, str, str]:
+    """Split 'catalog.schema.table' into (catalog, schema, table)."""
+    parts = (full_name or "").split(".")
+    if len(parts) == 3:
+        return parts[0], parts[1], parts[2]
+    if len(parts) == 2:
+        return "", parts[0], parts[1]
+    return "", "", full_name
+def collect_table_lineage(cursor: Any, lookback_days: int) -> list[dict[str, Any]]:
+    rows = _query(
+        cursor,
+        f"""
+        SELECT DISTINCT
+            source_table_full_name,
+            target_table_full_name,
+            created_by,
+            MAX(event_time) AS last_seen
+        FROM system.access.table_lineage
+        WHERE event_time >= DATEADD(DAY, -{lookback_days}, CURRENT_TIMESTAMP())
+          AND source_table_full_name IS NOT NULL
+          AND target_table_full_name IS NOT NULL
+        GROUP BY source_table_full_name, target_table_full_name, created_by
+        LIMIT 50000
+        """,  # ← SUBSTITUTE: adjust lookback_days, LIMIT, or add catalog/schema filters
+    )
+    events: list[dict[str, Any]] = []
+    for row in rows:
+        src_catalog, src_schema, src_table = _parse_full_name(row["source_table_full_name"])
+        dst_catalog, dst_schema, dst_table = _parse_full_name(row["target_table_full_name"])
+        if not src_table or not dst_table:
+            continue
+        events.append({
+            "sources": [{"database": src_catalog, "schema": src_schema, "asset_name": src_table}],
+            "destination": {"database": dst_catalog, "schema": dst_schema, "asset_name": dst_table},
+            "lineage_type": "table",
+        })
+    return events
+def collect_column_lineage(cursor: Any, lookback_days: int) -> list[dict[str, Any]]:
+    rows = _query(
+        cursor,
+        f"""
+        SELECT DISTINCT
+            source_table_full_name,
+            source_column_name,
+            target_table_full_name,
+            target_column_name
+        FROM system.access.column_lineage
+        WHERE event_time >= DATEADD(DAY, -{lookback_days}, CURRENT_TIMESTAMP())
+          AND source_table_full_name IS NOT NULL
+          AND target_table_full_name IS NOT NULL
+        LIMIT 50000
+        """,  # ← SUBSTITUTE: adjust LIMIT or add catalog/schema filters if needed
+    )
+    # Group by destination table so we can build one event per destination
+    grouped: dict[str, dict[str, Any]] = {}
+    for row in rows:
+        dst_key = row["target_table_full_name"]
+        if dst_key not in grouped:
+            grouped[dst_key] = {"dst_full": dst_key, "columns": []}
+        grouped[dst_key]["columns"].append(row)
+    events: list[dict[str, Any]] = []
+    for dst_key, group in grouped.items():
+        dst_catalog, dst_schema, dst_table = _parse_full_name(group["dst_full"])
+        if not dst_table:
+            continue
+        col_fields: list[dict[str, Any]] = []
+        for row in group["columns"]:
+            src_catalog, src_schema, src_table = _parse_full_name(row["source_table_full_name"])
+            col_fields.append({
+                "destination_field": row["target_column_name"],
+                "sources": [{
+                    "database": src_catalog,
+                    "schema": src_schema,
+                    "asset_name": src_table,
+                    "field": row["source_column_name"],
+                }],
+            })
+        events.append({
+            "sources": [],  # column lineage carries source refs inside col_fields
+            "destination": {"database": dst_catalog, "schema": dst_schema, "asset_name": dst_table},
+            "column_lineage": col_fields,
+            "lineage_type": "column",
+        })
+    return events
+def collect(
+    host: str,
+    http_path: str,
+    token: str,
+    manifest_path: str = "manifest_lineage.json",
+    include_column_lineage: bool = False,
+    lookback_days: int = LOOKBACK_DAYS,
+) -> list[dict[str, Any]]:
+    """Connect to Databricks, collect lineage, write a JSON manifest, and return events."""
+    _check_available_memory(min_gb=2.0)
+    collected_at = datetime.now(timezone.utc).isoformat()
+    with sql.connect(
+        server_hostname=host,    # ← SUBSTITUTE
+        http_path=http_path,     # ← SUBSTITUTE
+        access_token=token,      # ← SUBSTITUTE
+    ) as conn:
+        with conn.cursor() as cursor:
+            table_events = collect_table_lineage(cursor, lookback_days)
+            col_events = collect_column_lineage(cursor, lookback_days) if include_column_lineage else []
+    all_events = table_events + col_events
+    log.info(
+        "Collected %d lineage events (%d table, %d column)",
+        len(all_events), len(table_events), len(col_events),
+    )
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": collected_at,
+        "lookback_days": lookback_days,
+        "table_lineage_events": len(table_events),
+        "column_lineage_events": len(col_events),
+        "events": all_events,
+    }
+    with open(manifest_path, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Manifest written to %s (%d events)", manifest_path, len(all_events))
+    return all_events
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect Databricks lineage to a manifest file")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--lookback-days", type=int, default=LOOKBACK_DAYS)
+    parser.add_argument(
+        "--column-lineage", action="store_true",
+        help="Also collect column-level lineage (requires system.access.column_lineage access)",
+    )
+    parser.add_argument("--manifest", default="manifest_lineage.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        manifest_path=args.manifest,
+        include_column_lineage=args.column_lineage,
+        lookback_days=args.lookback_days,
+    )
+if __name__ == "__main__":
+    main()