npm - opencode-skills-collection - Versions diffs - 2.0.0 → 2.0.3 - Mend

opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Databricks — Metadata Collection (collect-only)
+=================================================
+Collects table schemas, row counts, and byte sizes from Databricks Unity Catalog
+using INFORMATION_SCHEMA and DESCRIBE DETAIL, then writes a JSON manifest file
+that can be consumed by push_metadata.py.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname (e.g. adb-1234.azuredatabricks.net)
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path (e.g. /sql/1.0/warehouses/abc123)
+  - DATABRICKS_TOKEN      : personal access token or service-principal secret
+  - DATABRICKS_CATALOG    : catalog to collect from (default: "hive_metastore" or "main")
+  - SCHEMA_EXCLUSIONS     : schemas to skip
+Prerequisites:
+  pip install databricks-sql-connector
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Any
+from databricks import sql
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+RESOURCE_TYPE = "databricks"
+# Schemas to skip across all catalogs
+SCHEMA_EXCLUSIONS: set[str] = {  # ← SUBSTITUTE: add any internal schemas to skip
+    "information_schema",
+    "__databricks_internal",
+}
+def _check_available_memory(min_gb: float = 2.0) -> None:
+    """Warn if available memory is below the threshold."""
+    try:
+        if hasattr(os, "sysconf"):  # Linux / macOS
+            page_size = os.sysconf("SC_PAGE_SIZE")
+            avail_pages = os.sysconf("SC_AVPHYS_PAGES")
+            avail_gb = (page_size * avail_pages) / (1024 ** 3)
+        else:
+            return  # Windows — skip check
+    except (ValueError, OSError):
+        return
+    if avail_gb < min_gb:
+        log.warning(
+            "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
+            "Consider reducing the collection scope or increasing available memory.",
+            avail_gb,
+            min_gb,
+        )
+def _query(cursor: Any, sql_text: str, params: tuple | None = None) -> list[dict[str, Any]]:
+    cursor.execute(sql_text, params)
+    cols = [d[0] for d in cursor.description]
+    rows = []
+    while True:
+        chunk = cursor.fetchmany(1000)
+        if not chunk:
+            break
+        rows.extend(dict(zip(cols, row)) for row in chunk)
+    return rows
+def collect_tables(cursor: Any, catalog: str) -> list[dict[str, Any]]:
+    return _query(
+        cursor,
+        f"""
+        SELECT table_catalog, table_schema, table_name, table_type, comment
+        FROM {catalog}.information_schema.tables
+        WHERE table_schema NOT IN ({", ".join(f"'{s}'" for s in SCHEMA_EXCLUSIONS)})
+        ORDER BY table_schema, table_name
+        """,  # ← SUBSTITUTE: add additional WHERE filters if needed
+    )
+def collect_columns(cursor: Any, catalog: str, schema: str, table: str) -> list[dict[str, Any]]:
+    return _query(
+        cursor,
+        f"""
+        SELECT column_name, data_type, comment
+        FROM {catalog}.information_schema.columns
+        WHERE table_schema = '{schema}' AND table_name = '{table}'
+        ORDER BY ordinal_position
+        """,
+    )
+def collect_detail(cursor: Any, catalog: str, schema: str, table: str) -> dict[str, Any] | None:
+    try:
+        rows = _query(cursor, f"DESCRIBE DETAIL `{catalog}`.`{schema}`.`{table}`")
+        return rows[0] if rows else None
+    except Exception:
+        log.debug("DESCRIBE DETAIL failed for %s.%s.%s", catalog, schema, table, exc_info=True)
+        return None
+def collect(
+    host: str,
+    http_path: str,
+    token: str,
+    catalog: str,
+    manifest_path: str = "manifest_metadata.json",
+) -> list[dict[str, Any]]:
+    """Connect to Databricks, collect metadata, write a JSON manifest, and return the asset dicts.
+    The manifest contains serialised asset dicts that push_metadata.py can read.
+    """
+    _check_available_memory(min_gb=2.0)
+    collected_at = datetime.now(timezone.utc).isoformat()
+    assets: list[dict[str, Any]] = []
+    with sql.connect(
+        server_hostname=host,    # ← SUBSTITUTE
+        http_path=http_path,     # ← SUBSTITUTE
+        access_token=token,      # ← SUBSTITUTE
+    ) as conn:
+        with conn.cursor() as cursor:
+            tables = collect_tables(cursor, catalog)
+            log.info("Found %d tables in catalog %s", len(tables), catalog)
+            for row in tables:
+                schema = row["table_schema"]
+                table_name = row["table_name"]
+                columns = collect_columns(cursor, catalog, schema, table_name)
+                fields = [
+                    {
+                        "name": col["column_name"],
+                        "type": col["data_type"].upper(),
+                        "description": col.get("comment") or None,
+                    }
+                    for col in columns
+                ]
+                detail = collect_detail(cursor, catalog, schema, table_name)
+                row_count: int | None = None
+                byte_count: int | None = None
+                last_updated: str | None = None
+                if detail:
+                    row_count = detail.get("numRows")
+                    byte_count = detail.get("sizeInBytes")
+                    last_modified = detail.get("lastModified")
+                    if last_modified:
+                        last_updated = (
+                            last_modified.isoformat()
+                            if hasattr(last_modified, "isoformat")
+                            else str(last_modified)
+                        )
+                asset = {
+                    "asset_name": table_name,
+                    "database": catalog,    # ← SUBSTITUTE: use catalog as database
+                    "schema": schema,
+                    "asset_type": "VIEW" if row.get("table_type", "").upper() == "VIEW" else "TABLE",
+                    "description": row.get("comment") or None,
+                    "fields": fields,
+                    "row_count": row_count,
+                    "byte_count": byte_count,
+                    "last_updated": last_updated,
+                }
+                assets.append(asset)
+                log.info("Collected %s.%s.%s", catalog, schema, table_name)
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": collected_at,
+        "catalog": catalog,
+        "asset_count": len(assets),
+        "assets": assets,
+    }
+    with open(manifest_path, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Manifest written to %s (%d assets)", manifest_path, len(assets))
+    return assets
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect Databricks metadata to a manifest file")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--catalog", default=os.getenv("DATABRICKS_CATALOG", "hive_metastore"))
+    parser.add_argument("--manifest", default="manifest_metadata.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        catalog=args.catalog,
+        manifest_path=args.manifest,
+    )
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""
+Databricks — Query Log Collection (collect-only)
+==================================================
+Collects finished query execution records from the Databricks system table
+system.query.history and writes a JSON manifest file that can be consumed
+by push_query_logs.py.
+Substitution points (search for "← SUBSTITUTE"):
+  - DATABRICKS_HOST       : workspace hostname
+  - DATABRICKS_HTTP_PATH  : SQL warehouse HTTP path
+  - DATABRICKS_TOKEN      : PAT or service-principal secret
+  - LOOKBACK_HOURS        : hours back from [now - LAG_HOURS] to collect (default 25)
+  - LOOKBACK_LAG_HOURS    : hours to lag behind now to avoid in-flight queries (default 1)
+  - MAX_ROWS              : maximum query rows to collect per run (default 10000)
+Prerequisites:
+  pip install databricks-sql-connector
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Any
+from databricks import sql
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+LOG_TYPE = "databricks"
+LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25"))        # ← SUBSTITUTE
+LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
+MAX_ROWS: int = int(os.getenv("MAX_ROWS", "10000"))                  # ← SUBSTITUTE
+_QUERY_LOG_SQL = """\
+SELECT
+    statement_id       AS query_id,
+    statement_text     AS query_text,
+    start_time,
+    end_time,
+    executed_by        AS user_name,
+    produced_rows      AS returned_rows,
+    total_task_duration_ms,
+    read_rows,
+    read_bytes
+FROM system.query.history
+WHERE start_time >= DATEADD(HOUR, -{lookback_hours}, NOW())
+  AND start_time <  DATEADD(HOUR, -{lag_hours}, NOW())
+  AND status = 'FINISHED'
+ORDER BY start_time
+LIMIT {max_rows}
+"""  # ← SUBSTITUTE: adjust status filter or add warehouse_id filter as needed
+def _check_available_memory(min_gb: float = 2.0) -> None:
+    """Warn if available memory is below the threshold."""
+    try:
+        if hasattr(os, "sysconf"):  # Linux / macOS
+            page_size = os.sysconf("SC_PAGE_SIZE")
+            avail_pages = os.sysconf("SC_AVPHYS_PAGES")
+            avail_gb = (page_size * avail_pages) / (1024 ** 3)
+        else:
+            return  # Windows — skip check
+    except (ValueError, OSError):
+        return
+    if avail_gb < min_gb:
+        log.warning(
+            "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
+            "Consider reducing the collection scope or increasing available memory.",
+            avail_gb,
+            min_gb,
+        )
+def _safe_isoformat(dt: Any) -> str | None:
+    if dt is None:
+        return None
+    if hasattr(dt, "isoformat"):
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.isoformat()
+    return str(dt)
+def _query(cursor: Any, sql_text: str) -> list[dict[str, Any]]:
+    cursor.execute(sql_text)
+    cols = [d[0] for d in cursor.description]
+    rows = []
+    while True:
+        chunk = cursor.fetchmany(1000)
+        if not chunk:
+            break
+        rows.extend(dict(zip(cols, row)) for row in chunk)
+    return rows
+def collect_query_logs(
+    cursor: Any,
+    lookback_hours: int,
+    lag_hours: int,
+    max_rows: int,
+) -> list[dict[str, Any]]:
+    rendered_sql = _QUERY_LOG_SQL.format(
+        lookback_hours=lookback_hours + lag_hours,  # offset from NOW() to cover the window
+        lag_hours=lag_hours,
+        max_rows=max_rows,
+    )
+    rows = _query(cursor, rendered_sql)
+    log.info("Retrieved %d query log rows from system.query.history", len(rows))
+    entries: list[dict[str, Any]] = []
+    for row in rows:
+        query_text: str = row.get("query_text") or ""
+        if not query_text.strip():
+            continue  # ← SUBSTITUTE: decide whether to skip empty-text rows
+        entry = {
+            "query_id": row.get("query_id"),
+            "query_text": query_text,
+            "start_time": _safe_isoformat(row.get("start_time")),
+            "end_time": _safe_isoformat(row.get("end_time")),
+            "user": row.get("user_name"),
+            "returned_rows": row.get("returned_rows"),
+            "total_task_duration_ms": row.get("total_task_duration_ms"),
+            "read_rows": row.get("read_rows"),
+            "read_bytes": row.get("read_bytes"),
+        }
+        entries.append(entry)
+    return entries
+def collect(
+    host: str,
+    http_path: str,
+    token: str,
+    manifest_path: str = "manifest_query_logs.json",
+    lookback_hours: int = LOOKBACK_HOURS,
+    lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
+    max_rows: int = MAX_ROWS,
+) -> list[dict[str, Any]]:
+    """Connect to Databricks, collect query logs, write a JSON manifest, and return entries."""
+    _check_available_memory(min_gb=2.0)
+    collected_at = datetime.now(timezone.utc).isoformat()
+    with sql.connect(
+        server_hostname=host,    # ← SUBSTITUTE
+        http_path=http_path,     # ← SUBSTITUTE
+        access_token=token,      # ← SUBSTITUTE
+    ) as conn:
+        with conn.cursor() as cursor:
+            entries = collect_query_logs(cursor, lookback_hours, lookback_lag_hours, max_rows)
+    log.info("Collected %d query log entries", len(entries))
+    manifest = {
+        "log_type": LOG_TYPE,
+        "collected_at": collected_at,
+        "lookback_hours": lookback_hours,
+        "lookback_lag_hours": lookback_lag_hours,
+        "query_log_count": len(entries),
+        "entries": entries,
+    }
+    with open(manifest_path, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    log.info("Manifest written to %s (%d entries)", manifest_path, len(entries))
+    return entries
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Collect Databricks query logs to a manifest file")
+    parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST"))           # ← SUBSTITUTE
+    parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
+    parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN"))         # ← SUBSTITUTE
+    parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
+    parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
+    parser.add_argument("--max-rows", type=int, default=MAX_ROWS)
+    parser.add_argument("--manifest", default="manifest_query_logs.json")
+    args = parser.parse_args()
+    required = ["host", "http_path", "token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    collect(
+        host=args.host,
+        http_path=args.http_path,
+        token=args.token,
+        manifest_path=args.manifest,
+        lookback_hours=args.lookback_hours,
+        lookback_lag_hours=args.lookback_lag_hours,
+        max_rows=args.max_rows,
+    )
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""
+Databricks — Lineage Push (push-only)
+=======================================
+Reads a JSON manifest file produced by collect_lineage.py and pushes the lineage
+events to Monte Carlo via the push ingestion API, with configurable batching to
+keep compressed payloads under 1 MB.
+Substitution points (search for "← SUBSTITUTE"):
+  - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
+  - MCD_RESOURCE_UUID      : UUID of the Databricks connection in Monte Carlo
+  - PUSH_BATCH_SIZE       : number of events per API call (default 500)
+Prerequisites:
+  pip install pycarlo
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from typing import Any
+from pycarlo.core import Client, Session
+from pycarlo.features.ingestion import IngestionService
+from pycarlo.features.ingestion.models import (
+    ColumnLineageField,
+    ColumnLineageSourceField,
+    LineageAssetRef,
+    LineageEvent,
+)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+RESOURCE_TYPE = "databricks"
+DEFAULT_BATCH_SIZE = 500  # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
+def _ref_from_dict(d: dict[str, Any]) -> LineageAssetRef:
+    database = d.get("database", "")
+    schema = d.get("schema", "")
+    name = d["asset_name"]
+    return LineageAssetRef(
+        type="TABLE",
+        name=name,
+        database=database,
+        schema=schema,
+        asset_id=f"{database}__{schema}__{name}",
+    )
+def _event_from_dict(d: dict[str, Any]) -> LineageEvent:
+    """Reconstruct a LineageEvent from a manifest dict."""
+    sources = [_ref_from_dict(s) for s in d.get("sources", [])]
+    destination = _ref_from_dict(d["destination"])
+    fields: list[ColumnLineageField] | None = None
+    if d.get("column_lineage"):
+        fields = []
+        for cl in d["column_lineage"]:
+            src_fields = []
+            for s in cl.get("sources", []):
+                asset_id = f"{s.get('database', '')}__{s.get('schema', '')}__{s['asset_name']}"
+                src_fields.append(
+                    ColumnLineageSourceField(
+                        asset_id=asset_id,
+                        field_name=s["field"],
+                    )
+                )
+            fields.append(
+                ColumnLineageField(
+                    name=cl["destination_field"],
+                    source_fields=src_fields,
+                )
+            )
+    return LineageEvent(
+        sources=sources,
+        destination=destination,
+        fields=fields,
+    )
+def push(
+    manifest_path: str,
+    resource_uuid: str,
+    key_id: str,
+    key_token: str,
+    batch_size: int = DEFAULT_BATCH_SIZE,
+) -> dict[str, Any]:
+    """Read a collect manifest and push lineage events to Monte Carlo in batches.
+    Returns a summary dict with invocation IDs and counts.
+    """
+    with open(manifest_path) as fh:
+        manifest = json.load(fh)
+    event_dicts: list[dict[str, Any]] = manifest["events"]
+    events = [_event_from_dict(d) for d in event_dicts]
+    log.info("Loaded %d lineage events from %s", len(events), manifest_path)
+    # Split into batches
+    batches = []
+    for i in range(0, len(events), batch_size):
+        batches.append(events[i : i + batch_size])
+    total_batches = len(batches)
+    def _push_batch(batch: list, batch_num: int) -> str | None:
+        """Push a single batch using a dedicated Session (thread-safe)."""
+        log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
+        client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
+        service = IngestionService(mc_client=client)
+        result = service.send_lineage(
+            resource_uuid=resource_uuid,
+            resource_type=RESOURCE_TYPE,
+            events=batch,
+        )
+        invocation_id = service.extract_invocation_id(result)
+        if invocation_id:
+            log.info("Batch %d: invocation_id=%s", batch_num, invocation_id)
+        return invocation_id
+    # Push batches in parallel (each thread gets its own pycarlo Session)
+    max_workers = min(4, total_batches)
+    invocation_ids: list[str | None] = [None] * total_batches
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        futures = {
+            pool.submit(_push_batch, batch, i + 1): i
+            for i, batch in enumerate(batches)
+        }
+        for future in as_completed(futures):
+            idx = futures[future]
+            try:
+                invocation_ids[idx] = future.result()
+            except Exception as exc:
+                log.error("ERROR pushing batch %d: %s", idx + 1, exc)
+                raise
+    log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
+    pushed_at = datetime.now(timezone.utc).isoformat()
+    summary = {
+        "resource_uuid": resource_uuid,
+        "resource_type": RESOURCE_TYPE,
+        "invocation_ids": invocation_ids,
+        "pushed_at": pushed_at,
+        "event_count": len(events),
+        "batch_count": total_batches,
+        "batch_size": batch_size,
+        "lookback_days": manifest.get("lookback_days"),
+        "table_lineage_events": manifest.get("table_lineage_events"),
+        "column_lineage_events": manifest.get("column_lineage_events"),
+    }
+    push_manifest_path = manifest_path.replace(".json", "_push_result.json")
+    with open(push_manifest_path, "w") as fh:
+        json.dump(summary, fh, indent=2)
+    log.info("Push result written to %s", push_manifest_path)
+    return summary
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Push Databricks lineage to Monte Carlo from manifest")
+    parser.add_argument("--manifest", default="manifest_lineage.json")
+    parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
+    parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
+    parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    args = parser.parse_args()
+    required = ["resource_uuid", "key_id", "key_token"]
+    missing = [k for k in required if getattr(args, k) is None]
+    if missing:
+        parser.error(f"Missing required arguments/env vars: {missing}")
+    push(
+        manifest_path=args.manifest,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        batch_size=args.batch_size,
+    )
+if __name__ == "__main__":
+    main()