npm - opencode-skills-collection - Versions diffs - 2.0.0-beta.3 → 2.0.2 - Mend

opencode-skills-collection 2.0.0-beta.3 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py ADDED Viewed

@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+Collect Hive query logs from a local HiveServer2 log file — collection only.
+Parses a plain-text HiveServer2 log for "Executing/Starting command" entries
+to extract query text, query ID, start time and end time.  Optionally reads
+per-query operation logs to populate ``returned_rows`` from SelectOperator
+``RECORDS_OUT`` counters.  Deduplicates entries by query ID.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points
+-------------------
+- --log-file       path to local HiveServer2 log (default: /tmp/root/hive.log)
+- --op-logs-dir    optional directory of per-query <queryId>.log files
+Prerequisites
+-------------
+    pip install python-dateutil python-dotenv
+Usage
+-----
+    python collect_query_logs.py \\
+        --log-file /tmp/root/hive.log \\
+        [--op-logs-dir /var/log/hive/operation_logs] \\
+        --output-file query_logs_output.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+from datetime import datetime, timezone
+from io import StringIO
+from pathlib import Path
+from dateutil.parser import isoparse
+# NOTE: the normalizer requires "hive-s3" — do not change to "hive" or "data-lake"
+LOG_TYPE = "hive-s3"
+# Matches the start of a new query block in the Hive log
+_COMMAND_START_RE = re.compile(
+    r"(Executing|Starting)\s+command\(queryId=(?P<query_id>\S*)\):\s+(?P<command>.*)$"
+)
+# Extracts returned row counts from per-query Hive operation logs
+_RECORDS_OUT_RE = re.compile(r"RECORDS_OUT_OPERATOR_SEL_\d+:(\d+)")
+def _parse_log_entries(log_text: str) -> list[dict]:
+    """
+    Parse a HiveServer2 log file and return a list of dicts:
+      query_id, start_time (datetime), end_time (datetime), query (str)
+    Each timestamped "Executing/Starting command" line starts a new entry.
+    The previous entry's end_time is set to the timestamp of the next line.
+    """
+    entries = []
+    query = ""
+    query_id = ""
+    start_time: datetime | None = None
+    last_timestamp: datetime | None = None
+    for line in StringIO(log_text):
+        parts = line.split()
+        if not parts:
+            continue
+        try:
+            timestamp = isoparse(parts[0])
+            if not timestamp.tzinfo:
+                timestamp = timestamp.replace(tzinfo=timezone.utc)
+        except ValueError:
+            # Continuation line for a multi-line query
+            if query:
+                query += "\n" + line.rstrip()
+            continue
+        command_start = _COMMAND_START_RE.search(line)
+        if command_start:
+            # Emit the previous entry before starting a new one
+            if query and start_time:
+                entries.append(
+                    {
+                        "query_id": query_id,
+                        "start_time": start_time,
+                        "end_time": timestamp,
+                        "query": query,
+                    }
+                )
+            query_id = command_start.group("query_id")
+            start_time = timestamp
+            query = command_start.group("command").strip()
+        elif query and start_time:
+            # A timestamped non-command line closes the current entry
+            entries.append(
+                {
+                    "query_id": query_id,
+                    "start_time": start_time,
+                    "end_time": timestamp,
+                    "query": query,
+                }
+            )
+            query = ""
+            query_id = ""
+            start_time = None
+        last_timestamp = timestamp
+    # Flush any trailing entry
+    if query and start_time:
+        end_time = last_timestamp or start_time
+        entries.append(
+            {
+                "query_id": query_id,
+                "start_time": start_time,
+                "end_time": end_time,
+                "query": query,
+            }
+        )
+    return entries
+def _load_returned_rows(op_logs_dir: str) -> dict[str, int]:
+    """
+    Scan a directory of per-query Hive operation logs (named <queryId>.log) and
+    return a mapping of query_id -> rows returned.
+    The row count is taken from the last RECORDS_OUT_OPERATOR_SEL_N value in
+    each file, which reflects the final number of rows delivered to the client.
+    """
+    rows_by_id: dict[str, int] = {}
+    for log_file in Path(op_logs_dir).glob("*.log"):
+        query_id = log_file.stem
+        last_count: int | None = None
+        try:
+            text = log_file.read_text(errors="replace")
+        except OSError:
+            continue
+        for m in _RECORDS_OUT_RE.finditer(text):
+            last_count = int(m.group(1))
+        if last_count is not None:
+            rows_by_id[query_id] = last_count
+    return rows_by_id
+def _build_query_log_entries(
+    raw_entries: list[dict],
+    rows_by_id: dict[str, int] | None = None,
+) -> list[dict]:
+    """
+    Deduplicate raw log entries by query_id and enrich with returned_rows.
+    Returns plain dicts so that ``push_query_logs.py`` can reconstruct
+    QueryLogEntry objects from the JSON manifest.
+    """
+    seen: set[str] = set()
+    entries = []
+    for r in raw_entries:
+        qid = r["query_id"]
+        if qid and qid in seen:
+            continue
+        if qid:
+            seen.add(qid)
+        returned_rows: int | None = rows_by_id.get(qid) if rows_by_id and qid else None
+        entries.append(
+            {
+                "query_id": qid or None,
+                "start_time": r["start_time"].isoformat(),
+                "end_time": r["end_time"].isoformat(),
+                "query_text": r["query"],
+                "user": "hadoop",  # ← SUBSTITUTE: set the user appropriate for your cluster
+                "returned_rows": returned_rows,
+            }
+        )
+    return entries
+def collect(
+    log_file: str,
+    op_logs_dir: str | None = None,
+) -> dict:
+    """
+    Parse query log entries from a HiveServer2 log file and return a manifest dict.
+    Args:
+        log_file: Path to a local HiveServer2 log file.
+        op_logs_dir: Optional directory containing per-query operation logs
+                     (<queryId>.log). When provided, returned_rows is populated
+                     from SelectOperator RECORDS_OUT counts.
+    Returns:
+        Manifest dict with keys: log_type, collected_at, entry_count,
+        window_start, window_end, queries.
+    """
+    print(f"Reading Hive log file: {log_file} ...")
+    with open(log_file, errors="replace") as fh:
+        log_text = fh.read()
+    raw_entries = _parse_log_entries(log_text)
+    print(f"  Parsed {len(raw_entries)} query log entry/entries.")
+    if not raw_entries:
+        print("No query log entries found.")
+        return {
+            "log_type": LOG_TYPE,
+            "collected_at": datetime.now(tz=timezone.utc).isoformat(),
+            "entry_count": 0,
+            "window_start": None,
+            "window_end": None,
+            "queries": [],
+        }
+    rows_by_id: dict[str, int] | None = None
+    if op_logs_dir:
+        rows_by_id = _load_returned_rows(op_logs_dir)
+        print(f"  Loaded row counts for {len(rows_by_id)} query/queries from {op_logs_dir}")
+    queries = _build_query_log_entries(raw_entries, rows_by_id)
+    start_times = [r["start_time"] for r in raw_entries]
+    end_times = [r["end_time"] for r in raw_entries]
+    manifest = {
+        "log_type": LOG_TYPE,
+        "collected_at": datetime.now(tz=timezone.utc).isoformat(),
+        "entry_count": len(queries),
+        "window_start": min(start_times).isoformat() if start_times else None,
+        "window_end": max(end_times).isoformat() if end_times else None,
+        "queries": [
+            {
+                "query_id": q["query_id"],
+                "start_time": q["start_time"],
+                "end_time": q["end_time"],
+                "query": q["query_text"],
+                "user": q["user"],
+                "returned_rows": q["returned_rows"],
+            }
+            for q in queries
+        ],
+    }
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect Hive query logs from a local log file and write a JSON manifest",
+    )
+    parser.add_argument(
+        "--log-file",
+        default="/tmp/root/hive.log",
+        help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)",  # ← SUBSTITUTE: your log path
+    )
+    parser.add_argument(
+        "--op-logs-dir",
+        default=None,
+        help=(
+            "Directory containing per-query Hive operation logs (<queryId>.log). "
+            "When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
+        ),
+        # ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
+    )
+    parser.add_argument(
+        "--output-file",
+        default="query_logs_output.json",
+        help="Path to write the output manifest (default: query_logs_output.json)",
+    )
+    args = parser.parse_args()
+    manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
+    with open(args.output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    print(f"Query log manifest written to {args.output_file}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py ADDED Viewed

@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+Push a collected Hive lineage manifest to Monte Carlo — push only.
+Reads a JSON manifest produced by ``collect_lineage.py``, builds LineageEvent
+objects (table-level or column-level), and calls ``send_lineage`` in batches.
+The manifest is updated in-place with ``resource_uuid`` and ``invocation_id``
+after a successful push.
+Can be run standalone via CLI or imported (use the ``push()`` function).
+Substitution points
+-------------------
+- MCD_INGEST_ID    (env) / --key-id        (CLI) : Monte Carlo ingestion key ID
+- MCD_INGEST_TOKEN (env) / --key-token      (CLI) : Monte Carlo ingestion key token
+- MCD_RESOURCE_UUID    (env) / --resource-uuid  (CLI) : MC resource UUID for this connection
+Prerequisites
+-------------
+    pip install pycarlo python-dotenv
+Usage (table-level):
+    python push_lineage.py \\
+        --key-id  <MCD_INGEST_ID> \\
+        --key-token <MCD_INGEST_TOKEN> \\
+        --resource-uuid <MCD_RESOURCE_UUID> \\
+        --input-file lineage_output.json
+Usage (column-level):
+    python push_lineage.py ... --column-lineage
+"""
+import argparse
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pycarlo.core import Client, Session
+from pycarlo.features.ingestion import IngestionService
+from pycarlo.features.ingestion.models import (
+    ColumnLineageField,
+    ColumnLineageSourceField,
+    LineageAssetRef,
+    LineageEvent,
+)
+# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
+RESOURCE_TYPE = "data-lake"
+# ← SUBSTITUTE: default batch size for lineage push (events per request)
+DEFAULT_BATCH_SIZE = 500
+# ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
+DEFAULT_TIMEOUT_SECONDS = 120
+def _build_table_lineage(edges_data: list[dict]) -> list[LineageEvent]:
+    """Build table-level LineageEvent objects from raw edge dicts."""
+    events = []
+    for edge in edges_data:
+        sources = edge.get("sources", [])
+        if not sources:
+            continue
+        dest = edge["destination"]
+        events.append(
+            LineageEvent(
+                destination=LineageAssetRef(
+                    type="TABLE",
+                    name=dest["table"],
+                    database=dest["database"],
+                    schema=dest["database"],
+                ),
+                sources=[
+                    LineageAssetRef(
+                        type="TABLE",
+                        name=src["table"],
+                        database=src["database"],
+                        schema=src["database"],
+                    )
+                    for src in sources
+                ],
+            )
+        )
+    return events
+def _build_column_lineage(edges_data: list[dict]) -> list[LineageEvent]:
+    """Build column-level LineageEvent objects from raw edge dicts."""
+    events = []
+    for edge in edges_data:
+        sources = edge.get("sources", [])
+        if not sources:
+            continue
+        dest = edge["destination"]
+        dest_asset_id = f"{dest['database']}__{dest['table']}"
+        source_asset_ids = {
+            (src["database"], src["table"]): f"{src['database']}__{src['table']}"
+            for src in sources
+        }
+        col_fields: dict[str, ColumnLineageField] = {}
+        for mapping in edge.get("col_mappings", []):
+            dest_col = mapping["dest_col"]
+            src_table = mapping["src_table"]
+            src_col = mapping["src_col"]
+            # Find the matching source db for this src_table
+            src_db = next(
+                (src["database"] for src in sources if src["table"] == src_table),
+                dest["database"],
+            )
+            src_aid = source_asset_ids.get((src_db, src_table), f"{src_db}__{src_table}")
+            if dest_col not in col_fields:
+                col_fields[dest_col] = ColumnLineageField(name=dest_col, source_fields=[])
+            col_fields[dest_col].source_fields.append(
+                ColumnLineageSourceField(asset_id=src_aid, field_name=src_col)
+            )
+        events.append(
+            LineageEvent(
+                destination=LineageAssetRef(
+                    type="TABLE",
+                    name=dest["table"],
+                    database=dest["database"],
+                    schema=dest["database"],
+                    asset_id=dest_asset_id,
+                ),
+                sources=[
+                    LineageAssetRef(
+                        type="TABLE",
+                        name=src["table"],
+                        database=src["database"],
+                        schema=src["database"],
+                        asset_id=source_asset_ids[(src["database"], src["table"])],
+                    )
+                    for src in sources
+                ],
+                fields=list(col_fields.values()) if col_fields else None,
+            )
+        )
+    return events
+def push(
+    manifest: dict,
+    resource_uuid: str,
+    key_id: str,
+    key_token: str,
+    column_lineage: bool = False,
+    batch_size: int = DEFAULT_BATCH_SIZE,
+    timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
+) -> str | None:
+    """
+    Push collected lineage to Monte Carlo and update the manifest in-place.
+    Events are sent in batches of ``batch_size`` (default 500) to avoid
+    oversized payloads.  Supports both table-level and column-level lineage.
+    Args:
+        manifest: Dict loaded from a ``collect_lineage.py`` output file.
+        resource_uuid: MC resource UUID for this Hive connection.
+        key_id: MC ingestion key ID.
+        key_token: MC ingestion key token.
+        column_lineage: When True, push column-level lineage; otherwise table-level.
+        batch_size: Events per POST request (default 500).
+        timeout_seconds: HTTP timeout per request (default 120).
+    Returns:
+        The last invocation ID string if returned by MC, otherwise None.
+    """
+    resource_type = manifest.get("resource_type", RESOURCE_TYPE)
+    edges_data = manifest.get("edges", [])
+    if column_lineage:
+        events = _build_column_lineage(edges_data)
+        label = "column-level"
+    else:
+        events = _build_table_lineage(edges_data)
+        label = "table-level"
+    print(f"Loaded {len(events)} {label} lineage event(s) from manifest")
+    if not events:
+        print("No lineage events to push.")
+        manifest["resource_uuid"] = resource_uuid
+        manifest["invocation_id"] = None
+        return None
+    # Split into batches
+    batch_list = []
+    for i in range(0, len(events), batch_size):
+        batch_list.append(events[i : i + batch_size])
+    total_batches = len(batch_list)
+    def _push_batch(batch: list, batch_num: int) -> str | None:
+        """Push a single batch using a dedicated Session (thread-safe)."""
+        print(f"  Pushing batch {batch_num}/{total_batches} ({len(batch)} events) ...")
+        client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
+        service = IngestionService(mc_client=client)
+        result = service.send_lineage(
+            resource_uuid=resource_uuid,
+            resource_type=resource_type,
+            events=batch,
+        )
+        invocation_id = service.extract_invocation_id(result)
+        if invocation_id:
+            print(f"    Batch {batch_num}: invocation_id={invocation_id}")
+        return invocation_id
+    # Push batches in parallel (each thread gets its own pycarlo Session)
+    max_workers = min(4, total_batches)
+    invocation_ids: list[str | None] = [None] * total_batches
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        futures = {
+            pool.submit(_push_batch, batch, i + 1): i
+            for i, batch in enumerate(batch_list)
+        }
+        for future in as_completed(futures):
+            idx = futures[future]
+            try:
+                invocation_ids[idx] = future.result()
+            except Exception as exc:
+                print(f"    ERROR pushing batch {idx + 1}: {exc}")
+                raise
+    print(f"  All {total_batches} batches pushed ({max_workers} workers)")
+    manifest["resource_uuid"] = resource_uuid
+    manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
+    if len([i for i in invocation_ids if i]) > 1:
+        manifest["invocation_ids"] = invocation_ids
+    elif "invocation_ids" in manifest:
+        del manifest["invocation_ids"]
+    return manifest.get("invocation_id")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Push a collected Hive lineage manifest to Monte Carlo",
+    )
+    parser.add_argument(
+        "--key-id",
+        default=os.environ.get("MCD_INGEST_ID"),
+        help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
+    )
+    parser.add_argument(
+        "--key-token",
+        default=os.environ.get("MCD_INGEST_TOKEN"),
+        help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
+    )
+    parser.add_argument(
+        "--resource-uuid",
+        default=os.environ.get("MCD_RESOURCE_UUID"),
+        help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
+    )
+    parser.add_argument(
+        "--input-file",
+        default="lineage_output.json",
+        help="Path to the JSON manifest written by collect_lineage.py (default: lineage_output.json)",
+    )
+    parser.add_argument(
+        "--column-lineage",
+        action="store_true",
+        help="Push column-level lineage instead of table-level",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        metavar="N",
+        help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=DEFAULT_TIMEOUT_SECONDS,
+        metavar="SEC",
+        help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
+    )
+    args = parser.parse_args()
+    if not args.key_id or not args.key_token:
+        parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
+    if not args.resource_uuid:
+        parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
+    with open(args.input_file) as fh:
+        manifest = json.load(fh)
+    push(
+        manifest=manifest,
+        resource_uuid=args.resource_uuid,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        column_lineage=args.column_lineage,
+        batch_size=args.batch_size,
+        timeout_seconds=args.timeout,
+    )
+    with open(args.input_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    print(f"Manifest updated in-place: {args.input_file}")
+    print("Done.")
+if __name__ == "__main__":
+    main()