npm - opencode-skills-collection - Versions diffs - 2.0.0 → 2.0.2 - Mend

opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py ADDED Viewed

@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Collect Hive query logs from a local log file and push them to Monte Carlo
+in one step.
+Thin wrapper that calls ``collect()`` from ``collect_query_logs`` followed by
+``push()`` from ``push_query_logs``, then writes the final manifest (with
+``resource_uuid`` and ``invocation_id``) to ``--output-file``.
+Substitution points
+-------------------
+- MCD_INGEST_ID    (env) / --key-id        (CLI) : Monte Carlo ingestion key ID
+- MCD_INGEST_TOKEN (env) / --key-token      (CLI) : Monte Carlo ingestion key token
+- MCD_RESOURCE_UUID    (env) / --resource-uuid  (CLI) : MC resource UUID (optional for query logs)
+- --log-file                  path to local HiveServer2 log (default: /tmp/root/hive.log)
+- --op-logs-dir               optional directory of per-query <queryId>.log files
+Prerequisites
+-------------
+    pip install pycarlo python-dateutil python-dotenv
+Usage
+-----
+    python collect_and_push_query_logs.py \\
+        --key-id  <MCD_INGEST_ID> \\
+        --key-token <MCD_INGEST_TOKEN> \\
+        --resource-uuid <MCD_RESOURCE_UUID> \\
+        --log-file /tmp/root/hive.log \\
+        [--op-logs-dir /var/log/hive/operation_logs]
+"""
+import argparse
+import json
+import os
+from collect_query_logs import collect
+from push_query_logs import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect Hive query logs from a local log file and push to Monte Carlo",
+    )
+    # Collect args
+    parser.add_argument(
+        "--log-file",
+        default="/tmp/root/hive.log",
+        help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)",  # ← SUBSTITUTE: your log path
+    )
+    parser.add_argument(
+        "--op-logs-dir",
+        default=None,
+        help=(
+            "Directory containing per-query Hive operation logs (<queryId>.log). "
+            "When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
+        ),
+        # ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
+    )
+    # Push / MC args
+    parser.add_argument(
+        "--key-id",
+        default=os.environ.get("MCD_INGEST_ID"),
+        help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
+    )
+    parser.add_argument(
+        "--key-token",
+        default=os.environ.get("MCD_INGEST_TOKEN"),
+        help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
+    )
+    parser.add_argument(
+        "--resource-uuid",
+        default=os.environ.get("MCD_RESOURCE_UUID"),
+        help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
+    )
+    parser.add_argument(
+        "--output-file",
+        default="query_logs_output.json",
+        help="Path to write the output manifest (default: query_logs_output.json)",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=DEFAULT_BATCH_SIZE,
+        metavar="N",
+        help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=DEFAULT_TIMEOUT_SECONDS,
+        metavar="SEC",
+        help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
+    )
+    args = parser.parse_args()
+    if not args.key_id or not args.key_token:
+        parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
+    manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
+    push(
+        manifest=manifest,
+        key_id=args.key_id,
+        key_token=args.key_token,
+        resource_uuid=args.resource_uuid,
+        batch_size=args.batch_size,
+        timeout_seconds=args.timeout,
+    )
+    with open(args.output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    print(f"Query log manifest written to {args.output_file}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py ADDED Viewed

@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""
+Extract table and column lineage from a local HiveServer2 log file — collection only.
+Reads a plain-text Hive log file (not compressed), extracts SQL query blocks
+from "Executing command" / "Starting command" entries, detects CTAS and
+INSERT INTO ... SELECT patterns to build lineage edges, then writes a JSON
+manifest file.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points
+-------------------
+- --log-file  path to local HiveServer2 log (default: /tmp/root/hive.log)
+Prerequisites
+-------------
+    pip install python-dotenv
+Usage
+-----
+    python collect_lineage.py \\
+        --log-file /tmp/root/hive.log \\
+        --output-file lineage_output.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
+RESOURCE_TYPE = "data-lake"
+# Regex for CTAS: CREATE TABLE [IF NOT EXISTS] db.table AS SELECT ... FROM db.table
+_CTAS_RE = re.compile(
+    r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
+    r"(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
+    r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
+    re.IGNORECASE | re.DOTALL,
+)
+# Regex for INSERT INTO/OVERWRITE db.table SELECT ... FROM db.table
+_INSERT_RE = re.compile(
+    r"INSERT\s+(?:INTO|OVERWRITE)\s+(?:TABLE\s+)?(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
+    r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
+    re.IGNORECASE | re.DOTALL,
+)
+# Regex to detect additional JOIN sources beyond the primary FROM clause
+_JOIN_RE = re.compile(r"JOIN\s+(?P<src_db>\w+)\.(?P<src_table>\w+)", re.IGNORECASE)
+# Simple column alias extraction: [alias.]col [AS dest]
+_COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
+# Hive string literals — strip before scanning so words inside 'status' AS ...
+# are not treated as column refs
+_STR_LITERAL_RE = re.compile(r"'(?:''|[^'])*'")
+# ROW_NUMBER() OVER (...) AS alias — whole expression has no single source column;
+# removing it avoids bogus tokens in col_mappings
+_WINDOW_AS_ALIAS_RE = re.compile(
+    r"\b(?:ROW_NUMBER|RANK|DENSE_RANK|NTILE)\s*\(\s*\)\s+OVER\s*\([^)]*\)\s+AS\s+\w+",
+    re.IGNORECASE,
+)
+# Regex to pull query text out of Hive log "Executing/Starting command" lines
+_COMMAND_START_RE = re.compile(
+    r"(?:Executing|Starting)\s+command\(queryId=\S*\):\s+(?P<query>.+?)(?=\n\d{4}-\d{2}-\d{2}|\Z)",
+    re.DOTALL,
+)
+# Tokens that are almost never real column names — SQL keywords, functions, casts, etc.
+_SQL_SCAN_NOISE = frozenset(
+    {
+        "ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE", "OVER", "PARTITION",
+        "ORDER", "BY", "CASE", "WHEN", "THEN", "ELSE", "END", "AND", "OR",
+        "NOT", "IN", "IS", "DISTINCT", "CAST", "CONVERT", "CURRENT_TIMESTAMP",
+        "CURRENT_DATE", "TRUE", "FALSE", "NULL", "BETWEEN", "LIKE", "EXISTS",
+        "ASC", "DESC", "LIMIT", "OFFSET", "GROUP", "HAVING", "UNION", "ALL",
+        "INNER", "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "JOIN", "ON",
+        "WHERE", "SELECT", "FROM", "AS", "STRING", "BIGINT", "INT", "SMALLINT",
+        "TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "BOOLEAN", "DATE",
+        "TIMESTAMP", "VARCHAR", "CHAR", "BINARY", "ARRAY", "MAP", "STRUCT",
+        "SUM", "AVG", "COUNT", "MIN", "MAX", "STDDEV", "VARIANCE", "VAR_POP",
+        "COALESCE", "IF", "SUBSTRING", "YEAR", "MONTH", "DAY", "LEAD", "LAG",
+        "FIRST_VALUE", "LAST_VALUE",
+    }
+)
+@dataclass
+class _LineageEdge:
+    dest_db: str
+    dest_table: str
+    sources: list[tuple[str, str]] = field(default_factory=list)
+    # col_mappings: (dest_col, src_table, src_col)
+    col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
+def _prepare_select_for_col_scan(select_clause: str) -> str:
+    """Remove literals and window headers so _COL_RE sees fewer false positives."""
+    s = _STR_LITERAL_RE.sub(" ", select_clause)
+    s = _WINDOW_AS_ALIAS_RE.sub(" ", s)
+    return s
+def _dedupe_col_mappings(mappings: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
+    seen: set[tuple[str, str, str]] = set()
+    out: list[tuple[str, str, str]] = []
+    for t in mappings:
+        if t in seen:
+            continue
+        seen.add(t)
+        out.append(t)
+    return out
+def _extract_query_blocks(log_text: str) -> list[str]:
+    """Extract individual SQL query strings from a Hive log file."""
+    return [m.group("query").strip() for m in _COMMAND_START_RE.finditer(log_text)]
+def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
+    """
+    Lightweight column mapping: for each `alias.col AS dest` or `col AS dest`
+    in the SELECT clause, return (dest_col, src_table, src_col).
+    Strips string literals and window function headers first to reduce false
+    positives, and filters out SQL keywords/noise tokens.
+    """
+    prepared = _prepare_select_for_col_scan(select_clause)
+    mappings = []
+    for m in _COL_RE.finditer(prepared):
+        src_col = m.group(2)
+        dest_col = m.group(3) or src_col
+        if src_col.upper() in ("FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*"):
+            continue
+        if src_col.upper() in _SQL_SCAN_NOISE or dest_col.upper() in _SQL_SCAN_NOISE:
+            continue
+        # After stripping 'literal' AS col, we get " AS col" — skip bare (col, col) with no source expr.
+        if dest_col == src_col:
+            prefix = prepared[: m.start()].rstrip()
+            if prefix.upper().endswith("AS"):
+                continue
+        mappings.append((dest_col, src_table, src_col))
+    return _dedupe_col_mappings(mappings)
+def _parse_edges(queries: list[str]) -> list[_LineageEdge]:
+    """Parse SQL query strings into _LineageEdge objects."""
+    edges: dict[str, _LineageEdge] = {}
+    for sql in queries:
+        # Strip string literals to avoid false table/column matches inside quoted strings
+        sql_clean = re.sub(r"\s+", " ", _STR_LITERAL_RE.sub(" ", sql)).strip()
+        for pattern in (_CTAS_RE, _INSERT_RE):
+            m = pattern.search(sql_clean)
+            if not m:
+                continue
+            dest_db = m.group("dest_db").lower()
+            dest_table = m.group("dest_table").lower()
+            src_db = m.group("src_db").lower()
+            src_table = m.group("src_table").lower()
+            select_cols = m.group("select_cols")
+            key = f"{dest_db}.{dest_table}"
+            if key not in edges:
+                edges[key] = _LineageEdge(dest_db=dest_db, dest_table=dest_table)
+            edge = edges[key]
+            src_pair = (src_db, src_table)
+            if src_pair not in edge.sources:
+                edge.sources.append(src_pair)
+            # Pick up additional JOIN sources
+            for jm in _JOIN_RE.finditer(sql_clean):
+                jp = (jm.group("src_db").lower(), jm.group("src_table").lower())
+                if jp not in edge.sources:
+                    edge.sources.append(jp)
+            edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
+            break  # matched one pattern, move to next query
+    # Deduplicate column mappings per edge (same INSERT may appear many times in HS2 logs)
+    for e in edges.values():
+        e.col_mappings = _dedupe_col_mappings(e.col_mappings)
+    return list(edges.values())
+def collect(log_file: str) -> dict:
+    """
+    Parse lineage edges from a HiveServer2 log file and return a manifest dict.
+    Args:
+        log_file: Path to a local HiveServer2 log file.
+    Returns:
+        Manifest dict with keys: resource_type, collected_at, edges.
+        Each edge has destination, sources, and col_mappings lists.
+    """
+    print(f"Reading Hive log file: {log_file} ...")
+    with open(log_file, errors="replace") as fh:
+        log_text = fh.read()
+    queries = _extract_query_blocks(log_text)
+    print(f"  Extracted {len(queries)} query block(s).")
+    edges = _parse_edges(queries)
+    print(f"  Parsed {len(edges)} lineage edge(s).")
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": datetime.now(tz=timezone.utc).isoformat(),
+        "edges": [
+            {
+                "destination": {"database": e.dest_db, "table": e.dest_table},
+                "sources": [{"database": sdb, "table": stbl} for sdb, stbl in e.sources],
+                "col_mappings": [
+                    {"dest_col": dc, "src_table": st, "src_col": sc}
+                    for dc, st, sc in e.col_mappings
+                ],
+            }
+            for e in edges
+        ],
+    }
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Extract Hive lineage from a local log file and write a JSON manifest",
+    )
+    parser.add_argument(
+        "--log-file",
+        default="/tmp/root/hive.log",
+        help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)",  # ← SUBSTITUTE: your log path
+    )
+    parser.add_argument(
+        "--output-file",
+        default="lineage_output.json",
+        help="Path to write the lineage manifest (default: lineage_output.json)",
+    )
+    args = parser.parse_args()
+    manifest = collect(log_file=args.log_file)
+    if not manifest["edges"]:
+        print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
+        return
+    with open(args.output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    print(f"Lineage manifest written to {args.output_file}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py ADDED Viewed

@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""
+Collect table metadata from a Hive Metastore — collection only.
+Connects to HiveServer2 (default port 10000), discovers all databases and
+tables via SHOW DATABASES / SHOW TABLES, reads schema and table statistics
+via DESCRIBE FORMATTED, then writes a JSON manifest file.
+Can be run standalone via CLI or imported (use the ``collect()`` function).
+Substitution points
+-------------------
+- HIVE_HOST         (env) / --hive-host   (CLI) : HiveServer2 hostname
+- HIVE_PORT         (env) / --hive-port   (CLI) : HiveServer2 port (default 10000)
+Prerequisites
+-------------
+    pip install pyhive python-dotenv
+Usage
+-----
+    python collect_metadata.py \\
+        --hive-host <HIVESERVER2_HOSTNAME> \\
+        --output-file metadata_output.json
+"""
+import argparse
+import json
+import os
+import re
+from datetime import datetime, timezone
+from pyhive import hive
+def _check_available_memory(min_gb: float = 2.0) -> None:
+    """Warn if available memory is below the threshold."""
+    try:
+        if hasattr(os, "sysconf"):  # Linux / macOS
+            page_size = os.sysconf("SC_PAGE_SIZE")
+            avail_pages = os.sysconf("SC_AVPHYS_PAGES")
+            avail_gb = (page_size * avail_pages) / (1024 ** 3)
+        else:
+            return  # Windows — skip check
+    except (ValueError, OSError):
+        return
+    if avail_gb < min_gb:
+        print(
+            f"WARNING: Only {avail_gb:.1f} GB of memory available "
+            f"(minimum recommended: {min_gb:.1f} GB). "
+            f"Consider reducing the number of databases/tables or increasing available memory."
+        )
+# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
+RESOURCE_TYPE = "data-lake"
+# Map Hive native types to SQL-standard uppercase types expected by Monte Carlo
+_HIVE_TYPE_MAP: dict[str, str] = {
+    "tinyint": "TINYINT",
+    "smallint": "SMALLINT",
+    "int": "INTEGER",
+    "integer": "INTEGER",
+    "bigint": "BIGINT",
+    "float": "FLOAT",
+    "double": "DOUBLE",
+    "double precision": "DOUBLE",
+    "decimal": "DECIMAL",
+    "numeric": "DECIMAL",
+    "boolean": "BOOLEAN",
+    "string": "VARCHAR",
+    "varchar": "VARCHAR",
+    "char": "CHAR",
+    "binary": "BINARY",
+    "timestamp": "TIMESTAMP",
+    "date": "DATE",
+    "interval": "INTERVAL",
+    "array": "ARRAY",
+    "map": "MAP",
+    "struct": "STRUCT",
+    "uniontype": "UNION",
+}
+# ← SUBSTITUTE: add any internal table name prefixes you want to skip
+_INTERNAL_TABLE_PREFIXES = ("tmp_", "__", "hive_")
+def _normalize_hive_type(hive_type: str) -> str:
+    """Uppercase and normalize a Hive type string to a SQL-standard form.
+    Parametrized types like ``decimal(10,2)`` or ``varchar(255)`` keep their
+    suffix; the base type is mapped through ``_HIVE_TYPE_MAP``.
+    """
+    lower = hive_type.lower().strip()
+    base = lower.split("(")[0].strip()
+    suffix = hive_type[len(base):].strip()  # preserve original params, e.g. decimal(10,2)
+    return _HIVE_TYPE_MAP.get(base, base.upper()) + suffix
+def _connect(host: str, port: int) -> hive.Connection:
+    # ← SUBSTITUTE: update username/auth if your cluster requires Kerberos or LDAP
+    return hive.connect(host=host, port=port, username="hadoop", auth="NONE")
+def _fetch_rows(cursor, query: str) -> list[tuple]:
+    """Execute a query and fetch results in memory-safe chunks."""
+    cursor.execute(query)
+    rows: list[tuple] = []
+    while True:
+        chunk = cursor.fetchmany(1000)
+        if not chunk:
+            break
+        rows.extend(chunk)
+    return rows
+def _parse_describe_formatted(rows: list[tuple]) -> dict:
+    """
+    Parse DESCRIBE FORMATTED <db>.<table> output into a structured dict:
+      columns, row_count, total_size, last_modified, description, created_on
+    """
+    result: dict = {
+        "columns": [],
+        "row_count": None,
+        "total_size": None,
+        "last_modified": None,
+        "description": None,
+        "created_on": None,
+    }
+    in_col_info = False
+    in_table_info = False
+    for row in rows:
+        col_name = (row[0] or "").strip()
+        data_type = (row[1] or "").strip()
+        comment = (row[2] or "").strip() if len(row) > 2 else ""
+        if col_name.startswith("# col_name"):
+            in_col_info = True
+            in_table_info = False
+            continue
+        if col_name.startswith("# Detailed Table Information"):
+            in_col_info = False
+            in_table_info = True
+            continue
+        if col_name.startswith("#"):
+            in_col_info = False
+            continue
+        if in_col_info and col_name and data_type:
+            result["columns"].append(
+                {
+                    "name": col_name,
+                    "type": _normalize_hive_type(data_type),
+                    "description": comment or None,
+                }
+            )
+        if in_table_info:
+            # Table Parameters rows have an empty col_name; key is in data_type, value in comment
+            param_key = data_type.strip() if not col_name else col_name.strip().rstrip(":")
+            param_val = (comment.strip() if not col_name else data_type.strip()) or ""
+            if re.search(r"numRows", param_key, re.IGNORECASE):
+                try:
+                    result["row_count"] = int(param_val)
+                except (ValueError, TypeError):
+                    pass
+            elif re.search(r"totalSize", param_key, re.IGNORECASE):
+                try:
+                    result["total_size"] = int(param_val)
+                except (ValueError, TypeError):
+                    pass
+            elif re.search(r"last_modified_time", param_key, re.IGNORECASE):
+                try:
+                    result["last_modified"] = datetime.fromtimestamp(
+                        int(param_val), tz=timezone.utc
+                    ).isoformat()
+                except (ValueError, TypeError):
+                    pass
+            elif re.search(r"^CreateTime", param_key):
+                # e.g. "Wed Mar 18 20:15:40 UTC 2026"
+                try:
+                    result["created_on"] = datetime.strptime(
+                        param_val, "%a %b %d %H:%M:%S %Z %Y"
+                    ).replace(tzinfo=timezone.utc).isoformat()
+                except (ValueError, TypeError):
+                    pass
+            elif param_key == "comment" and not result["description"] and param_val:
+                result["description"] = param_val
+    return result
+def collect(
+    hive_host: str,
+    hive_port: int = 10000,
+) -> dict:
+    """
+    Connect to HiveServer2, discover all databases and tables, and return a
+    manifest dict with collected asset metadata.
+    Args:
+        hive_host: HiveServer2 hostname.
+        hive_port: HiveServer2 port (default 10000).
+    Returns:
+        Manifest dict with keys: resource_type, collected_at, assets.
+    """
+    _check_available_memory()
+    print(f"Connecting to HiveServer2 at {hive_host}:{hive_port} ...")
+    conn = _connect(hive_host, hive_port)
+    cursor = conn.cursor()
+    assets: list[dict] = []
+    print("Collecting table metadata ...")
+    databases = [row[0] for row in _fetch_rows(cursor, "SHOW DATABASES")]
+    print(f"  Found databases: {databases}")
+    for db in databases:
+        # ← SUBSTITUTE: add any system databases you want to skip
+        if db in ("information_schema",):
+            continue
+        tables = _fetch_rows(cursor, f"SHOW TABLES IN {db}")
+        table_names = [row[0] for row in tables]
+        print(f"  {db}: {len(table_names)} table(s)")
+        for table in table_names:
+            if any(table.startswith(p) for p in _INTERNAL_TABLE_PREFIXES):
+                continue
+            try:
+                desc_rows = _fetch_rows(cursor, f"DESCRIBE FORMATTED {db}.{table}")
+            except Exception as exc:
+                print(f"    WARNING: could not describe {db}.{table}: {exc}")
+                continue
+            info = _parse_describe_formatted(desc_rows)
+            row_count = info["row_count"] if info["row_count"] and info["row_count"] > 0 else None
+            byte_count = info["total_size"] if info["total_size"] and info["total_size"] > 0 else None
+            assets.append(
+                {
+                    "database": db,
+                    "schema": db,
+                    "name": table,
+                    "description": info["description"],
+                    "created_on": info["created_on"],
+                    "row_count": row_count,
+                    "byte_count": byte_count,
+                    "last_modified": info["last_modified"],
+                    "fields": [
+                        {"name": col["name"], "type": col["type"], "description": col["description"]}
+                        for col in info["columns"]
+                    ],
+                }
+            )
+            print(
+                f"    + {db}.{table} ({len(info['columns'])} columns, "
+                f"desc={info['description']!r}, created={info['created_on']})"
+            )
+    cursor.close()
+    conn.close()
+    print(f"\nCollected {len(assets)} table(s).")
+    manifest = {
+        "resource_type": RESOURCE_TYPE,
+        "collected_at": datetime.now(tz=timezone.utc).isoformat(),
+        "assets": assets,
+    }
+    return manifest
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Collect Hive table metadata and write a JSON manifest",
+    )
+    parser.add_argument(
+        "--hive-host",
+        default=os.environ.get("HIVE_HOST"),
+        help="HiveServer2 hostname (env: HIVE_HOST)",  # ← SUBSTITUTE: your EMR master DNS or Hive host
+    )
+    parser.add_argument(
+        "--hive-port",
+        type=int,
+        default=10000,
+        help="HiveServer2 port (default: 10000)",  # ← SUBSTITUTE if your cluster uses a non-standard port
+    )
+    parser.add_argument(
+        "--output-file",
+        default="metadata_output.json",
+        help="Path to write the output manifest (default: metadata_output.json)",
+    )
+    args = parser.parse_args()
+    if not args.hive_host:
+        parser.error("--hive-host is required (or set HIVE_HOST)")
+    manifest = collect(
+        hive_host=args.hive_host,
+        hive_port=args.hive_port,
+    )
+    with open(args.output_file, "w") as fh:
+        json.dump(manifest, fh, indent=2)
+    print(f"Asset manifest written to {args.output_file}")
+    print("Done.")
+if __name__ == "__main__":
+    main()