opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect Hive query logs from a local log file and push them to Monte Carlo
4
+ in one step.
5
+
6
+ Thin wrapper that calls ``collect()`` from ``collect_query_logs`` followed by
7
+ ``push()`` from ``push_query_logs``, then writes the final manifest (with
8
+ ``resource_uuid`` and ``invocation_id``) to ``--output-file``.
9
+
10
+ Substitution points
11
+ -------------------
12
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
13
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
14
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID (optional for query logs)
15
+ - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
16
+ - --op-logs-dir optional directory of per-query <queryId>.log files
17
+
18
+ Prerequisites
19
+ -------------
20
+ pip install pycarlo python-dateutil python-dotenv
21
+
22
+ Usage
23
+ -----
24
+ python collect_and_push_query_logs.py \\
25
+ --key-id <MCD_INGEST_ID> \\
26
+ --key-token <MCD_INGEST_TOKEN> \\
27
+ --resource-uuid <MCD_RESOURCE_UUID> \\
28
+ --log-file /tmp/root/hive.log \\
29
+ [--op-logs-dir /var/log/hive/operation_logs]
30
+ """
31
+
32
+ import argparse
33
+ import json
34
+ import os
35
+
36
+ from collect_query_logs import collect
37
+ from push_query_logs import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
38
+
39
+
40
+ def main() -> None:
41
+ parser = argparse.ArgumentParser(
42
+ description="Collect Hive query logs from a local log file and push to Monte Carlo",
43
+ )
44
+ # Collect args
45
+ parser.add_argument(
46
+ "--log-file",
47
+ default="/tmp/root/hive.log",
48
+ help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
49
+ )
50
+ parser.add_argument(
51
+ "--op-logs-dir",
52
+ default=None,
53
+ help=(
54
+ "Directory containing per-query Hive operation logs (<queryId>.log). "
55
+ "When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
56
+ ),
57
+ # ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
58
+ )
59
+ # Push / MC args
60
+ parser.add_argument(
61
+ "--key-id",
62
+ default=os.environ.get("MCD_INGEST_ID"),
63
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
64
+ )
65
+ parser.add_argument(
66
+ "--key-token",
67
+ default=os.environ.get("MCD_INGEST_TOKEN"),
68
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
69
+ )
70
+ parser.add_argument(
71
+ "--resource-uuid",
72
+ default=os.environ.get("MCD_RESOURCE_UUID"),
73
+ help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
74
+ )
75
+ parser.add_argument(
76
+ "--output-file",
77
+ default="query_logs_output.json",
78
+ help="Path to write the output manifest (default: query_logs_output.json)",
79
+ )
80
+ parser.add_argument(
81
+ "--batch-size",
82
+ type=int,
83
+ default=DEFAULT_BATCH_SIZE,
84
+ metavar="N",
85
+ help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
86
+ )
87
+ parser.add_argument(
88
+ "--timeout",
89
+ type=int,
90
+ default=DEFAULT_TIMEOUT_SECONDS,
91
+ metavar="SEC",
92
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
93
+ )
94
+ args = parser.parse_args()
95
+
96
+ if not args.key_id or not args.key_token:
97
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
98
+
99
+ manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
100
+
101
+ push(
102
+ manifest=manifest,
103
+ key_id=args.key_id,
104
+ key_token=args.key_token,
105
+ resource_uuid=args.resource_uuid,
106
+ batch_size=args.batch_size,
107
+ timeout_seconds=args.timeout,
108
+ )
109
+
110
+ with open(args.output_file, "w") as fh:
111
+ json.dump(manifest, fh, indent=2)
112
+ print(f"Query log manifest written to {args.output_file}")
113
+ print("Done.")
114
+
115
+
116
+ if __name__ == "__main__":
117
+ main()
@@ -0,0 +1,265 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract table and column lineage from a local HiveServer2 log file — collection only.
4
+
5
+ Reads a plain-text Hive log file (not compressed), extracts SQL query blocks
6
+ from "Executing command" / "Starting command" entries, detects CTAS and
7
+ INSERT INTO ... SELECT patterns to build lineage edges, then writes a JSON
8
+ manifest file.
9
+
10
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
15
+
16
+ Prerequisites
17
+ -------------
18
+ pip install python-dotenv
19
+
20
+ Usage
21
+ -----
22
+ python collect_lineage.py \\
23
+ --log-file /tmp/root/hive.log \\
24
+ --output-file lineage_output.json
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import re
32
+ from dataclasses import dataclass, field
33
+ from datetime import datetime, timezone
34
+
35
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
36
+ RESOURCE_TYPE = "data-lake"
37
+
38
+ # Regex for CTAS: CREATE TABLE [IF NOT EXISTS] db.table AS SELECT ... FROM db.table
39
+ _CTAS_RE = re.compile(
40
+ r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
41
+ r"(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
42
+ r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
43
+ re.IGNORECASE | re.DOTALL,
44
+ )
45
+
46
+ # Regex for INSERT INTO/OVERWRITE db.table SELECT ... FROM db.table
47
+ _INSERT_RE = re.compile(
48
+ r"INSERT\s+(?:INTO|OVERWRITE)\s+(?:TABLE\s+)?(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
49
+ r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
50
+ re.IGNORECASE | re.DOTALL,
51
+ )
52
+
53
+ # Regex to detect additional JOIN sources beyond the primary FROM clause
54
+ _JOIN_RE = re.compile(r"JOIN\s+(?P<src_db>\w+)\.(?P<src_table>\w+)", re.IGNORECASE)
55
+
56
+ # Simple column alias extraction: [alias.]col [AS dest]
57
+ _COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
58
+
59
+ # Hive string literals — strip before scanning so words inside 'status' AS ...
60
+ # are not treated as column refs
61
+ _STR_LITERAL_RE = re.compile(r"'(?:''|[^'])*'")
62
+
63
+ # ROW_NUMBER() OVER (...) AS alias — whole expression has no single source column;
64
+ # removing it avoids bogus tokens in col_mappings
65
+ _WINDOW_AS_ALIAS_RE = re.compile(
66
+ r"\b(?:ROW_NUMBER|RANK|DENSE_RANK|NTILE)\s*\(\s*\)\s+OVER\s*\([^)]*\)\s+AS\s+\w+",
67
+ re.IGNORECASE,
68
+ )
69
+
70
+ # Regex to pull query text out of Hive log "Executing/Starting command" lines
71
+ _COMMAND_START_RE = re.compile(
72
+ r"(?:Executing|Starting)\s+command\(queryId=\S*\):\s+(?P<query>.+?)(?=\n\d{4}-\d{2}-\d{2}|\Z)",
73
+ re.DOTALL,
74
+ )
75
+
76
+ # Tokens that are almost never real column names — SQL keywords, functions, casts, etc.
77
+ _SQL_SCAN_NOISE = frozenset(
78
+ {
79
+ "ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE", "OVER", "PARTITION",
80
+ "ORDER", "BY", "CASE", "WHEN", "THEN", "ELSE", "END", "AND", "OR",
81
+ "NOT", "IN", "IS", "DISTINCT", "CAST", "CONVERT", "CURRENT_TIMESTAMP",
82
+ "CURRENT_DATE", "TRUE", "FALSE", "NULL", "BETWEEN", "LIKE", "EXISTS",
83
+ "ASC", "DESC", "LIMIT", "OFFSET", "GROUP", "HAVING", "UNION", "ALL",
84
+ "INNER", "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "JOIN", "ON",
85
+ "WHERE", "SELECT", "FROM", "AS", "STRING", "BIGINT", "INT", "SMALLINT",
86
+ "TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "BOOLEAN", "DATE",
87
+ "TIMESTAMP", "VARCHAR", "CHAR", "BINARY", "ARRAY", "MAP", "STRUCT",
88
+ "SUM", "AVG", "COUNT", "MIN", "MAX", "STDDEV", "VARIANCE", "VAR_POP",
89
+ "COALESCE", "IF", "SUBSTRING", "YEAR", "MONTH", "DAY", "LEAD", "LAG",
90
+ "FIRST_VALUE", "LAST_VALUE",
91
+ }
92
+ )
93
+
94
+
95
+ @dataclass
96
+ class _LineageEdge:
97
+ dest_db: str
98
+ dest_table: str
99
+ sources: list[tuple[str, str]] = field(default_factory=list)
100
+ # col_mappings: (dest_col, src_table, src_col)
101
+ col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
102
+
103
+
104
+ def _prepare_select_for_col_scan(select_clause: str) -> str:
105
+ """Remove literals and window headers so _COL_RE sees fewer false positives."""
106
+ s = _STR_LITERAL_RE.sub(" ", select_clause)
107
+ s = _WINDOW_AS_ALIAS_RE.sub(" ", s)
108
+ return s
109
+
110
+
111
+ def _dedupe_col_mappings(mappings: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
112
+ seen: set[tuple[str, str, str]] = set()
113
+ out: list[tuple[str, str, str]] = []
114
+ for t in mappings:
115
+ if t in seen:
116
+ continue
117
+ seen.add(t)
118
+ out.append(t)
119
+ return out
120
+
121
+
122
+ def _extract_query_blocks(log_text: str) -> list[str]:
123
+ """Extract individual SQL query strings from a Hive log file."""
124
+ return [m.group("query").strip() for m in _COMMAND_START_RE.finditer(log_text)]
125
+
126
+
127
+ def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
128
+ """
129
+ Lightweight column mapping: for each `alias.col AS dest` or `col AS dest`
130
+ in the SELECT clause, return (dest_col, src_table, src_col).
131
+
132
+ Strips string literals and window function headers first to reduce false
133
+ positives, and filters out SQL keywords/noise tokens.
134
+ """
135
+ prepared = _prepare_select_for_col_scan(select_clause)
136
+ mappings = []
137
+ for m in _COL_RE.finditer(prepared):
138
+ src_col = m.group(2)
139
+ dest_col = m.group(3) or src_col
140
+ if src_col.upper() in ("FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*"):
141
+ continue
142
+ if src_col.upper() in _SQL_SCAN_NOISE or dest_col.upper() in _SQL_SCAN_NOISE:
143
+ continue
144
+ # After stripping 'literal' AS col, we get " AS col" — skip bare (col, col) with no source expr.
145
+ if dest_col == src_col:
146
+ prefix = prepared[: m.start()].rstrip()
147
+ if prefix.upper().endswith("AS"):
148
+ continue
149
+ mappings.append((dest_col, src_table, src_col))
150
+ return _dedupe_col_mappings(mappings)
151
+
152
+
153
+ def _parse_edges(queries: list[str]) -> list[_LineageEdge]:
154
+ """Parse SQL query strings into _LineageEdge objects."""
155
+ edges: dict[str, _LineageEdge] = {}
156
+
157
+ for sql in queries:
158
+ # Strip string literals to avoid false table/column matches inside quoted strings
159
+ sql_clean = re.sub(r"\s+", " ", _STR_LITERAL_RE.sub(" ", sql)).strip()
160
+
161
+ for pattern in (_CTAS_RE, _INSERT_RE):
162
+ m = pattern.search(sql_clean)
163
+ if not m:
164
+ continue
165
+
166
+ dest_db = m.group("dest_db").lower()
167
+ dest_table = m.group("dest_table").lower()
168
+ src_db = m.group("src_db").lower()
169
+ src_table = m.group("src_table").lower()
170
+ select_cols = m.group("select_cols")
171
+
172
+ key = f"{dest_db}.{dest_table}"
173
+ if key not in edges:
174
+ edges[key] = _LineageEdge(dest_db=dest_db, dest_table=dest_table)
175
+
176
+ edge = edges[key]
177
+ src_pair = (src_db, src_table)
178
+ if src_pair not in edge.sources:
179
+ edge.sources.append(src_pair)
180
+
181
+ # Pick up additional JOIN sources
182
+ for jm in _JOIN_RE.finditer(sql_clean):
183
+ jp = (jm.group("src_db").lower(), jm.group("src_table").lower())
184
+ if jp not in edge.sources:
185
+ edge.sources.append(jp)
186
+
187
+ edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
188
+ break # matched one pattern, move to next query
189
+
190
+ # Deduplicate column mappings per edge (same INSERT may appear many times in HS2 logs)
191
+ for e in edges.values():
192
+ e.col_mappings = _dedupe_col_mappings(e.col_mappings)
193
+
194
+ return list(edges.values())
195
+
196
+
197
+ def collect(log_file: str) -> dict:
198
+ """
199
+ Parse lineage edges from a HiveServer2 log file and return a manifest dict.
200
+
201
+ Args:
202
+ log_file: Path to a local HiveServer2 log file.
203
+
204
+ Returns:
205
+ Manifest dict with keys: resource_type, collected_at, edges.
206
+ Each edge has destination, sources, and col_mappings lists.
207
+ """
208
+ print(f"Reading Hive log file: {log_file} ...")
209
+ with open(log_file, errors="replace") as fh:
210
+ log_text = fh.read()
211
+
212
+ queries = _extract_query_blocks(log_text)
213
+ print(f" Extracted {len(queries)} query block(s).")
214
+
215
+ edges = _parse_edges(queries)
216
+ print(f" Parsed {len(edges)} lineage edge(s).")
217
+
218
+ manifest = {
219
+ "resource_type": RESOURCE_TYPE,
220
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
221
+ "edges": [
222
+ {
223
+ "destination": {"database": e.dest_db, "table": e.dest_table},
224
+ "sources": [{"database": sdb, "table": stbl} for sdb, stbl in e.sources],
225
+ "col_mappings": [
226
+ {"dest_col": dc, "src_table": st, "src_col": sc}
227
+ for dc, st, sc in e.col_mappings
228
+ ],
229
+ }
230
+ for e in edges
231
+ ],
232
+ }
233
+ return manifest
234
+
235
+
236
+ def main() -> None:
237
+ parser = argparse.ArgumentParser(
238
+ description="Extract Hive lineage from a local log file and write a JSON manifest",
239
+ )
240
+ parser.add_argument(
241
+ "--log-file",
242
+ default="/tmp/root/hive.log",
243
+ help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
244
+ )
245
+ parser.add_argument(
246
+ "--output-file",
247
+ default="lineage_output.json",
248
+ help="Path to write the lineage manifest (default: lineage_output.json)",
249
+ )
250
+ args = parser.parse_args()
251
+
252
+ manifest = collect(log_file=args.log_file)
253
+
254
+ if not manifest["edges"]:
255
+ print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
256
+ return
257
+
258
+ with open(args.output_file, "w") as fh:
259
+ json.dump(manifest, fh, indent=2)
260
+ print(f"Lineage manifest written to {args.output_file}")
261
+ print("Done.")
262
+
263
+
264
+ if __name__ == "__main__":
265
+ main()
@@ -0,0 +1,313 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect table metadata from a Hive Metastore — collection only.
4
+
5
+ Connects to HiveServer2 (default port 10000), discovers all databases and
6
+ tables via SHOW DATABASES / SHOW TABLES, reads schema and table statistics
7
+ via DESCRIBE FORMATTED, then writes a JSON manifest file.
8
+
9
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
10
+
11
+ Substitution points
12
+ -------------------
13
+ - HIVE_HOST (env) / --hive-host (CLI) : HiveServer2 hostname
14
+ - HIVE_PORT (env) / --hive-port (CLI) : HiveServer2 port (default 10000)
15
+
16
+ Prerequisites
17
+ -------------
18
+ pip install pyhive python-dotenv
19
+
20
+ Usage
21
+ -----
22
+ python collect_metadata.py \\
23
+ --hive-host <HIVESERVER2_HOSTNAME> \\
24
+ --output-file metadata_output.json
25
+ """
26
+
27
+ import argparse
28
+ import json
29
+ import os
30
+ import re
31
+ from datetime import datetime, timezone
32
+
33
+ from pyhive import hive
34
+
35
+
36
+ def _check_available_memory(min_gb: float = 2.0) -> None:
37
+ """Warn if available memory is below the threshold."""
38
+ try:
39
+ if hasattr(os, "sysconf"): # Linux / macOS
40
+ page_size = os.sysconf("SC_PAGE_SIZE")
41
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
42
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
43
+ else:
44
+ return # Windows — skip check
45
+ except (ValueError, OSError):
46
+ return
47
+ if avail_gb < min_gb:
48
+ print(
49
+ f"WARNING: Only {avail_gb:.1f} GB of memory available "
50
+ f"(minimum recommended: {min_gb:.1f} GB). "
51
+ f"Consider reducing the number of databases/tables or increasing available memory."
52
+ )
53
+
54
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
55
+ RESOURCE_TYPE = "data-lake"
56
+
57
+ # Map Hive native types to SQL-standard uppercase types expected by Monte Carlo
58
+ _HIVE_TYPE_MAP: dict[str, str] = {
59
+ "tinyint": "TINYINT",
60
+ "smallint": "SMALLINT",
61
+ "int": "INTEGER",
62
+ "integer": "INTEGER",
63
+ "bigint": "BIGINT",
64
+ "float": "FLOAT",
65
+ "double": "DOUBLE",
66
+ "double precision": "DOUBLE",
67
+ "decimal": "DECIMAL",
68
+ "numeric": "DECIMAL",
69
+ "boolean": "BOOLEAN",
70
+ "string": "VARCHAR",
71
+ "varchar": "VARCHAR",
72
+ "char": "CHAR",
73
+ "binary": "BINARY",
74
+ "timestamp": "TIMESTAMP",
75
+ "date": "DATE",
76
+ "interval": "INTERVAL",
77
+ "array": "ARRAY",
78
+ "map": "MAP",
79
+ "struct": "STRUCT",
80
+ "uniontype": "UNION",
81
+ }
82
+
83
+ # ← SUBSTITUTE: add any internal table name prefixes you want to skip
84
+ _INTERNAL_TABLE_PREFIXES = ("tmp_", "__", "hive_")
85
+
86
+
87
+ def _normalize_hive_type(hive_type: str) -> str:
88
+ """Uppercase and normalize a Hive type string to a SQL-standard form.
89
+
90
+ Parametrized types like ``decimal(10,2)`` or ``varchar(255)`` keep their
91
+ suffix; the base type is mapped through ``_HIVE_TYPE_MAP``.
92
+ """
93
+ lower = hive_type.lower().strip()
94
+ base = lower.split("(")[0].strip()
95
+ suffix = hive_type[len(base):].strip() # preserve original params, e.g. decimal(10,2)
96
+ return _HIVE_TYPE_MAP.get(base, base.upper()) + suffix
97
+
98
+
99
+ def _connect(host: str, port: int) -> hive.Connection:
100
+ # ← SUBSTITUTE: update username/auth if your cluster requires Kerberos or LDAP
101
+ return hive.connect(host=host, port=port, username="hadoop", auth="NONE")
102
+
103
+
104
+ def _fetch_rows(cursor, query: str) -> list[tuple]:
105
+ """Execute a query and fetch results in memory-safe chunks."""
106
+ cursor.execute(query)
107
+ rows: list[tuple] = []
108
+ while True:
109
+ chunk = cursor.fetchmany(1000)
110
+ if not chunk:
111
+ break
112
+ rows.extend(chunk)
113
+ return rows
114
+
115
+
116
+ def _parse_describe_formatted(rows: list[tuple]) -> dict:
117
+ """
118
+ Parse DESCRIBE FORMATTED <db>.<table> output into a structured dict:
119
+ columns, row_count, total_size, last_modified, description, created_on
120
+ """
121
+ result: dict = {
122
+ "columns": [],
123
+ "row_count": None,
124
+ "total_size": None,
125
+ "last_modified": None,
126
+ "description": None,
127
+ "created_on": None,
128
+ }
129
+ in_col_info = False
130
+ in_table_info = False
131
+
132
+ for row in rows:
133
+ col_name = (row[0] or "").strip()
134
+ data_type = (row[1] or "").strip()
135
+ comment = (row[2] or "").strip() if len(row) > 2 else ""
136
+
137
+ if col_name.startswith("# col_name"):
138
+ in_col_info = True
139
+ in_table_info = False
140
+ continue
141
+ if col_name.startswith("# Detailed Table Information"):
142
+ in_col_info = False
143
+ in_table_info = True
144
+ continue
145
+ if col_name.startswith("#"):
146
+ in_col_info = False
147
+ continue
148
+
149
+ if in_col_info and col_name and data_type:
150
+ result["columns"].append(
151
+ {
152
+ "name": col_name,
153
+ "type": _normalize_hive_type(data_type),
154
+ "description": comment or None,
155
+ }
156
+ )
157
+
158
+ if in_table_info:
159
+ # Table Parameters rows have an empty col_name; key is in data_type, value in comment
160
+ param_key = data_type.strip() if not col_name else col_name.strip().rstrip(":")
161
+ param_val = (comment.strip() if not col_name else data_type.strip()) or ""
162
+
163
+ if re.search(r"numRows", param_key, re.IGNORECASE):
164
+ try:
165
+ result["row_count"] = int(param_val)
166
+ except (ValueError, TypeError):
167
+ pass
168
+ elif re.search(r"totalSize", param_key, re.IGNORECASE):
169
+ try:
170
+ result["total_size"] = int(param_val)
171
+ except (ValueError, TypeError):
172
+ pass
173
+ elif re.search(r"last_modified_time", param_key, re.IGNORECASE):
174
+ try:
175
+ result["last_modified"] = datetime.fromtimestamp(
176
+ int(param_val), tz=timezone.utc
177
+ ).isoformat()
178
+ except (ValueError, TypeError):
179
+ pass
180
+ elif re.search(r"^CreateTime", param_key):
181
+ # e.g. "Wed Mar 18 20:15:40 UTC 2026"
182
+ try:
183
+ result["created_on"] = datetime.strptime(
184
+ param_val, "%a %b %d %H:%M:%S %Z %Y"
185
+ ).replace(tzinfo=timezone.utc).isoformat()
186
+ except (ValueError, TypeError):
187
+ pass
188
+ elif param_key == "comment" and not result["description"] and param_val:
189
+ result["description"] = param_val
190
+
191
+ return result
192
+
193
+
194
+ def collect(
195
+ hive_host: str,
196
+ hive_port: int = 10000,
197
+ ) -> dict:
198
+ """
199
+ Connect to HiveServer2, discover all databases and tables, and return a
200
+ manifest dict with collected asset metadata.
201
+
202
+ Args:
203
+ hive_host: HiveServer2 hostname.
204
+ hive_port: HiveServer2 port (default 10000).
205
+
206
+ Returns:
207
+ Manifest dict with keys: resource_type, collected_at, assets.
208
+ """
209
+ _check_available_memory()
210
+ print(f"Connecting to HiveServer2 at {hive_host}:{hive_port} ...")
211
+ conn = _connect(hive_host, hive_port)
212
+ cursor = conn.cursor()
213
+ assets: list[dict] = []
214
+
215
+ print("Collecting table metadata ...")
216
+ databases = [row[0] for row in _fetch_rows(cursor, "SHOW DATABASES")]
217
+ print(f" Found databases: {databases}")
218
+
219
+ for db in databases:
220
+ # ← SUBSTITUTE: add any system databases you want to skip
221
+ if db in ("information_schema",):
222
+ continue
223
+
224
+ tables = _fetch_rows(cursor, f"SHOW TABLES IN {db}")
225
+ table_names = [row[0] for row in tables]
226
+ print(f" {db}: {len(table_names)} table(s)")
227
+
228
+ for table in table_names:
229
+ if any(table.startswith(p) for p in _INTERNAL_TABLE_PREFIXES):
230
+ continue
231
+
232
+ try:
233
+ desc_rows = _fetch_rows(cursor, f"DESCRIBE FORMATTED {db}.{table}")
234
+ except Exception as exc:
235
+ print(f" WARNING: could not describe {db}.{table}: {exc}")
236
+ continue
237
+
238
+ info = _parse_describe_formatted(desc_rows)
239
+
240
+ row_count = info["row_count"] if info["row_count"] and info["row_count"] > 0 else None
241
+ byte_count = info["total_size"] if info["total_size"] and info["total_size"] > 0 else None
242
+
243
+ assets.append(
244
+ {
245
+ "database": db,
246
+ "schema": db,
247
+ "name": table,
248
+ "description": info["description"],
249
+ "created_on": info["created_on"],
250
+ "row_count": row_count,
251
+ "byte_count": byte_count,
252
+ "last_modified": info["last_modified"],
253
+ "fields": [
254
+ {"name": col["name"], "type": col["type"], "description": col["description"]}
255
+ for col in info["columns"]
256
+ ],
257
+ }
258
+ )
259
+ print(
260
+ f" + {db}.{table} ({len(info['columns'])} columns, "
261
+ f"desc={info['description']!r}, created={info['created_on']})"
262
+ )
263
+
264
+ cursor.close()
265
+ conn.close()
266
+ print(f"\nCollected {len(assets)} table(s).")
267
+
268
+ manifest = {
269
+ "resource_type": RESOURCE_TYPE,
270
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
271
+ "assets": assets,
272
+ }
273
+ return manifest
274
+
275
+
276
+ def main() -> None:
277
+ parser = argparse.ArgumentParser(
278
+ description="Collect Hive table metadata and write a JSON manifest",
279
+ )
280
+ parser.add_argument(
281
+ "--hive-host",
282
+ default=os.environ.get("HIVE_HOST"),
283
+ help="HiveServer2 hostname (env: HIVE_HOST)", # ← SUBSTITUTE: your EMR master DNS or Hive host
284
+ )
285
+ parser.add_argument(
286
+ "--hive-port",
287
+ type=int,
288
+ default=10000,
289
+ help="HiveServer2 port (default: 10000)", # ← SUBSTITUTE if your cluster uses a non-standard port
290
+ )
291
+ parser.add_argument(
292
+ "--output-file",
293
+ default="metadata_output.json",
294
+ help="Path to write the output manifest (default: metadata_output.json)",
295
+ )
296
+ args = parser.parse_args()
297
+
298
+ if not args.hive_host:
299
+ parser.error("--hive-host is required (or set HIVE_HOST)")
300
+
301
+ manifest = collect(
302
+ hive_host=args.hive_host,
303
+ hive_port=args.hive_port,
304
+ )
305
+
306
+ with open(args.output_file, "w") as fh:
307
+ json.dump(manifest, fh, indent=2)
308
+ print(f"Asset manifest written to {args.output_file}")
309
+ print("Done.")
310
+
311
+
312
+ if __name__ == "__main__":
313
+ main()