opencode-skills-collection 2.0.0-beta.3 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -0
  2. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  3. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  4. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  5. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  6. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  7. package/bundled-skills/docs/users/bundles.md +1 -1
  8. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  9. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  10. package/bundled-skills/docs/users/getting-started.md +1 -1
  11. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  12. package/bundled-skills/docs/users/usage.md +4 -4
  13. package/bundled-skills/docs/users/visual-guide.md +4 -4
  14. package/bundled-skills/manage-skills/SKILL.md +187 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  20. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  21. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  22. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  23. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  24. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  86. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  89. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  90. package/package.json +1 -1
  91. package/skills_index.json +503 -61
@@ -0,0 +1,284 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect Hive query logs from a local HiveServer2 log file — collection only.
4
+
5
+ Parses a plain-text HiveServer2 log for "Executing/Starting command" entries
6
+ to extract query text, query ID, start time and end time. Optionally reads
7
+ per-query operation logs to populate ``returned_rows`` from SelectOperator
8
+ ``RECORDS_OUT`` counters. Deduplicates entries by query ID.
9
+
10
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
15
+ - --op-logs-dir optional directory of per-query <queryId>.log files
16
+
17
+ Prerequisites
18
+ -------------
19
+ pip install python-dateutil python-dotenv
20
+
21
+ Usage
22
+ -----
23
+ python collect_query_logs.py \\
24
+ --log-file /tmp/root/hive.log \\
25
+ [--op-logs-dir /var/log/hive/operation_logs] \\
26
+ --output-file query_logs_output.json
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import argparse
32
+ import json
33
+ import re
34
+ from datetime import datetime, timezone
35
+ from io import StringIO
36
+ from pathlib import Path
37
+
38
+ from dateutil.parser import isoparse
39
+
40
+ # NOTE: the normalizer requires "hive-s3" — do not change to "hive" or "data-lake"
41
+ LOG_TYPE = "hive-s3"
42
+
43
+ # Matches the start of a new query block in the Hive log
44
+ _COMMAND_START_RE = re.compile(
45
+ r"(Executing|Starting)\s+command\(queryId=(?P<query_id>\S*)\):\s+(?P<command>.*)$"
46
+ )
47
+
48
+ # Extracts returned row counts from per-query Hive operation logs
49
+ _RECORDS_OUT_RE = re.compile(r"RECORDS_OUT_OPERATOR_SEL_\d+:(\d+)")
50
+
51
+
52
+ def _parse_log_entries(log_text: str) -> list[dict]:
53
+ """
54
+ Parse a HiveServer2 log file and return a list of dicts:
55
+ query_id, start_time (datetime), end_time (datetime), query (str)
56
+
57
+ Each timestamped "Executing/Starting command" line starts a new entry.
58
+ The previous entry's end_time is set to the timestamp of the next line.
59
+ """
60
+ entries = []
61
+ query = ""
62
+ query_id = ""
63
+ start_time: datetime | None = None
64
+ last_timestamp: datetime | None = None
65
+
66
+ for line in StringIO(log_text):
67
+ parts = line.split()
68
+ if not parts:
69
+ continue
70
+
71
+ try:
72
+ timestamp = isoparse(parts[0])
73
+ if not timestamp.tzinfo:
74
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
75
+ except ValueError:
76
+ # Continuation line for a multi-line query
77
+ if query:
78
+ query += "\n" + line.rstrip()
79
+ continue
80
+
81
+ command_start = _COMMAND_START_RE.search(line)
82
+ if command_start:
83
+ # Emit the previous entry before starting a new one
84
+ if query and start_time:
85
+ entries.append(
86
+ {
87
+ "query_id": query_id,
88
+ "start_time": start_time,
89
+ "end_time": timestamp,
90
+ "query": query,
91
+ }
92
+ )
93
+ query_id = command_start.group("query_id")
94
+ start_time = timestamp
95
+ query = command_start.group("command").strip()
96
+ elif query and start_time:
97
+ # A timestamped non-command line closes the current entry
98
+ entries.append(
99
+ {
100
+ "query_id": query_id,
101
+ "start_time": start_time,
102
+ "end_time": timestamp,
103
+ "query": query,
104
+ }
105
+ )
106
+ query = ""
107
+ query_id = ""
108
+ start_time = None
109
+
110
+ last_timestamp = timestamp
111
+
112
+ # Flush any trailing entry
113
+ if query and start_time:
114
+ end_time = last_timestamp or start_time
115
+ entries.append(
116
+ {
117
+ "query_id": query_id,
118
+ "start_time": start_time,
119
+ "end_time": end_time,
120
+ "query": query,
121
+ }
122
+ )
123
+
124
+ return entries
125
+
126
+
127
+ def _load_returned_rows(op_logs_dir: str) -> dict[str, int]:
128
+ """
129
+ Scan a directory of per-query Hive operation logs (named <queryId>.log) and
130
+ return a mapping of query_id -> rows returned.
131
+
132
+ The row count is taken from the last RECORDS_OUT_OPERATOR_SEL_N value in
133
+ each file, which reflects the final number of rows delivered to the client.
134
+ """
135
+ rows_by_id: dict[str, int] = {}
136
+ for log_file in Path(op_logs_dir).glob("*.log"):
137
+ query_id = log_file.stem
138
+ last_count: int | None = None
139
+ try:
140
+ text = log_file.read_text(errors="replace")
141
+ except OSError:
142
+ continue
143
+ for m in _RECORDS_OUT_RE.finditer(text):
144
+ last_count = int(m.group(1))
145
+ if last_count is not None:
146
+ rows_by_id[query_id] = last_count
147
+ return rows_by_id
148
+
149
+
150
+ def _build_query_log_entries(
151
+ raw_entries: list[dict],
152
+ rows_by_id: dict[str, int] | None = None,
153
+ ) -> list[dict]:
154
+ """
155
+ Deduplicate raw log entries by query_id and enrich with returned_rows.
156
+
157
+ Returns plain dicts so that ``push_query_logs.py`` can reconstruct
158
+ QueryLogEntry objects from the JSON manifest.
159
+ """
160
+ seen: set[str] = set()
161
+ entries = []
162
+ for r in raw_entries:
163
+ qid = r["query_id"]
164
+ if qid and qid in seen:
165
+ continue
166
+ if qid:
167
+ seen.add(qid)
168
+
169
+ returned_rows: int | None = rows_by_id.get(qid) if rows_by_id and qid else None
170
+
171
+ entries.append(
172
+ {
173
+ "query_id": qid or None,
174
+ "start_time": r["start_time"].isoformat(),
175
+ "end_time": r["end_time"].isoformat(),
176
+ "query_text": r["query"],
177
+ "user": "hadoop", # ← SUBSTITUTE: set the user appropriate for your cluster
178
+ "returned_rows": returned_rows,
179
+ }
180
+ )
181
+ return entries
182
+
183
+
184
+ def collect(
185
+ log_file: str,
186
+ op_logs_dir: str | None = None,
187
+ ) -> dict:
188
+ """
189
+ Parse query log entries from a HiveServer2 log file and return a manifest dict.
190
+
191
+ Args:
192
+ log_file: Path to a local HiveServer2 log file.
193
+ op_logs_dir: Optional directory containing per-query operation logs
194
+ (<queryId>.log). When provided, returned_rows is populated
195
+ from SelectOperator RECORDS_OUT counts.
196
+
197
+ Returns:
198
+ Manifest dict with keys: log_type, collected_at, entry_count,
199
+ window_start, window_end, queries.
200
+ """
201
+ print(f"Reading Hive log file: {log_file} ...")
202
+ with open(log_file, errors="replace") as fh:
203
+ log_text = fh.read()
204
+
205
+ raw_entries = _parse_log_entries(log_text)
206
+ print(f" Parsed {len(raw_entries)} query log entry/entries.")
207
+
208
+ if not raw_entries:
209
+ print("No query log entries found.")
210
+ return {
211
+ "log_type": LOG_TYPE,
212
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
213
+ "entry_count": 0,
214
+ "window_start": None,
215
+ "window_end": None,
216
+ "queries": [],
217
+ }
218
+
219
+ rows_by_id: dict[str, int] | None = None
220
+ if op_logs_dir:
221
+ rows_by_id = _load_returned_rows(op_logs_dir)
222
+ print(f" Loaded row counts for {len(rows_by_id)} query/queries from {op_logs_dir}")
223
+
224
+ queries = _build_query_log_entries(raw_entries, rows_by_id)
225
+
226
+ start_times = [r["start_time"] for r in raw_entries]
227
+ end_times = [r["end_time"] for r in raw_entries]
228
+
229
+ manifest = {
230
+ "log_type": LOG_TYPE,
231
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
232
+ "entry_count": len(queries),
233
+ "window_start": min(start_times).isoformat() if start_times else None,
234
+ "window_end": max(end_times).isoformat() if end_times else None,
235
+ "queries": [
236
+ {
237
+ "query_id": q["query_id"],
238
+ "start_time": q["start_time"],
239
+ "end_time": q["end_time"],
240
+ "query": q["query_text"],
241
+ "user": q["user"],
242
+ "returned_rows": q["returned_rows"],
243
+ }
244
+ for q in queries
245
+ ],
246
+ }
247
+ return manifest
248
+
249
+
250
+ def main() -> None:
251
+ parser = argparse.ArgumentParser(
252
+ description="Collect Hive query logs from a local log file and write a JSON manifest",
253
+ )
254
+ parser.add_argument(
255
+ "--log-file",
256
+ default="/tmp/root/hive.log",
257
+ help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
258
+ )
259
+ parser.add_argument(
260
+ "--op-logs-dir",
261
+ default=None,
262
+ help=(
263
+ "Directory containing per-query Hive operation logs (<queryId>.log). "
264
+ "When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
265
+ ),
266
+ # ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
267
+ )
268
+ parser.add_argument(
269
+ "--output-file",
270
+ default="query_logs_output.json",
271
+ help="Path to write the output manifest (default: query_logs_output.json)",
272
+ )
273
+ args = parser.parse_args()
274
+
275
+ manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
276
+
277
+ with open(args.output_file, "w") as fh:
278
+ json.dump(manifest, fh, indent=2)
279
+ print(f"Query log manifest written to {args.output_file}")
280
+ print("Done.")
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
@@ -0,0 +1,309 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Push a collected Hive lineage manifest to Monte Carlo — push only.
4
+
5
+ Reads a JSON manifest produced by ``collect_lineage.py``, builds LineageEvent
6
+ objects (table-level or column-level), and calls ``send_lineage`` in batches.
7
+ The manifest is updated in-place with ``resource_uuid`` and ``invocation_id``
8
+ after a successful push.
9
+
10
+ Can be run standalone via CLI or imported (use the ``push()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
15
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
16
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
17
+
18
+ Prerequisites
19
+ -------------
20
+ pip install pycarlo python-dotenv
21
+
22
+ Usage (table-level):
23
+ python push_lineage.py \\
24
+ --key-id <MCD_INGEST_ID> \\
25
+ --key-token <MCD_INGEST_TOKEN> \\
26
+ --resource-uuid <MCD_RESOURCE_UUID> \\
27
+ --input-file lineage_output.json
28
+
29
+ Usage (column-level):
30
+ python push_lineage.py ... --column-lineage
31
+ """
32
+
33
+ import argparse
34
+ import json
35
+ import os
36
+ from concurrent.futures import ThreadPoolExecutor, as_completed
37
+
38
+ from pycarlo.core import Client, Session
39
+ from pycarlo.features.ingestion import IngestionService
40
+ from pycarlo.features.ingestion.models import (
41
+ ColumnLineageField,
42
+ ColumnLineageSourceField,
43
+ LineageAssetRef,
44
+ LineageEvent,
45
+ )
46
+
47
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
48
+ RESOURCE_TYPE = "data-lake"
49
+
50
+ # ← SUBSTITUTE: default batch size for lineage push (events per request)
51
+ DEFAULT_BATCH_SIZE = 500
52
+
53
+ # ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
54
+ DEFAULT_TIMEOUT_SECONDS = 120
55
+
56
+
57
+ def _build_table_lineage(edges_data: list[dict]) -> list[LineageEvent]:
58
+ """Build table-level LineageEvent objects from raw edge dicts."""
59
+ events = []
60
+ for edge in edges_data:
61
+ sources = edge.get("sources", [])
62
+ if not sources:
63
+ continue
64
+ dest = edge["destination"]
65
+ events.append(
66
+ LineageEvent(
67
+ destination=LineageAssetRef(
68
+ type="TABLE",
69
+ name=dest["table"],
70
+ database=dest["database"],
71
+ schema=dest["database"],
72
+ ),
73
+ sources=[
74
+ LineageAssetRef(
75
+ type="TABLE",
76
+ name=src["table"],
77
+ database=src["database"],
78
+ schema=src["database"],
79
+ )
80
+ for src in sources
81
+ ],
82
+ )
83
+ )
84
+ return events
85
+
86
+
87
+ def _build_column_lineage(edges_data: list[dict]) -> list[LineageEvent]:
88
+ """Build column-level LineageEvent objects from raw edge dicts."""
89
+ events = []
90
+ for edge in edges_data:
91
+ sources = edge.get("sources", [])
92
+ if not sources:
93
+ continue
94
+
95
+ dest = edge["destination"]
96
+ dest_asset_id = f"{dest['database']}__{dest['table']}"
97
+ source_asset_ids = {
98
+ (src["database"], src["table"]): f"{src['database']}__{src['table']}"
99
+ for src in sources
100
+ }
101
+
102
+ col_fields: dict[str, ColumnLineageField] = {}
103
+ for mapping in edge.get("col_mappings", []):
104
+ dest_col = mapping["dest_col"]
105
+ src_table = mapping["src_table"]
106
+ src_col = mapping["src_col"]
107
+ # Find the matching source db for this src_table
108
+ src_db = next(
109
+ (src["database"] for src in sources if src["table"] == src_table),
110
+ dest["database"],
111
+ )
112
+ src_aid = source_asset_ids.get((src_db, src_table), f"{src_db}__{src_table}")
113
+ if dest_col not in col_fields:
114
+ col_fields[dest_col] = ColumnLineageField(name=dest_col, source_fields=[])
115
+ col_fields[dest_col].source_fields.append(
116
+ ColumnLineageSourceField(asset_id=src_aid, field_name=src_col)
117
+ )
118
+
119
+ events.append(
120
+ LineageEvent(
121
+ destination=LineageAssetRef(
122
+ type="TABLE",
123
+ name=dest["table"],
124
+ database=dest["database"],
125
+ schema=dest["database"],
126
+ asset_id=dest_asset_id,
127
+ ),
128
+ sources=[
129
+ LineageAssetRef(
130
+ type="TABLE",
131
+ name=src["table"],
132
+ database=src["database"],
133
+ schema=src["database"],
134
+ asset_id=source_asset_ids[(src["database"], src["table"])],
135
+ )
136
+ for src in sources
137
+ ],
138
+ fields=list(col_fields.values()) if col_fields else None,
139
+ )
140
+ )
141
+ return events
142
+
143
+
144
+ def push(
145
+ manifest: dict,
146
+ resource_uuid: str,
147
+ key_id: str,
148
+ key_token: str,
149
+ column_lineage: bool = False,
150
+ batch_size: int = DEFAULT_BATCH_SIZE,
151
+ timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
152
+ ) -> str | None:
153
+ """
154
+ Push collected lineage to Monte Carlo and update the manifest in-place.
155
+
156
+ Events are sent in batches of ``batch_size`` (default 500) to avoid
157
+ oversized payloads. Supports both table-level and column-level lineage.
158
+
159
+ Args:
160
+ manifest: Dict loaded from a ``collect_lineage.py`` output file.
161
+ resource_uuid: MC resource UUID for this Hive connection.
162
+ key_id: MC ingestion key ID.
163
+ key_token: MC ingestion key token.
164
+ column_lineage: When True, push column-level lineage; otherwise table-level.
165
+ batch_size: Events per POST request (default 500).
166
+ timeout_seconds: HTTP timeout per request (default 120).
167
+
168
+ Returns:
169
+ The last invocation ID string if returned by MC, otherwise None.
170
+ """
171
+ resource_type = manifest.get("resource_type", RESOURCE_TYPE)
172
+ edges_data = manifest.get("edges", [])
173
+
174
+ if column_lineage:
175
+ events = _build_column_lineage(edges_data)
176
+ label = "column-level"
177
+ else:
178
+ events = _build_table_lineage(edges_data)
179
+ label = "table-level"
180
+
181
+ print(f"Loaded {len(events)} {label} lineage event(s) from manifest")
182
+
183
+ if not events:
184
+ print("No lineage events to push.")
185
+ manifest["resource_uuid"] = resource_uuid
186
+ manifest["invocation_id"] = None
187
+ return None
188
+
189
+ # Split into batches
190
+ batch_list = []
191
+ for i in range(0, len(events), batch_size):
192
+ batch_list.append(events[i : i + batch_size])
193
+ total_batches = len(batch_list)
194
+
195
+ def _push_batch(batch: list, batch_num: int) -> str | None:
196
+ """Push a single batch using a dedicated Session (thread-safe)."""
197
+ print(f" Pushing batch {batch_num}/{total_batches} ({len(batch)} events) ...")
198
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
199
+ service = IngestionService(mc_client=client)
200
+ result = service.send_lineage(
201
+ resource_uuid=resource_uuid,
202
+ resource_type=resource_type,
203
+ events=batch,
204
+ )
205
+ invocation_id = service.extract_invocation_id(result)
206
+ if invocation_id:
207
+ print(f" Batch {batch_num}: invocation_id={invocation_id}")
208
+ return invocation_id
209
+
210
+ # Push batches in parallel (each thread gets its own pycarlo Session)
211
+ max_workers = min(4, total_batches)
212
+ invocation_ids: list[str | None] = [None] * total_batches
213
+
214
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
215
+ futures = {
216
+ pool.submit(_push_batch, batch, i + 1): i
217
+ for i, batch in enumerate(batch_list)
218
+ }
219
+ for future in as_completed(futures):
220
+ idx = futures[future]
221
+ try:
222
+ invocation_ids[idx] = future.result()
223
+ except Exception as exc:
224
+ print(f" ERROR pushing batch {idx + 1}: {exc}")
225
+ raise
226
+
227
+ print(f" All {total_batches} batches pushed ({max_workers} workers)")
228
+
229
+ manifest["resource_uuid"] = resource_uuid
230
+ manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
231
+ if len([i for i in invocation_ids if i]) > 1:
232
+ manifest["invocation_ids"] = invocation_ids
233
+ elif "invocation_ids" in manifest:
234
+ del manifest["invocation_ids"]
235
+
236
+ return manifest.get("invocation_id")
237
+
238
+
239
+ def main() -> None:
240
+ parser = argparse.ArgumentParser(
241
+ description="Push a collected Hive lineage manifest to Monte Carlo",
242
+ )
243
+ parser.add_argument(
244
+ "--key-id",
245
+ default=os.environ.get("MCD_INGEST_ID"),
246
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
247
+ )
248
+ parser.add_argument(
249
+ "--key-token",
250
+ default=os.environ.get("MCD_INGEST_TOKEN"),
251
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
252
+ )
253
+ parser.add_argument(
254
+ "--resource-uuid",
255
+ default=os.environ.get("MCD_RESOURCE_UUID"),
256
+ help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
257
+ )
258
+ parser.add_argument(
259
+ "--input-file",
260
+ default="lineage_output.json",
261
+ help="Path to the JSON manifest written by collect_lineage.py (default: lineage_output.json)",
262
+ )
263
+ parser.add_argument(
264
+ "--column-lineage",
265
+ action="store_true",
266
+ help="Push column-level lineage instead of table-level",
267
+ )
268
+ parser.add_argument(
269
+ "--batch-size",
270
+ type=int,
271
+ default=DEFAULT_BATCH_SIZE,
272
+ metavar="N",
273
+ help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
274
+ )
275
+ parser.add_argument(
276
+ "--timeout",
277
+ type=int,
278
+ default=DEFAULT_TIMEOUT_SECONDS,
279
+ metavar="SEC",
280
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
281
+ )
282
+ args = parser.parse_args()
283
+
284
+ if not args.key_id or not args.key_token:
285
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
286
+ if not args.resource_uuid:
287
+ parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
288
+
289
+ with open(args.input_file) as fh:
290
+ manifest = json.load(fh)
291
+
292
+ push(
293
+ manifest=manifest,
294
+ resource_uuid=args.resource_uuid,
295
+ key_id=args.key_id,
296
+ key_token=args.key_token,
297
+ column_lineage=args.column_lineage,
298
+ batch_size=args.batch_size,
299
+ timeout_seconds=args.timeout,
300
+ )
301
+
302
+ with open(args.input_file, "w") as fh:
303
+ json.dump(manifest, fh, indent=2)
304
+ print(f"Manifest updated in-place: {args.input_file}")
305
+ print("Done.")
306
+
307
+
308
+ if __name__ == "__main__":
309
+ main()