opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,254 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect query logs from Snowflake ACCOUNT_USAGE.QUERY_HISTORY — collection only.
4
+
5
+ Queries a 24-hour window ending 1 hour ago (ACCOUNT_USAGE views have an
6
+ approximate 45-minute ingestion latency, so the last hour is intentionally
7
+ skipped to avoid incomplete data). The collected query logs are written to a
8
+ JSON manifest file.
9
+
10
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier
15
+ - SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username
16
+ - SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password
17
+ - SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse
18
+
19
+ Prerequisites
20
+ -------------
21
+ pip install snowflake-connector-python
22
+
23
+ Usage
24
+ -----
25
+ python collect_query_logs.py \\
26
+ --account <SNOWFLAKE_ACCOUNT> \\
27
+ --user <SNOWFLAKE_USER> \\
28
+ --password <SNOWFLAKE_PASSWORD> \\
29
+ --warehouse <SNOWFLAKE_WAREHOUSE>
30
+ """
31
+
32
+ import argparse
33
+ import json
34
+ import os
35
+ from datetime import datetime, timezone
36
+
37
+ import snowflake.connector
38
+
39
+ # ← SUBSTITUTE: set LOG_TYPE to match your warehouse type (query logs use log_type, not resource_type)
40
+ LOG_TYPE = "snowflake"
41
+
42
+
43
+ def _check_available_memory(min_gb: float = 2.0) -> None:
44
+ """Warn if available memory is below the threshold."""
45
+ try:
46
+ if hasattr(os, "sysconf"): # Linux / macOS
47
+ page_size = os.sysconf("SC_PAGE_SIZE")
48
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
49
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
50
+ else:
51
+ return # Windows — skip check
52
+ except (ValueError, OSError):
53
+ return
54
+ if avail_gb < min_gb:
55
+ print(
56
+ f"WARNING: Only {avail_gb:.1f} GB of memory available "
57
+ f"(minimum recommended: {min_gb:.1f} GB). "
58
+ f"Consider reducing the lookback window or increasing available memory."
59
+ )
60
+
61
+ # How many hours to look back from the trailing-edge cutoff
62
+ # ← SUBSTITUTE: adjust to match your collection cadence (e.g. 2 for every-2-hours runs)
63
+ _WINDOW_HOURS = 25
64
+
65
+ # Hours to skip at the trailing edge — ACCOUNT_USAGE has ~45-minute latency;
66
+ # skipping 1 hour provides a comfortable buffer.
67
+ # ← SUBSTITUTE: lower to 0 if you have confirmed real-time access to ACCOUNT_USAGE
68
+ _TRAILING_SKIP_HOURS = 1
69
+
70
+ # Maximum rows to collect per run — increase if your warehouse has higher query volume
71
+ # ← SUBSTITUTE: adjust based on your Snowflake query volume
72
+ _QUERY_LIMIT = 10000
73
+
74
+
75
+ def _fetch_query_history(conn) -> list[dict]:
76
+ """
77
+ Fetch recent query history from SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY.
78
+
79
+ Collection window: [NOW - _WINDOW_HOURS, NOW - _TRAILING_SKIP_HOURS]
80
+ This intentionally excludes the most recent hour to avoid the ACCOUNT_USAGE
81
+ ingestion latency gap.
82
+ """
83
+ cursor = conn.cursor()
84
+ cursor.execute(
85
+ f"""
86
+ SELECT
87
+ QUERY_ID,
88
+ QUERY_TEXT,
89
+ START_TIME,
90
+ END_TIME,
91
+ USER_NAME,
92
+ DATABASE_NAME,
93
+ WAREHOUSE_NAME,
94
+ BYTES_SCANNED,
95
+ ROWS_PRODUCED,
96
+ EXECUTION_STATUS,
97
+ QUERY_TAG,
98
+ ROLE_NAME
99
+ FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
100
+ WHERE START_TIME >= DATEADD(hour, -{_WINDOW_HOURS}, CURRENT_TIMESTAMP())
101
+ AND START_TIME < DATEADD(hour, -{_TRAILING_SKIP_HOURS}, CURRENT_TIMESTAMP())
102
+ AND EXECUTION_STATUS = 'SUCCESS'
103
+ ORDER BY START_TIME
104
+ LIMIT {_QUERY_LIMIT}
105
+ """
106
+ # ← SUBSTITUTE: add AND DATABASE_NAME = '<db>' or AND WAREHOUSE_NAME = '<wh>'
107
+ # to restrict collection to a specific database or warehouse
108
+ )
109
+ columns = [col[0] for col in cursor.description]
110
+ rows = []
111
+ while True:
112
+ chunk = cursor.fetchmany(1000)
113
+ if not chunk:
114
+ break
115
+ rows.extend(dict(zip(columns, row)) for row in chunk)
116
+ cursor.close()
117
+ return rows
118
+
119
+
120
+ def _iso(dt: object) -> str | None:
121
+ if dt is None:
122
+ return None
123
+ return dt.isoformat() if hasattr(dt, "isoformat") else str(dt)
124
+
125
+
126
+ def collect(
127
+ account: str,
128
+ user: str,
129
+ password: str,
130
+ warehouse: str,
131
+ output_file: str = "query_logs_output.json",
132
+ ) -> dict:
133
+ """
134
+ Connect to Snowflake, collect query logs, and write a JSON manifest.
135
+
136
+ Returns the manifest dict.
137
+ """
138
+ _check_available_memory()
139
+ print(f"Connecting to Snowflake account: {account} ...")
140
+ conn = snowflake.connector.connect(
141
+ account=account,
142
+ user=user,
143
+ password=password,
144
+ warehouse=warehouse,
145
+ )
146
+
147
+ print(
148
+ f"Fetching QUERY_HISTORY (last {_WINDOW_HOURS}h, excluding final {_TRAILING_SKIP_HOURS}h, "
149
+ f"limit {_QUERY_LIMIT}) ..."
150
+ )
151
+ rows = _fetch_query_history(conn)
152
+ conn.close()
153
+ print(f" Retrieved {len(rows)} query log row(s).")
154
+
155
+ if not rows:
156
+ print("No query log rows found in the specified window.")
157
+ manifest = {
158
+ "log_type": LOG_TYPE,
159
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
160
+ "entry_count": 0,
161
+ "window_start": None,
162
+ "window_end": None,
163
+ "queries": [],
164
+ }
165
+ with open(output_file, "w") as fh:
166
+ json.dump(manifest, fh, indent=2, default=str)
167
+ return manifest
168
+
169
+ start_times = [r["START_TIME"] for r in rows if r.get("START_TIME") is not None]
170
+ end_times = [r["END_TIME"] for r in rows if r.get("END_TIME") is not None]
171
+
172
+ manifest = {
173
+ "log_type": LOG_TYPE,
174
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
175
+ "entry_count": len(rows),
176
+ "window_start": _iso(min(start_times)) if start_times else None,
177
+ "window_end": _iso(max(end_times)) if end_times else None,
178
+ "queries": [
179
+ {
180
+ "query_id": r.get("QUERY_ID"),
181
+ "query_text": r.get("QUERY_TEXT") or "",
182
+ "start_time": _iso(r.get("START_TIME")),
183
+ "end_time": _iso(r.get("END_TIME")),
184
+ "user": r.get("USER_NAME"),
185
+ "warehouse": r.get("WAREHOUSE_NAME"),
186
+ "bytes_scanned": r.get("BYTES_SCANNED"),
187
+ "rows_produced": r.get("ROWS_PRODUCED"),
188
+ }
189
+ for r in rows
190
+ ],
191
+ }
192
+ with open(output_file, "w") as fh:
193
+ json.dump(manifest, fh, indent=2, default=str)
194
+ print(f"Query log manifest written to {output_file}")
195
+
196
+ return manifest
197
+
198
+
199
+ def main() -> None:
200
+ parser = argparse.ArgumentParser(
201
+ description="Collect Snowflake query logs from ACCOUNT_USAGE and write to a manifest file",
202
+ )
203
+ parser.add_argument(
204
+ "--account",
205
+ default=os.environ.get("SNOWFLAKE_ACCOUNT"),
206
+ help="Snowflake account identifier, e.g. xy12345.us-east-1 (env: SNOWFLAKE_ACCOUNT)", # ← SUBSTITUTE
207
+ )
208
+ parser.add_argument(
209
+ "--user",
210
+ default=os.environ.get("SNOWFLAKE_USER"),
211
+ help="Snowflake username (env: SNOWFLAKE_USER)",
212
+ )
213
+ parser.add_argument(
214
+ "--password",
215
+ default=os.environ.get("SNOWFLAKE_PASSWORD"),
216
+ help="Snowflake password (env: SNOWFLAKE_PASSWORD)",
217
+ )
218
+ parser.add_argument(
219
+ "--warehouse",
220
+ default=os.environ.get("SNOWFLAKE_WAREHOUSE"),
221
+ help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)", # ← SUBSTITUTE
222
+ )
223
+ parser.add_argument(
224
+ "--output-file",
225
+ default="query_logs_output.json",
226
+ help="Path to write the output manifest (default: query_logs_output.json)",
227
+ )
228
+ args = parser.parse_args()
229
+
230
+ missing = [
231
+ name
232
+ for name, val in [
233
+ ("--account", args.account),
234
+ ("--user", args.user),
235
+ ("--password", args.password),
236
+ ("--warehouse", args.warehouse),
237
+ ]
238
+ if not val
239
+ ]
240
+ if missing:
241
+ parser.error(f"Missing required arguments: {', '.join(missing)}")
242
+
243
+ collect(
244
+ account=args.account,
245
+ user=args.user,
246
+ password=args.password,
247
+ warehouse=args.warehouse,
248
+ output_file=args.output_file,
249
+ )
250
+ print("Done.")
251
+
252
+
253
+ if __name__ == "__main__":
254
+ main()
@@ -0,0 +1,307 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Push lineage events to Monte Carlo from a JSON manifest — push only.
4
+
5
+ Reads a manifest file produced by ``collect_lineage.py`` and sends the lineage
6
+ events to Monte Carlo using the pycarlo push ingestion API. Large payloads are
7
+ split into batches to stay under the 1 MB compressed limit.
8
+
9
+ Can be run standalone via CLI or imported (use the ``push()`` function).
10
+
11
+ Substitution points
12
+ -------------------
13
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
14
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
15
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
16
+
17
+ Prerequisites
18
+ -------------
19
+ pip install pycarlo
20
+
21
+ Usage
22
+ -----
23
+ python push_lineage.py \\
24
+ --key-id <MCD_INGEST_ID> \\
25
+ --key-token <MCD_INGEST_TOKEN> \\
26
+ --resource-uuid <MCD_RESOURCE_UUID> \\
27
+ --input-file lineage_output.json
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import json
34
+ import os
35
+ from concurrent.futures import ThreadPoolExecutor, as_completed
36
+ from datetime import datetime, timezone
37
+
38
+ from pycarlo.core import Client, Session
39
+ from pycarlo.features.ingestion import IngestionService
40
+ from pycarlo.features.ingestion.models import (
41
+ ColumnLineageField,
42
+ ColumnLineageSourceField,
43
+ LineageAssetRef,
44
+ LineageEvent,
45
+ )
46
+
47
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
48
+ RESOURCE_TYPE = "snowflake"
49
+
50
+ # Maximum events per batch — conservative default to keep compressed payload under 1 MB
51
+ # ← SUBSTITUTE: tune based on average edge complexity (number of sources, column mappings)
52
+ _BATCH_SIZE = 500
53
+
54
+
55
+ def _build_table_lineage_events(edges: list[dict]) -> list[LineageEvent]:
56
+ """Build table-level LineageEvent objects from manifest edge dicts."""
57
+ events = []
58
+ for edge in edges:
59
+ dest = edge["destination"]
60
+ sources = edge.get("sources", [])
61
+ if not sources:
62
+ continue
63
+ events.append(
64
+ LineageEvent(
65
+ destination=LineageAssetRef(
66
+ type="TABLE",
67
+ name=dest["table"],
68
+ database=dest["database"],
69
+ schema=dest["schema"],
70
+ ),
71
+ sources=[
72
+ LineageAssetRef(
73
+ type="TABLE",
74
+ name=s["table"],
75
+ database=s["database"],
76
+ schema=s["schema"],
77
+ )
78
+ for s in sources
79
+ ],
80
+ )
81
+ )
82
+ return events
83
+
84
+
85
+ def _build_column_lineage_events(edges: list[dict]) -> list[LineageEvent]:
86
+ """Build column-level LineageEvent objects from manifest edge dicts."""
87
+ events = []
88
+ for edge in edges:
89
+ dest = edge["destination"]
90
+ sources = edge.get("sources", [])
91
+ col_mappings = edge.get("col_mappings", [])
92
+ if not sources:
93
+ continue
94
+
95
+ dest_asset_id = f"{dest['database']}__{dest['schema']}__{dest['table']}"
96
+ source_asset_ids = {
97
+ (s["database"], s["schema"], s["table"]): f"{s['database']}__{s['schema']}__{s['table']}"
98
+ for s in sources
99
+ }
100
+
101
+ col_fields: dict[str, ColumnLineageField] = {}
102
+ for mapping in col_mappings:
103
+ dest_col = mapping["dest_col"]
104
+ src_table = mapping["src_table"]
105
+ src_col = mapping["src_col"]
106
+ # Match src_table to the first source with that table name
107
+ match = next(
108
+ (s for s in sources if s["table"] == src_table),
109
+ sources[0] if sources else None,
110
+ )
111
+ if not match:
112
+ continue
113
+ src_aid = source_asset_ids[(match["database"], match["schema"], match["table"])]
114
+ if dest_col not in col_fields:
115
+ col_fields[dest_col] = ColumnLineageField(name=dest_col, source_fields=[])
116
+ col_fields[dest_col].source_fields.append(
117
+ ColumnLineageSourceField(asset_id=src_aid, field_name=src_col)
118
+ )
119
+
120
+ events.append(
121
+ LineageEvent(
122
+ destination=LineageAssetRef(
123
+ type="TABLE",
124
+ name=dest["table"],
125
+ database=dest["database"],
126
+ schema=dest["schema"],
127
+ asset_id=dest_asset_id,
128
+ ),
129
+ sources=[
130
+ LineageAssetRef(
131
+ type="TABLE",
132
+ name=s["table"],
133
+ database=s["database"],
134
+ schema=s["schema"],
135
+ asset_id=source_asset_ids[(s["database"], s["schema"], s["table"])],
136
+ )
137
+ for s in sources
138
+ ],
139
+ fields=list(col_fields.values()) if col_fields else None,
140
+ )
141
+ )
142
+ return events
143
+
144
+
145
+ def push(
146
+ input_file: str,
147
+ resource_uuid: str,
148
+ key_id: str,
149
+ key_token: str,
150
+ batch_size: int = _BATCH_SIZE,
151
+ output_file: str = "lineage_push_result.json",
152
+ ) -> dict:
153
+ """
154
+ Read a lineage manifest and push events to Monte Carlo in batches.
155
+
156
+ Returns a result dict with invocation IDs for each batch.
157
+ """
158
+ with open(input_file) as fh:
159
+ manifest = json.load(fh)
160
+
161
+ edges = manifest.get("edges", [])
162
+ resource_type = manifest.get("resource_type", RESOURCE_TYPE)
163
+ column_lineage = manifest.get("column_lineage", False)
164
+
165
+ if column_lineage:
166
+ events = _build_column_lineage_events(edges)
167
+ label = "column-level"
168
+ else:
169
+ events = _build_table_lineage_events(edges)
170
+ label = "table-level"
171
+
172
+ print(f"Loaded {len(events)} {label} lineage event(s) from {input_file}")
173
+
174
+ if not events:
175
+ print("No lineage events to push.")
176
+ push_result = {
177
+ "resource_uuid": resource_uuid,
178
+ "resource_type": resource_type,
179
+ "invocation_ids": [],
180
+ "pushed_at": datetime.now(tz=timezone.utc).isoformat(),
181
+ "total_events": 0,
182
+ "batch_count": 0,
183
+ "batch_size": batch_size,
184
+ }
185
+ with open(output_file, "w") as fh:
186
+ json.dump(push_result, fh, indent=2)
187
+ return push_result
188
+
189
+ # Split into batches
190
+ batches = []
191
+ for i in range(0, len(events), batch_size):
192
+ batches.append(events[i : i + batch_size])
193
+ total_batches = len(batches)
194
+
195
+ def _push_batch(batch: list, batch_num: int) -> str | None:
196
+ """Push a single batch using a dedicated Session (thread-safe)."""
197
+ print(f" Pushing batch {batch_num}/{total_batches} ({len(batch)} events) ...")
198
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
199
+ service = IngestionService(mc_client=client)
200
+ result = service.send_lineage(
201
+ resource_uuid=resource_uuid,
202
+ resource_type=resource_type,
203
+ events=batch,
204
+ )
205
+ invocation_id = service.extract_invocation_id(result)
206
+ if invocation_id:
207
+ print(f" Batch {batch_num}: invocation_id={invocation_id}")
208
+ return invocation_id
209
+
210
+ # Push batches in parallel (each thread gets its own pycarlo Session)
211
+ max_workers = min(4, total_batches)
212
+ invocation_ids: list[str | None] = [None] * total_batches
213
+
214
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
215
+ futures = {
216
+ pool.submit(_push_batch, batch, i + 1): i
217
+ for i, batch in enumerate(batches)
218
+ }
219
+ for future in as_completed(futures):
220
+ idx = futures[future]
221
+ try:
222
+ invocation_ids[idx] = future.result()
223
+ except Exception as exc:
224
+ print(f" ERROR pushing batch {idx + 1}: {exc}")
225
+ raise
226
+
227
+ print(f" All {total_batches} batches pushed ({max_workers} workers)")
228
+
229
+ push_result = {
230
+ "resource_uuid": resource_uuid,
231
+ "resource_type": resource_type,
232
+ "invocation_ids": invocation_ids,
233
+ "pushed_at": datetime.now(tz=timezone.utc).isoformat(),
234
+ "total_events": len(events),
235
+ "batch_count": total_batches,
236
+ "batch_size": batch_size,
237
+ "edges": edges, # preserve for downstream validation
238
+ }
239
+ with open(output_file, "w") as fh:
240
+ json.dump(push_result, fh, indent=2)
241
+ print(f"Push result written to {output_file}")
242
+
243
+ return push_result
244
+
245
+
246
+ def main() -> None:
247
+ parser = argparse.ArgumentParser(
248
+ description="Push Snowflake lineage from a manifest to Monte Carlo",
249
+ )
250
+ parser.add_argument(
251
+ "--key-id",
252
+ default=os.environ.get("MCD_INGEST_ID"),
253
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
254
+ )
255
+ parser.add_argument(
256
+ "--key-token",
257
+ default=os.environ.get("MCD_INGEST_TOKEN"),
258
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
259
+ )
260
+ parser.add_argument(
261
+ "--resource-uuid",
262
+ default=os.environ.get("MCD_RESOURCE_UUID"),
263
+ help="Monte Carlo resource UUID for this Snowflake connection (env: MCD_RESOURCE_UUID)",
264
+ )
265
+ parser.add_argument(
266
+ "--input-file",
267
+ default="lineage_output.json",
268
+ help="Path to the collect manifest to read (default: lineage_output.json)",
269
+ )
270
+ parser.add_argument(
271
+ "--output-file",
272
+ default="lineage_push_result.json",
273
+ help="Path to write the push result (default: lineage_push_result.json)",
274
+ )
275
+ parser.add_argument(
276
+ "--batch-size",
277
+ type=int,
278
+ default=_BATCH_SIZE,
279
+ help=f"Max events per push batch (default: {_BATCH_SIZE})",
280
+ )
281
+ args = parser.parse_args()
282
+
283
+ missing = [
284
+ name
285
+ for name, val in [
286
+ ("--key-id", args.key_id),
287
+ ("--key-token", args.key_token),
288
+ ("--resource-uuid", args.resource_uuid),
289
+ ]
290
+ if not val
291
+ ]
292
+ if missing:
293
+ parser.error(f"Missing required arguments: {', '.join(missing)}")
294
+
295
+ push(
296
+ input_file=args.input_file,
297
+ resource_uuid=args.resource_uuid,
298
+ key_id=args.key_id,
299
+ key_token=args.key_token,
300
+ batch_size=args.batch_size,
301
+ output_file=args.output_file,
302
+ )
303
+ print("Done.")
304
+
305
+
306
+ if __name__ == "__main__":
307
+ main()