opencode-skills-collection 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Collect table and column lineage from Snowflake — collection only.
|
|
4
|
+
|
|
5
|
+
Queries ACCOUNT_USAGE for DML/DDL statements in the last 24 hours, parses each
|
|
6
|
+
QUERY_TEXT with regex to extract source and destination tables, then writes the
|
|
7
|
+
resulting lineage edges to a JSON manifest file.
|
|
8
|
+
|
|
9
|
+
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
10
|
+
|
|
11
|
+
Note: ACCOUNT_USAGE views have an approximate latency of 45 minutes, so very
|
|
12
|
+
recent queries may not yet appear.
|
|
13
|
+
|
|
14
|
+
Substitution points
|
|
15
|
+
-------------------
|
|
16
|
+
- SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier
|
|
17
|
+
- SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username
|
|
18
|
+
- SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password
|
|
19
|
+
- SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse
|
|
20
|
+
|
|
21
|
+
Prerequisites
|
|
22
|
+
-------------
|
|
23
|
+
pip install snowflake-connector-python
|
|
24
|
+
|
|
25
|
+
Usage (table-level):
|
|
26
|
+
python collect_lineage.py \\
|
|
27
|
+
--account <SNOWFLAKE_ACCOUNT> \\
|
|
28
|
+
--user <SNOWFLAKE_USER> \\
|
|
29
|
+
--password <SNOWFLAKE_PASSWORD> \\
|
|
30
|
+
--warehouse <SNOWFLAKE_WAREHOUSE>
|
|
31
|
+
|
|
32
|
+
Usage (column-level):
|
|
33
|
+
python collect_lineage.py ... --column-lineage
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import argparse
|
|
39
|
+
import json
|
|
40
|
+
import os
|
|
41
|
+
import re
|
|
42
|
+
from dataclasses import dataclass, field
|
|
43
|
+
from datetime import datetime, timezone
|
|
44
|
+
|
|
45
|
+
import snowflake.connector
|
|
46
|
+
|
|
47
|
+
# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
|
|
48
|
+
RESOURCE_TYPE = "snowflake"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _check_available_memory(min_gb: float = 2.0) -> None:
|
|
52
|
+
"""Warn if available memory is below the threshold."""
|
|
53
|
+
try:
|
|
54
|
+
if hasattr(os, "sysconf"): # Linux / macOS
|
|
55
|
+
page_size = os.sysconf("SC_PAGE_SIZE")
|
|
56
|
+
avail_pages = os.sysconf("SC_AVPHYS_PAGES")
|
|
57
|
+
avail_gb = (page_size * avail_pages) / (1024 ** 3)
|
|
58
|
+
else:
|
|
59
|
+
return # Windows — skip check
|
|
60
|
+
except (ValueError, OSError):
|
|
61
|
+
return
|
|
62
|
+
if avail_gb < min_gb:
|
|
63
|
+
print(
|
|
64
|
+
f"WARNING: Only {avail_gb:.1f} GB of memory available "
|
|
65
|
+
f"(minimum recommended: {min_gb:.1f} GB). "
|
|
66
|
+
f"Consider reducing the lookback window or increasing available memory."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Hours to look back in ACCOUNT_USAGE.QUERY_HISTORY
|
|
70
|
+
# ← SUBSTITUTE: adjust the lookback window to match your collection cadence
|
|
71
|
+
_LOOKBACK_HOURS = 24
|
|
72
|
+
|
|
73
|
+
# Regex for CTAS: CREATE [OR REPLACE] [TRANSIENT] TABLE [IF NOT EXISTS] [db.][schema.]table AS SELECT
|
|
74
|
+
_CTAS_RE = re.compile(
|
|
75
|
+
r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:TRANSIENT\s+)?TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
|
|
76
|
+
r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
|
|
77
|
+
r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
|
|
78
|
+
r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
|
|
79
|
+
re.IGNORECASE | re.DOTALL,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Regex for INSERT INTO [db.][schema.]table SELECT ... FROM [db.][schema.]table
|
|
83
|
+
_INSERT_RE = re.compile(
|
|
84
|
+
r"INSERT\s+(?:INTO|OVERWRITE)\s+"
|
|
85
|
+
r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
|
|
86
|
+
r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
|
|
87
|
+
r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
|
|
88
|
+
re.IGNORECASE | re.DOTALL,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Regex for CREATE [OR REPLACE] VIEW [db.][schema.]view AS SELECT ... FROM ...
|
|
92
|
+
_CREATE_VIEW_RE = re.compile(
|
|
93
|
+
r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:SECURE\s+)?VIEW\s+"
|
|
94
|
+
r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
|
|
95
|
+
r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
|
|
96
|
+
r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
|
|
97
|
+
re.IGNORECASE | re.DOTALL,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Additional JOIN sources
|
|
101
|
+
_JOIN_RE = re.compile(
|
|
102
|
+
r"JOIN\s+(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
|
|
103
|
+
re.IGNORECASE,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Simple column alias extraction from SELECT clause
|
|
107
|
+
_COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
|
|
108
|
+
_SQL_KEYWORDS = {
|
|
109
|
+
"FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*", "AND", "OR",
|
|
110
|
+
"GROUP", "ORDER", "BY", "HAVING", "LIMIT", "DISTINCT", "CASE", "WHEN",
|
|
111
|
+
"THEN", "ELSE", "END", "NULL", "NOT", "IN", "IS", "BETWEEN",
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class _LineageEdge:
|
|
117
|
+
dest_db: str
|
|
118
|
+
dest_schema: str
|
|
119
|
+
dest_table: str
|
|
120
|
+
sources: list[tuple[str, str, str]] = field(default_factory=list)
|
|
121
|
+
# col_mappings: (dest_col, src_table, src_col)
|
|
122
|
+
col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
|
|
126
|
+
mappings = []
|
|
127
|
+
for m in _COL_RE.finditer(select_clause):
|
|
128
|
+
src_col = m.group(2)
|
|
129
|
+
dest_col = m.group(3) or src_col
|
|
130
|
+
if src_col.upper() in _SQL_KEYWORDS:
|
|
131
|
+
continue
|
|
132
|
+
mappings.append((dest_col, src_table, src_col))
|
|
133
|
+
return mappings
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _parse_edges(rows: list[dict]) -> list[_LineageEdge]:
|
|
137
|
+
"""Parse QUERY_HISTORY rows into _LineageEdge objects."""
|
|
138
|
+
edges: dict[str, _LineageEdge] = {}
|
|
139
|
+
|
|
140
|
+
for row in rows:
|
|
141
|
+
query_text = row.get("QUERY_TEXT") or ""
|
|
142
|
+
default_db = (row.get("DATABASE_NAME") or "").lower()
|
|
143
|
+
sql_clean = re.sub(r"\s+", " ", query_text).strip()
|
|
144
|
+
|
|
145
|
+
for pattern in (_CTAS_RE, _INSERT_RE, _CREATE_VIEW_RE):
|
|
146
|
+
m = pattern.search(sql_clean)
|
|
147
|
+
if not m:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
dest_db = (m.group("dest_db") or default_db).lower()
|
|
151
|
+
dest_schema = (m.group("dest_schema") or "public").lower()
|
|
152
|
+
dest_table = m.group("dest_table").lower()
|
|
153
|
+
src_db = (m.group("src_db") or default_db).lower()
|
|
154
|
+
src_schema = (m.group("src_schema") or "public").lower()
|
|
155
|
+
src_table = m.group("src_table").lower()
|
|
156
|
+
select_cols = m.group("select_cols")
|
|
157
|
+
|
|
158
|
+
key = f"{dest_db}.{dest_schema}.{dest_table}"
|
|
159
|
+
if key not in edges:
|
|
160
|
+
edges[key] = _LineageEdge(
|
|
161
|
+
dest_db=dest_db, dest_schema=dest_schema, dest_table=dest_table
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
edge = edges[key]
|
|
165
|
+
src_triple = (src_db, src_schema, src_table)
|
|
166
|
+
if src_triple not in edge.sources:
|
|
167
|
+
edge.sources.append(src_triple)
|
|
168
|
+
|
|
169
|
+
for jm in _JOIN_RE.finditer(sql_clean):
|
|
170
|
+
jt = jm.group("src_table").lower()
|
|
171
|
+
jschema = (jm.group("src_schema") or src_schema).lower()
|
|
172
|
+
jdb = (jm.group("src_db") or src_db).lower()
|
|
173
|
+
jp = (jdb, jschema, jt)
|
|
174
|
+
if jp not in edge.sources:
|
|
175
|
+
edge.sources.append(jp)
|
|
176
|
+
|
|
177
|
+
edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
|
|
178
|
+
break
|
|
179
|
+
|
|
180
|
+
return list(edges.values())
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _fetch_query_history(conn, lookback_hours: int) -> list[dict]:
|
|
184
|
+
cursor = conn.cursor()
|
|
185
|
+
cursor.execute(
|
|
186
|
+
f"""
|
|
187
|
+
SELECT QUERY_ID, QUERY_TEXT, START_TIME, END_TIME, USER_NAME, DATABASE_NAME, EXECUTION_STATUS
|
|
188
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
|
|
189
|
+
WHERE START_TIME >= DATEADD(hour, -{lookback_hours}, CURRENT_TIMESTAMP())
|
|
190
|
+
AND EXECUTION_STATUS = 'SUCCESS'
|
|
191
|
+
AND QUERY_TYPE IN ('CREATE_TABLE_AS_SELECT', 'INSERT', 'MERGE', 'CREATE_VIEW')
|
|
192
|
+
ORDER BY START_TIME
|
|
193
|
+
LIMIT 50000
|
|
194
|
+
"""
|
|
195
|
+
# ← SUBSTITUTE: adjust QUERY_TYPE list, LIMIT, or add a WHERE clause to scope to specific databases
|
|
196
|
+
)
|
|
197
|
+
columns = [col[0] for col in cursor.description]
|
|
198
|
+
rows = []
|
|
199
|
+
while True:
|
|
200
|
+
batch = cursor.fetchmany(1000)
|
|
201
|
+
if not batch:
|
|
202
|
+
break
|
|
203
|
+
rows.extend(dict(zip(columns, row)) for row in batch)
|
|
204
|
+
cursor.close()
|
|
205
|
+
return rows
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def collect(
|
|
209
|
+
account: str,
|
|
210
|
+
user: str,
|
|
211
|
+
password: str,
|
|
212
|
+
warehouse: str,
|
|
213
|
+
lookback_hours: int = _LOOKBACK_HOURS,
|
|
214
|
+
column_lineage: bool = False,
|
|
215
|
+
output_file: str = "lineage_output.json",
|
|
216
|
+
) -> dict:
|
|
217
|
+
"""
|
|
218
|
+
Connect to Snowflake, collect lineage edges, and write a JSON manifest.
|
|
219
|
+
|
|
220
|
+
Returns the manifest dict.
|
|
221
|
+
"""
|
|
222
|
+
_check_available_memory()
|
|
223
|
+
print(f"Connecting to Snowflake account: {account} ...")
|
|
224
|
+
conn = snowflake.connector.connect(
|
|
225
|
+
account=account,
|
|
226
|
+
user=user,
|
|
227
|
+
password=password,
|
|
228
|
+
warehouse=warehouse,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
print(f"Fetching QUERY_HISTORY for the last {lookback_hours} hour(s) ...")
|
|
232
|
+
rows = _fetch_query_history(conn, lookback_hours)
|
|
233
|
+
conn.close()
|
|
234
|
+
print(f" Retrieved {len(rows)} qualifying query/queries.")
|
|
235
|
+
|
|
236
|
+
if not rows:
|
|
237
|
+
print("No lineage queries found in the specified window.")
|
|
238
|
+
manifest = {
|
|
239
|
+
"resource_type": RESOURCE_TYPE,
|
|
240
|
+
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
241
|
+
"column_lineage": column_lineage,
|
|
242
|
+
"edges": [],
|
|
243
|
+
}
|
|
244
|
+
with open(output_file, "w") as fh:
|
|
245
|
+
json.dump(manifest, fh, indent=2)
|
|
246
|
+
return manifest
|
|
247
|
+
|
|
248
|
+
edges = _parse_edges(rows)
|
|
249
|
+
print(f" Parsed {len(edges)} lineage edge(s).")
|
|
250
|
+
|
|
251
|
+
manifest = {
|
|
252
|
+
"resource_type": RESOURCE_TYPE,
|
|
253
|
+
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
254
|
+
"column_lineage": column_lineage,
|
|
255
|
+
"edges": [
|
|
256
|
+
{
|
|
257
|
+
"destination": {
|
|
258
|
+
"database": e.dest_db,
|
|
259
|
+
"schema": e.dest_schema,
|
|
260
|
+
"table": e.dest_table,
|
|
261
|
+
},
|
|
262
|
+
"sources": [
|
|
263
|
+
{"database": sdb, "schema": sschema, "table": stbl}
|
|
264
|
+
for sdb, sschema, stbl in e.sources
|
|
265
|
+
],
|
|
266
|
+
"col_mappings": [
|
|
267
|
+
{"dest_col": dc, "src_table": st, "src_col": sc}
|
|
268
|
+
for dc, st, sc in e.col_mappings
|
|
269
|
+
],
|
|
270
|
+
}
|
|
271
|
+
for e in edges
|
|
272
|
+
],
|
|
273
|
+
}
|
|
274
|
+
with open(output_file, "w") as fh:
|
|
275
|
+
json.dump(manifest, fh, indent=2)
|
|
276
|
+
print(f"Lineage manifest written to {output_file}")
|
|
277
|
+
|
|
278
|
+
return manifest
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def main() -> None:
|
|
282
|
+
parser = argparse.ArgumentParser(
|
|
283
|
+
description="Collect Snowflake lineage from ACCOUNT_USAGE and write to a manifest file",
|
|
284
|
+
)
|
|
285
|
+
parser.add_argument(
|
|
286
|
+
"--account",
|
|
287
|
+
default=os.environ.get("SNOWFLAKE_ACCOUNT"),
|
|
288
|
+
help="Snowflake account identifier (env: SNOWFLAKE_ACCOUNT)",
|
|
289
|
+
)
|
|
290
|
+
parser.add_argument(
|
|
291
|
+
"--user",
|
|
292
|
+
default=os.environ.get("SNOWFLAKE_USER"),
|
|
293
|
+
help="Snowflake username (env: SNOWFLAKE_USER)",
|
|
294
|
+
)
|
|
295
|
+
parser.add_argument(
|
|
296
|
+
"--password",
|
|
297
|
+
default=os.environ.get("SNOWFLAKE_PASSWORD"),
|
|
298
|
+
help="Snowflake password (env: SNOWFLAKE_PASSWORD)",
|
|
299
|
+
)
|
|
300
|
+
parser.add_argument(
|
|
301
|
+
"--warehouse",
|
|
302
|
+
default=os.environ.get("SNOWFLAKE_WAREHOUSE"),
|
|
303
|
+
help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)",
|
|
304
|
+
)
|
|
305
|
+
parser.add_argument(
|
|
306
|
+
"--lookback-hours",
|
|
307
|
+
type=int,
|
|
308
|
+
default=_LOOKBACK_HOURS,
|
|
309
|
+
help=f"Hours of QUERY_HISTORY to scan (default: {_LOOKBACK_HOURS})",
|
|
310
|
+
)
|
|
311
|
+
parser.add_argument(
|
|
312
|
+
"--column-lineage",
|
|
313
|
+
action="store_true",
|
|
314
|
+
help="Include column-level lineage mappings in the manifest",
|
|
315
|
+
)
|
|
316
|
+
parser.add_argument(
|
|
317
|
+
"--output-file",
|
|
318
|
+
default="lineage_output.json",
|
|
319
|
+
help="Path to write the lineage manifest (default: lineage_output.json)",
|
|
320
|
+
)
|
|
321
|
+
args = parser.parse_args()
|
|
322
|
+
|
|
323
|
+
missing = [
|
|
324
|
+
name
|
|
325
|
+
for name, val in [
|
|
326
|
+
("--account", args.account),
|
|
327
|
+
("--user", args.user),
|
|
328
|
+
("--password", args.password),
|
|
329
|
+
("--warehouse", args.warehouse),
|
|
330
|
+
]
|
|
331
|
+
if not val
|
|
332
|
+
]
|
|
333
|
+
if missing:
|
|
334
|
+
parser.error(f"Missing required arguments: {', '.join(missing)}")
|
|
335
|
+
|
|
336
|
+
collect(
|
|
337
|
+
account=args.account,
|
|
338
|
+
user=args.user,
|
|
339
|
+
password=args.password,
|
|
340
|
+
warehouse=args.warehouse,
|
|
341
|
+
lookback_hours=args.lookback_hours,
|
|
342
|
+
column_lineage=args.column_lineage,
|
|
343
|
+
output_file=args.output_file,
|
|
344
|
+
)
|
|
345
|
+
print("Done.")
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
if __name__ == "__main__":
|
|
349
|
+
main()
|
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Collect table metadata from Snowflake — collection only.
|
|
4
|
+
|
|
5
|
+
Connects to Snowflake, discovers all accessible databases and schemas, then
|
|
6
|
+
queries INFORMATION_SCHEMA.TABLES for volume/freshness and
|
|
7
|
+
INFORMATION_SCHEMA.COLUMNS for field definitions. The collected assets are
|
|
8
|
+
written to a JSON manifest file.
|
|
9
|
+
|
|
10
|
+
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
11
|
+
|
|
12
|
+
Substitution points
|
|
13
|
+
-------------------
|
|
14
|
+
- SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier (e.g. xy12345.us-east-1)
|
|
15
|
+
- SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username
|
|
16
|
+
- SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password
|
|
17
|
+
- SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse
|
|
18
|
+
|
|
19
|
+
Prerequisites
|
|
20
|
+
-------------
|
|
21
|
+
pip install snowflake-connector-python
|
|
22
|
+
|
|
23
|
+
Usage
|
|
24
|
+
-----
|
|
25
|
+
python collect_metadata.py \\
|
|
26
|
+
--account <SNOWFLAKE_ACCOUNT> \\
|
|
27
|
+
--user <SNOWFLAKE_USER> \\
|
|
28
|
+
--password <SNOWFLAKE_PASSWORD> \\
|
|
29
|
+
--warehouse <SNOWFLAKE_WAREHOUSE>
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import argparse
|
|
33
|
+
import json
|
|
34
|
+
import os
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
|
|
37
|
+
import snowflake.connector
|
|
38
|
+
|
|
39
|
+
# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
|
|
40
|
+
RESOURCE_TYPE = "snowflake"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_available_memory(min_gb: float = 2.0) -> None:
|
|
44
|
+
"""Warn if available memory is below the threshold."""
|
|
45
|
+
try:
|
|
46
|
+
if hasattr(os, "sysconf"): # Linux / macOS
|
|
47
|
+
page_size = os.sysconf("SC_PAGE_SIZE")
|
|
48
|
+
avail_pages = os.sysconf("SC_AVPHYS_PAGES")
|
|
49
|
+
avail_gb = (page_size * avail_pages) / (1024 ** 3)
|
|
50
|
+
else:
|
|
51
|
+
return # Windows — skip check
|
|
52
|
+
except (ValueError, OSError):
|
|
53
|
+
return
|
|
54
|
+
if avail_gb < min_gb:
|
|
55
|
+
print(
|
|
56
|
+
f"WARNING: Only {avail_gb:.1f} GB of memory available "
|
|
57
|
+
f"(minimum recommended: {min_gb:.1f} GB). "
|
|
58
|
+
f"Consider reducing the lookback window or increasing available memory."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Databases that are Snowflake system databases — skip them
|
|
62
|
+
_SKIP_DATABASES = {"SNOWFLAKE", "SNOWFLAKE_SAMPLE_DATA"}
|
|
63
|
+
|
|
64
|
+
# Schemas that are Snowflake system schemas — skip them
|
|
65
|
+
_SKIP_SCHEMAS = {"INFORMATION_SCHEMA"}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Snowflake TABLE_TYPE → Monte Carlo RelationalAsset.type mapping.
|
|
69
|
+
# The MC API only accepts "TABLE" or "VIEW" (uppercase).
|
|
70
|
+
_TABLE_TYPE_MAP = {
|
|
71
|
+
"BASE TABLE": "TABLE",
|
|
72
|
+
"TABLE": "TABLE",
|
|
73
|
+
"DYNAMIC TABLE": "TABLE",
|
|
74
|
+
"EXTERNAL TABLE": "TABLE",
|
|
75
|
+
"VIEW": "VIEW",
|
|
76
|
+
"MATERIALIZED VIEW": "VIEW",
|
|
77
|
+
"SECURE VIEW": "VIEW",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_table_type(raw_type: str | None) -> str:
|
|
82
|
+
"""Map Snowflake's TABLE_TYPE value to MC-accepted 'TABLE' or 'VIEW'."""
|
|
83
|
+
if not raw_type:
|
|
84
|
+
return "TABLE"
|
|
85
|
+
return _TABLE_TYPE_MAP.get(raw_type.upper(), "TABLE")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _connect(account: str, user: str, password: str, warehouse: str):
|
|
89
|
+
# ← SUBSTITUTE: add role= or authenticator= kwargs if your org requires them
|
|
90
|
+
return snowflake.connector.connect(
|
|
91
|
+
account=account,
|
|
92
|
+
user=user,
|
|
93
|
+
password=password,
|
|
94
|
+
warehouse=warehouse,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _collect_assets(conn) -> list[dict]:
|
|
99
|
+
"""Collect table metadata from Snowflake and return as a list of dicts."""
|
|
100
|
+
cursor = conn.cursor()
|
|
101
|
+
assets: list[dict] = []
|
|
102
|
+
|
|
103
|
+
# --- Discover databases ---
|
|
104
|
+
cursor.execute("SHOW DATABASES")
|
|
105
|
+
# SHOW DATABASES returns (created_on, name, …); column index 1 is the name
|
|
106
|
+
all_db_rows = []
|
|
107
|
+
while True:
|
|
108
|
+
chunk = cursor.fetchmany(1000)
|
|
109
|
+
if not chunk:
|
|
110
|
+
break
|
|
111
|
+
all_db_rows.extend(chunk)
|
|
112
|
+
databases = [row[1] for row in all_db_rows if row[1] not in _SKIP_DATABASES]
|
|
113
|
+
print(f" Found {len(databases)} database(s): {databases}")
|
|
114
|
+
|
|
115
|
+
for db in databases:
|
|
116
|
+
# --- Discover schemas in each database ---
|
|
117
|
+
try:
|
|
118
|
+
cursor.execute(f'SHOW SCHEMAS IN DATABASE "{db}"')
|
|
119
|
+
except Exception as exc:
|
|
120
|
+
print(f" WARNING: could not list schemas in {db}: {exc}")
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
# Column index 1 is the schema name
|
|
124
|
+
all_schema_rows = []
|
|
125
|
+
while True:
|
|
126
|
+
chunk = cursor.fetchmany(1000)
|
|
127
|
+
if not chunk:
|
|
128
|
+
break
|
|
129
|
+
all_schema_rows.extend(chunk)
|
|
130
|
+
schemas = [row[1] for row in all_schema_rows if row[1] not in _SKIP_SCHEMAS]
|
|
131
|
+
|
|
132
|
+
# --- Collect tables, volume, and freshness via INFORMATION_SCHEMA ---
|
|
133
|
+
try:
|
|
134
|
+
cursor.execute(
|
|
135
|
+
f"""
|
|
136
|
+
SELECT
|
|
137
|
+
TABLE_CATALOG,
|
|
138
|
+
TABLE_SCHEMA,
|
|
139
|
+
TABLE_NAME,
|
|
140
|
+
TABLE_TYPE,
|
|
141
|
+
ROW_COUNT,
|
|
142
|
+
BYTES,
|
|
143
|
+
LAST_ALTERED,
|
|
144
|
+
COMMENT
|
|
145
|
+
FROM "{db}".INFORMATION_SCHEMA.TABLES
|
|
146
|
+
WHERE TABLE_SCHEMA != 'INFORMATION_SCHEMA'
|
|
147
|
+
ORDER BY TABLE_SCHEMA, TABLE_NAME
|
|
148
|
+
"""
|
|
149
|
+
)
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
print(f" WARNING: could not query INFORMATION_SCHEMA.TABLES in {db}: {exc}")
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
table_rows = []
|
|
155
|
+
while True:
|
|
156
|
+
chunk = cursor.fetchmany(1000)
|
|
157
|
+
if not chunk:
|
|
158
|
+
break
|
|
159
|
+
table_rows.extend(chunk)
|
|
160
|
+
print(f" {db}: {len(table_rows)} table(s)")
|
|
161
|
+
|
|
162
|
+
# Build a set of schema names present in the table result to know which
|
|
163
|
+
# INFORMATION_SCHEMA.COLUMNS queries to run
|
|
164
|
+
schemas_with_tables: set[str] = {row[1] for row in table_rows}
|
|
165
|
+
|
|
166
|
+
# Pre-fetch all columns for this database in one query per schema
|
|
167
|
+
columns_by_table: dict[tuple[str, str], list[dict]] = {}
|
|
168
|
+
for schema in schemas_with_tables:
|
|
169
|
+
if schema not in schemas:
|
|
170
|
+
continue # respect the earlier schema skip list
|
|
171
|
+
try:
|
|
172
|
+
cursor.execute(
|
|
173
|
+
f"""
|
|
174
|
+
SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COMMENT
|
|
175
|
+
FROM "{db}".INFORMATION_SCHEMA.COLUMNS
|
|
176
|
+
WHERE TABLE_SCHEMA = %s
|
|
177
|
+
ORDER BY TABLE_NAME, ORDINAL_POSITION
|
|
178
|
+
""",
|
|
179
|
+
(schema,),
|
|
180
|
+
)
|
|
181
|
+
except Exception as exc:
|
|
182
|
+
print(f" WARNING: could not fetch columns for {db}.{schema}: {exc}")
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
all_col_rows = []
|
|
186
|
+
while True:
|
|
187
|
+
chunk = cursor.fetchmany(1000)
|
|
188
|
+
if not chunk:
|
|
189
|
+
break
|
|
190
|
+
all_col_rows.extend(chunk)
|
|
191
|
+
for col_row in all_col_rows:
|
|
192
|
+
table_name, col_name, data_type, col_comment = col_row
|
|
193
|
+
key = (schema, table_name)
|
|
194
|
+
if key not in columns_by_table:
|
|
195
|
+
columns_by_table[key] = []
|
|
196
|
+
columns_by_table[key].append(
|
|
197
|
+
{
|
|
198
|
+
"name": col_name,
|
|
199
|
+
"type": data_type,
|
|
200
|
+
"description": col_comment or None,
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Build asset dicts
|
|
205
|
+
for row in table_rows:
|
|
206
|
+
tbl_catalog, tbl_schema, tbl_name, tbl_type, row_count, byte_count, last_altered, tbl_comment = row
|
|
207
|
+
|
|
208
|
+
volume = None
|
|
209
|
+
if row_count is not None or byte_count is not None:
|
|
210
|
+
volume = {
|
|
211
|
+
"row_count": int(row_count) if row_count is not None else None,
|
|
212
|
+
"byte_count": int(byte_count) if byte_count is not None else None,
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
freshness = None
|
|
216
|
+
if last_altered is not None:
|
|
217
|
+
freshness = {
|
|
218
|
+
"last_update_time": last_altered.isoformat() if hasattr(last_altered, "isoformat") else str(last_altered),
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
fields = columns_by_table.get((tbl_schema, tbl_name), [])
|
|
222
|
+
|
|
223
|
+
assets.append(
|
|
224
|
+
{
|
|
225
|
+
"type": _normalize_table_type(tbl_type),
|
|
226
|
+
"database": tbl_catalog,
|
|
227
|
+
"schema": tbl_schema,
|
|
228
|
+
"name": tbl_name,
|
|
229
|
+
"description": tbl_comment or None,
|
|
230
|
+
"fields": fields,
|
|
231
|
+
"volume": volume,
|
|
232
|
+
"freshness": freshness,
|
|
233
|
+
}
|
|
234
|
+
)
|
|
235
|
+
print(f" + {tbl_catalog}.{tbl_schema}.{tbl_name} ({len(fields)} columns)")
|
|
236
|
+
|
|
237
|
+
cursor.close()
|
|
238
|
+
return assets
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def collect(
|
|
242
|
+
account: str,
|
|
243
|
+
user: str,
|
|
244
|
+
password: str,
|
|
245
|
+
warehouse: str,
|
|
246
|
+
output_file: str = "metadata_output.json",
|
|
247
|
+
) -> dict:
|
|
248
|
+
"""
|
|
249
|
+
Connect to Snowflake, collect table metadata, and write a JSON manifest.
|
|
250
|
+
|
|
251
|
+
Returns the manifest dict.
|
|
252
|
+
"""
|
|
253
|
+
_check_available_memory()
|
|
254
|
+
print(f"Connecting to Snowflake account: {account} ...")
|
|
255
|
+
conn = _connect(account, user, password, warehouse)
|
|
256
|
+
|
|
257
|
+
print("Collecting table metadata ...")
|
|
258
|
+
assets = _collect_assets(conn)
|
|
259
|
+
conn.close()
|
|
260
|
+
print(f"\nCollected {len(assets)} table(s).")
|
|
261
|
+
|
|
262
|
+
manifest = {
|
|
263
|
+
"resource_type": RESOURCE_TYPE,
|
|
264
|
+
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
265
|
+
"assets": assets,
|
|
266
|
+
}
|
|
267
|
+
with open(output_file, "w") as fh:
|
|
268
|
+
json.dump(manifest, fh, indent=2)
|
|
269
|
+
print(f"Asset manifest written to {output_file}")
|
|
270
|
+
|
|
271
|
+
return manifest
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def main() -> None:
|
|
275
|
+
parser = argparse.ArgumentParser(
|
|
276
|
+
description="Collect Snowflake table metadata and write to a manifest file",
|
|
277
|
+
)
|
|
278
|
+
parser.add_argument(
|
|
279
|
+
"--account",
|
|
280
|
+
default=os.environ.get("SNOWFLAKE_ACCOUNT"),
|
|
281
|
+
help="Snowflake account identifier, e.g. xy12345.us-east-1 (env: SNOWFLAKE_ACCOUNT)", # ← SUBSTITUTE
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
"--user",
|
|
285
|
+
default=os.environ.get("SNOWFLAKE_USER"),
|
|
286
|
+
help="Snowflake username (env: SNOWFLAKE_USER)", # ← SUBSTITUTE
|
|
287
|
+
)
|
|
288
|
+
parser.add_argument(
|
|
289
|
+
"--password",
|
|
290
|
+
default=os.environ.get("SNOWFLAKE_PASSWORD"),
|
|
291
|
+
help="Snowflake password (env: SNOWFLAKE_PASSWORD)", # ← SUBSTITUTE
|
|
292
|
+
)
|
|
293
|
+
parser.add_argument(
|
|
294
|
+
"--warehouse",
|
|
295
|
+
default=os.environ.get("SNOWFLAKE_WAREHOUSE"),
|
|
296
|
+
help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)", # ← SUBSTITUTE
|
|
297
|
+
)
|
|
298
|
+
parser.add_argument(
|
|
299
|
+
"--output-file",
|
|
300
|
+
default="metadata_output.json",
|
|
301
|
+
help="Path to write the output manifest (default: metadata_output.json)",
|
|
302
|
+
)
|
|
303
|
+
args = parser.parse_args()
|
|
304
|
+
|
|
305
|
+
missing = [
|
|
306
|
+
name
|
|
307
|
+
for name, val in [
|
|
308
|
+
("--account", args.account),
|
|
309
|
+
("--user", args.user),
|
|
310
|
+
("--password", args.password),
|
|
311
|
+
("--warehouse", args.warehouse),
|
|
312
|
+
]
|
|
313
|
+
if not val
|
|
314
|
+
]
|
|
315
|
+
if missing:
|
|
316
|
+
parser.error(f"Missing required arguments: {', '.join(missing)}")
|
|
317
|
+
|
|
318
|
+
collect(
|
|
319
|
+
account=args.account,
|
|
320
|
+
user=args.user,
|
|
321
|
+
password=args.password,
|
|
322
|
+
warehouse=args.warehouse,
|
|
323
|
+
output_file=args.output_file,
|
|
324
|
+
)
|
|
325
|
+
print("Done.")
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
if __name__ == "__main__":
|
|
329
|
+
main()
|