opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,349 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect table and column lineage from Snowflake — collection only.
4
+
5
+ Queries ACCOUNT_USAGE for DML/DDL statements in the last 24 hours, parses each
6
+ QUERY_TEXT with regex to extract source and destination tables, then writes the
7
+ resulting lineage edges to a JSON manifest file.
8
+
9
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
10
+
11
+ Note: ACCOUNT_USAGE views have an approximate latency of 45 minutes, so very
12
+ recent queries may not yet appear.
13
+
14
+ Substitution points
15
+ -------------------
16
+ - SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier
17
+ - SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username
18
+ - SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password
19
+ - SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse
20
+
21
+ Prerequisites
22
+ -------------
23
+ pip install snowflake-connector-python
24
+
25
+ Usage (table-level):
26
+ python collect_lineage.py \\
27
+ --account <SNOWFLAKE_ACCOUNT> \\
28
+ --user <SNOWFLAKE_USER> \\
29
+ --password <SNOWFLAKE_PASSWORD> \\
30
+ --warehouse <SNOWFLAKE_WAREHOUSE>
31
+
32
+ Usage (column-level):
33
+ python collect_lineage.py ... --column-lineage
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import argparse
39
+ import json
40
+ import os
41
+ import re
42
+ from dataclasses import dataclass, field
43
+ from datetime import datetime, timezone
44
+
45
+ import snowflake.connector
46
+
47
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
48
+ RESOURCE_TYPE = "snowflake"
49
+
50
+
51
+ def _check_available_memory(min_gb: float = 2.0) -> None:
52
+ """Warn if available memory is below the threshold."""
53
+ try:
54
+ if hasattr(os, "sysconf"): # Linux / macOS
55
+ page_size = os.sysconf("SC_PAGE_SIZE")
56
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
57
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
58
+ else:
59
+ return # Windows — skip check
60
+ except (ValueError, OSError):
61
+ return
62
+ if avail_gb < min_gb:
63
+ print(
64
+ f"WARNING: Only {avail_gb:.1f} GB of memory available "
65
+ f"(minimum recommended: {min_gb:.1f} GB). "
66
+ f"Consider reducing the lookback window or increasing available memory."
67
+ )
68
+
69
+ # Hours to look back in ACCOUNT_USAGE.QUERY_HISTORY
70
+ # ← SUBSTITUTE: adjust the lookback window to match your collection cadence
71
+ _LOOKBACK_HOURS = 24
72
+
73
+ # Regex for CTAS: CREATE [OR REPLACE] [TRANSIENT] TABLE [IF NOT EXISTS] [db.][schema.]table AS SELECT
74
+ _CTAS_RE = re.compile(
75
+ r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:TRANSIENT\s+)?TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
76
+ r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
77
+ r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
78
+ r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
79
+ re.IGNORECASE | re.DOTALL,
80
+ )
81
+
82
+ # Regex for INSERT INTO [db.][schema.]table SELECT ... FROM [db.][schema.]table
83
+ _INSERT_RE = re.compile(
84
+ r"INSERT\s+(?:INTO|OVERWRITE)\s+"
85
+ r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
86
+ r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
87
+ r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
88
+ re.IGNORECASE | re.DOTALL,
89
+ )
90
+
91
+ # Regex for CREATE [OR REPLACE] VIEW [db.][schema.]view AS SELECT ... FROM ...
92
+ _CREATE_VIEW_RE = re.compile(
93
+ r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:SECURE\s+)?VIEW\s+"
94
+ r"(?:(?P<dest_db>\w+)\.)?(?:(?P<dest_schema>\w+)\.)?(?P<dest_table>\w+)"
95
+ r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+"
96
+ r"(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
97
+ re.IGNORECASE | re.DOTALL,
98
+ )
99
+
100
+ # Additional JOIN sources
101
+ _JOIN_RE = re.compile(
102
+ r"JOIN\s+(?:(?P<src_db>\w+)\.)?(?:(?P<src_schema>\w+)\.)?(?P<src_table>\w+)",
103
+ re.IGNORECASE,
104
+ )
105
+
106
+ # Simple column alias extraction from SELECT clause
107
+ _COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
108
+ _SQL_KEYWORDS = {
109
+ "FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*", "AND", "OR",
110
+ "GROUP", "ORDER", "BY", "HAVING", "LIMIT", "DISTINCT", "CASE", "WHEN",
111
+ "THEN", "ELSE", "END", "NULL", "NOT", "IN", "IS", "BETWEEN",
112
+ }
113
+
114
+
115
+ @dataclass
116
+ class _LineageEdge:
117
+ dest_db: str
118
+ dest_schema: str
119
+ dest_table: str
120
+ sources: list[tuple[str, str, str]] = field(default_factory=list)
121
+ # col_mappings: (dest_col, src_table, src_col)
122
+ col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
123
+
124
+
125
+ def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
126
+ mappings = []
127
+ for m in _COL_RE.finditer(select_clause):
128
+ src_col = m.group(2)
129
+ dest_col = m.group(3) or src_col
130
+ if src_col.upper() in _SQL_KEYWORDS:
131
+ continue
132
+ mappings.append((dest_col, src_table, src_col))
133
+ return mappings
134
+
135
+
136
+ def _parse_edges(rows: list[dict]) -> list[_LineageEdge]:
137
+ """Parse QUERY_HISTORY rows into _LineageEdge objects."""
138
+ edges: dict[str, _LineageEdge] = {}
139
+
140
+ for row in rows:
141
+ query_text = row.get("QUERY_TEXT") or ""
142
+ default_db = (row.get("DATABASE_NAME") or "").lower()
143
+ sql_clean = re.sub(r"\s+", " ", query_text).strip()
144
+
145
+ for pattern in (_CTAS_RE, _INSERT_RE, _CREATE_VIEW_RE):
146
+ m = pattern.search(sql_clean)
147
+ if not m:
148
+ continue
149
+
150
+ dest_db = (m.group("dest_db") or default_db).lower()
151
+ dest_schema = (m.group("dest_schema") or "public").lower()
152
+ dest_table = m.group("dest_table").lower()
153
+ src_db = (m.group("src_db") or default_db).lower()
154
+ src_schema = (m.group("src_schema") or "public").lower()
155
+ src_table = m.group("src_table").lower()
156
+ select_cols = m.group("select_cols")
157
+
158
+ key = f"{dest_db}.{dest_schema}.{dest_table}"
159
+ if key not in edges:
160
+ edges[key] = _LineageEdge(
161
+ dest_db=dest_db, dest_schema=dest_schema, dest_table=dest_table
162
+ )
163
+
164
+ edge = edges[key]
165
+ src_triple = (src_db, src_schema, src_table)
166
+ if src_triple not in edge.sources:
167
+ edge.sources.append(src_triple)
168
+
169
+ for jm in _JOIN_RE.finditer(sql_clean):
170
+ jt = jm.group("src_table").lower()
171
+ jschema = (jm.group("src_schema") or src_schema).lower()
172
+ jdb = (jm.group("src_db") or src_db).lower()
173
+ jp = (jdb, jschema, jt)
174
+ if jp not in edge.sources:
175
+ edge.sources.append(jp)
176
+
177
+ edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
178
+ break
179
+
180
+ return list(edges.values())
181
+
182
+
183
+ def _fetch_query_history(conn, lookback_hours: int) -> list[dict]:
184
+ cursor = conn.cursor()
185
+ cursor.execute(
186
+ f"""
187
+ SELECT QUERY_ID, QUERY_TEXT, START_TIME, END_TIME, USER_NAME, DATABASE_NAME, EXECUTION_STATUS
188
+ FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
189
+ WHERE START_TIME >= DATEADD(hour, -{lookback_hours}, CURRENT_TIMESTAMP())
190
+ AND EXECUTION_STATUS = 'SUCCESS'
191
+ AND QUERY_TYPE IN ('CREATE_TABLE_AS_SELECT', 'INSERT', 'MERGE', 'CREATE_VIEW')
192
+ ORDER BY START_TIME
193
+ LIMIT 50000
194
+ """
195
+ # ← SUBSTITUTE: adjust QUERY_TYPE list, LIMIT, or add a WHERE clause to scope to specific databases
196
+ )
197
+ columns = [col[0] for col in cursor.description]
198
+ rows = []
199
+ while True:
200
+ batch = cursor.fetchmany(1000)
201
+ if not batch:
202
+ break
203
+ rows.extend(dict(zip(columns, row)) for row in batch)
204
+ cursor.close()
205
+ return rows
206
+
207
+
208
+ def collect(
209
+ account: str,
210
+ user: str,
211
+ password: str,
212
+ warehouse: str,
213
+ lookback_hours: int = _LOOKBACK_HOURS,
214
+ column_lineage: bool = False,
215
+ output_file: str = "lineage_output.json",
216
+ ) -> dict:
217
+ """
218
+ Connect to Snowflake, collect lineage edges, and write a JSON manifest.
219
+
220
+ Returns the manifest dict.
221
+ """
222
+ _check_available_memory()
223
+ print(f"Connecting to Snowflake account: {account} ...")
224
+ conn = snowflake.connector.connect(
225
+ account=account,
226
+ user=user,
227
+ password=password,
228
+ warehouse=warehouse,
229
+ )
230
+
231
+ print(f"Fetching QUERY_HISTORY for the last {lookback_hours} hour(s) ...")
232
+ rows = _fetch_query_history(conn, lookback_hours)
233
+ conn.close()
234
+ print(f" Retrieved {len(rows)} qualifying query/queries.")
235
+
236
+ if not rows:
237
+ print("No lineage queries found in the specified window.")
238
+ manifest = {
239
+ "resource_type": RESOURCE_TYPE,
240
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
241
+ "column_lineage": column_lineage,
242
+ "edges": [],
243
+ }
244
+ with open(output_file, "w") as fh:
245
+ json.dump(manifest, fh, indent=2)
246
+ return manifest
247
+
248
+ edges = _parse_edges(rows)
249
+ print(f" Parsed {len(edges)} lineage edge(s).")
250
+
251
+ manifest = {
252
+ "resource_type": RESOURCE_TYPE,
253
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
254
+ "column_lineage": column_lineage,
255
+ "edges": [
256
+ {
257
+ "destination": {
258
+ "database": e.dest_db,
259
+ "schema": e.dest_schema,
260
+ "table": e.dest_table,
261
+ },
262
+ "sources": [
263
+ {"database": sdb, "schema": sschema, "table": stbl}
264
+ for sdb, sschema, stbl in e.sources
265
+ ],
266
+ "col_mappings": [
267
+ {"dest_col": dc, "src_table": st, "src_col": sc}
268
+ for dc, st, sc in e.col_mappings
269
+ ],
270
+ }
271
+ for e in edges
272
+ ],
273
+ }
274
+ with open(output_file, "w") as fh:
275
+ json.dump(manifest, fh, indent=2)
276
+ print(f"Lineage manifest written to {output_file}")
277
+
278
+ return manifest
279
+
280
+
281
+ def main() -> None:
282
+ parser = argparse.ArgumentParser(
283
+ description="Collect Snowflake lineage from ACCOUNT_USAGE and write to a manifest file",
284
+ )
285
+ parser.add_argument(
286
+ "--account",
287
+ default=os.environ.get("SNOWFLAKE_ACCOUNT"),
288
+ help="Snowflake account identifier (env: SNOWFLAKE_ACCOUNT)",
289
+ )
290
+ parser.add_argument(
291
+ "--user",
292
+ default=os.environ.get("SNOWFLAKE_USER"),
293
+ help="Snowflake username (env: SNOWFLAKE_USER)",
294
+ )
295
+ parser.add_argument(
296
+ "--password",
297
+ default=os.environ.get("SNOWFLAKE_PASSWORD"),
298
+ help="Snowflake password (env: SNOWFLAKE_PASSWORD)",
299
+ )
300
+ parser.add_argument(
301
+ "--warehouse",
302
+ default=os.environ.get("SNOWFLAKE_WAREHOUSE"),
303
+ help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)",
304
+ )
305
+ parser.add_argument(
306
+ "--lookback-hours",
307
+ type=int,
308
+ default=_LOOKBACK_HOURS,
309
+ help=f"Hours of QUERY_HISTORY to scan (default: {_LOOKBACK_HOURS})",
310
+ )
311
+ parser.add_argument(
312
+ "--column-lineage",
313
+ action="store_true",
314
+ help="Include column-level lineage mappings in the manifest",
315
+ )
316
+ parser.add_argument(
317
+ "--output-file",
318
+ default="lineage_output.json",
319
+ help="Path to write the lineage manifest (default: lineage_output.json)",
320
+ )
321
+ args = parser.parse_args()
322
+
323
+ missing = [
324
+ name
325
+ for name, val in [
326
+ ("--account", args.account),
327
+ ("--user", args.user),
328
+ ("--password", args.password),
329
+ ("--warehouse", args.warehouse),
330
+ ]
331
+ if not val
332
+ ]
333
+ if missing:
334
+ parser.error(f"Missing required arguments: {', '.join(missing)}")
335
+
336
+ collect(
337
+ account=args.account,
338
+ user=args.user,
339
+ password=args.password,
340
+ warehouse=args.warehouse,
341
+ lookback_hours=args.lookback_hours,
342
+ column_lineage=args.column_lineage,
343
+ output_file=args.output_file,
344
+ )
345
+ print("Done.")
346
+
347
+
348
+ if __name__ == "__main__":
349
+ main()
@@ -0,0 +1,329 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect table metadata from Snowflake — collection only.
4
+
5
+ Connects to Snowflake, discovers all accessible databases and schemas, then
6
+ queries INFORMATION_SCHEMA.TABLES for volume/freshness and
7
+ INFORMATION_SCHEMA.COLUMNS for field definitions. The collected assets are
8
+ written to a JSON manifest file.
9
+
10
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier (e.g. xy12345.us-east-1)
15
+ - SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username
16
+ - SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password
17
+ - SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse
18
+
19
+ Prerequisites
20
+ -------------
21
+ pip install snowflake-connector-python
22
+
23
+ Usage
24
+ -----
25
+ python collect_metadata.py \\
26
+ --account <SNOWFLAKE_ACCOUNT> \\
27
+ --user <SNOWFLAKE_USER> \\
28
+ --password <SNOWFLAKE_PASSWORD> \\
29
+ --warehouse <SNOWFLAKE_WAREHOUSE>
30
+ """
31
+
32
+ import argparse
33
+ import json
34
+ import os
35
+ from datetime import datetime, timezone
36
+
37
+ import snowflake.connector
38
+
39
+ # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
40
+ RESOURCE_TYPE = "snowflake"
41
+
42
+
43
+ def _check_available_memory(min_gb: float = 2.0) -> None:
44
+ """Warn if available memory is below the threshold."""
45
+ try:
46
+ if hasattr(os, "sysconf"): # Linux / macOS
47
+ page_size = os.sysconf("SC_PAGE_SIZE")
48
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
49
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
50
+ else:
51
+ return # Windows — skip check
52
+ except (ValueError, OSError):
53
+ return
54
+ if avail_gb < min_gb:
55
+ print(
56
+ f"WARNING: Only {avail_gb:.1f} GB of memory available "
57
+ f"(minimum recommended: {min_gb:.1f} GB). "
58
+ f"Consider reducing the lookback window or increasing available memory."
59
+ )
60
+
61
+ # Databases that are Snowflake system databases — skip them
62
+ _SKIP_DATABASES = {"SNOWFLAKE", "SNOWFLAKE_SAMPLE_DATA"}
63
+
64
+ # Schemas that are Snowflake system schemas — skip them
65
+ _SKIP_SCHEMAS = {"INFORMATION_SCHEMA"}
66
+
67
+
68
+ # Snowflake TABLE_TYPE → Monte Carlo RelationalAsset.type mapping.
69
+ # The MC API only accepts "TABLE" or "VIEW" (uppercase).
70
+ _TABLE_TYPE_MAP = {
71
+ "BASE TABLE": "TABLE",
72
+ "TABLE": "TABLE",
73
+ "DYNAMIC TABLE": "TABLE",
74
+ "EXTERNAL TABLE": "TABLE",
75
+ "VIEW": "VIEW",
76
+ "MATERIALIZED VIEW": "VIEW",
77
+ "SECURE VIEW": "VIEW",
78
+ }
79
+
80
+
81
+ def _normalize_table_type(raw_type: str | None) -> str:
82
+ """Map Snowflake's TABLE_TYPE value to MC-accepted 'TABLE' or 'VIEW'."""
83
+ if not raw_type:
84
+ return "TABLE"
85
+ return _TABLE_TYPE_MAP.get(raw_type.upper(), "TABLE")
86
+
87
+
88
+ def _connect(account: str, user: str, password: str, warehouse: str):
89
+ # ← SUBSTITUTE: add role= or authenticator= kwargs if your org requires them
90
+ return snowflake.connector.connect(
91
+ account=account,
92
+ user=user,
93
+ password=password,
94
+ warehouse=warehouse,
95
+ )
96
+
97
+
98
+ def _collect_assets(conn) -> list[dict]:
99
+ """Collect table metadata from Snowflake and return as a list of dicts."""
100
+ cursor = conn.cursor()
101
+ assets: list[dict] = []
102
+
103
+ # --- Discover databases ---
104
+ cursor.execute("SHOW DATABASES")
105
+ # SHOW DATABASES returns (created_on, name, …); column index 1 is the name
106
+ all_db_rows = []
107
+ while True:
108
+ chunk = cursor.fetchmany(1000)
109
+ if not chunk:
110
+ break
111
+ all_db_rows.extend(chunk)
112
+ databases = [row[1] for row in all_db_rows if row[1] not in _SKIP_DATABASES]
113
+ print(f" Found {len(databases)} database(s): {databases}")
114
+
115
+ for db in databases:
116
+ # --- Discover schemas in each database ---
117
+ try:
118
+ cursor.execute(f'SHOW SCHEMAS IN DATABASE "{db}"')
119
+ except Exception as exc:
120
+ print(f" WARNING: could not list schemas in {db}: {exc}")
121
+ continue
122
+
123
+ # Column index 1 is the schema name
124
+ all_schema_rows = []
125
+ while True:
126
+ chunk = cursor.fetchmany(1000)
127
+ if not chunk:
128
+ break
129
+ all_schema_rows.extend(chunk)
130
+ schemas = [row[1] for row in all_schema_rows if row[1] not in _SKIP_SCHEMAS]
131
+
132
+ # --- Collect tables, volume, and freshness via INFORMATION_SCHEMA ---
133
+ try:
134
+ cursor.execute(
135
+ f"""
136
+ SELECT
137
+ TABLE_CATALOG,
138
+ TABLE_SCHEMA,
139
+ TABLE_NAME,
140
+ TABLE_TYPE,
141
+ ROW_COUNT,
142
+ BYTES,
143
+ LAST_ALTERED,
144
+ COMMENT
145
+ FROM "{db}".INFORMATION_SCHEMA.TABLES
146
+ WHERE TABLE_SCHEMA != 'INFORMATION_SCHEMA'
147
+ ORDER BY TABLE_SCHEMA, TABLE_NAME
148
+ """
149
+ )
150
+ except Exception as exc:
151
+ print(f" WARNING: could not query INFORMATION_SCHEMA.TABLES in {db}: {exc}")
152
+ continue
153
+
154
+ table_rows = []
155
+ while True:
156
+ chunk = cursor.fetchmany(1000)
157
+ if not chunk:
158
+ break
159
+ table_rows.extend(chunk)
160
+ print(f" {db}: {len(table_rows)} table(s)")
161
+
162
+ # Build a set of schema names present in the table result to know which
163
+ # INFORMATION_SCHEMA.COLUMNS queries to run
164
+ schemas_with_tables: set[str] = {row[1] for row in table_rows}
165
+
166
+ # Pre-fetch all columns for this database in one query per schema
167
+ columns_by_table: dict[tuple[str, str], list[dict]] = {}
168
+ for schema in schemas_with_tables:
169
+ if schema not in schemas:
170
+ continue # respect the earlier schema skip list
171
+ try:
172
+ cursor.execute(
173
+ f"""
174
+ SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COMMENT
175
+ FROM "{db}".INFORMATION_SCHEMA.COLUMNS
176
+ WHERE TABLE_SCHEMA = %s
177
+ ORDER BY TABLE_NAME, ORDINAL_POSITION
178
+ """,
179
+ (schema,),
180
+ )
181
+ except Exception as exc:
182
+ print(f" WARNING: could not fetch columns for {db}.{schema}: {exc}")
183
+ continue
184
+
185
+ all_col_rows = []
186
+ while True:
187
+ chunk = cursor.fetchmany(1000)
188
+ if not chunk:
189
+ break
190
+ all_col_rows.extend(chunk)
191
+ for col_row in all_col_rows:
192
+ table_name, col_name, data_type, col_comment = col_row
193
+ key = (schema, table_name)
194
+ if key not in columns_by_table:
195
+ columns_by_table[key] = []
196
+ columns_by_table[key].append(
197
+ {
198
+ "name": col_name,
199
+ "type": data_type,
200
+ "description": col_comment or None,
201
+ }
202
+ )
203
+
204
+ # Build asset dicts
205
+ for row in table_rows:
206
+ tbl_catalog, tbl_schema, tbl_name, tbl_type, row_count, byte_count, last_altered, tbl_comment = row
207
+
208
+ volume = None
209
+ if row_count is not None or byte_count is not None:
210
+ volume = {
211
+ "row_count": int(row_count) if row_count is not None else None,
212
+ "byte_count": int(byte_count) if byte_count is not None else None,
213
+ }
214
+
215
+ freshness = None
216
+ if last_altered is not None:
217
+ freshness = {
218
+ "last_update_time": last_altered.isoformat() if hasattr(last_altered, "isoformat") else str(last_altered),
219
+ }
220
+
221
+ fields = columns_by_table.get((tbl_schema, tbl_name), [])
222
+
223
+ assets.append(
224
+ {
225
+ "type": _normalize_table_type(tbl_type),
226
+ "database": tbl_catalog,
227
+ "schema": tbl_schema,
228
+ "name": tbl_name,
229
+ "description": tbl_comment or None,
230
+ "fields": fields,
231
+ "volume": volume,
232
+ "freshness": freshness,
233
+ }
234
+ )
235
+ print(f" + {tbl_catalog}.{tbl_schema}.{tbl_name} ({len(fields)} columns)")
236
+
237
+ cursor.close()
238
+ return assets
239
+
240
+
241
+ def collect(
242
+ account: str,
243
+ user: str,
244
+ password: str,
245
+ warehouse: str,
246
+ output_file: str = "metadata_output.json",
247
+ ) -> dict:
248
+ """
249
+ Connect to Snowflake, collect table metadata, and write a JSON manifest.
250
+
251
+ Returns the manifest dict.
252
+ """
253
+ _check_available_memory()
254
+ print(f"Connecting to Snowflake account: {account} ...")
255
+ conn = _connect(account, user, password, warehouse)
256
+
257
+ print("Collecting table metadata ...")
258
+ assets = _collect_assets(conn)
259
+ conn.close()
260
+ print(f"\nCollected {len(assets)} table(s).")
261
+
262
+ manifest = {
263
+ "resource_type": RESOURCE_TYPE,
264
+ "collected_at": datetime.now(tz=timezone.utc).isoformat(),
265
+ "assets": assets,
266
+ }
267
+ with open(output_file, "w") as fh:
268
+ json.dump(manifest, fh, indent=2)
269
+ print(f"Asset manifest written to {output_file}")
270
+
271
+ return manifest
272
+
273
+
274
+ def main() -> None:
275
+ parser = argparse.ArgumentParser(
276
+ description="Collect Snowflake table metadata and write to a manifest file",
277
+ )
278
+ parser.add_argument(
279
+ "--account",
280
+ default=os.environ.get("SNOWFLAKE_ACCOUNT"),
281
+ help="Snowflake account identifier, e.g. xy12345.us-east-1 (env: SNOWFLAKE_ACCOUNT)", # ← SUBSTITUTE
282
+ )
283
+ parser.add_argument(
284
+ "--user",
285
+ default=os.environ.get("SNOWFLAKE_USER"),
286
+ help="Snowflake username (env: SNOWFLAKE_USER)", # ← SUBSTITUTE
287
+ )
288
+ parser.add_argument(
289
+ "--password",
290
+ default=os.environ.get("SNOWFLAKE_PASSWORD"),
291
+ help="Snowflake password (env: SNOWFLAKE_PASSWORD)", # ← SUBSTITUTE
292
+ )
293
+ parser.add_argument(
294
+ "--warehouse",
295
+ default=os.environ.get("SNOWFLAKE_WAREHOUSE"),
296
+ help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)", # ← SUBSTITUTE
297
+ )
298
+ parser.add_argument(
299
+ "--output-file",
300
+ default="metadata_output.json",
301
+ help="Path to write the output manifest (default: metadata_output.json)",
302
+ )
303
+ args = parser.parse_args()
304
+
305
+ missing = [
306
+ name
307
+ for name, val in [
308
+ ("--account", args.account),
309
+ ("--user", args.user),
310
+ ("--password", args.password),
311
+ ("--warehouse", args.warehouse),
312
+ ]
313
+ if not val
314
+ ]
315
+ if missing:
316
+ parser.error(f"Missing required arguments: {', '.join(missing)}")
317
+
318
+ collect(
319
+ account=args.account,
320
+ user=args.user,
321
+ password=args.password,
322
+ warehouse=args.warehouse,
323
+ output_file=args.output_file,
324
+ )
325
+ print("Done.")
326
+
327
+
328
+ if __name__ == "__main__":
329
+ main()