opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,212 @@
1
+ """
2
+ Databricks — Metadata Collection (collect-only)
3
+ =================================================
4
+ Collects table schemas, row counts, and byte sizes from Databricks Unity Catalog
5
+ using INFORMATION_SCHEMA and DESCRIBE DETAIL, then writes a JSON manifest file
6
+ that can be consumed by push_metadata.py.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - DATABRICKS_HOST : workspace hostname (e.g. adb-1234.azuredatabricks.net)
10
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path (e.g. /sql/1.0/warehouses/abc123)
11
+ - DATABRICKS_TOKEN : personal access token or service-principal secret
12
+ - DATABRICKS_CATALOG : catalog to collect from (default: "hive_metastore" or "main")
13
+ - SCHEMA_EXCLUSIONS : schemas to skip
14
+
15
+ Prerequisites:
16
+ pip install databricks-sql-connector
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+ import logging
24
+ import os
25
+ from datetime import datetime, timezone
26
+ from typing import Any
27
+
28
+ from databricks import sql
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
31
+ log = logging.getLogger(__name__)
32
+
33
+ RESOURCE_TYPE = "databricks"
34
+
35
+ # Schemas to skip across all catalogs
36
+ SCHEMA_EXCLUSIONS: set[str] = { # ← SUBSTITUTE: add any internal schemas to skip
37
+ "information_schema",
38
+ "__databricks_internal",
39
+ }
40
+
41
+
42
+ def _check_available_memory(min_gb: float = 2.0) -> None:
43
+ """Warn if available memory is below the threshold."""
44
+ try:
45
+ if hasattr(os, "sysconf"): # Linux / macOS
46
+ page_size = os.sysconf("SC_PAGE_SIZE")
47
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
48
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
49
+ else:
50
+ return # Windows — skip check
51
+ except (ValueError, OSError):
52
+ return
53
+ if avail_gb < min_gb:
54
+ log.warning(
55
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
56
+ "Consider reducing the collection scope or increasing available memory.",
57
+ avail_gb,
58
+ min_gb,
59
+ )
60
+
61
+
62
+ def _query(cursor: Any, sql_text: str, params: tuple | None = None) -> list[dict[str, Any]]:
63
+ cursor.execute(sql_text, params)
64
+ cols = [d[0] for d in cursor.description]
65
+ rows = []
66
+ while True:
67
+ chunk = cursor.fetchmany(1000)
68
+ if not chunk:
69
+ break
70
+ rows.extend(dict(zip(cols, row)) for row in chunk)
71
+ return rows
72
+
73
+
74
+ def collect_tables(cursor: Any, catalog: str) -> list[dict[str, Any]]:
75
+ return _query(
76
+ cursor,
77
+ f"""
78
+ SELECT table_catalog, table_schema, table_name, table_type, comment
79
+ FROM {catalog}.information_schema.tables
80
+ WHERE table_schema NOT IN ({", ".join(f"'{s}'" for s in SCHEMA_EXCLUSIONS)})
81
+ ORDER BY table_schema, table_name
82
+ """, # ← SUBSTITUTE: add additional WHERE filters if needed
83
+ )
84
+
85
+
86
+ def collect_columns(cursor: Any, catalog: str, schema: str, table: str) -> list[dict[str, Any]]:
87
+ return _query(
88
+ cursor,
89
+ f"""
90
+ SELECT column_name, data_type, comment
91
+ FROM {catalog}.information_schema.columns
92
+ WHERE table_schema = '{schema}' AND table_name = '{table}'
93
+ ORDER BY ordinal_position
94
+ """,
95
+ )
96
+
97
+
98
+ def collect_detail(cursor: Any, catalog: str, schema: str, table: str) -> dict[str, Any] | None:
99
+ try:
100
+ rows = _query(cursor, f"DESCRIBE DETAIL `{catalog}`.`{schema}`.`{table}`")
101
+ return rows[0] if rows else None
102
+ except Exception:
103
+ log.debug("DESCRIBE DETAIL failed for %s.%s.%s", catalog, schema, table, exc_info=True)
104
+ return None
105
+
106
+
107
+ def collect(
108
+ host: str,
109
+ http_path: str,
110
+ token: str,
111
+ catalog: str,
112
+ manifest_path: str = "manifest_metadata.json",
113
+ ) -> list[dict[str, Any]]:
114
+ """Connect to Databricks, collect metadata, write a JSON manifest, and return the asset dicts.
115
+
116
+ The manifest contains serialised asset dicts that push_metadata.py can read.
117
+ """
118
+ _check_available_memory(min_gb=2.0)
119
+ collected_at = datetime.now(timezone.utc).isoformat()
120
+ assets: list[dict[str, Any]] = []
121
+
122
+ with sql.connect(
123
+ server_hostname=host, # ← SUBSTITUTE
124
+ http_path=http_path, # ← SUBSTITUTE
125
+ access_token=token, # ← SUBSTITUTE
126
+ ) as conn:
127
+ with conn.cursor() as cursor:
128
+ tables = collect_tables(cursor, catalog)
129
+ log.info("Found %d tables in catalog %s", len(tables), catalog)
130
+
131
+ for row in tables:
132
+ schema = row["table_schema"]
133
+ table_name = row["table_name"]
134
+
135
+ columns = collect_columns(cursor, catalog, schema, table_name)
136
+ fields = [
137
+ {
138
+ "name": col["column_name"],
139
+ "type": col["data_type"].upper(),
140
+ "description": col.get("comment") or None,
141
+ }
142
+ for col in columns
143
+ ]
144
+
145
+ detail = collect_detail(cursor, catalog, schema, table_name)
146
+ row_count: int | None = None
147
+ byte_count: int | None = None
148
+ last_updated: str | None = None
149
+ if detail:
150
+ row_count = detail.get("numRows")
151
+ byte_count = detail.get("sizeInBytes")
152
+ last_modified = detail.get("lastModified")
153
+ if last_modified:
154
+ last_updated = (
155
+ last_modified.isoformat()
156
+ if hasattr(last_modified, "isoformat")
157
+ else str(last_modified)
158
+ )
159
+
160
+ asset = {
161
+ "asset_name": table_name,
162
+ "database": catalog, # ← SUBSTITUTE: use catalog as database
163
+ "schema": schema,
164
+ "asset_type": "VIEW" if row.get("table_type", "").upper() == "VIEW" else "TABLE",
165
+ "description": row.get("comment") or None,
166
+ "fields": fields,
167
+ "row_count": row_count,
168
+ "byte_count": byte_count,
169
+ "last_updated": last_updated,
170
+ }
171
+ assets.append(asset)
172
+ log.info("Collected %s.%s.%s", catalog, schema, table_name)
173
+
174
+ manifest = {
175
+ "resource_type": RESOURCE_TYPE,
176
+ "collected_at": collected_at,
177
+ "catalog": catalog,
178
+ "asset_count": len(assets),
179
+ "assets": assets,
180
+ }
181
+ with open(manifest_path, "w") as fh:
182
+ json.dump(manifest, fh, indent=2)
183
+ log.info("Manifest written to %s (%d assets)", manifest_path, len(assets))
184
+
185
+ return assets
186
+
187
+
188
+ def main() -> None:
189
+ parser = argparse.ArgumentParser(description="Collect Databricks metadata to a manifest file")
190
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
191
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
192
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
193
+ parser.add_argument("--catalog", default=os.getenv("DATABRICKS_CATALOG", "hive_metastore"))
194
+ parser.add_argument("--manifest", default="manifest_metadata.json")
195
+ args = parser.parse_args()
196
+
197
+ required = ["host", "http_path", "token"]
198
+ missing = [k for k in required if getattr(args, k) is None]
199
+ if missing:
200
+ parser.error(f"Missing required arguments/env vars: {missing}")
201
+
202
+ collect(
203
+ host=args.host,
204
+ http_path=args.http_path,
205
+ token=args.token,
206
+ catalog=args.catalog,
207
+ manifest_path=args.manifest,
208
+ )
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()
@@ -0,0 +1,204 @@
1
+ """
2
+ Databricks — Query Log Collection (collect-only)
3
+ ==================================================
4
+ Collects finished query execution records from the Databricks system table
5
+ system.query.history and writes a JSON manifest file that can be consumed
6
+ by push_query_logs.py.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - DATABRICKS_HOST : workspace hostname
10
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path
11
+ - DATABRICKS_TOKEN : PAT or service-principal secret
12
+ - LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
13
+ - LOOKBACK_LAG_HOURS : hours to lag behind now to avoid in-flight queries (default 1)
14
+ - MAX_ROWS : maximum query rows to collect per run (default 10000)
15
+
16
+ Prerequisites:
17
+ pip install databricks-sql-connector
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import logging
25
+ import os
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+ from databricks import sql
30
+
31
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
32
+ log = logging.getLogger(__name__)
33
+
34
+ LOG_TYPE = "databricks"
35
+
36
+ LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
37
+ LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
38
+ MAX_ROWS: int = int(os.getenv("MAX_ROWS", "10000")) # ← SUBSTITUTE
39
+
40
+ _QUERY_LOG_SQL = """\
41
+ SELECT
42
+ statement_id AS query_id,
43
+ statement_text AS query_text,
44
+ start_time,
45
+ end_time,
46
+ executed_by AS user_name,
47
+ produced_rows AS returned_rows,
48
+ total_task_duration_ms,
49
+ read_rows,
50
+ read_bytes
51
+ FROM system.query.history
52
+ WHERE start_time >= DATEADD(HOUR, -{lookback_hours}, NOW())
53
+ AND start_time < DATEADD(HOUR, -{lag_hours}, NOW())
54
+ AND status = 'FINISHED'
55
+ ORDER BY start_time
56
+ LIMIT {max_rows}
57
+ """ # ← SUBSTITUTE: adjust status filter or add warehouse_id filter as needed
58
+
59
+
60
+ def _check_available_memory(min_gb: float = 2.0) -> None:
61
+ """Warn if available memory is below the threshold."""
62
+ try:
63
+ if hasattr(os, "sysconf"): # Linux / macOS
64
+ page_size = os.sysconf("SC_PAGE_SIZE")
65
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
66
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
67
+ else:
68
+ return # Windows — skip check
69
+ except (ValueError, OSError):
70
+ return
71
+ if avail_gb < min_gb:
72
+ log.warning(
73
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
74
+ "Consider reducing the collection scope or increasing available memory.",
75
+ avail_gb,
76
+ min_gb,
77
+ )
78
+
79
+
80
+ def _safe_isoformat(dt: Any) -> str | None:
81
+ if dt is None:
82
+ return None
83
+ if hasattr(dt, "isoformat"):
84
+ if dt.tzinfo is None:
85
+ dt = dt.replace(tzinfo=timezone.utc)
86
+ return dt.isoformat()
87
+ return str(dt)
88
+
89
+
90
+ def _query(cursor: Any, sql_text: str) -> list[dict[str, Any]]:
91
+ cursor.execute(sql_text)
92
+ cols = [d[0] for d in cursor.description]
93
+ rows = []
94
+ while True:
95
+ chunk = cursor.fetchmany(1000)
96
+ if not chunk:
97
+ break
98
+ rows.extend(dict(zip(cols, row)) for row in chunk)
99
+ return rows
100
+
101
+
102
+ def collect_query_logs(
103
+ cursor: Any,
104
+ lookback_hours: int,
105
+ lag_hours: int,
106
+ max_rows: int,
107
+ ) -> list[dict[str, Any]]:
108
+ rendered_sql = _QUERY_LOG_SQL.format(
109
+ lookback_hours=lookback_hours + lag_hours, # offset from NOW() to cover the window
110
+ lag_hours=lag_hours,
111
+ max_rows=max_rows,
112
+ )
113
+ rows = _query(cursor, rendered_sql)
114
+ log.info("Retrieved %d query log rows from system.query.history", len(rows))
115
+
116
+ entries: list[dict[str, Any]] = []
117
+ for row in rows:
118
+ query_text: str = row.get("query_text") or ""
119
+ if not query_text.strip():
120
+ continue # ← SUBSTITUTE: decide whether to skip empty-text rows
121
+
122
+ entry = {
123
+ "query_id": row.get("query_id"),
124
+ "query_text": query_text,
125
+ "start_time": _safe_isoformat(row.get("start_time")),
126
+ "end_time": _safe_isoformat(row.get("end_time")),
127
+ "user": row.get("user_name"),
128
+ "returned_rows": row.get("returned_rows"),
129
+ "total_task_duration_ms": row.get("total_task_duration_ms"),
130
+ "read_rows": row.get("read_rows"),
131
+ "read_bytes": row.get("read_bytes"),
132
+ }
133
+ entries.append(entry)
134
+
135
+ return entries
136
+
137
+
138
+ def collect(
139
+ host: str,
140
+ http_path: str,
141
+ token: str,
142
+ manifest_path: str = "manifest_query_logs.json",
143
+ lookback_hours: int = LOOKBACK_HOURS,
144
+ lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
145
+ max_rows: int = MAX_ROWS,
146
+ ) -> list[dict[str, Any]]:
147
+ """Connect to Databricks, collect query logs, write a JSON manifest, and return entries."""
148
+ _check_available_memory(min_gb=2.0)
149
+ collected_at = datetime.now(timezone.utc).isoformat()
150
+
151
+ with sql.connect(
152
+ server_hostname=host, # ← SUBSTITUTE
153
+ http_path=http_path, # ← SUBSTITUTE
154
+ access_token=token, # ← SUBSTITUTE
155
+ ) as conn:
156
+ with conn.cursor() as cursor:
157
+ entries = collect_query_logs(cursor, lookback_hours, lookback_lag_hours, max_rows)
158
+
159
+ log.info("Collected %d query log entries", len(entries))
160
+
161
+ manifest = {
162
+ "log_type": LOG_TYPE,
163
+ "collected_at": collected_at,
164
+ "lookback_hours": lookback_hours,
165
+ "lookback_lag_hours": lookback_lag_hours,
166
+ "query_log_count": len(entries),
167
+ "entries": entries,
168
+ }
169
+ with open(manifest_path, "w") as fh:
170
+ json.dump(manifest, fh, indent=2)
171
+ log.info("Manifest written to %s (%d entries)", manifest_path, len(entries))
172
+
173
+ return entries
174
+
175
+
176
+ def main() -> None:
177
+ parser = argparse.ArgumentParser(description="Collect Databricks query logs to a manifest file")
178
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
179
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
180
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
181
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
182
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
183
+ parser.add_argument("--max-rows", type=int, default=MAX_ROWS)
184
+ parser.add_argument("--manifest", default="manifest_query_logs.json")
185
+ args = parser.parse_args()
186
+
187
+ required = ["host", "http_path", "token"]
188
+ missing = [k for k in required if getattr(args, k) is None]
189
+ if missing:
190
+ parser.error(f"Missing required arguments/env vars: {missing}")
191
+
192
+ collect(
193
+ host=args.host,
194
+ http_path=args.http_path,
195
+ token=args.token,
196
+ manifest_path=args.manifest,
197
+ lookback_hours=args.lookback_hours,
198
+ lookback_lag_hours=args.lookback_lag_hours,
199
+ max_rows=args.max_rows,
200
+ )
201
+
202
+
203
+ if __name__ == "__main__":
204
+ main()
@@ -0,0 +1,192 @@
1
+ """
2
+ Databricks — Lineage Push (push-only)
3
+ =======================================
4
+ Reads a JSON manifest file produced by collect_lineage.py and pushes the lineage
5
+ events to Monte Carlo via the push ingestion API, with configurable batching to
6
+ keep compressed payloads under 1 MB.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
10
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
11
+ - PUSH_BATCH_SIZE : number of events per API call (default 500)
12
+
13
+ Prerequisites:
14
+ pip install pycarlo
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ ColumnLineageField,
31
+ ColumnLineageSourceField,
32
+ LineageAssetRef,
33
+ LineageEvent,
34
+ )
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
37
+ log = logging.getLogger(__name__)
38
+
39
+ RESOURCE_TYPE = "databricks"
40
+ DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
41
+
42
+
43
+ def _ref_from_dict(d: dict[str, Any]) -> LineageAssetRef:
44
+ database = d.get("database", "")
45
+ schema = d.get("schema", "")
46
+ name = d["asset_name"]
47
+ return LineageAssetRef(
48
+ type="TABLE",
49
+ name=name,
50
+ database=database,
51
+ schema=schema,
52
+ asset_id=f"{database}__{schema}__{name}",
53
+ )
54
+
55
+
56
+ def _event_from_dict(d: dict[str, Any]) -> LineageEvent:
57
+ """Reconstruct a LineageEvent from a manifest dict."""
58
+ sources = [_ref_from_dict(s) for s in d.get("sources", [])]
59
+ destination = _ref_from_dict(d["destination"])
60
+
61
+ fields: list[ColumnLineageField] | None = None
62
+ if d.get("column_lineage"):
63
+ fields = []
64
+ for cl in d["column_lineage"]:
65
+ src_fields = []
66
+ for s in cl.get("sources", []):
67
+ asset_id = f"{s.get('database', '')}__{s.get('schema', '')}__{s['asset_name']}"
68
+ src_fields.append(
69
+ ColumnLineageSourceField(
70
+ asset_id=asset_id,
71
+ field_name=s["field"],
72
+ )
73
+ )
74
+ fields.append(
75
+ ColumnLineageField(
76
+ name=cl["destination_field"],
77
+ source_fields=src_fields,
78
+ )
79
+ )
80
+
81
+ return LineageEvent(
82
+ sources=sources,
83
+ destination=destination,
84
+ fields=fields,
85
+ )
86
+
87
+
88
+ def push(
89
+ manifest_path: str,
90
+ resource_uuid: str,
91
+ key_id: str,
92
+ key_token: str,
93
+ batch_size: int = DEFAULT_BATCH_SIZE,
94
+ ) -> dict[str, Any]:
95
+ """Read a collect manifest and push lineage events to Monte Carlo in batches.
96
+
97
+ Returns a summary dict with invocation IDs and counts.
98
+ """
99
+ with open(manifest_path) as fh:
100
+ manifest = json.load(fh)
101
+
102
+ event_dicts: list[dict[str, Any]] = manifest["events"]
103
+ events = [_event_from_dict(d) for d in event_dicts]
104
+ log.info("Loaded %d lineage events from %s", len(events), manifest_path)
105
+
106
+ # Split into batches
107
+ batches = []
108
+ for i in range(0, len(events), batch_size):
109
+ batches.append(events[i : i + batch_size])
110
+ total_batches = len(batches)
111
+
112
+ def _push_batch(batch: list, batch_num: int) -> str | None:
113
+ """Push a single batch using a dedicated Session (thread-safe)."""
114
+ log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
115
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
116
+ service = IngestionService(mc_client=client)
117
+ result = service.send_lineage(
118
+ resource_uuid=resource_uuid,
119
+ resource_type=RESOURCE_TYPE,
120
+ events=batch,
121
+ )
122
+ invocation_id = service.extract_invocation_id(result)
123
+ if invocation_id:
124
+ log.info("Batch %d: invocation_id=%s", batch_num, invocation_id)
125
+ return invocation_id
126
+
127
+ # Push batches in parallel (each thread gets its own pycarlo Session)
128
+ max_workers = min(4, total_batches)
129
+ invocation_ids: list[str | None] = [None] * total_batches
130
+
131
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
132
+ futures = {
133
+ pool.submit(_push_batch, batch, i + 1): i
134
+ for i, batch in enumerate(batches)
135
+ }
136
+ for future in as_completed(futures):
137
+ idx = futures[future]
138
+ try:
139
+ invocation_ids[idx] = future.result()
140
+ except Exception as exc:
141
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
142
+ raise
143
+
144
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
145
+
146
+ pushed_at = datetime.now(timezone.utc).isoformat()
147
+ summary = {
148
+ "resource_uuid": resource_uuid,
149
+ "resource_type": RESOURCE_TYPE,
150
+ "invocation_ids": invocation_ids,
151
+ "pushed_at": pushed_at,
152
+ "event_count": len(events),
153
+ "batch_count": total_batches,
154
+ "batch_size": batch_size,
155
+ "lookback_days": manifest.get("lookback_days"),
156
+ "table_lineage_events": manifest.get("table_lineage_events"),
157
+ "column_lineage_events": manifest.get("column_lineage_events"),
158
+ }
159
+
160
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
161
+ with open(push_manifest_path, "w") as fh:
162
+ json.dump(summary, fh, indent=2)
163
+ log.info("Push result written to %s", push_manifest_path)
164
+
165
+ return summary
166
+
167
+
168
+ def main() -> None:
169
+ parser = argparse.ArgumentParser(description="Push Databricks lineage to Monte Carlo from manifest")
170
+ parser.add_argument("--manifest", default="manifest_lineage.json")
171
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
172
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
173
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
174
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
175
+ args = parser.parse_args()
176
+
177
+ required = ["resource_uuid", "key_id", "key_token"]
178
+ missing = [k for k in required if getattr(args, k) is None]
179
+ if missing:
180
+ parser.error(f"Missing required arguments/env vars: {missing}")
181
+
182
+ push(
183
+ manifest_path=args.manifest,
184
+ resource_uuid=args.resource_uuid,
185
+ key_id=args.key_id,
186
+ key_token=args.key_token,
187
+ batch_size=args.batch_size,
188
+ )
189
+
190
+
191
+ if __name__ == "__main__":
192
+ main()