opencode-skills-collection 2.0.0-beta.3 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -0
  2. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  3. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  4. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  5. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  6. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  7. package/bundled-skills/docs/users/bundles.md +1 -1
  8. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  9. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  10. package/bundled-skills/docs/users/getting-started.md +1 -1
  11. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  12. package/bundled-skills/docs/users/usage.md +4 -4
  13. package/bundled-skills/docs/users/visual-guide.md +4 -4
  14. package/bundled-skills/manage-skills/SKILL.md +187 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  20. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  21. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  22. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  23. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  24. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  86. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  89. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  90. package/package.json +1 -1
  91. package/skills_index.json +503 -61
@@ -0,0 +1,208 @@
1
+ """
2
+ BigQuery Iceberg — Query Log Push (push only)
3
+ =============================================
4
+ Reads a JSON manifest produced by collect_query_logs.py and pushes query
5
+ log entries to Monte Carlo using the pycarlo SDK's IngestionService.
6
+
7
+ Uses dateutil.isoparse() to convert ISO8601 strings back to datetime
8
+ objects (QueryLogEntry requires datetime, not str).
9
+
10
+ Can be run standalone via CLI or imported (use the ``push()`` function).
11
+
12
+ Substitution points (search for "← SUBSTITUTE"):
13
+ - MCD_INGEST_ID : Monte Carlo Ingestion API key ID
14
+ - MCD_INGEST_TOKEN : Monte Carlo Ingestion API key token
15
+ - MCD_RESOURCE_UUID : Monte Carlo warehouse resource UUID
16
+
17
+ Prerequisites:
18
+ pip install pycarlo>=0.12.251 python-dateutil>=2.8.0
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import logging
26
+ import os
27
+ from concurrent.futures import ThreadPoolExecutor, as_completed
28
+ from datetime import datetime, timezone
29
+
30
+ from dateutil.parser import isoparse
31
+
32
+ from pycarlo.core import Client, Session
33
+ from pycarlo.features.ingestion import IngestionService
34
+ from pycarlo.features.ingestion.models import QueryLogEntry
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
37
+ log = logging.getLogger(__name__)
38
+
39
+ LOG_TYPE = "bigquery"
40
+
41
+ # Query logs include full SQL text — keep batches small to stay under the
42
+ # 1 MB compressed payload limit.
43
+ _BATCH_SIZE = 100
44
+
45
+ # Truncate very long SQL to prevent 413 errors.
46
+ _MAX_QUERY_TEXT_LEN = 10_000
47
+
48
+ _ENDPOINT = "https://integrations.getmontecarlo.com"
49
+
50
+
51
+ def _build_query_log_entries(queries: list[dict]) -> list[QueryLogEntry]:
52
+ """Convert manifest query dicts into QueryLogEntry objects."""
53
+ entries = []
54
+ truncated = 0
55
+ for q in queries:
56
+ query_text = q.get("query_text") or ""
57
+
58
+ if len(query_text) > _MAX_QUERY_TEXT_LEN:
59
+ query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
60
+ truncated += 1
61
+
62
+ extra = {}
63
+ if q.get("total_bytes_billed") is not None:
64
+ extra["total_bytes_billed"] = q["total_bytes_billed"]
65
+ if q.get("statement_type") is not None:
66
+ extra["statement_type"] = q["statement_type"]
67
+
68
+ start_time = q.get("start_time")
69
+ end_time = q.get("end_time")
70
+
71
+ entry = QueryLogEntry(
72
+ query_id=q.get("query_id"),
73
+ query_text=query_text,
74
+ start_time=isoparse(start_time) if start_time else None,
75
+ end_time=isoparse(end_time) if end_time else None,
76
+ user=q.get("user"),
77
+ extra=extra or None,
78
+ )
79
+ entries.append(entry)
80
+
81
+ if truncated:
82
+ log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
83
+ return entries
84
+
85
+
86
+ def push(
87
+ input_file: str,
88
+ resource_uuid: str,
89
+ key_id: str,
90
+ key_token: str,
91
+ batch_size: int = _BATCH_SIZE,
92
+ output_file: str = "query_logs_push_result.json",
93
+ ) -> dict:
94
+ """Read a query log manifest and push entries to Monte Carlo in batches."""
95
+ endpoint = _ENDPOINT
96
+ log.info("Using endpoint: %s", endpoint)
97
+
98
+ with open(input_file) as fh:
99
+ manifest = json.load(fh)
100
+
101
+ queries = manifest.get("queries", [])
102
+ log_type = manifest.get("log_type", LOG_TYPE)
103
+ entries = _build_query_log_entries(queries)
104
+ log.info("Loaded %d query log entry/entries from %s", len(entries), input_file)
105
+
106
+ if not entries:
107
+ log.info("No query log entries to push.")
108
+ push_result = {
109
+ "resource_uuid": resource_uuid,
110
+ "log_type": log_type,
111
+ "invocation_ids": [],
112
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
113
+ "total_entries": 0,
114
+ "batch_count": 0,
115
+ "batch_size": batch_size,
116
+ }
117
+ with open(output_file, "w") as fh:
118
+ json.dump(push_result, fh, indent=2)
119
+ return push_result
120
+
121
+ batches = [entries[i : i + batch_size] for i in range(0, len(entries), batch_size)]
122
+ total_batches = len(batches)
123
+
124
+ def _push_batch(batch: list[QueryLogEntry], batch_num: int) -> str | None:
125
+ client = Client(session=Session(
126
+ mcd_id=key_id, mcd_token=key_token, scope="Ingestion", endpoint=endpoint,
127
+ ))
128
+ service = IngestionService(mc_client=client)
129
+ result = service.send_query_logs(
130
+ resource_uuid=resource_uuid,
131
+ log_type=log_type,
132
+ events=batch,
133
+ )
134
+ invocation_id = service.extract_invocation_id(result)
135
+ log.info(
136
+ "Pushed batch %d/%d (%d entries) — invocation_id=%s",
137
+ batch_num, total_batches, len(batch), invocation_id,
138
+ )
139
+ return invocation_id
140
+
141
+ max_workers = min(4, total_batches)
142
+ invocation_ids: list[str | None] = [None] * total_batches
143
+
144
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
145
+ futures = {
146
+ pool.submit(_push_batch, batch, i + 1): i
147
+ for i, batch in enumerate(batches)
148
+ }
149
+ for future in as_completed(futures):
150
+ idx = futures[future]
151
+ try:
152
+ invocation_ids[idx] = future.result()
153
+ except Exception as exc:
154
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
155
+ raise
156
+
157
+ log.info("All %d batch(es) pushed.", total_batches)
158
+
159
+ push_result = {
160
+ "resource_uuid": resource_uuid,
161
+ "log_type": log_type,
162
+ "invocation_ids": invocation_ids,
163
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
164
+ "total_entries": len(entries),
165
+ "batch_count": total_batches,
166
+ "batch_size": batch_size,
167
+ }
168
+ with open(output_file, "w") as fh:
169
+ json.dump(push_result, fh, indent=2)
170
+ log.info("Push result written to %s", output_file)
171
+
172
+ return push_result
173
+
174
+
175
+ def main() -> None:
176
+ parser = argparse.ArgumentParser(
177
+ description="Push BigQuery query logs from a manifest to Monte Carlo",
178
+ )
179
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
180
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
181
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
182
+ parser.add_argument("--input-file", default="query_logs_output.json")
183
+ parser.add_argument("--output-file", default="query_logs_push_result.json")
184
+ parser.add_argument(
185
+ "--batch-size",
186
+ type=int,
187
+ default=_BATCH_SIZE,
188
+ help=f"Max entries per push batch (default: {_BATCH_SIZE})",
189
+ )
190
+ args = parser.parse_args()
191
+
192
+ required = ["resource_uuid", "key_id", "key_token"]
193
+ missing = [k for k in required if getattr(args, k) is None]
194
+ if missing:
195
+ parser.error(f"Missing required arguments/env vars: {missing}")
196
+
197
+ push(
198
+ input_file=args.input_file,
199
+ resource_uuid=args.resource_uuid,
200
+ key_id=args.key_id,
201
+ key_token=args.key_token,
202
+ batch_size=args.batch_size,
203
+ output_file=args.output_file,
204
+ )
205
+
206
+
207
+ if __name__ == "__main__":
208
+ main()
@@ -0,0 +1,83 @@
1
+ """
2
+ Databricks — Lineage Collect & Push (combined)
3
+ ================================================
4
+ Collects table-level and (optionally) column-level lineage from Databricks Unity
5
+ Catalog system tables, then pushes them to Monte Carlo via the push ingestion API.
6
+
7
+ This script imports and calls collect() from collect_lineage and push() from
8
+ push_lineage, running both in sequence.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - DATABRICKS_HOST : workspace hostname
12
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path
13
+ - DATABRICKS_TOKEN : PAT or service-principal secret
14
+ - LOOKBACK_DAYS : how many days back to collect lineage (default 30)
15
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
16
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
17
+ - PUSH_BATCH_SIZE : number of events per API call (default 500)
18
+
19
+ Use the --column-lineage flag to also push column-level lineage (disabled by default).
20
+
21
+ Prerequisites:
22
+ pip install databricks-sql-connector pycarlo
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import logging
29
+ import os
30
+
31
+ from collect_lineage import LOOKBACK_DAYS, collect
32
+ from push_lineage import DEFAULT_BATCH_SIZE, push
33
+
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
35
+ log = logging.getLogger(__name__)
36
+
37
+
38
+ def main() -> None:
39
+ parser = argparse.ArgumentParser(description="Collect and push Databricks lineage to Monte Carlo")
40
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
41
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
42
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
43
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
44
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
45
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
46
+ parser.add_argument("--lookback-days", type=int, default=LOOKBACK_DAYS)
47
+ parser.add_argument(
48
+ "--column-lineage", action="store_true",
49
+ help="Also collect column-level lineage (requires system.access.column_lineage access)",
50
+ )
51
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
52
+ parser.add_argument("--manifest", default="manifest_lineage.json")
53
+ args = parser.parse_args()
54
+
55
+ required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
56
+ missing = [k for k in required if getattr(args, k) is None]
57
+ if missing:
58
+ parser.error(f"Missing required arguments/env vars: {missing}")
59
+
60
+ log.info("Step 1: Collecting lineage …")
61
+ collect(
62
+ host=args.host,
63
+ http_path=args.http_path,
64
+ token=args.token,
65
+ manifest_path=args.manifest,
66
+ include_column_lineage=args.column_lineage,
67
+ lookback_days=args.lookback_days,
68
+ )
69
+
70
+ log.info("Step 2: Pushing lineage to Monte Carlo …")
71
+ push(
72
+ manifest_path=args.manifest,
73
+ resource_uuid=args.resource_uuid,
74
+ key_id=args.key_id,
75
+ key_token=args.key_token,
76
+ batch_size=args.batch_size,
77
+ )
78
+
79
+ log.info("Done — collect and push complete.")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,77 @@
1
+ """
2
+ Databricks — Metadata Collect & Push (combined)
3
+ =================================================
4
+ Collects table schemas, row counts, and byte sizes from Databricks Unity Catalog,
5
+ then pushes them to Monte Carlo via the push ingestion API.
6
+
7
+ This script imports and calls collect() from collect_metadata and push() from
8
+ push_metadata, running both in sequence.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - DATABRICKS_HOST : workspace hostname (e.g. adb-1234.azuredatabricks.net)
12
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path (e.g. /sql/1.0/warehouses/abc123)
13
+ - DATABRICKS_TOKEN : personal access token or service-principal secret
14
+ - DATABRICKS_CATALOG : catalog to collect from (default: "hive_metastore" or "main")
15
+ - SCHEMA_EXCLUSIONS : schemas to skip
16
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
17
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
18
+ - PUSH_BATCH_SIZE : number of assets per API call (default 500)
19
+
20
+ Prerequisites:
21
+ pip install databricks-sql-connector pycarlo
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import logging
28
+ import os
29
+
30
+ from collect_metadata import collect
31
+ from push_metadata import DEFAULT_BATCH_SIZE, push
32
+
33
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
34
+ log = logging.getLogger(__name__)
35
+
36
+
37
+ def main() -> None:
38
+ parser = argparse.ArgumentParser(description="Collect and push Databricks metadata to Monte Carlo")
39
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
40
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
41
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
42
+ parser.add_argument("--catalog", default=os.getenv("DATABRICKS_CATALOG", "hive_metastore"))
43
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
44
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
45
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
46
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
47
+ parser.add_argument("--manifest", default="manifest_metadata.json")
48
+ args = parser.parse_args()
49
+
50
+ required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
51
+ missing = [k for k in required if getattr(args, k) is None]
52
+ if missing:
53
+ parser.error(f"Missing required arguments/env vars: {missing}")
54
+
55
+ log.info("Step 1: Collecting metadata …")
56
+ collect(
57
+ host=args.host,
58
+ http_path=args.http_path,
59
+ token=args.token,
60
+ catalog=args.catalog,
61
+ manifest_path=args.manifest,
62
+ )
63
+
64
+ log.info("Step 2: Pushing metadata to Monte Carlo …")
65
+ push(
66
+ manifest_path=args.manifest,
67
+ resource_uuid=args.resource_uuid,
68
+ key_id=args.key_id,
69
+ key_token=args.key_token,
70
+ batch_size=args.batch_size,
71
+ )
72
+
73
+ log.info("Done — collect and push complete.")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
@@ -0,0 +1,83 @@
1
+ """
2
+ Databricks — Query Log Collect & Push (combined)
3
+ ==================================================
4
+ Collects finished query execution records from the Databricks system table
5
+ system.query.history and pushes them to Monte Carlo for query-pattern analysis,
6
+ lineage derivation, and usage attribution.
7
+
8
+ This script imports and calls collect() from collect_query_logs and push() from
9
+ push_query_logs, running both in sequence.
10
+
11
+ Substitution points (search for "← SUBSTITUTE"):
12
+ - DATABRICKS_HOST : workspace hostname
13
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path
14
+ - DATABRICKS_TOKEN : PAT or service-principal secret
15
+ - LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
16
+ - LOOKBACK_LAG_HOURS : hours to lag behind now to avoid in-flight queries (default 1)
17
+ - MAX_ROWS : maximum query rows to collect per run (default 10000)
18
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
19
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
20
+ - PUSH_BATCH_SIZE : number of entries per API call (default 250)
21
+
22
+ Prerequisites:
23
+ pip install databricks-sql-connector pycarlo
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import logging
30
+ import os
31
+
32
+ from collect_query_logs import LOOKBACK_HOURS, LOOKBACK_LAG_HOURS, MAX_ROWS, collect
33
+ from push_query_logs import DEFAULT_BATCH_SIZE, push
34
+
35
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
36
+ log = logging.getLogger(__name__)
37
+
38
+
39
+ def main() -> None:
40
+ parser = argparse.ArgumentParser(description="Collect and push Databricks query logs to Monte Carlo")
41
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
42
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
43
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
44
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
45
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
46
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
47
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
48
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
49
+ parser.add_argument("--max-rows", type=int, default=MAX_ROWS)
50
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
51
+ parser.add_argument("--manifest", default="manifest_query_logs.json")
52
+ args = parser.parse_args()
53
+
54
+ required = ["host", "http_path", "token", "resource_uuid", "key_id", "key_token"]
55
+ missing = [k for k in required if getattr(args, k) is None]
56
+ if missing:
57
+ parser.error(f"Missing required arguments/env vars: {missing}")
58
+
59
+ log.info("Step 1: Collecting query logs …")
60
+ collect(
61
+ host=args.host,
62
+ http_path=args.http_path,
63
+ token=args.token,
64
+ manifest_path=args.manifest,
65
+ lookback_hours=args.lookback_hours,
66
+ lookback_lag_hours=args.lookback_lag_hours,
67
+ max_rows=args.max_rows,
68
+ )
69
+
70
+ log.info("Step 2: Pushing query logs to Monte Carlo …")
71
+ push(
72
+ manifest_path=args.manifest,
73
+ resource_uuid=args.resource_uuid,
74
+ key_id=args.key_id,
75
+ key_token=args.key_token,
76
+ batch_size=args.batch_size,
77
+ )
78
+
79
+ log.info("Done — collect and push complete.")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,240 @@
1
+ """
2
+ Databricks — Lineage Collection (collect-only)
3
+ ================================================
4
+ Collects table-level and (optionally) column-level lineage from Databricks Unity
5
+ Catalog system tables (system.access.table_lineage and system.access.column_lineage).
6
+ No SQL parsing required — Databricks provides first-class lineage metadata.
7
+
8
+ Writes a JSON manifest file that can be consumed by push_lineage.py.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - DATABRICKS_HOST : workspace hostname
12
+ - DATABRICKS_HTTP_PATH : SQL warehouse HTTP path
13
+ - DATABRICKS_TOKEN : PAT or service-principal secret
14
+ - LOOKBACK_DAYS : how many days back to collect lineage (default 30)
15
+
16
+ Use the --column-lineage flag to also collect column-level lineage (disabled by default).
17
+
18
+ Prerequisites:
19
+ pip install databricks-sql-connector
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import json
26
+ import logging
27
+ import os
28
+ from datetime import datetime, timezone
29
+ from typing import Any
30
+
31
+ from databricks import sql
32
+
33
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
34
+ log = logging.getLogger(__name__)
35
+
36
+ RESOURCE_TYPE = "databricks"
37
+ LOOKBACK_DAYS: int = int(os.getenv("LOOKBACK_DAYS", "30")) # ← SUBSTITUTE
38
+
39
+
40
+ def _check_available_memory(min_gb: float = 2.0) -> None:
41
+ """Warn if available memory is below the threshold."""
42
+ try:
43
+ if hasattr(os, "sysconf"): # Linux / macOS
44
+ page_size = os.sysconf("SC_PAGE_SIZE")
45
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
46
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
47
+ else:
48
+ return # Windows — skip check
49
+ except (ValueError, OSError):
50
+ return
51
+ if avail_gb < min_gb:
52
+ log.warning(
53
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
54
+ "Consider reducing the collection scope or increasing available memory.",
55
+ avail_gb,
56
+ min_gb,
57
+ )
58
+
59
+
60
+ def _query(cursor: Any, sql_text: str) -> list[dict[str, Any]]:
61
+ cursor.execute(sql_text)
62
+ cols = [d[0] for d in cursor.description]
63
+ rows = []
64
+ while True:
65
+ chunk = cursor.fetchmany(1000)
66
+ if not chunk:
67
+ break
68
+ rows.extend(dict(zip(cols, row)) for row in chunk)
69
+ return rows
70
+
71
+
72
+ def _parse_full_name(full_name: str) -> tuple[str, str, str]:
73
+ """Split 'catalog.schema.table' into (catalog, schema, table)."""
74
+ parts = (full_name or "").split(".")
75
+ if len(parts) == 3:
76
+ return parts[0], parts[1], parts[2]
77
+ if len(parts) == 2:
78
+ return "", parts[0], parts[1]
79
+ return "", "", full_name
80
+
81
+
82
+ def collect_table_lineage(cursor: Any, lookback_days: int) -> list[dict[str, Any]]:
83
+ rows = _query(
84
+ cursor,
85
+ f"""
86
+ SELECT DISTINCT
87
+ source_table_full_name,
88
+ target_table_full_name,
89
+ created_by,
90
+ MAX(event_time) AS last_seen
91
+ FROM system.access.table_lineage
92
+ WHERE event_time >= DATEADD(DAY, -{lookback_days}, CURRENT_TIMESTAMP())
93
+ AND source_table_full_name IS NOT NULL
94
+ AND target_table_full_name IS NOT NULL
95
+ GROUP BY source_table_full_name, target_table_full_name, created_by
96
+ LIMIT 50000
97
+ """, # ← SUBSTITUTE: adjust lookback_days, LIMIT, or add catalog/schema filters
98
+ )
99
+
100
+ events: list[dict[str, Any]] = []
101
+ for row in rows:
102
+ src_catalog, src_schema, src_table = _parse_full_name(row["source_table_full_name"])
103
+ dst_catalog, dst_schema, dst_table = _parse_full_name(row["target_table_full_name"])
104
+
105
+ if not src_table or not dst_table:
106
+ continue
107
+
108
+ events.append({
109
+ "sources": [{"database": src_catalog, "schema": src_schema, "asset_name": src_table}],
110
+ "destination": {"database": dst_catalog, "schema": dst_schema, "asset_name": dst_table},
111
+ "lineage_type": "table",
112
+ })
113
+ return events
114
+
115
+
116
+ def collect_column_lineage(cursor: Any, lookback_days: int) -> list[dict[str, Any]]:
117
+ rows = _query(
118
+ cursor,
119
+ f"""
120
+ SELECT DISTINCT
121
+ source_table_full_name,
122
+ source_column_name,
123
+ target_table_full_name,
124
+ target_column_name
125
+ FROM system.access.column_lineage
126
+ WHERE event_time >= DATEADD(DAY, -{lookback_days}, CURRENT_TIMESTAMP())
127
+ AND source_table_full_name IS NOT NULL
128
+ AND target_table_full_name IS NOT NULL
129
+ LIMIT 50000
130
+ """, # ← SUBSTITUTE: adjust LIMIT or add catalog/schema filters if needed
131
+ )
132
+
133
+ # Group by destination table so we can build one event per destination
134
+ grouped: dict[str, dict[str, Any]] = {}
135
+ for row in rows:
136
+ dst_key = row["target_table_full_name"]
137
+ if dst_key not in grouped:
138
+ grouped[dst_key] = {"dst_full": dst_key, "columns": []}
139
+ grouped[dst_key]["columns"].append(row)
140
+
141
+ events: list[dict[str, Any]] = []
142
+ for dst_key, group in grouped.items():
143
+ dst_catalog, dst_schema, dst_table = _parse_full_name(group["dst_full"])
144
+ if not dst_table:
145
+ continue
146
+
147
+ col_fields: list[dict[str, Any]] = []
148
+ for row in group["columns"]:
149
+ src_catalog, src_schema, src_table = _parse_full_name(row["source_table_full_name"])
150
+ col_fields.append({
151
+ "destination_field": row["target_column_name"],
152
+ "sources": [{
153
+ "database": src_catalog,
154
+ "schema": src_schema,
155
+ "asset_name": src_table,
156
+ "field": row["source_column_name"],
157
+ }],
158
+ })
159
+
160
+ events.append({
161
+ "sources": [], # column lineage carries source refs inside col_fields
162
+ "destination": {"database": dst_catalog, "schema": dst_schema, "asset_name": dst_table},
163
+ "column_lineage": col_fields,
164
+ "lineage_type": "column",
165
+ })
166
+ return events
167
+
168
+
169
+ def collect(
170
+ host: str,
171
+ http_path: str,
172
+ token: str,
173
+ manifest_path: str = "manifest_lineage.json",
174
+ include_column_lineage: bool = False,
175
+ lookback_days: int = LOOKBACK_DAYS,
176
+ ) -> list[dict[str, Any]]:
177
+ """Connect to Databricks, collect lineage, write a JSON manifest, and return events."""
178
+ _check_available_memory(min_gb=2.0)
179
+ collected_at = datetime.now(timezone.utc).isoformat()
180
+
181
+ with sql.connect(
182
+ server_hostname=host, # ← SUBSTITUTE
183
+ http_path=http_path, # ← SUBSTITUTE
184
+ access_token=token, # ← SUBSTITUTE
185
+ ) as conn:
186
+ with conn.cursor() as cursor:
187
+ table_events = collect_table_lineage(cursor, lookback_days)
188
+ col_events = collect_column_lineage(cursor, lookback_days) if include_column_lineage else []
189
+
190
+ all_events = table_events + col_events
191
+ log.info(
192
+ "Collected %d lineage events (%d table, %d column)",
193
+ len(all_events), len(table_events), len(col_events),
194
+ )
195
+
196
+ manifest = {
197
+ "resource_type": RESOURCE_TYPE,
198
+ "collected_at": collected_at,
199
+ "lookback_days": lookback_days,
200
+ "table_lineage_events": len(table_events),
201
+ "column_lineage_events": len(col_events),
202
+ "events": all_events,
203
+ }
204
+ with open(manifest_path, "w") as fh:
205
+ json.dump(manifest, fh, indent=2)
206
+ log.info("Manifest written to %s (%d events)", manifest_path, len(all_events))
207
+
208
+ return all_events
209
+
210
+
211
+ def main() -> None:
212
+ parser = argparse.ArgumentParser(description="Collect Databricks lineage to a manifest file")
213
+ parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
214
+ parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
215
+ parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
216
+ parser.add_argument("--lookback-days", type=int, default=LOOKBACK_DAYS)
217
+ parser.add_argument(
218
+ "--column-lineage", action="store_true",
219
+ help="Also collect column-level lineage (requires system.access.column_lineage access)",
220
+ )
221
+ parser.add_argument("--manifest", default="manifest_lineage.json")
222
+ args = parser.parse_args()
223
+
224
+ required = ["host", "http_path", "token"]
225
+ missing = [k for k in required if getattr(args, k) is None]
226
+ if missing:
227
+ parser.error(f"Missing required arguments/env vars: {missing}")
228
+
229
+ collect(
230
+ host=args.host,
231
+ http_path=args.http_path,
232
+ token=args.token,
233
+ manifest_path=args.manifest,
234
+ include_column_lineage=args.column_lineage,
235
+ lookback_days=args.lookback_days,
236
+ )
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()