opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,178 @@
1
+ """
2
+ Databricks — Metadata Push (push-only)
3
+ ========================================
4
+ Reads a JSON manifest file produced by collect_metadata.py and pushes the assets
5
+ to Monte Carlo via the push ingestion API, with configurable batching to keep
6
+ compressed payloads under 1 MB.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
10
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
11
+ - PUSH_BATCH_SIZE : number of assets per API call (default 500)
12
+
13
+ Prerequisites:
14
+ pip install pycarlo
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ AssetField,
31
+ AssetFreshness,
32
+ AssetMetadata,
33
+ AssetVolume,
34
+ RelationalAsset,
35
+ )
36
+
37
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
38
+ log = logging.getLogger(__name__)
39
+
40
+ RESOURCE_TYPE = "databricks"
41
+ DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
42
+
43
+
44
+ def _asset_from_dict(d: dict[str, Any]) -> RelationalAsset:
45
+ """Reconstruct a RelationalAsset from a manifest dict."""
46
+ fields = [
47
+ AssetField(
48
+ name=f["name"],
49
+ type=f.get("type"),
50
+ description=f.get("description"),
51
+ )
52
+ for f in d.get("fields", [])
53
+ ]
54
+
55
+ volume = None
56
+ if d.get("row_count") is not None or d.get("byte_count") is not None:
57
+ volume = AssetVolume(row_count=d.get("row_count"), byte_count=d.get("byte_count"))
58
+
59
+ freshness = None
60
+ if d.get("last_updated") is not None:
61
+ freshness = AssetFreshness(last_update_time=d.get("last_updated"))
62
+
63
+ return RelationalAsset(
64
+ type=d.get("asset_type", "TABLE"),
65
+ metadata=AssetMetadata(
66
+ name=d["asset_name"],
67
+ database=d["database"], # ← SUBSTITUTE: use catalog as database
68
+ schema=d["schema"],
69
+ description=d.get("description"),
70
+ ),
71
+ fields=fields,
72
+ volume=volume,
73
+ freshness=freshness,
74
+ )
75
+
76
+
77
+ def push(
78
+ manifest_path: str,
79
+ resource_uuid: str,
80
+ key_id: str,
81
+ key_token: str,
82
+ batch_size: int = DEFAULT_BATCH_SIZE,
83
+ ) -> dict[str, Any]:
84
+ """Read a collect manifest and push assets to Monte Carlo in batches.
85
+
86
+ Returns a summary dict with invocation IDs and counts.
87
+ """
88
+ with open(manifest_path) as fh:
89
+ manifest = json.load(fh)
90
+
91
+ asset_dicts: list[dict[str, Any]] = manifest["assets"]
92
+ assets = [_asset_from_dict(d) for d in asset_dicts]
93
+ log.info("Loaded %d assets from %s", len(assets), manifest_path)
94
+
95
+ # Split into batches
96
+ batches = []
97
+ for i in range(0, max(len(assets), 1), batch_size):
98
+ batches.append(assets[i : i + batch_size])
99
+ total_batches = len(batches)
100
+
101
+ def _push_batch(batch: list, batch_num: int) -> str | None:
102
+ """Push a single batch using a dedicated Session (thread-safe)."""
103
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
104
+ service = IngestionService(mc_client=client)
105
+ result = service.send_metadata(
106
+ resource_uuid=resource_uuid,
107
+ resource_type=RESOURCE_TYPE,
108
+ events=batch,
109
+ )
110
+ invocation_id = service.extract_invocation_id(result)
111
+ log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
112
+ return invocation_id
113
+
114
+ # Push batches in parallel (each thread gets its own pycarlo Session)
115
+ max_workers = min(4, total_batches)
116
+ invocation_ids: list[str | None] = [None] * total_batches
117
+
118
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
119
+ futures = {
120
+ pool.submit(_push_batch, batch, i + 1): i
121
+ for i, batch in enumerate(batches)
122
+ }
123
+ for future in as_completed(futures):
124
+ idx = futures[future]
125
+ try:
126
+ invocation_ids[idx] = future.result()
127
+ except Exception as exc:
128
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
129
+ raise
130
+
131
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
132
+
133
+ pushed_at = datetime.now(timezone.utc).isoformat()
134
+ summary = {
135
+ "resource_uuid": resource_uuid,
136
+ "resource_type": RESOURCE_TYPE,
137
+ "invocation_ids": invocation_ids,
138
+ "pushed_at": pushed_at,
139
+ "asset_count": len(assets),
140
+ "batch_count": total_batches,
141
+ "batch_size": batch_size,
142
+ "catalog": manifest.get("catalog"),
143
+ }
144
+
145
+ # Write push result alongside the collect manifest
146
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
147
+ with open(push_manifest_path, "w") as fh:
148
+ json.dump(summary, fh, indent=2)
149
+ log.info("Push result written to %s", push_manifest_path)
150
+
151
+ return summary
152
+
153
+
154
+ def main() -> None:
155
+ parser = argparse.ArgumentParser(description="Push Databricks metadata to Monte Carlo from manifest")
156
+ parser.add_argument("--manifest", default="manifest_metadata.json")
157
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
158
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
159
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
160
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
161
+ args = parser.parse_args()
162
+
163
+ required = ["resource_uuid", "key_id", "key_token"]
164
+ missing = [k for k in required if getattr(args, k) is None]
165
+ if missing:
166
+ parser.error(f"Missing required arguments/env vars: {missing}")
167
+
168
+ push(
169
+ manifest_path=args.manifest,
170
+ resource_uuid=args.resource_uuid,
171
+ key_id=args.key_id,
172
+ key_token=args.key_token,
173
+ batch_size=args.batch_size,
174
+ )
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
@@ -0,0 +1,200 @@
1
+ """
2
+ Databricks — Query Log Push (push-only)
3
+ =========================================
4
+ Reads a JSON manifest file produced by collect_query_logs.py and pushes the query
5
+ log entries to Monte Carlo via the push ingestion API, with configurable batching
6
+ to keep compressed payloads under 1 MB.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
10
+ - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
11
+ - PUSH_BATCH_SIZE : number of entries per API call (default 100)
12
+
13
+ Prerequisites:
14
+ pip install pycarlo
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ from dateutil.parser import isoparse
28
+ from pycarlo.core import Client, Session
29
+ from pycarlo.features.ingestion import IngestionService
30
+ from pycarlo.features.ingestion.models import QueryLogEntry
31
+
32
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
33
+ log = logging.getLogger(__name__)
34
+
35
+ LOG_TYPE = "databricks"
36
+ DEFAULT_BATCH_SIZE = 100 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
37
+
38
+ # Truncate query_text longer than this to prevent 413 errors.
39
+ # Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
40
+ # compressed payloads even at small batch sizes.
41
+ _MAX_QUERY_TEXT_LEN = 10_000
42
+
43
+
44
+ def _build_query_log_entries(entry_dicts: list[dict[str, Any]]) -> list[QueryLogEntry]:
45
+ """Convert manifest query dicts into QueryLogEntry objects."""
46
+ entries = []
47
+ truncated = 0
48
+ for d in entry_dicts:
49
+ query_text = d.get("query_text") or ""
50
+
51
+ # Truncate very long SQL to prevent 413 Request Too Large
52
+ if len(query_text) > _MAX_QUERY_TEXT_LEN:
53
+ query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
54
+ truncated += 1
55
+
56
+ extra = {}
57
+ if d.get("total_task_duration_ms") is not None:
58
+ extra["total_task_duration_ms"] = d["total_task_duration_ms"]
59
+ if d.get("read_rows") is not None:
60
+ extra["read_rows"] = d["read_rows"]
61
+ if d.get("read_bytes") is not None:
62
+ extra["read_bytes"] = d["read_bytes"]
63
+
64
+ start_time = d.get("start_time")
65
+ end_time = d.get("end_time")
66
+
67
+ entries.append(
68
+ QueryLogEntry(
69
+ query_id=d.get("query_id"),
70
+ query_text=query_text,
71
+ start_time=isoparse(start_time) if start_time else None,
72
+ end_time=isoparse(end_time) if end_time else None,
73
+ user=d.get("user"),
74
+ returned_rows=d.get("returned_rows"),
75
+ extra=extra or None,
76
+ )
77
+ )
78
+ if truncated:
79
+ log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
80
+ return entries
81
+
82
+
83
+ def push(
84
+ manifest_path: str,
85
+ resource_uuid: str,
86
+ key_id: str,
87
+ key_token: str,
88
+ batch_size: int = DEFAULT_BATCH_SIZE,
89
+ ) -> dict[str, Any]:
90
+ """Read a collect manifest and push query log entries to Monte Carlo in batches.
91
+
92
+ Returns a summary dict with invocation IDs and counts.
93
+ """
94
+ with open(manifest_path) as fh:
95
+ manifest = json.load(fh)
96
+
97
+ entry_dicts: list[dict[str, Any]] = manifest["entries"]
98
+ entries = _build_query_log_entries(entry_dicts)
99
+ log.info("Loaded %d query log entries from %s", len(entries), manifest_path)
100
+
101
+ if not entries:
102
+ log.info("No query log entries to push.")
103
+ summary = {
104
+ "resource_uuid": resource_uuid,
105
+ "log_type": LOG_TYPE,
106
+ "invocation_ids": [],
107
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
108
+ "query_log_count": 0,
109
+ "batch_count": 0,
110
+ "batch_size": batch_size,
111
+ }
112
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
113
+ with open(push_manifest_path, "w") as fh:
114
+ json.dump(summary, fh, indent=2)
115
+ return summary
116
+
117
+ # Split into batches
118
+ batches = []
119
+ for i in range(0, len(entries), batch_size):
120
+ batches.append(entries[i : i + batch_size])
121
+ total_batches = len(batches)
122
+
123
+ def _push_batch(batch: list, batch_num: int) -> str | None:
124
+ """Push a single batch using a dedicated Session (thread-safe)."""
125
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
126
+ service = IngestionService(mc_client=client)
127
+ result = service.send_query_logs(
128
+ resource_uuid=resource_uuid,
129
+ log_type=LOG_TYPE,
130
+ events=batch,
131
+ )
132
+ invocation_id = service.extract_invocation_id(result)
133
+ log.info("Pushed batch %d/%d (%d entries) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
134
+ return invocation_id
135
+
136
+ # Push batches in parallel (each thread gets its own pycarlo Session)
137
+ max_workers = min(4, total_batches)
138
+ invocation_ids: list[str | None] = [None] * total_batches
139
+
140
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
141
+ futures = {
142
+ pool.submit(_push_batch, batch, i + 1): i
143
+ for i, batch in enumerate(batches)
144
+ }
145
+ for future in as_completed(futures):
146
+ idx = futures[future]
147
+ try:
148
+ invocation_ids[idx] = future.result()
149
+ except Exception as exc:
150
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
151
+ raise
152
+
153
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
154
+
155
+ pushed_at = datetime.now(timezone.utc).isoformat()
156
+ summary = {
157
+ "resource_uuid": resource_uuid,
158
+ "log_type": LOG_TYPE,
159
+ "invocation_ids": invocation_ids,
160
+ "pushed_at": pushed_at,
161
+ "query_log_count": len(entries),
162
+ "batch_count": total_batches,
163
+ "batch_size": batch_size,
164
+ "lookback_hours": manifest.get("lookback_hours"),
165
+ "lookback_lag_hours": manifest.get("lookback_lag_hours"),
166
+ }
167
+
168
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
169
+ with open(push_manifest_path, "w") as fh:
170
+ json.dump(summary, fh, indent=2)
171
+ log.info("Push result written to %s", push_manifest_path)
172
+
173
+ return summary
174
+
175
+
176
+ def main() -> None:
177
+ parser = argparse.ArgumentParser(description="Push Databricks query logs to Monte Carlo from manifest")
178
+ parser.add_argument("--manifest", default="manifest_query_logs.json")
179
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
180
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
181
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
182
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
183
+ args = parser.parse_args()
184
+
185
+ required = ["resource_uuid", "key_id", "key_token"]
186
+ missing = [k for k in required if getattr(args, k) is None]
187
+ if missing:
188
+ parser.error(f"Missing required arguments/env vars: {missing}")
189
+
190
+ push(
191
+ manifest_path=args.manifest,
192
+ resource_uuid=args.resource_uuid,
193
+ key_id=args.key_id,
194
+ key_token=args.key_token,
195
+ batch_size=args.batch_size,
196
+ )
197
+
198
+
199
+ if __name__ == "__main__":
200
+ main()
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract Hive lineage from a local log file and push it to Monte Carlo in one step.
4
+
5
+ Thin wrapper that calls ``collect()`` from ``collect_lineage`` followed by
6
+ ``push()`` from ``push_lineage``, then writes the final manifest (with
7
+ ``resource_uuid`` and ``invocation_id``) to ``--output-file``.
8
+
9
+ Substitution points
10
+ -------------------
11
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
12
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
13
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
14
+ - --log-file : path to local HiveServer2 log
15
+
16
+ Prerequisites
17
+ -------------
18
+ pip install pycarlo python-dotenv
19
+
20
+ Usage (table-level):
21
+ python collect_and_push_lineage.py \\
22
+ --key-id <MCD_INGEST_ID> \\
23
+ --key-token <MCD_INGEST_TOKEN> \\
24
+ --resource-uuid <MCD_RESOURCE_UUID> \\
25
+ --log-file /tmp/root/hive.log
26
+
27
+ Usage (column-level):
28
+ python collect_and_push_lineage.py ... --column-lineage
29
+ """
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+
35
+ from collect_lineage import collect
36
+ from push_lineage import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
37
+
38
+
39
+ def main() -> None:
40
+ parser = argparse.ArgumentParser(
41
+ description="Extract Hive lineage from a local log file and push to Monte Carlo",
42
+ )
43
+ # Collect args
44
+ parser.add_argument(
45
+ "--log-file",
46
+ default="/tmp/root/hive.log",
47
+ help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
48
+ )
49
+ # Push / MC args
50
+ parser.add_argument(
51
+ "--key-id",
52
+ default=os.environ.get("MCD_INGEST_ID"),
53
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
54
+ )
55
+ parser.add_argument(
56
+ "--key-token",
57
+ default=os.environ.get("MCD_INGEST_TOKEN"),
58
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
59
+ )
60
+ parser.add_argument(
61
+ "--resource-uuid",
62
+ default=os.environ.get("MCD_RESOURCE_UUID"),
63
+ help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
64
+ )
65
+ parser.add_argument(
66
+ "--column-lineage",
67
+ action="store_true",
68
+ help="Push column-level lineage instead of table-level",
69
+ )
70
+ parser.add_argument(
71
+ "--output-file",
72
+ default="lineage_output.json",
73
+ help="Path to write the lineage manifest (default: lineage_output.json)",
74
+ )
75
+ parser.add_argument(
76
+ "--batch-size",
77
+ type=int,
78
+ default=DEFAULT_BATCH_SIZE,
79
+ metavar="N",
80
+ help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
81
+ )
82
+ parser.add_argument(
83
+ "--timeout",
84
+ type=int,
85
+ default=DEFAULT_TIMEOUT_SECONDS,
86
+ metavar="SEC",
87
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
88
+ )
89
+ args = parser.parse_args()
90
+
91
+ if not args.key_id or not args.key_token:
92
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
93
+ if not args.resource_uuid:
94
+ parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
95
+
96
+ manifest = collect(log_file=args.log_file)
97
+
98
+ if not manifest["edges"]:
99
+ print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
100
+ return
101
+
102
+ push(
103
+ manifest=manifest,
104
+ resource_uuid=args.resource_uuid,
105
+ key_id=args.key_id,
106
+ key_token=args.key_token,
107
+ column_lineage=args.column_lineage,
108
+ batch_size=args.batch_size,
109
+ timeout_seconds=args.timeout,
110
+ )
111
+
112
+ with open(args.output_file, "w") as fh:
113
+ json.dump(manifest, fh, indent=2)
114
+ print(f"Lineage manifest written to {args.output_file}")
115
+ print("Done.")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect Hive table metadata and push it to Monte Carlo in one step.
4
+
5
+ Thin wrapper that calls ``collect()`` from ``collect_metadata`` followed by
6
+ ``push()`` from ``push_metadata``, then writes the final manifest (with
7
+ ``resource_uuid`` and ``invocation_id``) to ``--output-file``.
8
+
9
+ Substitution points
10
+ -------------------
11
+ - HIVE_HOST (env) / --hive-host (CLI) : HiveServer2 hostname
12
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
13
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
14
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
15
+
16
+ Prerequisites
17
+ -------------
18
+ pip install pycarlo pyhive python-dotenv
19
+
20
+ Usage
21
+ -----
22
+ python collect_and_push_metadata.py \\
23
+ --key-id <MCD_INGEST_ID> \\
24
+ --key-token <MCD_INGEST_TOKEN> \\
25
+ --resource-uuid <MCD_RESOURCE_UUID> \\
26
+ --hive-host <HIVESERVER2_HOSTNAME>
27
+ """
28
+
29
+ import argparse
30
+ import json
31
+ import os
32
+
33
+ from collect_metadata import collect
34
+ from push_metadata import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
35
+
36
+
37
+ def main() -> None:
38
+ parser = argparse.ArgumentParser(
39
+ description="Collect Hive table metadata and push to Monte Carlo",
40
+ )
41
+ # Hive / collect args
42
+ parser.add_argument(
43
+ "--hive-host",
44
+ default=os.environ.get("HIVE_HOST"),
45
+ help="HiveServer2 hostname (env: HIVE_HOST)", # ← SUBSTITUTE: your EMR master DNS or Hive host
46
+ )
47
+ parser.add_argument(
48
+ "--hive-port",
49
+ type=int,
50
+ default=10000,
51
+ help="HiveServer2 port (default: 10000)", # ← SUBSTITUTE if your cluster uses a non-standard port
52
+ )
53
+ # Push / MC args
54
+ parser.add_argument(
55
+ "--key-id",
56
+ default=os.environ.get("MCD_INGEST_ID"),
57
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)", # ← SUBSTITUTE env var name if different
58
+ )
59
+ parser.add_argument(
60
+ "--key-token",
61
+ default=os.environ.get("MCD_INGEST_TOKEN"),
62
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)", # ← SUBSTITUTE env var name if different
63
+ )
64
+ parser.add_argument(
65
+ "--resource-uuid",
66
+ default=os.environ.get("MCD_RESOURCE_UUID"),
67
+ required=False,
68
+ help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
69
+ )
70
+ parser.add_argument(
71
+ "--output-file",
72
+ default="metadata_output.json",
73
+ help="Path to write the output manifest (default: metadata_output.json)",
74
+ )
75
+ parser.add_argument(
76
+ "--batch-size",
77
+ type=int,
78
+ default=DEFAULT_BATCH_SIZE,
79
+ metavar="N",
80
+ help=f"Max assets per POST (default: {DEFAULT_BATCH_SIZE})",
81
+ )
82
+ parser.add_argument(
83
+ "--timeout",
84
+ type=int,
85
+ default=DEFAULT_TIMEOUT_SECONDS,
86
+ metavar="SEC",
87
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
88
+ )
89
+ args = parser.parse_args()
90
+
91
+ if not args.hive_host:
92
+ parser.error("--hive-host is required (or set HIVE_HOST)")
93
+ if not args.key_id or not args.key_token:
94
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
95
+ if not args.resource_uuid:
96
+ parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
97
+
98
+ manifest = collect(
99
+ hive_host=args.hive_host,
100
+ hive_port=args.hive_port,
101
+ )
102
+
103
+ push(
104
+ manifest=manifest,
105
+ resource_uuid=args.resource_uuid,
106
+ key_id=args.key_id,
107
+ key_token=args.key_token,
108
+ batch_size=args.batch_size,
109
+ timeout_seconds=args.timeout,
110
+ )
111
+
112
+ with open(args.output_file, "w") as fh:
113
+ json.dump(manifest, fh, indent=2)
114
+ print(f"Manifest written to {args.output_file}")
115
+ print("Done.")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()