opencode-skills-collection 2.0.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Databricks — Metadata Push (push-only)
|
|
3
|
+
========================================
|
|
4
|
+
Reads a JSON manifest file produced by collect_metadata.py and pushes the assets
|
|
5
|
+
to Monte Carlo via the push ingestion API, with configurable batching to keep
|
|
6
|
+
compressed payloads under 1 MB.
|
|
7
|
+
|
|
8
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
9
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
10
|
+
- MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
|
|
11
|
+
- PUSH_BATCH_SIZE : number of assets per API call (default 500)
|
|
12
|
+
|
|
13
|
+
Prerequisites:
|
|
14
|
+
pip install pycarlo
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
AssetField,
|
|
31
|
+
AssetFreshness,
|
|
32
|
+
AssetMetadata,
|
|
33
|
+
AssetVolume,
|
|
34
|
+
RelationalAsset,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
38
|
+
log = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
RESOURCE_TYPE = "databricks"
|
|
41
|
+
DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _asset_from_dict(d: dict[str, Any]) -> RelationalAsset:
|
|
45
|
+
"""Reconstruct a RelationalAsset from a manifest dict."""
|
|
46
|
+
fields = [
|
|
47
|
+
AssetField(
|
|
48
|
+
name=f["name"],
|
|
49
|
+
type=f.get("type"),
|
|
50
|
+
description=f.get("description"),
|
|
51
|
+
)
|
|
52
|
+
for f in d.get("fields", [])
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
volume = None
|
|
56
|
+
if d.get("row_count") is not None or d.get("byte_count") is not None:
|
|
57
|
+
volume = AssetVolume(row_count=d.get("row_count"), byte_count=d.get("byte_count"))
|
|
58
|
+
|
|
59
|
+
freshness = None
|
|
60
|
+
if d.get("last_updated") is not None:
|
|
61
|
+
freshness = AssetFreshness(last_update_time=d.get("last_updated"))
|
|
62
|
+
|
|
63
|
+
return RelationalAsset(
|
|
64
|
+
type=d.get("asset_type", "TABLE"),
|
|
65
|
+
metadata=AssetMetadata(
|
|
66
|
+
name=d["asset_name"],
|
|
67
|
+
database=d["database"], # ← SUBSTITUTE: use catalog as database
|
|
68
|
+
schema=d["schema"],
|
|
69
|
+
description=d.get("description"),
|
|
70
|
+
),
|
|
71
|
+
fields=fields,
|
|
72
|
+
volume=volume,
|
|
73
|
+
freshness=freshness,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def push(
|
|
78
|
+
manifest_path: str,
|
|
79
|
+
resource_uuid: str,
|
|
80
|
+
key_id: str,
|
|
81
|
+
key_token: str,
|
|
82
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
83
|
+
) -> dict[str, Any]:
|
|
84
|
+
"""Read a collect manifest and push assets to Monte Carlo in batches.
|
|
85
|
+
|
|
86
|
+
Returns a summary dict with invocation IDs and counts.
|
|
87
|
+
"""
|
|
88
|
+
with open(manifest_path) as fh:
|
|
89
|
+
manifest = json.load(fh)
|
|
90
|
+
|
|
91
|
+
asset_dicts: list[dict[str, Any]] = manifest["assets"]
|
|
92
|
+
assets = [_asset_from_dict(d) for d in asset_dicts]
|
|
93
|
+
log.info("Loaded %d assets from %s", len(assets), manifest_path)
|
|
94
|
+
|
|
95
|
+
# Split into batches
|
|
96
|
+
batches = []
|
|
97
|
+
for i in range(0, max(len(assets), 1), batch_size):
|
|
98
|
+
batches.append(assets[i : i + batch_size])
|
|
99
|
+
total_batches = len(batches)
|
|
100
|
+
|
|
101
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
102
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
103
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
104
|
+
service = IngestionService(mc_client=client)
|
|
105
|
+
result = service.send_metadata(
|
|
106
|
+
resource_uuid=resource_uuid,
|
|
107
|
+
resource_type=RESOURCE_TYPE,
|
|
108
|
+
events=batch,
|
|
109
|
+
)
|
|
110
|
+
invocation_id = service.extract_invocation_id(result)
|
|
111
|
+
log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
112
|
+
return invocation_id
|
|
113
|
+
|
|
114
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
115
|
+
max_workers = min(4, total_batches)
|
|
116
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
117
|
+
|
|
118
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
119
|
+
futures = {
|
|
120
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
121
|
+
for i, batch in enumerate(batches)
|
|
122
|
+
}
|
|
123
|
+
for future in as_completed(futures):
|
|
124
|
+
idx = futures[future]
|
|
125
|
+
try:
|
|
126
|
+
invocation_ids[idx] = future.result()
|
|
127
|
+
except Exception as exc:
|
|
128
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
129
|
+
raise
|
|
130
|
+
|
|
131
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
132
|
+
|
|
133
|
+
pushed_at = datetime.now(timezone.utc).isoformat()
|
|
134
|
+
summary = {
|
|
135
|
+
"resource_uuid": resource_uuid,
|
|
136
|
+
"resource_type": RESOURCE_TYPE,
|
|
137
|
+
"invocation_ids": invocation_ids,
|
|
138
|
+
"pushed_at": pushed_at,
|
|
139
|
+
"asset_count": len(assets),
|
|
140
|
+
"batch_count": total_batches,
|
|
141
|
+
"batch_size": batch_size,
|
|
142
|
+
"catalog": manifest.get("catalog"),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Write push result alongside the collect manifest
|
|
146
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
147
|
+
with open(push_manifest_path, "w") as fh:
|
|
148
|
+
json.dump(summary, fh, indent=2)
|
|
149
|
+
log.info("Push result written to %s", push_manifest_path)
|
|
150
|
+
|
|
151
|
+
return summary
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def main() -> None:
|
|
155
|
+
parser = argparse.ArgumentParser(description="Push Databricks metadata to Monte Carlo from manifest")
|
|
156
|
+
parser.add_argument("--manifest", default="manifest_metadata.json")
|
|
157
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
158
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
159
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
160
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
161
|
+
args = parser.parse_args()
|
|
162
|
+
|
|
163
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
164
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
165
|
+
if missing:
|
|
166
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
167
|
+
|
|
168
|
+
push(
|
|
169
|
+
manifest_path=args.manifest,
|
|
170
|
+
resource_uuid=args.resource_uuid,
|
|
171
|
+
key_id=args.key_id,
|
|
172
|
+
key_token=args.key_token,
|
|
173
|
+
batch_size=args.batch_size,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
main()
|
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Databricks — Query Log Push (push-only)
|
|
3
|
+
=========================================
|
|
4
|
+
Reads a JSON manifest file produced by collect_query_logs.py and pushes the query
|
|
5
|
+
log entries to Monte Carlo via the push ingestion API, with configurable batching
|
|
6
|
+
to keep compressed payloads under 1 MB.
|
|
7
|
+
|
|
8
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
9
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
10
|
+
- MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
|
|
11
|
+
- PUSH_BATCH_SIZE : number of entries per API call (default 100)
|
|
12
|
+
|
|
13
|
+
Prerequisites:
|
|
14
|
+
pip install pycarlo
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from dateutil.parser import isoparse
|
|
28
|
+
from pycarlo.core import Client, Session
|
|
29
|
+
from pycarlo.features.ingestion import IngestionService
|
|
30
|
+
from pycarlo.features.ingestion.models import QueryLogEntry
|
|
31
|
+
|
|
32
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
33
|
+
log = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
LOG_TYPE = "databricks"
|
|
36
|
+
DEFAULT_BATCH_SIZE = 100 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
|
|
37
|
+
|
|
38
|
+
# Truncate query_text longer than this to prevent 413 errors.
|
|
39
|
+
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
|
|
40
|
+
# compressed payloads even at small batch sizes.
|
|
41
|
+
_MAX_QUERY_TEXT_LEN = 10_000
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _build_query_log_entries(entry_dicts: list[dict[str, Any]]) -> list[QueryLogEntry]:
|
|
45
|
+
"""Convert manifest query dicts into QueryLogEntry objects."""
|
|
46
|
+
entries = []
|
|
47
|
+
truncated = 0
|
|
48
|
+
for d in entry_dicts:
|
|
49
|
+
query_text = d.get("query_text") or ""
|
|
50
|
+
|
|
51
|
+
# Truncate very long SQL to prevent 413 Request Too Large
|
|
52
|
+
if len(query_text) > _MAX_QUERY_TEXT_LEN:
|
|
53
|
+
query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
|
|
54
|
+
truncated += 1
|
|
55
|
+
|
|
56
|
+
extra = {}
|
|
57
|
+
if d.get("total_task_duration_ms") is not None:
|
|
58
|
+
extra["total_task_duration_ms"] = d["total_task_duration_ms"]
|
|
59
|
+
if d.get("read_rows") is not None:
|
|
60
|
+
extra["read_rows"] = d["read_rows"]
|
|
61
|
+
if d.get("read_bytes") is not None:
|
|
62
|
+
extra["read_bytes"] = d["read_bytes"]
|
|
63
|
+
|
|
64
|
+
start_time = d.get("start_time")
|
|
65
|
+
end_time = d.get("end_time")
|
|
66
|
+
|
|
67
|
+
entries.append(
|
|
68
|
+
QueryLogEntry(
|
|
69
|
+
query_id=d.get("query_id"),
|
|
70
|
+
query_text=query_text,
|
|
71
|
+
start_time=isoparse(start_time) if start_time else None,
|
|
72
|
+
end_time=isoparse(end_time) if end_time else None,
|
|
73
|
+
user=d.get("user"),
|
|
74
|
+
returned_rows=d.get("returned_rows"),
|
|
75
|
+
extra=extra or None,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
if truncated:
|
|
79
|
+
log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
|
|
80
|
+
return entries
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def push(
|
|
84
|
+
manifest_path: str,
|
|
85
|
+
resource_uuid: str,
|
|
86
|
+
key_id: str,
|
|
87
|
+
key_token: str,
|
|
88
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
89
|
+
) -> dict[str, Any]:
|
|
90
|
+
"""Read a collect manifest and push query log entries to Monte Carlo in batches.
|
|
91
|
+
|
|
92
|
+
Returns a summary dict with invocation IDs and counts.
|
|
93
|
+
"""
|
|
94
|
+
with open(manifest_path) as fh:
|
|
95
|
+
manifest = json.load(fh)
|
|
96
|
+
|
|
97
|
+
entry_dicts: list[dict[str, Any]] = manifest["entries"]
|
|
98
|
+
entries = _build_query_log_entries(entry_dicts)
|
|
99
|
+
log.info("Loaded %d query log entries from %s", len(entries), manifest_path)
|
|
100
|
+
|
|
101
|
+
if not entries:
|
|
102
|
+
log.info("No query log entries to push.")
|
|
103
|
+
summary = {
|
|
104
|
+
"resource_uuid": resource_uuid,
|
|
105
|
+
"log_type": LOG_TYPE,
|
|
106
|
+
"invocation_ids": [],
|
|
107
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
108
|
+
"query_log_count": 0,
|
|
109
|
+
"batch_count": 0,
|
|
110
|
+
"batch_size": batch_size,
|
|
111
|
+
}
|
|
112
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
113
|
+
with open(push_manifest_path, "w") as fh:
|
|
114
|
+
json.dump(summary, fh, indent=2)
|
|
115
|
+
return summary
|
|
116
|
+
|
|
117
|
+
# Split into batches
|
|
118
|
+
batches = []
|
|
119
|
+
for i in range(0, len(entries), batch_size):
|
|
120
|
+
batches.append(entries[i : i + batch_size])
|
|
121
|
+
total_batches = len(batches)
|
|
122
|
+
|
|
123
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
124
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
125
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
126
|
+
service = IngestionService(mc_client=client)
|
|
127
|
+
result = service.send_query_logs(
|
|
128
|
+
resource_uuid=resource_uuid,
|
|
129
|
+
log_type=LOG_TYPE,
|
|
130
|
+
events=batch,
|
|
131
|
+
)
|
|
132
|
+
invocation_id = service.extract_invocation_id(result)
|
|
133
|
+
log.info("Pushed batch %d/%d (%d entries) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
134
|
+
return invocation_id
|
|
135
|
+
|
|
136
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
137
|
+
max_workers = min(4, total_batches)
|
|
138
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
139
|
+
|
|
140
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
141
|
+
futures = {
|
|
142
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
143
|
+
for i, batch in enumerate(batches)
|
|
144
|
+
}
|
|
145
|
+
for future in as_completed(futures):
|
|
146
|
+
idx = futures[future]
|
|
147
|
+
try:
|
|
148
|
+
invocation_ids[idx] = future.result()
|
|
149
|
+
except Exception as exc:
|
|
150
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
151
|
+
raise
|
|
152
|
+
|
|
153
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
154
|
+
|
|
155
|
+
pushed_at = datetime.now(timezone.utc).isoformat()
|
|
156
|
+
summary = {
|
|
157
|
+
"resource_uuid": resource_uuid,
|
|
158
|
+
"log_type": LOG_TYPE,
|
|
159
|
+
"invocation_ids": invocation_ids,
|
|
160
|
+
"pushed_at": pushed_at,
|
|
161
|
+
"query_log_count": len(entries),
|
|
162
|
+
"batch_count": total_batches,
|
|
163
|
+
"batch_size": batch_size,
|
|
164
|
+
"lookback_hours": manifest.get("lookback_hours"),
|
|
165
|
+
"lookback_lag_hours": manifest.get("lookback_lag_hours"),
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
169
|
+
with open(push_manifest_path, "w") as fh:
|
|
170
|
+
json.dump(summary, fh, indent=2)
|
|
171
|
+
log.info("Push result written to %s", push_manifest_path)
|
|
172
|
+
|
|
173
|
+
return summary
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def main() -> None:
|
|
177
|
+
parser = argparse.ArgumentParser(description="Push Databricks query logs to Monte Carlo from manifest")
|
|
178
|
+
parser.add_argument("--manifest", default="manifest_query_logs.json")
|
|
179
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
180
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
181
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
182
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
183
|
+
args = parser.parse_args()
|
|
184
|
+
|
|
185
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
186
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
187
|
+
if missing:
|
|
188
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
189
|
+
|
|
190
|
+
push(
|
|
191
|
+
manifest_path=args.manifest,
|
|
192
|
+
resource_uuid=args.resource_uuid,
|
|
193
|
+
key_id=args.key_id,
|
|
194
|
+
key_token=args.key_token,
|
|
195
|
+
batch_size=args.batch_size,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
main()
|
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Extract Hive lineage from a local log file and push it to Monte Carlo in one step.
|
|
4
|
+
|
|
5
|
+
Thin wrapper that calls ``collect()`` from ``collect_lineage`` followed by
|
|
6
|
+
``push()`` from ``push_lineage``, then writes the final manifest (with
|
|
7
|
+
``resource_uuid`` and ``invocation_id``) to ``--output-file``.
|
|
8
|
+
|
|
9
|
+
Substitution points
|
|
10
|
+
-------------------
|
|
11
|
+
- MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
|
|
12
|
+
- MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
|
|
13
|
+
- MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
|
|
14
|
+
- --log-file : path to local HiveServer2 log
|
|
15
|
+
|
|
16
|
+
Prerequisites
|
|
17
|
+
-------------
|
|
18
|
+
pip install pycarlo python-dotenv
|
|
19
|
+
|
|
20
|
+
Usage (table-level):
|
|
21
|
+
python collect_and_push_lineage.py \\
|
|
22
|
+
--key-id <MCD_INGEST_ID> \\
|
|
23
|
+
--key-token <MCD_INGEST_TOKEN> \\
|
|
24
|
+
--resource-uuid <MCD_RESOURCE_UUID> \\
|
|
25
|
+
--log-file /tmp/root/hive.log
|
|
26
|
+
|
|
27
|
+
Usage (column-level):
|
|
28
|
+
python collect_and_push_lineage.py ... --column-lineage
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
|
|
35
|
+
from collect_lineage import collect
|
|
36
|
+
from push_lineage import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main() -> None:
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
description="Extract Hive lineage from a local log file and push to Monte Carlo",
|
|
42
|
+
)
|
|
43
|
+
# Collect args
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--log-file",
|
|
46
|
+
default="/tmp/root/hive.log",
|
|
47
|
+
help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
|
|
48
|
+
)
|
|
49
|
+
# Push / MC args
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--key-id",
|
|
52
|
+
default=os.environ.get("MCD_INGEST_ID"),
|
|
53
|
+
help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--key-token",
|
|
57
|
+
default=os.environ.get("MCD_INGEST_TOKEN"),
|
|
58
|
+
help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--resource-uuid",
|
|
62
|
+
default=os.environ.get("MCD_RESOURCE_UUID"),
|
|
63
|
+
help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--column-lineage",
|
|
67
|
+
action="store_true",
|
|
68
|
+
help="Push column-level lineage instead of table-level",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--output-file",
|
|
72
|
+
default="lineage_output.json",
|
|
73
|
+
help="Path to write the lineage manifest (default: lineage_output.json)",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--batch-size",
|
|
77
|
+
type=int,
|
|
78
|
+
default=DEFAULT_BATCH_SIZE,
|
|
79
|
+
metavar="N",
|
|
80
|
+
help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--timeout",
|
|
84
|
+
type=int,
|
|
85
|
+
default=DEFAULT_TIMEOUT_SECONDS,
|
|
86
|
+
metavar="SEC",
|
|
87
|
+
help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
|
|
88
|
+
)
|
|
89
|
+
args = parser.parse_args()
|
|
90
|
+
|
|
91
|
+
if not args.key_id or not args.key_token:
|
|
92
|
+
parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
|
|
93
|
+
if not args.resource_uuid:
|
|
94
|
+
parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
|
|
95
|
+
|
|
96
|
+
manifest = collect(log_file=args.log_file)
|
|
97
|
+
|
|
98
|
+
if not manifest["edges"]:
|
|
99
|
+
print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
push(
|
|
103
|
+
manifest=manifest,
|
|
104
|
+
resource_uuid=args.resource_uuid,
|
|
105
|
+
key_id=args.key_id,
|
|
106
|
+
key_token=args.key_token,
|
|
107
|
+
column_lineage=args.column_lineage,
|
|
108
|
+
batch_size=args.batch_size,
|
|
109
|
+
timeout_seconds=args.timeout,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
with open(args.output_file, "w") as fh:
|
|
113
|
+
json.dump(manifest, fh, indent=2)
|
|
114
|
+
print(f"Lineage manifest written to {args.output_file}")
|
|
115
|
+
print("Done.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Collect Hive table metadata and push it to Monte Carlo in one step.
|
|
4
|
+
|
|
5
|
+
Thin wrapper that calls ``collect()`` from ``collect_metadata`` followed by
|
|
6
|
+
``push()`` from ``push_metadata``, then writes the final manifest (with
|
|
7
|
+
``resource_uuid`` and ``invocation_id``) to ``--output-file``.
|
|
8
|
+
|
|
9
|
+
Substitution points
|
|
10
|
+
-------------------
|
|
11
|
+
- HIVE_HOST (env) / --hive-host (CLI) : HiveServer2 hostname
|
|
12
|
+
- MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
|
|
13
|
+
- MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
|
|
14
|
+
- MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
|
|
15
|
+
|
|
16
|
+
Prerequisites
|
|
17
|
+
-------------
|
|
18
|
+
pip install pycarlo pyhive python-dotenv
|
|
19
|
+
|
|
20
|
+
Usage
|
|
21
|
+
-----
|
|
22
|
+
python collect_and_push_metadata.py \\
|
|
23
|
+
--key-id <MCD_INGEST_ID> \\
|
|
24
|
+
--key-token <MCD_INGEST_TOKEN> \\
|
|
25
|
+
--resource-uuid <MCD_RESOURCE_UUID> \\
|
|
26
|
+
--hive-host <HIVESERVER2_HOSTNAME>
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import argparse
|
|
30
|
+
import json
|
|
31
|
+
import os
|
|
32
|
+
|
|
33
|
+
from collect_metadata import collect
|
|
34
|
+
from push_metadata import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main() -> None:
|
|
38
|
+
parser = argparse.ArgumentParser(
|
|
39
|
+
description="Collect Hive table metadata and push to Monte Carlo",
|
|
40
|
+
)
|
|
41
|
+
# Hive / collect args
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--hive-host",
|
|
44
|
+
default=os.environ.get("HIVE_HOST"),
|
|
45
|
+
help="HiveServer2 hostname (env: HIVE_HOST)", # ← SUBSTITUTE: your EMR master DNS or Hive host
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--hive-port",
|
|
49
|
+
type=int,
|
|
50
|
+
default=10000,
|
|
51
|
+
help="HiveServer2 port (default: 10000)", # ← SUBSTITUTE if your cluster uses a non-standard port
|
|
52
|
+
)
|
|
53
|
+
# Push / MC args
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--key-id",
|
|
56
|
+
default=os.environ.get("MCD_INGEST_ID"),
|
|
57
|
+
help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)", # ← SUBSTITUTE env var name if different
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--key-token",
|
|
61
|
+
default=os.environ.get("MCD_INGEST_TOKEN"),
|
|
62
|
+
help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)", # ← SUBSTITUTE env var name if different
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--resource-uuid",
|
|
66
|
+
default=os.environ.get("MCD_RESOURCE_UUID"),
|
|
67
|
+
required=False,
|
|
68
|
+
help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--output-file",
|
|
72
|
+
default="metadata_output.json",
|
|
73
|
+
help="Path to write the output manifest (default: metadata_output.json)",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--batch-size",
|
|
77
|
+
type=int,
|
|
78
|
+
default=DEFAULT_BATCH_SIZE,
|
|
79
|
+
metavar="N",
|
|
80
|
+
help=f"Max assets per POST (default: {DEFAULT_BATCH_SIZE})",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--timeout",
|
|
84
|
+
type=int,
|
|
85
|
+
default=DEFAULT_TIMEOUT_SECONDS,
|
|
86
|
+
metavar="SEC",
|
|
87
|
+
help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
|
|
88
|
+
)
|
|
89
|
+
args = parser.parse_args()
|
|
90
|
+
|
|
91
|
+
if not args.hive_host:
|
|
92
|
+
parser.error("--hive-host is required (or set HIVE_HOST)")
|
|
93
|
+
if not args.key_id or not args.key_token:
|
|
94
|
+
parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
|
|
95
|
+
if not args.resource_uuid:
|
|
96
|
+
parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
|
|
97
|
+
|
|
98
|
+
manifest = collect(
|
|
99
|
+
hive_host=args.hive_host,
|
|
100
|
+
hive_port=args.hive_port,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
push(
|
|
104
|
+
manifest=manifest,
|
|
105
|
+
resource_uuid=args.resource_uuid,
|
|
106
|
+
key_id=args.key_id,
|
|
107
|
+
key_token=args.key_token,
|
|
108
|
+
batch_size=args.batch_size,
|
|
109
|
+
timeout_seconds=args.timeout,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
with open(args.output_file, "w") as fh:
|
|
113
|
+
json.dump(manifest, fh, indent=2)
|
|
114
|
+
print(f"Manifest written to {args.output_file}")
|
|
115
|
+
print("Done.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
main()
|