opencode-skills-collection 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery — Lineage Push (push only)
|
|
3
|
+
====================================
|
|
4
|
+
Reads a manifest file produced by ``collect_lineage.py`` and pushes the lineage
|
|
5
|
+
events to Monte Carlo using the pycarlo push ingestion API. Large payloads are
|
|
6
|
+
split into batches to stay under the 1 MB compressed limit.
|
|
7
|
+
|
|
8
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
9
|
+
|
|
10
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
11
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
12
|
+
- MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
|
|
13
|
+
|
|
14
|
+
Prerequisites:
|
|
15
|
+
pip install pycarlo
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
LineageAssetRef,
|
|
31
|
+
LineageEvent,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
35
|
+
log = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
RESOURCE_TYPE = "bigquery"
|
|
38
|
+
|
|
39
|
+
# Maximum events per batch — conservative default to keep compressed payload under 1 MB
|
|
40
|
+
# ← SUBSTITUTE: tune based on average edge complexity (number of sources per event)
|
|
41
|
+
_BATCH_SIZE = 500
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _make_ref(database: str, schema: str, table: str) -> LineageAssetRef:
|
|
45
|
+
return LineageAssetRef(
|
|
46
|
+
type="TABLE",
|
|
47
|
+
name=table,
|
|
48
|
+
database=database,
|
|
49
|
+
schema=schema,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _build_events(edges: list[dict]) -> list[LineageEvent]:
|
|
54
|
+
"""Build LineageEvent objects from manifest edge dicts."""
|
|
55
|
+
events = []
|
|
56
|
+
for edge in edges:
|
|
57
|
+
dest = edge["destination"]
|
|
58
|
+
sources = edge.get("sources", [])
|
|
59
|
+
if not sources:
|
|
60
|
+
continue
|
|
61
|
+
events.append(
|
|
62
|
+
LineageEvent(
|
|
63
|
+
destination=_make_ref(dest["database"], dest["schema"], dest["table"]),
|
|
64
|
+
sources=[
|
|
65
|
+
_make_ref(s["database"], s["schema"], s["table"])
|
|
66
|
+
for s in sources
|
|
67
|
+
],
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return events
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def push(
|
|
74
|
+
input_file: str,
|
|
75
|
+
resource_uuid: str,
|
|
76
|
+
key_id: str,
|
|
77
|
+
key_token: str,
|
|
78
|
+
batch_size: int = _BATCH_SIZE,
|
|
79
|
+
output_file: str = "lineage_push_result.json",
|
|
80
|
+
) -> dict:
|
|
81
|
+
"""
|
|
82
|
+
Read a lineage manifest and push events to Monte Carlo in batches.
|
|
83
|
+
|
|
84
|
+
Returns a result dict with invocation IDs for each batch.
|
|
85
|
+
"""
|
|
86
|
+
with open(input_file) as fh:
|
|
87
|
+
manifest = json.load(fh)
|
|
88
|
+
|
|
89
|
+
edges = manifest.get("edges", [])
|
|
90
|
+
resource_type = manifest.get("resource_type", RESOURCE_TYPE)
|
|
91
|
+
events = _build_events(edges)
|
|
92
|
+
log.info("Loaded %d lineage event(s) from %s", len(events), input_file)
|
|
93
|
+
|
|
94
|
+
if not events:
|
|
95
|
+
log.info("No lineage events to push.")
|
|
96
|
+
push_result = {
|
|
97
|
+
"resource_uuid": resource_uuid,
|
|
98
|
+
"resource_type": resource_type,
|
|
99
|
+
"invocation_ids": [],
|
|
100
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
101
|
+
"total_events": 0,
|
|
102
|
+
"batch_count": 0,
|
|
103
|
+
"batch_size": batch_size,
|
|
104
|
+
}
|
|
105
|
+
with open(output_file, "w") as fh:
|
|
106
|
+
json.dump(push_result, fh, indent=2)
|
|
107
|
+
return push_result
|
|
108
|
+
|
|
109
|
+
# Split into batches
|
|
110
|
+
batches = []
|
|
111
|
+
for i in range(0, len(events), batch_size):
|
|
112
|
+
batches.append(events[i : i + batch_size])
|
|
113
|
+
total_batches = len(batches)
|
|
114
|
+
|
|
115
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
116
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
117
|
+
log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
|
|
118
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
119
|
+
service = IngestionService(mc_client=client)
|
|
120
|
+
result = service.send_lineage(
|
|
121
|
+
resource_uuid=resource_uuid,
|
|
122
|
+
resource_type=resource_type,
|
|
123
|
+
events=batch,
|
|
124
|
+
)
|
|
125
|
+
invocation_id = service.extract_invocation_id(result)
|
|
126
|
+
if invocation_id:
|
|
127
|
+
log.info(" Batch %d: invocation_id=%s", batch_num, invocation_id)
|
|
128
|
+
return invocation_id
|
|
129
|
+
|
|
130
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
131
|
+
max_workers = min(4, total_batches)
|
|
132
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
133
|
+
|
|
134
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
135
|
+
futures = {
|
|
136
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
137
|
+
for i, batch in enumerate(batches)
|
|
138
|
+
}
|
|
139
|
+
for future in as_completed(futures):
|
|
140
|
+
idx = futures[future]
|
|
141
|
+
try:
|
|
142
|
+
invocation_ids[idx] = future.result()
|
|
143
|
+
except Exception as exc:
|
|
144
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
145
|
+
raise
|
|
146
|
+
|
|
147
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
148
|
+
|
|
149
|
+
push_result = {
|
|
150
|
+
"resource_uuid": resource_uuid,
|
|
151
|
+
"resource_type": resource_type,
|
|
152
|
+
"invocation_ids": invocation_ids,
|
|
153
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
154
|
+
"total_events": len(events),
|
|
155
|
+
"batch_count": total_batches,
|
|
156
|
+
"batch_size": batch_size,
|
|
157
|
+
}
|
|
158
|
+
with open(output_file, "w") as fh:
|
|
159
|
+
json.dump(push_result, fh, indent=2)
|
|
160
|
+
log.info("Push result written to %s", output_file)
|
|
161
|
+
|
|
162
|
+
return push_result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def main() -> None:
|
|
166
|
+
parser = argparse.ArgumentParser(
|
|
167
|
+
description="Push BigQuery lineage from a manifest to Monte Carlo",
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
170
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
171
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
172
|
+
parser.add_argument("--input-file", default="lineage_output.json")
|
|
173
|
+
parser.add_argument("--output-file", default="lineage_push_result.json")
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"--batch-size",
|
|
176
|
+
type=int,
|
|
177
|
+
default=_BATCH_SIZE,
|
|
178
|
+
help=f"Max events per push batch (default: {_BATCH_SIZE})",
|
|
179
|
+
)
|
|
180
|
+
args = parser.parse_args()
|
|
181
|
+
|
|
182
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
183
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
184
|
+
if missing:
|
|
185
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
186
|
+
|
|
187
|
+
push(
|
|
188
|
+
input_file=args.input_file,
|
|
189
|
+
resource_uuid=args.resource_uuid,
|
|
190
|
+
key_id=args.key_id,
|
|
191
|
+
key_token=args.key_token,
|
|
192
|
+
batch_size=args.batch_size,
|
|
193
|
+
output_file=args.output_file,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
if __name__ == "__main__":
|
|
198
|
+
main()
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery — Metadata Push (push only)
|
|
3
|
+
=====================================
|
|
4
|
+
Reads a manifest file produced by ``collect_metadata.py`` and pushes the assets
|
|
5
|
+
to Monte Carlo using the pycarlo push ingestion API. Large payloads are split
|
|
6
|
+
into batches to stay under the 1 MB compressed limit.
|
|
7
|
+
|
|
8
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
9
|
+
|
|
10
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
11
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
12
|
+
- MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
|
|
13
|
+
|
|
14
|
+
Prerequisites:
|
|
15
|
+
pip install pycarlo
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
AssetField,
|
|
31
|
+
AssetFreshness,
|
|
32
|
+
AssetMetadata,
|
|
33
|
+
AssetVolume,
|
|
34
|
+
RelationalAsset,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
38
|
+
log = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
RESOURCE_TYPE = "bigquery"
|
|
41
|
+
|
|
42
|
+
# Maximum assets per batch — conservative default to keep compressed payload under 1 MB
|
|
43
|
+
# ← SUBSTITUTE: tune based on average asset size (fields per table, description length, etc.)
|
|
44
|
+
_BATCH_SIZE = 500
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _asset_from_dict(d: dict) -> RelationalAsset:
|
|
48
|
+
"""Reconstruct a RelationalAsset from a manifest dict entry."""
|
|
49
|
+
fields = [
|
|
50
|
+
AssetField(
|
|
51
|
+
name=f["name"],
|
|
52
|
+
type=f.get("type"),
|
|
53
|
+
description=f.get("description"),
|
|
54
|
+
)
|
|
55
|
+
for f in d.get("fields", [])
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
volume = None
|
|
59
|
+
if d.get("volume"):
|
|
60
|
+
volume = AssetVolume(
|
|
61
|
+
row_count=d["volume"].get("row_count"),
|
|
62
|
+
byte_count=d["volume"].get("byte_count"),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
freshness = None
|
|
66
|
+
if d.get("freshness"):
|
|
67
|
+
freshness = AssetFreshness(
|
|
68
|
+
last_update_time=d["freshness"].get("last_update_time"),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return RelationalAsset(
|
|
72
|
+
type=d.get("type", "TABLE"),
|
|
73
|
+
metadata=AssetMetadata(
|
|
74
|
+
name=d["name"],
|
|
75
|
+
database=d["database"], # ← SUBSTITUTE: use project or dataset as database
|
|
76
|
+
schema=d["schema"],
|
|
77
|
+
description=d.get("description"),
|
|
78
|
+
),
|
|
79
|
+
fields=fields,
|
|
80
|
+
volume=volume,
|
|
81
|
+
freshness=freshness,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def push(
|
|
86
|
+
input_file: str,
|
|
87
|
+
resource_uuid: str,
|
|
88
|
+
key_id: str,
|
|
89
|
+
key_token: str,
|
|
90
|
+
batch_size: int = _BATCH_SIZE,
|
|
91
|
+
output_file: str = "metadata_push_result.json",
|
|
92
|
+
) -> dict:
|
|
93
|
+
"""
|
|
94
|
+
Read a metadata manifest and push assets to Monte Carlo in batches.
|
|
95
|
+
|
|
96
|
+
Returns a result dict with invocation IDs for each batch.
|
|
97
|
+
"""
|
|
98
|
+
with open(input_file) as fh:
|
|
99
|
+
manifest = json.load(fh)
|
|
100
|
+
|
|
101
|
+
asset_dicts = manifest.get("assets", [])
|
|
102
|
+
resource_type = manifest.get("resource_type", RESOURCE_TYPE)
|
|
103
|
+
assets = [_asset_from_dict(d) for d in asset_dicts]
|
|
104
|
+
log.info("Loaded %d asset(s) from %s", len(assets), input_file)
|
|
105
|
+
|
|
106
|
+
# Split into batches
|
|
107
|
+
batches = []
|
|
108
|
+
for i in range(0, max(len(assets), 1), batch_size):
|
|
109
|
+
batches.append(assets[i : i + batch_size])
|
|
110
|
+
total_batches = len(batches)
|
|
111
|
+
|
|
112
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
113
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
114
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
115
|
+
service = IngestionService(mc_client=client)
|
|
116
|
+
result = service.send_metadata(
|
|
117
|
+
resource_uuid=resource_uuid,
|
|
118
|
+
resource_type=resource_type,
|
|
119
|
+
events=batch,
|
|
120
|
+
)
|
|
121
|
+
invocation_id = service.extract_invocation_id(result)
|
|
122
|
+
log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
123
|
+
return invocation_id
|
|
124
|
+
|
|
125
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
126
|
+
max_workers = min(4, total_batches)
|
|
127
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
128
|
+
|
|
129
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
130
|
+
futures = {
|
|
131
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
132
|
+
for i, batch in enumerate(batches)
|
|
133
|
+
}
|
|
134
|
+
for future in as_completed(futures):
|
|
135
|
+
idx = futures[future]
|
|
136
|
+
try:
|
|
137
|
+
invocation_ids[idx] = future.result()
|
|
138
|
+
except Exception as exc:
|
|
139
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
143
|
+
|
|
144
|
+
push_result = {
|
|
145
|
+
"resource_uuid": resource_uuid,
|
|
146
|
+
"resource_type": resource_type,
|
|
147
|
+
"invocation_ids": invocation_ids,
|
|
148
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
149
|
+
"total_assets": len(assets),
|
|
150
|
+
"batch_count": total_batches,
|
|
151
|
+
"batch_size": batch_size,
|
|
152
|
+
}
|
|
153
|
+
with open(output_file, "w") as fh:
|
|
154
|
+
json.dump(push_result, fh, indent=2)
|
|
155
|
+
log.info("Push result written to %s", output_file)
|
|
156
|
+
|
|
157
|
+
return push_result
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def main() -> None:
|
|
161
|
+
parser = argparse.ArgumentParser(
|
|
162
|
+
description="Push BigQuery metadata from a manifest to Monte Carlo",
|
|
163
|
+
)
|
|
164
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
165
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
166
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
167
|
+
parser.add_argument("--input-file", default="metadata_output.json")
|
|
168
|
+
parser.add_argument("--output-file", default="metadata_push_result.json")
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--batch-size",
|
|
171
|
+
type=int,
|
|
172
|
+
default=_BATCH_SIZE,
|
|
173
|
+
help=f"Max assets per push batch (default: {_BATCH_SIZE})",
|
|
174
|
+
)
|
|
175
|
+
args = parser.parse_args()
|
|
176
|
+
|
|
177
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
178
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
179
|
+
if missing:
|
|
180
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
181
|
+
|
|
182
|
+
push(
|
|
183
|
+
input_file=args.input_file,
|
|
184
|
+
resource_uuid=args.resource_uuid,
|
|
185
|
+
key_id=args.key_id,
|
|
186
|
+
key_token=args.key_token,
|
|
187
|
+
batch_size=args.batch_size,
|
|
188
|
+
output_file=args.output_file,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
if __name__ == "__main__":
|
|
193
|
+
main()
|
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery — Query Log Push (push only)
|
|
3
|
+
======================================
|
|
4
|
+
Reads a manifest file produced by ``collect_query_logs.py`` and pushes the query
|
|
5
|
+
log entries to Monte Carlo using the pycarlo push ingestion API. Large payloads
|
|
6
|
+
are split into batches to stay under the 1 MB compressed limit.
|
|
7
|
+
|
|
8
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
9
|
+
|
|
10
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
11
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
12
|
+
- MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
|
|
13
|
+
|
|
14
|
+
Prerequisites:
|
|
15
|
+
pip install pycarlo
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
|
|
27
|
+
from dateutil.parser import isoparse
|
|
28
|
+
from pycarlo.core import Client, Session
|
|
29
|
+
from pycarlo.features.ingestion import IngestionService
|
|
30
|
+
from pycarlo.features.ingestion.models import QueryLogEntry
|
|
31
|
+
|
|
32
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
33
|
+
log = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
LOG_TYPE = "bigquery"
|
|
36
|
+
|
|
37
|
+
# Maximum entries per batch — conservative default to keep compressed payload under 1 MB.
|
|
38
|
+
# Query logs include full SQL text — keep batches small to stay under the 1 MB
|
|
39
|
+
# compressed payload limit. 50 entries can trigger 413 on active warehouses.
|
|
40
|
+
# ← SUBSTITUTE: tune based on average query length
|
|
41
|
+
_BATCH_SIZE = 100
|
|
42
|
+
|
|
43
|
+
# Truncate query_text longer than this to prevent 413 errors.
|
|
44
|
+
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
|
|
45
|
+
# compressed payloads even at small batch sizes.
|
|
46
|
+
_MAX_QUERY_TEXT_LEN = 10_000
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _build_query_log_entries(queries: list[dict]) -> list[QueryLogEntry]:
|
|
50
|
+
"""Convert manifest query dicts into QueryLogEntry objects."""
|
|
51
|
+
entries = []
|
|
52
|
+
truncated = 0
|
|
53
|
+
for q in queries:
|
|
54
|
+
query_text = q.get("query_text") or ""
|
|
55
|
+
|
|
56
|
+
# Truncate very long SQL to prevent 413 Request Too Large
|
|
57
|
+
if len(query_text) > _MAX_QUERY_TEXT_LEN:
|
|
58
|
+
query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
|
|
59
|
+
truncated += 1
|
|
60
|
+
|
|
61
|
+
extra = {}
|
|
62
|
+
if q.get("total_bytes_billed") is not None:
|
|
63
|
+
extra["total_bytes_billed"] = q["total_bytes_billed"]
|
|
64
|
+
if q.get("statement_type") is not None:
|
|
65
|
+
extra["statement_type"] = q["statement_type"]
|
|
66
|
+
|
|
67
|
+
start_time = q.get("start_time")
|
|
68
|
+
end_time = q.get("end_time")
|
|
69
|
+
|
|
70
|
+
entry = QueryLogEntry(
|
|
71
|
+
query_id=q.get("query_id"),
|
|
72
|
+
query_text=query_text,
|
|
73
|
+
start_time=isoparse(start_time) if start_time else None,
|
|
74
|
+
end_time=isoparse(end_time) if end_time else None,
|
|
75
|
+
user=q.get("user"),
|
|
76
|
+
extra=extra or None,
|
|
77
|
+
)
|
|
78
|
+
entries.append(entry)
|
|
79
|
+
if truncated:
|
|
80
|
+
log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
|
|
81
|
+
return entries
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def push(
|
|
85
|
+
input_file: str,
|
|
86
|
+
resource_uuid: str,
|
|
87
|
+
key_id: str,
|
|
88
|
+
key_token: str,
|
|
89
|
+
batch_size: int = _BATCH_SIZE,
|
|
90
|
+
output_file: str = "query_logs_push_result.json",
|
|
91
|
+
) -> dict:
|
|
92
|
+
"""
|
|
93
|
+
Read a query log manifest and push entries to Monte Carlo in batches.
|
|
94
|
+
|
|
95
|
+
Returns a result dict with invocation IDs for each batch.
|
|
96
|
+
"""
|
|
97
|
+
with open(input_file) as fh:
|
|
98
|
+
manifest = json.load(fh)
|
|
99
|
+
|
|
100
|
+
queries = manifest.get("queries", [])
|
|
101
|
+
log_type = manifest.get("log_type", LOG_TYPE)
|
|
102
|
+
entries = _build_query_log_entries(queries)
|
|
103
|
+
log.info("Loaded %d query log entry/entries from %s", len(entries), input_file)
|
|
104
|
+
|
|
105
|
+
if not entries:
|
|
106
|
+
log.info("No query log entries to push.")
|
|
107
|
+
push_result = {
|
|
108
|
+
"resource_uuid": resource_uuid,
|
|
109
|
+
"log_type": log_type,
|
|
110
|
+
"invocation_ids": [],
|
|
111
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
112
|
+
"total_entries": 0,
|
|
113
|
+
"batch_count": 0,
|
|
114
|
+
"batch_size": batch_size,
|
|
115
|
+
}
|
|
116
|
+
with open(output_file, "w") as fh:
|
|
117
|
+
json.dump(push_result, fh, indent=2)
|
|
118
|
+
return push_result
|
|
119
|
+
|
|
120
|
+
# Split into batches
|
|
121
|
+
batches = []
|
|
122
|
+
for i in range(0, len(entries), batch_size):
|
|
123
|
+
batches.append(entries[i : i + batch_size])
|
|
124
|
+
total_batches = len(batches)
|
|
125
|
+
|
|
126
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
127
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
128
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
129
|
+
service = IngestionService(mc_client=client)
|
|
130
|
+
result = service.send_query_logs(
|
|
131
|
+
resource_uuid=resource_uuid,
|
|
132
|
+
log_type=log_type,
|
|
133
|
+
events=batch,
|
|
134
|
+
)
|
|
135
|
+
invocation_id = service.extract_invocation_id(result)
|
|
136
|
+
log.info("Pushed batch %d/%d (%d entries) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
137
|
+
return invocation_id
|
|
138
|
+
|
|
139
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
140
|
+
max_workers = min(4, total_batches)
|
|
141
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
142
|
+
|
|
143
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
144
|
+
futures = {
|
|
145
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
146
|
+
for i, batch in enumerate(batches)
|
|
147
|
+
}
|
|
148
|
+
for future in as_completed(futures):
|
|
149
|
+
idx = futures[future]
|
|
150
|
+
try:
|
|
151
|
+
invocation_ids[idx] = future.result()
|
|
152
|
+
except Exception as exc:
|
|
153
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
154
|
+
raise
|
|
155
|
+
|
|
156
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
157
|
+
|
|
158
|
+
push_result = {
|
|
159
|
+
"resource_uuid": resource_uuid,
|
|
160
|
+
"log_type": log_type,
|
|
161
|
+
"invocation_ids": invocation_ids,
|
|
162
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
163
|
+
"total_entries": len(entries),
|
|
164
|
+
"batch_count": total_batches,
|
|
165
|
+
"batch_size": batch_size,
|
|
166
|
+
}
|
|
167
|
+
with open(output_file, "w") as fh:
|
|
168
|
+
json.dump(push_result, fh, indent=2)
|
|
169
|
+
log.info("Push result written to %s", output_file)
|
|
170
|
+
|
|
171
|
+
return push_result
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def main() -> None:
|
|
175
|
+
parser = argparse.ArgumentParser(
|
|
176
|
+
description="Push BigQuery query logs from a manifest to Monte Carlo",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
179
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
180
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
181
|
+
parser.add_argument("--input-file", default="query_logs_output.json")
|
|
182
|
+
parser.add_argument("--output-file", default="query_logs_push_result.json")
|
|
183
|
+
parser.add_argument(
|
|
184
|
+
"--batch-size",
|
|
185
|
+
type=int,
|
|
186
|
+
default=_BATCH_SIZE,
|
|
187
|
+
help=f"Max entries per push batch (default: {_BATCH_SIZE})",
|
|
188
|
+
)
|
|
189
|
+
args = parser.parse_args()
|
|
190
|
+
|
|
191
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
192
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
193
|
+
if missing:
|
|
194
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
195
|
+
|
|
196
|
+
push(
|
|
197
|
+
input_file=args.input_file,
|
|
198
|
+
resource_uuid=args.resource_uuid,
|
|
199
|
+
key_id=args.key_id,
|
|
200
|
+
key_token=args.key_token,
|
|
201
|
+
batch_size=args.batch_size,
|
|
202
|
+
output_file=args.output_file,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
main()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery Iceberg — Metadata Collect & Push (combined)
|
|
3
|
+
=====================================================
|
|
4
|
+
Convenience wrapper that runs collect_metadata.collect() followed by
|
|
5
|
+
push_metadata.push() in a single invocation. Supports
|
|
6
|
+
``--only-freshness-and-volume`` for fast periodic pushes.
|
|
7
|
+
|
|
8
|
+
Prerequisites:
|
|
9
|
+
pip install google-cloud-bigquery pycarlo>=0.12.251
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from collect_metadata import collect
|
|
18
|
+
from push_metadata import push
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main() -> None:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
description="Collect BigQuery Iceberg metadata and push to Monte Carlo",
|
|
24
|
+
)
|
|
25
|
+
# Collection args
|
|
26
|
+
parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))
|
|
27
|
+
parser.add_argument("--datasets", nargs="+", default=None)
|
|
28
|
+
parser.add_argument("--tables", nargs="+", default=None)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--only-freshness-and-volume",
|
|
31
|
+
action="store_true",
|
|
32
|
+
help="Skip field/schema collection — only collect freshness and volume.",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument("--manifest-file", default="metadata_output.json")
|
|
35
|
+
|
|
36
|
+
# Push args
|
|
37
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
38
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
39
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
40
|
+
parser.add_argument("--batch-size", type=int, default=500)
|
|
41
|
+
parser.add_argument("--push-result-file", default="metadata_push_result.json")
|
|
42
|
+
|
|
43
|
+
args = parser.parse_args()
|
|
44
|
+
|
|
45
|
+
if not args.project_id:
|
|
46
|
+
parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
|
|
47
|
+
required_push = ["resource_uuid", "key_id", "key_token"]
|
|
48
|
+
missing = [k for k in required_push if getattr(args, k) is None]
|
|
49
|
+
if missing:
|
|
50
|
+
parser.error(f"Missing required push arguments/env vars: {missing}")
|
|
51
|
+
|
|
52
|
+
collect(
|
|
53
|
+
project_id=args.project_id,
|
|
54
|
+
datasets=args.datasets,
|
|
55
|
+
tables=args.tables,
|
|
56
|
+
only_freshness_and_volume=args.only_freshness_and_volume,
|
|
57
|
+
output_file=args.manifest_file,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
push(
|
|
61
|
+
input_file=args.manifest_file,
|
|
62
|
+
resource_uuid=args.resource_uuid,
|
|
63
|
+
key_id=args.key_id,
|
|
64
|
+
key_token=args.key_token,
|
|
65
|
+
batch_size=args.batch_size,
|
|
66
|
+
output_file=args.push_result_file,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
main()
|