opencode-skills-collection 2.0.0-beta.3 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Push a collected Hive metadata manifest to Monte Carlo — push only.
|
|
4
|
+
|
|
5
|
+
Reads a JSON manifest produced by ``collect_metadata.py``, builds
|
|
6
|
+
RelationalAsset objects, and calls ``send_metadata`` in batches. The manifest
|
|
7
|
+
is updated in-place with ``resource_uuid`` and ``invocation_id`` after a
|
|
8
|
+
successful push.
|
|
9
|
+
|
|
10
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
11
|
+
|
|
12
|
+
Substitution points
|
|
13
|
+
-------------------
|
|
14
|
+
- MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
|
|
15
|
+
- MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
|
|
16
|
+
- MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
|
|
17
|
+
|
|
18
|
+
Prerequisites
|
|
19
|
+
-------------
|
|
20
|
+
pip install pycarlo python-dotenv
|
|
21
|
+
|
|
22
|
+
Usage
|
|
23
|
+
-----
|
|
24
|
+
python push_metadata.py \\
|
|
25
|
+
--key-id <MCD_INGEST_ID> \\
|
|
26
|
+
--key-token <MCD_INGEST_TOKEN> \\
|
|
27
|
+
--resource-uuid <MCD_RESOURCE_UUID> \\
|
|
28
|
+
--input-file metadata_output.json
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
|
|
37
|
+
from pycarlo.core import Client, Session
|
|
38
|
+
from pycarlo.features.ingestion import IngestionService
|
|
39
|
+
from pycarlo.features.ingestion.models import (
|
|
40
|
+
AssetField,
|
|
41
|
+
AssetFreshness,
|
|
42
|
+
AssetMetadata,
|
|
43
|
+
AssetVolume,
|
|
44
|
+
RelationalAsset,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# ← SUBSTITUTE: default batch size for metadata push (assets per request)
|
|
48
|
+
DEFAULT_BATCH_SIZE = 500
|
|
49
|
+
|
|
50
|
+
# ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
|
|
51
|
+
DEFAULT_TIMEOUT_SECONDS = 120
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _build_assets(manifest: dict) -> list[RelationalAsset]:
|
|
55
|
+
"""Rebuild RelationalAsset objects from a collected metadata manifest."""
|
|
56
|
+
assets = []
|
|
57
|
+
for a in manifest.get("assets", []):
|
|
58
|
+
fields = [
|
|
59
|
+
AssetField(
|
|
60
|
+
name=f["name"],
|
|
61
|
+
type=f["type"],
|
|
62
|
+
description=f.get("description"),
|
|
63
|
+
)
|
|
64
|
+
for f in a.get("fields", [])
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
volume = None
|
|
68
|
+
row_count = a.get("row_count")
|
|
69
|
+
byte_count = a.get("byte_count")
|
|
70
|
+
if row_count or byte_count:
|
|
71
|
+
volume = AssetVolume(
|
|
72
|
+
row_count=row_count if row_count and row_count > 0 else None,
|
|
73
|
+
byte_count=byte_count if byte_count and byte_count > 0 else None,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
freshness = None
|
|
77
|
+
last_modified = a.get("last_modified")
|
|
78
|
+
if last_modified:
|
|
79
|
+
freshness = AssetFreshness(last_update_time=last_modified)
|
|
80
|
+
|
|
81
|
+
assets.append(
|
|
82
|
+
RelationalAsset(
|
|
83
|
+
type="TABLE",
|
|
84
|
+
metadata=AssetMetadata(
|
|
85
|
+
name=a["name"],
|
|
86
|
+
database=a["database"],
|
|
87
|
+
schema=a["schema"],
|
|
88
|
+
description=a.get("description"),
|
|
89
|
+
created_on=a.get("created_on"),
|
|
90
|
+
),
|
|
91
|
+
fields=fields,
|
|
92
|
+
volume=volume,
|
|
93
|
+
freshness=freshness,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
return assets
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def push(
|
|
100
|
+
manifest: dict,
|
|
101
|
+
resource_uuid: str,
|
|
102
|
+
key_id: str,
|
|
103
|
+
key_token: str,
|
|
104
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
105
|
+
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
|
106
|
+
) -> str | None:
|
|
107
|
+
"""
|
|
108
|
+
Push collected metadata to Monte Carlo and update the manifest in-place.
|
|
109
|
+
|
|
110
|
+
Assets are sent in batches of ``batch_size`` (default 500) to avoid
|
|
111
|
+
oversized payloads. The manifest is enriched with ``resource_uuid``
|
|
112
|
+
and the last ``invocation_id`` from the response.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
manifest: Dict loaded from a ``collect_metadata.py`` output file.
|
|
116
|
+
resource_uuid: MC resource UUID for this Hive connection.
|
|
117
|
+
key_id: MC ingestion key ID.
|
|
118
|
+
key_token: MC ingestion key token.
|
|
119
|
+
batch_size: Assets per POST request (default 500).
|
|
120
|
+
timeout_seconds: HTTP timeout per request (default 120).
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The last invocation ID string if returned by MC, otherwise None.
|
|
124
|
+
"""
|
|
125
|
+
resource_type = manifest.get("resource_type", "data-lake")
|
|
126
|
+
|
|
127
|
+
assets = _build_assets(manifest)
|
|
128
|
+
n = len(assets)
|
|
129
|
+
|
|
130
|
+
print(f"Loaded {n} asset(s) from manifest")
|
|
131
|
+
|
|
132
|
+
# Split into batches
|
|
133
|
+
batch_list = []
|
|
134
|
+
for i in range(0, max(n, 1), batch_size):
|
|
135
|
+
batch_list.append(assets[i : i + batch_size])
|
|
136
|
+
total_batches = len(batch_list)
|
|
137
|
+
|
|
138
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
139
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
140
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
141
|
+
service = IngestionService(mc_client=client)
|
|
142
|
+
result = service.send_metadata(
|
|
143
|
+
resource_uuid=resource_uuid,
|
|
144
|
+
resource_type=resource_type,
|
|
145
|
+
events=batch,
|
|
146
|
+
)
|
|
147
|
+
invocation_id = service.extract_invocation_id(result)
|
|
148
|
+
print(f" Pushed batch {batch_num}/{total_batches} ({len(batch)} assets) — invocation_id={invocation_id}")
|
|
149
|
+
return invocation_id
|
|
150
|
+
|
|
151
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
152
|
+
max_workers = min(4, total_batches)
|
|
153
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
154
|
+
|
|
155
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
156
|
+
futures = {
|
|
157
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
158
|
+
for i, batch in enumerate(batch_list)
|
|
159
|
+
}
|
|
160
|
+
for future in as_completed(futures):
|
|
161
|
+
idx = futures[future]
|
|
162
|
+
try:
|
|
163
|
+
invocation_ids[idx] = future.result()
|
|
164
|
+
except Exception as exc:
|
|
165
|
+
print(f" ERROR pushing batch {idx + 1}: {exc}")
|
|
166
|
+
raise
|
|
167
|
+
|
|
168
|
+
print(f" All {total_batches} batches pushed ({max_workers} workers)")
|
|
169
|
+
|
|
170
|
+
manifest["resource_uuid"] = resource_uuid
|
|
171
|
+
manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
|
|
172
|
+
if len([i for i in invocation_ids if i]) > 1:
|
|
173
|
+
manifest["invocation_ids"] = invocation_ids
|
|
174
|
+
elif "invocation_ids" in manifest:
|
|
175
|
+
del manifest["invocation_ids"]
|
|
176
|
+
|
|
177
|
+
return manifest.get("invocation_id")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def main() -> None:
|
|
181
|
+
parser = argparse.ArgumentParser(
|
|
182
|
+
description="Push a collected Hive metadata manifest to Monte Carlo",
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--key-id",
|
|
186
|
+
default=os.environ.get("MCD_INGEST_ID"),
|
|
187
|
+
help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)", # ← SUBSTITUTE env var name if different
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--key-token",
|
|
191
|
+
default=os.environ.get("MCD_INGEST_TOKEN"),
|
|
192
|
+
help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)", # ← SUBSTITUTE env var name if different
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--resource-uuid",
|
|
196
|
+
default=os.environ.get("MCD_RESOURCE_UUID"),
|
|
197
|
+
required=False,
|
|
198
|
+
help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
|
|
199
|
+
)
|
|
200
|
+
parser.add_argument(
|
|
201
|
+
"--input-file",
|
|
202
|
+
default="metadata_output.json",
|
|
203
|
+
help="Path to the JSON manifest written by collect_metadata.py (default: metadata_output.json)",
|
|
204
|
+
)
|
|
205
|
+
parser.add_argument(
|
|
206
|
+
"--batch-size",
|
|
207
|
+
type=int,
|
|
208
|
+
default=DEFAULT_BATCH_SIZE,
|
|
209
|
+
metavar="N",
|
|
210
|
+
help=f"Max assets per POST (default: {DEFAULT_BATCH_SIZE})",
|
|
211
|
+
)
|
|
212
|
+
parser.add_argument(
|
|
213
|
+
"--timeout",
|
|
214
|
+
type=int,
|
|
215
|
+
default=DEFAULT_TIMEOUT_SECONDS,
|
|
216
|
+
metavar="SEC",
|
|
217
|
+
help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
|
|
218
|
+
)
|
|
219
|
+
args = parser.parse_args()
|
|
220
|
+
|
|
221
|
+
if not args.key_id or not args.key_token:
|
|
222
|
+
parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
|
|
223
|
+
if not args.resource_uuid:
|
|
224
|
+
parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
|
|
225
|
+
|
|
226
|
+
with open(args.input_file) as fh:
|
|
227
|
+
manifest = json.load(fh)
|
|
228
|
+
|
|
229
|
+
push(
|
|
230
|
+
manifest=manifest,
|
|
231
|
+
resource_uuid=args.resource_uuid,
|
|
232
|
+
key_id=args.key_id,
|
|
233
|
+
key_token=args.key_token,
|
|
234
|
+
batch_size=args.batch_size,
|
|
235
|
+
timeout_seconds=args.timeout,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
with open(args.input_file, "w") as fh:
|
|
239
|
+
json.dump(manifest, fh, indent=2)
|
|
240
|
+
print(f"Manifest updated in-place: {args.input_file}")
|
|
241
|
+
print("Done.")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == "__main__":
|
|
245
|
+
main()
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Push a collected Hive query log manifest to Monte Carlo — push only.
|
|
4
|
+
|
|
5
|
+
Reads a JSON manifest produced by ``collect_query_logs.py``, builds
|
|
6
|
+
QueryLogEntry objects, and calls ``send_query_logs`` in batches. The manifest
|
|
7
|
+
is updated in-place with ``resource_uuid`` and ``invocation_id`` after a
|
|
8
|
+
successful push.
|
|
9
|
+
|
|
10
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
11
|
+
|
|
12
|
+
Substitution points
|
|
13
|
+
-------------------
|
|
14
|
+
- MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
|
|
15
|
+
- MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
|
|
16
|
+
- MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID (optional for query logs)
|
|
17
|
+
|
|
18
|
+
Prerequisites
|
|
19
|
+
-------------
|
|
20
|
+
pip install pycarlo python-dateutil python-dotenv
|
|
21
|
+
|
|
22
|
+
Usage
|
|
23
|
+
-----
|
|
24
|
+
python push_query_logs.py \\
|
|
25
|
+
--key-id <MCD_INGEST_ID> \\
|
|
26
|
+
--key-token <MCD_INGEST_TOKEN> \\
|
|
27
|
+
--resource-uuid <MCD_RESOURCE_UUID> \\
|
|
28
|
+
--input-file query_logs_output.json
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
35
|
+
from datetime import datetime, timezone
|
|
36
|
+
|
|
37
|
+
from dateutil.parser import isoparse
|
|
38
|
+
|
|
39
|
+
from pycarlo.core import Client, Session
|
|
40
|
+
from pycarlo.features.ingestion import IngestionService
|
|
41
|
+
from pycarlo.features.ingestion.models import QueryLogEntry
|
|
42
|
+
|
|
43
|
+
# ← SUBSTITUTE: default batch size for query log push (events per request)
|
|
44
|
+
# Query logs include full SQL text — keep batches small to stay under the 1 MB
|
|
45
|
+
# compressed payload limit. 50 entries can trigger 413 on active warehouses.
|
|
46
|
+
DEFAULT_BATCH_SIZE = 100
|
|
47
|
+
|
|
48
|
+
# ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
|
|
49
|
+
DEFAULT_TIMEOUT_SECONDS = 120
|
|
50
|
+
|
|
51
|
+
# Truncate query_text longer than this to prevent 413 errors.
|
|
52
|
+
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
|
|
53
|
+
# compressed payloads even at small batch sizes.
|
|
54
|
+
_MAX_QUERY_TEXT_LEN = 10_000
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _build_events(manifest: dict) -> list[QueryLogEntry]:
|
|
58
|
+
"""
|
|
59
|
+
Rebuild QueryLogEntry objects from a collected query log manifest.
|
|
60
|
+
|
|
61
|
+
ISO timestamp strings are parsed back to datetime. Entries are
|
|
62
|
+
deduplicated by query_id.
|
|
63
|
+
"""
|
|
64
|
+
seen: set[str] = set()
|
|
65
|
+
events = []
|
|
66
|
+
truncated = 0
|
|
67
|
+
for q in manifest.get("queries", []):
|
|
68
|
+
qid = q.get("query_id")
|
|
69
|
+
if qid and qid in seen:
|
|
70
|
+
continue
|
|
71
|
+
if qid:
|
|
72
|
+
seen.add(qid)
|
|
73
|
+
|
|
74
|
+
start_time = isoparse(q["start_time"])
|
|
75
|
+
if not start_time.tzinfo:
|
|
76
|
+
start_time = start_time.replace(tzinfo=timezone.utc)
|
|
77
|
+
|
|
78
|
+
end_time = isoparse(q["end_time"])
|
|
79
|
+
if not end_time.tzinfo:
|
|
80
|
+
end_time = end_time.replace(tzinfo=timezone.utc)
|
|
81
|
+
|
|
82
|
+
query_text = q.get("query") or ""
|
|
83
|
+
|
|
84
|
+
# Truncate very long SQL to prevent 413 Request Too Large
|
|
85
|
+
if len(query_text) > _MAX_QUERY_TEXT_LEN:
|
|
86
|
+
query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
|
|
87
|
+
truncated += 1
|
|
88
|
+
|
|
89
|
+
events.append(
|
|
90
|
+
QueryLogEntry(
|
|
91
|
+
start_time=start_time,
|
|
92
|
+
end_time=end_time,
|
|
93
|
+
query_text=query_text,
|
|
94
|
+
query_id=qid or None,
|
|
95
|
+
user=q.get("user", "hadoop"), # ← SUBSTITUTE: set the user appropriate for your cluster
|
|
96
|
+
returned_rows=q.get("returned_rows"),
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
if truncated:
|
|
100
|
+
print(f" Truncated {truncated} query text(s) exceeding {_MAX_QUERY_TEXT_LEN} chars")
|
|
101
|
+
return events
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def push(
|
|
105
|
+
manifest: dict,
|
|
106
|
+
key_id: str,
|
|
107
|
+
key_token: str,
|
|
108
|
+
resource_uuid: str | None = None,
|
|
109
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
110
|
+
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
|
111
|
+
) -> str | None:
|
|
112
|
+
"""
|
|
113
|
+
Push collected query logs to Monte Carlo and update the manifest in-place.
|
|
114
|
+
|
|
115
|
+
Events are sent in batches of ``batch_size`` (default 100) to avoid
|
|
116
|
+
oversized payloads.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
manifest: Dict loaded from a ``collect_query_logs.py`` output file.
|
|
120
|
+
key_id: MC ingestion key ID.
|
|
121
|
+
key_token: MC ingestion key token.
|
|
122
|
+
resource_uuid: Optional MC resource UUID.
|
|
123
|
+
batch_size: Events per POST request (default 100).
|
|
124
|
+
timeout_seconds: HTTP timeout per request (default 120).
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
The last invocation ID string if returned by MC, otherwise None.
|
|
128
|
+
"""
|
|
129
|
+
log_type = manifest.get("log_type", "hive-s3")
|
|
130
|
+
|
|
131
|
+
events = _build_events(manifest)
|
|
132
|
+
n = len(events)
|
|
133
|
+
print(f"Loaded {n} query log entry/entries from manifest")
|
|
134
|
+
|
|
135
|
+
if not events:
|
|
136
|
+
print("No query log entries to push.")
|
|
137
|
+
manifest["log_type"] = log_type
|
|
138
|
+
if resource_uuid is not None:
|
|
139
|
+
manifest["resource_uuid"] = resource_uuid
|
|
140
|
+
manifest["invocation_id"] = None
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# Split into batches
|
|
144
|
+
batch_list = []
|
|
145
|
+
for i in range(0, n, batch_size):
|
|
146
|
+
batch_list.append(events[i : i + batch_size])
|
|
147
|
+
total_batches = len(batch_list)
|
|
148
|
+
|
|
149
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
150
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
151
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
152
|
+
service = IngestionService(mc_client=client)
|
|
153
|
+
result = service.send_query_logs(
|
|
154
|
+
resource_uuid=resource_uuid,
|
|
155
|
+
log_type=log_type,
|
|
156
|
+
events=batch,
|
|
157
|
+
)
|
|
158
|
+
invocation_id = service.extract_invocation_id(result)
|
|
159
|
+
print(f" Pushed batch {batch_num}/{total_batches} ({len(batch)} entries) — invocation_id={invocation_id}")
|
|
160
|
+
return invocation_id
|
|
161
|
+
|
|
162
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
163
|
+
max_workers = min(4, total_batches)
|
|
164
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
165
|
+
|
|
166
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
167
|
+
futures = {
|
|
168
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
169
|
+
for i, batch in enumerate(batch_list)
|
|
170
|
+
}
|
|
171
|
+
for future in as_completed(futures):
|
|
172
|
+
idx = futures[future]
|
|
173
|
+
try:
|
|
174
|
+
invocation_ids[idx] = future.result()
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
print(f" ERROR pushing batch {idx + 1}: {exc}")
|
|
177
|
+
raise
|
|
178
|
+
|
|
179
|
+
print(f" All {total_batches} batches pushed ({max_workers} workers)")
|
|
180
|
+
|
|
181
|
+
manifest["log_type"] = log_type
|
|
182
|
+
if resource_uuid is not None:
|
|
183
|
+
manifest["resource_uuid"] = resource_uuid
|
|
184
|
+
manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
|
|
185
|
+
if len([i for i in invocation_ids if i]) > 1:
|
|
186
|
+
manifest["invocation_ids"] = invocation_ids
|
|
187
|
+
elif "invocation_ids" in manifest:
|
|
188
|
+
del manifest["invocation_ids"]
|
|
189
|
+
|
|
190
|
+
return manifest.get("invocation_id")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def main() -> None:
|
|
194
|
+
parser = argparse.ArgumentParser(
|
|
195
|
+
description="Push a collected Hive query log manifest to Monte Carlo",
|
|
196
|
+
)
|
|
197
|
+
parser.add_argument(
|
|
198
|
+
"--key-id",
|
|
199
|
+
default=os.environ.get("MCD_INGEST_ID"),
|
|
200
|
+
help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
|
|
201
|
+
)
|
|
202
|
+
parser.add_argument(
|
|
203
|
+
"--key-token",
|
|
204
|
+
default=os.environ.get("MCD_INGEST_TOKEN"),
|
|
205
|
+
help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
|
|
206
|
+
)
|
|
207
|
+
parser.add_argument(
|
|
208
|
+
"--resource-uuid",
|
|
209
|
+
default=os.environ.get("MCD_RESOURCE_UUID"),
|
|
210
|
+
help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
|
|
211
|
+
)
|
|
212
|
+
parser.add_argument(
|
|
213
|
+
"--input-file",
|
|
214
|
+
default="query_logs_output.json",
|
|
215
|
+
help="Path to the JSON manifest written by collect_query_logs.py (default: query_logs_output.json)",
|
|
216
|
+
)
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
"--batch-size",
|
|
219
|
+
type=int,
|
|
220
|
+
default=DEFAULT_BATCH_SIZE,
|
|
221
|
+
metavar="N",
|
|
222
|
+
help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
|
|
223
|
+
)
|
|
224
|
+
parser.add_argument(
|
|
225
|
+
"--timeout",
|
|
226
|
+
type=int,
|
|
227
|
+
default=DEFAULT_TIMEOUT_SECONDS,
|
|
228
|
+
metavar="SEC",
|
|
229
|
+
help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
|
|
230
|
+
)
|
|
231
|
+
args = parser.parse_args()
|
|
232
|
+
|
|
233
|
+
if not args.key_id or not args.key_token:
|
|
234
|
+
parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
|
|
235
|
+
|
|
236
|
+
with open(args.input_file) as fh:
|
|
237
|
+
manifest = json.load(fh)
|
|
238
|
+
|
|
239
|
+
push(
|
|
240
|
+
manifest=manifest,
|
|
241
|
+
key_id=args.key_id,
|
|
242
|
+
key_token=args.key_token,
|
|
243
|
+
resource_uuid=args.resource_uuid,
|
|
244
|
+
batch_size=args.batch_size,
|
|
245
|
+
timeout_seconds=args.timeout,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
with open(args.input_file, "w") as fh:
|
|
249
|
+
json.dump(manifest, fh, indent=2)
|
|
250
|
+
print(f"Manifest updated in-place: {args.input_file}")
|
|
251
|
+
print("Done.")
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
if __name__ == "__main__":
|
|
255
|
+
main()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redshift — Lineage Collect & Push (combined)
|
|
3
|
+
==============================================
|
|
4
|
+
Collects table-level lineage from Redshift by parsing query history, then pushes
|
|
5
|
+
the derived lineage events to Monte Carlo via the push ingestion API.
|
|
6
|
+
|
|
7
|
+
This script imports and calls collect() from collect_lineage and push() from
|
|
8
|
+
push_lineage, running both in sequence.
|
|
9
|
+
|
|
10
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
11
|
+
- REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
|
|
12
|
+
- LOOKBACK_HOURS : how far back to scan query history (default 24 h)
|
|
13
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
14
|
+
- MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
|
|
15
|
+
- PUSH_BATCH_SIZE : number of events per API call (default 500)
|
|
16
|
+
|
|
17
|
+
Prerequisites:
|
|
18
|
+
pip install psycopg2-binary pycarlo
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import logging
|
|
25
|
+
import os
|
|
26
|
+
|
|
27
|
+
from collect_lineage import LOOKBACK_HOURS, collect
|
|
28
|
+
from push_lineage import DEFAULT_BATCH_SIZE, push
|
|
29
|
+
|
|
30
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
31
|
+
log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main() -> None:
|
|
35
|
+
parser = argparse.ArgumentParser(description="Collect and push Redshift lineage to Monte Carlo")
|
|
36
|
+
parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
|
|
37
|
+
parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
|
|
38
|
+
parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
|
|
39
|
+
parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
|
|
40
|
+
parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
|
|
41
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
42
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
43
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
44
|
+
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
|
|
45
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
46
|
+
parser.add_argument("--manifest", default="manifest_lineage.json")
|
|
47
|
+
args = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
required = ["host", "db", "user", "password", "resource_uuid", "key_id", "key_token"]
|
|
50
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
51
|
+
if missing:
|
|
52
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
53
|
+
|
|
54
|
+
log.info("Step 1: Collecting lineage …")
|
|
55
|
+
collect(
|
|
56
|
+
host=args.host,
|
|
57
|
+
db=args.db,
|
|
58
|
+
user=args.user,
|
|
59
|
+
password=args.password,
|
|
60
|
+
manifest_path=args.manifest,
|
|
61
|
+
port=args.port,
|
|
62
|
+
lookback_hours=args.lookback_hours,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
log.info("Step 2: Pushing lineage to Monte Carlo …")
|
|
66
|
+
push(
|
|
67
|
+
manifest_path=args.manifest,
|
|
68
|
+
resource_uuid=args.resource_uuid,
|
|
69
|
+
key_id=args.key_id,
|
|
70
|
+
key_token=args.key_token,
|
|
71
|
+
batch_size=args.batch_size,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
log.info("Done — collect and push complete.")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
main()
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redshift — Metadata Collect & Push (combined)
|
|
3
|
+
===============================================
|
|
4
|
+
Collects table schemas, row counts, and byte sizes from Amazon Redshift,
|
|
5
|
+
then pushes them to Monte Carlo via the push ingestion API.
|
|
6
|
+
|
|
7
|
+
This script imports and calls collect() from collect_metadata and push() from
|
|
8
|
+
push_metadata, running both in sequence.
|
|
9
|
+
|
|
10
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
11
|
+
- REDSHIFT_HOST : Redshift cluster endpoint or serverless workgroup endpoint
|
|
12
|
+
- REDSHIFT_DB : database name to connect to
|
|
13
|
+
- REDSHIFT_USER : database user (or IAM role user)
|
|
14
|
+
- REDSHIFT_PASSWORD : database password
|
|
15
|
+
- DB_EXCLUSIONS : databases to skip
|
|
16
|
+
- SCHEMA_EXCLUSIONS : schemas to skip in every database
|
|
17
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
18
|
+
- MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
|
|
19
|
+
- PUSH_BATCH_SIZE : number of assets per API call (default 500)
|
|
20
|
+
|
|
21
|
+
Prerequisites:
|
|
22
|
+
pip install psycopg2-binary pycarlo
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import logging
|
|
29
|
+
import os
|
|
30
|
+
|
|
31
|
+
from collect_metadata import collect
|
|
32
|
+
from push_metadata import DEFAULT_BATCH_SIZE, push
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
35
|
+
log = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def main() -> None:
|
|
39
|
+
parser = argparse.ArgumentParser(description="Collect and push Redshift metadata to Monte Carlo")
|
|
40
|
+
parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
|
|
41
|
+
parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
|
|
42
|
+
parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
|
|
43
|
+
parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
|
|
44
|
+
parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
|
|
45
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
46
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
47
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
48
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
49
|
+
parser.add_argument("--manifest", default="manifest_metadata.json")
|
|
50
|
+
args = parser.parse_args()
|
|
51
|
+
|
|
52
|
+
required = ["host", "db", "user", "password", "resource_uuid", "key_id", "key_token"]
|
|
53
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
54
|
+
if missing:
|
|
55
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
56
|
+
|
|
57
|
+
log.info("Step 1: Collecting metadata …")
|
|
58
|
+
collect(
|
|
59
|
+
host=args.host,
|
|
60
|
+
db=args.db,
|
|
61
|
+
user=args.user,
|
|
62
|
+
password=args.password,
|
|
63
|
+
manifest_path=args.manifest,
|
|
64
|
+
port=args.port,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
log.info("Step 2: Pushing metadata to Monte Carlo …")
|
|
68
|
+
push(
|
|
69
|
+
manifest_path=args.manifest,
|
|
70
|
+
resource_uuid=args.resource_uuid,
|
|
71
|
+
key_id=args.key_id,
|
|
72
|
+
key_token=args.key_token,
|
|
73
|
+
batch_size=args.batch_size,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
log.info("Done — collect and push complete.")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
main()
|