opencode-skills-collection 2.0.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redshift — Query Log Collection (collect-only)
|
|
3
|
+
================================================
|
|
4
|
+
Collects completed query execution records from Redshift using sys_query_history
|
|
5
|
+
and sys_querytext (modern RA3/serverless), assembles full SQL text from
|
|
6
|
+
multi-row text chunks, and writes a JSON manifest file that can be consumed
|
|
7
|
+
by push_query_logs.py.
|
|
8
|
+
|
|
9
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
10
|
+
- REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
|
|
11
|
+
- LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
|
|
12
|
+
- LOOKBACK_LAG_HOURS: lag behind now to avoid in-flight queries (default 1)
|
|
13
|
+
- BATCH_SIZE : number of query_ids to fetch texts for in one SQL call
|
|
14
|
+
- MAX_QUERIES : maximum query rows to process per run
|
|
15
|
+
|
|
16
|
+
Prerequisites:
|
|
17
|
+
pip install psycopg2-binary
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
import os
|
|
26
|
+
from datetime import datetime, timezone
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
import psycopg2
|
|
30
|
+
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
32
|
+
log = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
LOG_TYPE = "redshift"
|
|
35
|
+
|
|
36
|
+
LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
|
|
37
|
+
LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
|
|
38
|
+
BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "200")) # ← SUBSTITUTE
|
|
39
|
+
MAX_QUERIES: int = int(os.getenv("MAX_QUERIES", "10000")) # ← SUBSTITUTE
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _check_available_memory(min_gb: float = 2.0) -> None:
|
|
43
|
+
"""Warn if available memory is below the threshold."""
|
|
44
|
+
try:
|
|
45
|
+
if hasattr(os, "sysconf"): # Linux / macOS
|
|
46
|
+
page_size = os.sysconf("SC_PAGE_SIZE")
|
|
47
|
+
avail_pages = os.sysconf("SC_AVPHYS_PAGES")
|
|
48
|
+
avail_gb = (page_size * avail_pages) / (1024 ** 3)
|
|
49
|
+
else:
|
|
50
|
+
return # Windows — skip check
|
|
51
|
+
except (ValueError, OSError):
|
|
52
|
+
return
|
|
53
|
+
if avail_gb < min_gb:
|
|
54
|
+
log.warning(
|
|
55
|
+
"Only %.1f GB of memory available (minimum recommended: %.1f GB). "
|
|
56
|
+
"Consider reducing the collection scope or increasing available memory.",
|
|
57
|
+
avail_gb,
|
|
58
|
+
min_gb,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _dictfetch(cursor: Any, sql: str, params: tuple | None = None) -> list[dict[str, Any]]:
|
|
63
|
+
cursor.execute(sql, params)
|
|
64
|
+
cols = [d.name for d in cursor.description]
|
|
65
|
+
rows = []
|
|
66
|
+
while True:
|
|
67
|
+
chunk = cursor.fetchmany(1000)
|
|
68
|
+
if not chunk:
|
|
69
|
+
break
|
|
70
|
+
rows.extend(dict(zip(cols, row)) for row in chunk)
|
|
71
|
+
return rows
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _safe_isoformat(dt: Any) -> str | None:
|
|
75
|
+
if dt is None:
|
|
76
|
+
return None
|
|
77
|
+
if hasattr(dt, "isoformat"):
|
|
78
|
+
if dt.tzinfo is None:
|
|
79
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
80
|
+
return dt.isoformat()
|
|
81
|
+
return str(dt)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def fetch_query_metadata(
|
|
85
|
+
cursor: Any,
|
|
86
|
+
lookback_hours: int,
|
|
87
|
+
lag_hours: int,
|
|
88
|
+
max_queries: int,
|
|
89
|
+
) -> list[dict[str, Any]]:
|
|
90
|
+
"""Fetch query execution metadata from sys_query_history."""
|
|
91
|
+
return _dictfetch(
|
|
92
|
+
cursor,
|
|
93
|
+
f"""
|
|
94
|
+
SELECT
|
|
95
|
+
query_id,
|
|
96
|
+
start_time,
|
|
97
|
+
end_time,
|
|
98
|
+
status,
|
|
99
|
+
user_id,
|
|
100
|
+
database_name,
|
|
101
|
+
elapsed_time
|
|
102
|
+
FROM sys_query_history
|
|
103
|
+
WHERE start_time >= DATEADD(hour, -{lookback_hours}, GETDATE())
|
|
104
|
+
AND start_time < DATEADD(hour, -{lag_hours}, GETDATE())
|
|
105
|
+
AND status = 'success'
|
|
106
|
+
ORDER BY start_time
|
|
107
|
+
LIMIT {max_queries}
|
|
108
|
+
""", # ← SUBSTITUTE: add AND database_name = 'mydb' to narrow scope
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def fetch_query_texts_batch(cursor: Any, query_ids: list[int]) -> dict[int, str]:
|
|
113
|
+
"""Batch-fetch and assemble multi-row query texts for a list of query_ids."""
|
|
114
|
+
if not query_ids:
|
|
115
|
+
return {}
|
|
116
|
+
|
|
117
|
+
# Build a VALUES list for the IN clause to avoid large parameter arrays
|
|
118
|
+
id_list = ", ".join(str(qid) for qid in query_ids)
|
|
119
|
+
rows = _dictfetch(
|
|
120
|
+
cursor,
|
|
121
|
+
f"""
|
|
122
|
+
SELECT
|
|
123
|
+
query_id,
|
|
124
|
+
LISTAGG(
|
|
125
|
+
CASE WHEN LEN(text) <= 200 THEN text ELSE LEFT(text, 200) END,
|
|
126
|
+
''
|
|
127
|
+
) WITHIN GROUP (ORDER BY sequence) AS query_text
|
|
128
|
+
FROM sys_querytext
|
|
129
|
+
WHERE query_id IN ({id_list})
|
|
130
|
+
GROUP BY query_id
|
|
131
|
+
""",
|
|
132
|
+
)
|
|
133
|
+
return {r["query_id"]: r["query_text"] for r in rows if r.get("query_text")}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def collect(
|
|
137
|
+
host: str,
|
|
138
|
+
db: str,
|
|
139
|
+
user: str,
|
|
140
|
+
password: str,
|
|
141
|
+
manifest_path: str = "manifest_query_logs.json",
|
|
142
|
+
port: int = 5439,
|
|
143
|
+
lookback_hours: int = LOOKBACK_HOURS,
|
|
144
|
+
lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
|
|
145
|
+
batch_size: int = BATCH_SIZE,
|
|
146
|
+
max_queries: int = MAX_QUERIES,
|
|
147
|
+
) -> list[dict[str, Any]]:
|
|
148
|
+
"""Connect to Redshift, collect query logs, write a JSON manifest, and return entries."""
|
|
149
|
+
_check_available_memory()
|
|
150
|
+
collected_at = datetime.now(timezone.utc).isoformat()
|
|
151
|
+
|
|
152
|
+
conn = psycopg2.connect(
|
|
153
|
+
host=host, port=port, dbname=db, user=user, password=password, connect_timeout=30,
|
|
154
|
+
)
|
|
155
|
+
try:
|
|
156
|
+
with conn.cursor() as cursor:
|
|
157
|
+
query_meta = fetch_query_metadata(cursor, lookback_hours, lookback_lag_hours, max_queries)
|
|
158
|
+
log.info("Retrieved %d query metadata rows", len(query_meta))
|
|
159
|
+
|
|
160
|
+
# Batch-fetch texts to avoid enormous single queries
|
|
161
|
+
query_ids = [r["query_id"] for r in query_meta]
|
|
162
|
+
text_map: dict[int, str] = {}
|
|
163
|
+
for i in range(0, len(query_ids), batch_size):
|
|
164
|
+
batch = query_ids[i : i + batch_size]
|
|
165
|
+
text_map.update(fetch_query_texts_batch(cursor, batch))
|
|
166
|
+
log.debug("Fetched texts for batch %d–%d", i, i + len(batch))
|
|
167
|
+
finally:
|
|
168
|
+
conn.close()
|
|
169
|
+
|
|
170
|
+
entries: list[dict[str, Any]] = []
|
|
171
|
+
for row in query_meta:
|
|
172
|
+
qid = row["query_id"]
|
|
173
|
+
query_text = text_map.get(qid, "")
|
|
174
|
+
if not query_text.strip():
|
|
175
|
+
continue # ← SUBSTITUTE: decide whether to push rows with missing text
|
|
176
|
+
|
|
177
|
+
entry = {
|
|
178
|
+
"query_id": str(qid),
|
|
179
|
+
"query_text": query_text,
|
|
180
|
+
"start_time": _safe_isoformat(row.get("start_time")),
|
|
181
|
+
"end_time": _safe_isoformat(row.get("end_time")),
|
|
182
|
+
"user": str(row.get("user_id")) if row.get("user_id") is not None else None,
|
|
183
|
+
"database_name": row.get("database_name"),
|
|
184
|
+
"elapsed_time_us": row.get("elapsed_time"),
|
|
185
|
+
}
|
|
186
|
+
entries.append(entry)
|
|
187
|
+
|
|
188
|
+
log.info("Collected %d query log entries", len(entries))
|
|
189
|
+
|
|
190
|
+
manifest = {
|
|
191
|
+
"log_type": LOG_TYPE,
|
|
192
|
+
"collected_at": collected_at,
|
|
193
|
+
"lookback_hours": lookback_hours,
|
|
194
|
+
"lookback_lag_hours": lookback_lag_hours,
|
|
195
|
+
"query_log_count": len(entries),
|
|
196
|
+
"entries": entries,
|
|
197
|
+
}
|
|
198
|
+
with open(manifest_path, "w") as fh:
|
|
199
|
+
json.dump(manifest, fh, indent=2)
|
|
200
|
+
log.info("Manifest written to %s (%d entries)", manifest_path, len(entries))
|
|
201
|
+
|
|
202
|
+
return entries
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def main() -> None:
|
|
206
|
+
parser = argparse.ArgumentParser(description="Collect Redshift query logs to a manifest file")
|
|
207
|
+
parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
|
|
208
|
+
parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
|
|
209
|
+
parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
|
|
210
|
+
parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
|
|
211
|
+
parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
|
|
212
|
+
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
|
|
213
|
+
parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
|
|
214
|
+
parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
|
|
215
|
+
parser.add_argument("--max-queries", type=int, default=MAX_QUERIES)
|
|
216
|
+
parser.add_argument("--manifest", default="manifest_query_logs.json")
|
|
217
|
+
args = parser.parse_args()
|
|
218
|
+
|
|
219
|
+
required = ["host", "db", "user", "password"]
|
|
220
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
221
|
+
if missing:
|
|
222
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
223
|
+
|
|
224
|
+
collect(
|
|
225
|
+
host=args.host,
|
|
226
|
+
db=args.db,
|
|
227
|
+
user=args.user,
|
|
228
|
+
password=args.password,
|
|
229
|
+
manifest_path=args.manifest,
|
|
230
|
+
port=args.port,
|
|
231
|
+
lookback_hours=args.lookback_hours,
|
|
232
|
+
lookback_lag_hours=args.lookback_lag_hours,
|
|
233
|
+
batch_size=args.batch_size,
|
|
234
|
+
max_queries=args.max_queries,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
if __name__ == "__main__":
|
|
239
|
+
main()
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redshift — Lineage Push (push-only)
|
|
3
|
+
=====================================
|
|
4
|
+
Reads a JSON manifest file produced by collect_lineage.py and pushes the lineage
|
|
5
|
+
events to Monte Carlo via the push ingestion API, with configurable batching to
|
|
6
|
+
keep compressed payloads under 1 MB.
|
|
7
|
+
|
|
8
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
9
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
10
|
+
- MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
|
|
11
|
+
- PUSH_BATCH_SIZE : number of events per API call (default 500)
|
|
12
|
+
|
|
13
|
+
Prerequisites:
|
|
14
|
+
pip install pycarlo
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
LineageAssetRef,
|
|
31
|
+
LineageEvent,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
35
|
+
log = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
RESOURCE_TYPE = "redshift"
|
|
38
|
+
DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ref_from_dict(d: dict[str, Any]) -> LineageAssetRef:
|
|
42
|
+
return LineageAssetRef(
|
|
43
|
+
type="TABLE",
|
|
44
|
+
name=d["asset_name"],
|
|
45
|
+
database=d.get("database", ""),
|
|
46
|
+
schema=d.get("schema", ""),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _event_from_dict(d: dict[str, Any]) -> LineageEvent:
|
|
51
|
+
"""Reconstruct a LineageEvent from a manifest dict."""
|
|
52
|
+
sources = [_ref_from_dict(s) for s in d.get("sources", [])]
|
|
53
|
+
destination = _ref_from_dict(d["destination"])
|
|
54
|
+
return LineageEvent(
|
|
55
|
+
sources=sources,
|
|
56
|
+
destination=destination,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def push(
|
|
61
|
+
manifest_path: str,
|
|
62
|
+
resource_uuid: str,
|
|
63
|
+
key_id: str,
|
|
64
|
+
key_token: str,
|
|
65
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
66
|
+
) -> dict[str, Any]:
|
|
67
|
+
"""Read a collect manifest and push lineage events to Monte Carlo in batches.
|
|
68
|
+
|
|
69
|
+
Returns a summary dict with invocation IDs and counts.
|
|
70
|
+
"""
|
|
71
|
+
with open(manifest_path) as fh:
|
|
72
|
+
manifest = json.load(fh)
|
|
73
|
+
|
|
74
|
+
event_dicts: list[dict[str, Any]] = manifest["events"]
|
|
75
|
+
events = [_event_from_dict(d) for d in event_dicts]
|
|
76
|
+
log.info("Loaded %d lineage events from %s", len(events), manifest_path)
|
|
77
|
+
|
|
78
|
+
if not events:
|
|
79
|
+
log.info("No lineage events to push.")
|
|
80
|
+
summary = {
|
|
81
|
+
"resource_uuid": resource_uuid,
|
|
82
|
+
"resource_type": RESOURCE_TYPE,
|
|
83
|
+
"invocation_ids": [],
|
|
84
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
85
|
+
"event_count": 0,
|
|
86
|
+
"batch_count": 0,
|
|
87
|
+
"batch_size": batch_size,
|
|
88
|
+
}
|
|
89
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
90
|
+
with open(push_manifest_path, "w") as fh:
|
|
91
|
+
json.dump(summary, fh, indent=2)
|
|
92
|
+
return summary
|
|
93
|
+
|
|
94
|
+
# Split into batches
|
|
95
|
+
batches = []
|
|
96
|
+
for i in range(0, len(events), batch_size):
|
|
97
|
+
batches.append(events[i : i + batch_size])
|
|
98
|
+
total_batches = len(batches)
|
|
99
|
+
|
|
100
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
101
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
102
|
+
log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
|
|
103
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
104
|
+
service = IngestionService(mc_client=client)
|
|
105
|
+
result = service.send_lineage(
|
|
106
|
+
resource_uuid=resource_uuid,
|
|
107
|
+
resource_type=RESOURCE_TYPE,
|
|
108
|
+
events=batch,
|
|
109
|
+
)
|
|
110
|
+
invocation_id = service.extract_invocation_id(result)
|
|
111
|
+
if invocation_id:
|
|
112
|
+
log.info("Batch %d: invocation_id=%s", batch_num, invocation_id)
|
|
113
|
+
return invocation_id
|
|
114
|
+
|
|
115
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
116
|
+
max_workers = min(4, total_batches)
|
|
117
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
118
|
+
|
|
119
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
120
|
+
futures = {
|
|
121
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
122
|
+
for i, batch in enumerate(batches)
|
|
123
|
+
}
|
|
124
|
+
for future in as_completed(futures):
|
|
125
|
+
idx = futures[future]
|
|
126
|
+
try:
|
|
127
|
+
invocation_ids[idx] = future.result()
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
130
|
+
raise
|
|
131
|
+
|
|
132
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
133
|
+
|
|
134
|
+
summary = {
|
|
135
|
+
"resource_uuid": resource_uuid,
|
|
136
|
+
"resource_type": RESOURCE_TYPE,
|
|
137
|
+
"invocation_ids": invocation_ids,
|
|
138
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
139
|
+
"event_count": len(events),
|
|
140
|
+
"batch_count": total_batches,
|
|
141
|
+
"batch_size": batch_size,
|
|
142
|
+
"lookback_hours": manifest.get("lookback_hours"),
|
|
143
|
+
"queries_scanned": manifest.get("queries_scanned"),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
147
|
+
with open(push_manifest_path, "w") as fh:
|
|
148
|
+
json.dump(summary, fh, indent=2)
|
|
149
|
+
log.info("Push result written to %s", push_manifest_path)
|
|
150
|
+
|
|
151
|
+
return summary
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def main() -> None:
|
|
155
|
+
parser = argparse.ArgumentParser(description="Push Redshift lineage to Monte Carlo from manifest")
|
|
156
|
+
parser.add_argument("--manifest", default="manifest_lineage.json")
|
|
157
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
158
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
159
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
160
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
161
|
+
args = parser.parse_args()
|
|
162
|
+
|
|
163
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
164
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
165
|
+
if missing:
|
|
166
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
167
|
+
|
|
168
|
+
push(
|
|
169
|
+
manifest_path=args.manifest,
|
|
170
|
+
resource_uuid=args.resource_uuid,
|
|
171
|
+
key_id=args.key_id,
|
|
172
|
+
key_token=args.key_token,
|
|
173
|
+
batch_size=args.batch_size,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
main()
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redshift — Metadata Push (push-only)
|
|
3
|
+
======================================
|
|
4
|
+
Reads a JSON manifest file produced by collect_metadata.py and pushes the assets
|
|
5
|
+
to Monte Carlo via the push ingestion API, with configurable batching to keep
|
|
6
|
+
compressed payloads under 1 MB.
|
|
7
|
+
|
|
8
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
9
|
+
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
10
|
+
- MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
|
|
11
|
+
- PUSH_BATCH_SIZE : number of assets per API call (default 500)
|
|
12
|
+
|
|
13
|
+
Prerequisites:
|
|
14
|
+
pip install pycarlo
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
AssetField,
|
|
31
|
+
AssetFreshness,
|
|
32
|
+
AssetMetadata,
|
|
33
|
+
AssetVolume,
|
|
34
|
+
RelationalAsset,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
38
|
+
log = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
RESOURCE_TYPE = "redshift"
|
|
41
|
+
DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _asset_from_dict(d: dict[str, Any]) -> RelationalAsset:
|
|
45
|
+
"""Reconstruct a RelationalAsset from a manifest dict."""
|
|
46
|
+
fields = [
|
|
47
|
+
AssetField(
|
|
48
|
+
name=f["name"],
|
|
49
|
+
type=f.get("type"),
|
|
50
|
+
description=f.get("description"),
|
|
51
|
+
)
|
|
52
|
+
for f in d.get("fields", [])
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
volume = None
|
|
56
|
+
if d.get("row_count") is not None or d.get("byte_count") is not None:
|
|
57
|
+
volume = AssetVolume(
|
|
58
|
+
row_count=d.get("row_count"),
|
|
59
|
+
byte_count=d.get("byte_count"),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
freshness = None
|
|
63
|
+
if d.get("last_updated") is not None:
|
|
64
|
+
freshness = AssetFreshness(last_update_time=d.get("last_updated"))
|
|
65
|
+
|
|
66
|
+
return RelationalAsset(
|
|
67
|
+
type=d.get("asset_type", "TABLE"),
|
|
68
|
+
metadata=AssetMetadata(
|
|
69
|
+
name=d["asset_name"],
|
|
70
|
+
database=d["database"], # ← SUBSTITUTE: use database as top-level namespace
|
|
71
|
+
schema=d["schema"],
|
|
72
|
+
description=d.get("description"),
|
|
73
|
+
),
|
|
74
|
+
fields=fields,
|
|
75
|
+
volume=volume,
|
|
76
|
+
freshness=freshness,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def push(
|
|
81
|
+
manifest_path: str,
|
|
82
|
+
resource_uuid: str,
|
|
83
|
+
key_id: str,
|
|
84
|
+
key_token: str,
|
|
85
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
86
|
+
) -> dict[str, Any]:
|
|
87
|
+
"""Read a collect manifest and push assets to Monte Carlo in batches.
|
|
88
|
+
|
|
89
|
+
Returns a summary dict with invocation IDs and counts.
|
|
90
|
+
"""
|
|
91
|
+
with open(manifest_path) as fh:
|
|
92
|
+
manifest = json.load(fh)
|
|
93
|
+
|
|
94
|
+
asset_dicts: list[dict[str, Any]] = manifest["assets"]
|
|
95
|
+
assets = [_asset_from_dict(d) for d in asset_dicts]
|
|
96
|
+
log.info("Loaded %d assets from %s", len(assets), manifest_path)
|
|
97
|
+
|
|
98
|
+
# Split into batches
|
|
99
|
+
batches = []
|
|
100
|
+
for i in range(0, max(len(assets), 1), batch_size):
|
|
101
|
+
batches.append(assets[i : i + batch_size])
|
|
102
|
+
total_batches = len(batches)
|
|
103
|
+
|
|
104
|
+
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
105
|
+
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
106
|
+
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
107
|
+
service = IngestionService(mc_client=client)
|
|
108
|
+
result = service.send_metadata(
|
|
109
|
+
resource_uuid=resource_uuid,
|
|
110
|
+
resource_type=RESOURCE_TYPE,
|
|
111
|
+
events=batch,
|
|
112
|
+
)
|
|
113
|
+
invocation_id = service.extract_invocation_id(result)
|
|
114
|
+
log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
115
|
+
return invocation_id
|
|
116
|
+
|
|
117
|
+
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
118
|
+
max_workers = min(4, total_batches)
|
|
119
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
120
|
+
|
|
121
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
122
|
+
futures = {
|
|
123
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
124
|
+
for i, batch in enumerate(batches)
|
|
125
|
+
}
|
|
126
|
+
for future in as_completed(futures):
|
|
127
|
+
idx = futures[future]
|
|
128
|
+
try:
|
|
129
|
+
invocation_ids[idx] = future.result()
|
|
130
|
+
except Exception as exc:
|
|
131
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
132
|
+
raise
|
|
133
|
+
|
|
134
|
+
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
135
|
+
|
|
136
|
+
summary = {
|
|
137
|
+
"resource_uuid": resource_uuid,
|
|
138
|
+
"resource_type": RESOURCE_TYPE,
|
|
139
|
+
"invocation_ids": invocation_ids,
|
|
140
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
141
|
+
"asset_count": len(assets),
|
|
142
|
+
"batch_count": total_batches,
|
|
143
|
+
"batch_size": batch_size,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
147
|
+
with open(push_manifest_path, "w") as fh:
|
|
148
|
+
json.dump(summary, fh, indent=2)
|
|
149
|
+
log.info("Push result written to %s", push_manifest_path)
|
|
150
|
+
|
|
151
|
+
return summary
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def main() -> None:
|
|
155
|
+
parser = argparse.ArgumentParser(description="Push Redshift metadata to Monte Carlo from manifest")
|
|
156
|
+
parser.add_argument("--manifest", default="manifest_metadata.json")
|
|
157
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
158
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
159
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
160
|
+
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
161
|
+
args = parser.parse_args()
|
|
162
|
+
|
|
163
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
164
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
165
|
+
if missing:
|
|
166
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
167
|
+
|
|
168
|
+
push(
|
|
169
|
+
manifest_path=args.manifest,
|
|
170
|
+
resource_uuid=args.resource_uuid,
|
|
171
|
+
key_id=args.key_id,
|
|
172
|
+
key_token=args.key_token,
|
|
173
|
+
batch_size=args.batch_size,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
main()
|