opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,239 @@
1
+ """
2
+ Redshift — Query Log Collection (collect-only)
3
+ ================================================
4
+ Collects completed query execution records from Redshift using sys_query_history
5
+ and sys_querytext (modern RA3/serverless), assembles full SQL text from
6
+ multi-row text chunks, and writes a JSON manifest file that can be consumed
7
+ by push_query_logs.py.
8
+
9
+ Substitution points (search for "← SUBSTITUTE"):
10
+ - REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
11
+ - LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
12
+ - LOOKBACK_LAG_HOURS: lag behind now to avoid in-flight queries (default 1)
13
+ - BATCH_SIZE : number of query_ids to fetch texts for in one SQL call
14
+ - MAX_QUERIES : maximum query rows to process per run
15
+
16
+ Prerequisites:
17
+ pip install psycopg2-binary
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import logging
25
+ import os
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+ import psycopg2
30
+
31
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
32
+ log = logging.getLogger(__name__)
33
+
34
+ LOG_TYPE = "redshift"
35
+
36
+ LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
37
+ LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
38
+ BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "200")) # ← SUBSTITUTE
39
+ MAX_QUERIES: int = int(os.getenv("MAX_QUERIES", "10000")) # ← SUBSTITUTE
40
+
41
+
42
+ def _check_available_memory(min_gb: float = 2.0) -> None:
43
+ """Warn if available memory is below the threshold."""
44
+ try:
45
+ if hasattr(os, "sysconf"): # Linux / macOS
46
+ page_size = os.sysconf("SC_PAGE_SIZE")
47
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
48
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
49
+ else:
50
+ return # Windows — skip check
51
+ except (ValueError, OSError):
52
+ return
53
+ if avail_gb < min_gb:
54
+ log.warning(
55
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
56
+ "Consider reducing the collection scope or increasing available memory.",
57
+ avail_gb,
58
+ min_gb,
59
+ )
60
+
61
+
62
+ def _dictfetch(cursor: Any, sql: str, params: tuple | None = None) -> list[dict[str, Any]]:
63
+ cursor.execute(sql, params)
64
+ cols = [d.name for d in cursor.description]
65
+ rows = []
66
+ while True:
67
+ chunk = cursor.fetchmany(1000)
68
+ if not chunk:
69
+ break
70
+ rows.extend(dict(zip(cols, row)) for row in chunk)
71
+ return rows
72
+
73
+
74
+ def _safe_isoformat(dt: Any) -> str | None:
75
+ if dt is None:
76
+ return None
77
+ if hasattr(dt, "isoformat"):
78
+ if dt.tzinfo is None:
79
+ dt = dt.replace(tzinfo=timezone.utc)
80
+ return dt.isoformat()
81
+ return str(dt)
82
+
83
+
84
+ def fetch_query_metadata(
85
+ cursor: Any,
86
+ lookback_hours: int,
87
+ lag_hours: int,
88
+ max_queries: int,
89
+ ) -> list[dict[str, Any]]:
90
+ """Fetch query execution metadata from sys_query_history."""
91
+ return _dictfetch(
92
+ cursor,
93
+ f"""
94
+ SELECT
95
+ query_id,
96
+ start_time,
97
+ end_time,
98
+ status,
99
+ user_id,
100
+ database_name,
101
+ elapsed_time
102
+ FROM sys_query_history
103
+ WHERE start_time >= DATEADD(hour, -{lookback_hours}, GETDATE())
104
+ AND start_time < DATEADD(hour, -{lag_hours}, GETDATE())
105
+ AND status = 'success'
106
+ ORDER BY start_time
107
+ LIMIT {max_queries}
108
+ """, # ← SUBSTITUTE: add AND database_name = 'mydb' to narrow scope
109
+ )
110
+
111
+
112
+ def fetch_query_texts_batch(cursor: Any, query_ids: list[int]) -> dict[int, str]:
113
+ """Batch-fetch and assemble multi-row query texts for a list of query_ids."""
114
+ if not query_ids:
115
+ return {}
116
+
117
+ # Build a VALUES list for the IN clause to avoid large parameter arrays
118
+ id_list = ", ".join(str(qid) for qid in query_ids)
119
+ rows = _dictfetch(
120
+ cursor,
121
+ f"""
122
+ SELECT
123
+ query_id,
124
+ LISTAGG(
125
+ CASE WHEN LEN(text) <= 200 THEN text ELSE LEFT(text, 200) END,
126
+ ''
127
+ ) WITHIN GROUP (ORDER BY sequence) AS query_text
128
+ FROM sys_querytext
129
+ WHERE query_id IN ({id_list})
130
+ GROUP BY query_id
131
+ """,
132
+ )
133
+ return {r["query_id"]: r["query_text"] for r in rows if r.get("query_text")}
134
+
135
+
136
+ def collect(
137
+ host: str,
138
+ db: str,
139
+ user: str,
140
+ password: str,
141
+ manifest_path: str = "manifest_query_logs.json",
142
+ port: int = 5439,
143
+ lookback_hours: int = LOOKBACK_HOURS,
144
+ lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
145
+ batch_size: int = BATCH_SIZE,
146
+ max_queries: int = MAX_QUERIES,
147
+ ) -> list[dict[str, Any]]:
148
+ """Connect to Redshift, collect query logs, write a JSON manifest, and return entries."""
149
+ _check_available_memory()
150
+ collected_at = datetime.now(timezone.utc).isoformat()
151
+
152
+ conn = psycopg2.connect(
153
+ host=host, port=port, dbname=db, user=user, password=password, connect_timeout=30,
154
+ )
155
+ try:
156
+ with conn.cursor() as cursor:
157
+ query_meta = fetch_query_metadata(cursor, lookback_hours, lookback_lag_hours, max_queries)
158
+ log.info("Retrieved %d query metadata rows", len(query_meta))
159
+
160
+ # Batch-fetch texts to avoid enormous single queries
161
+ query_ids = [r["query_id"] for r in query_meta]
162
+ text_map: dict[int, str] = {}
163
+ for i in range(0, len(query_ids), batch_size):
164
+ batch = query_ids[i : i + batch_size]
165
+ text_map.update(fetch_query_texts_batch(cursor, batch))
166
+ log.debug("Fetched texts for batch %d–%d", i, i + len(batch))
167
+ finally:
168
+ conn.close()
169
+
170
+ entries: list[dict[str, Any]] = []
171
+ for row in query_meta:
172
+ qid = row["query_id"]
173
+ query_text = text_map.get(qid, "")
174
+ if not query_text.strip():
175
+ continue # ← SUBSTITUTE: decide whether to push rows with missing text
176
+
177
+ entry = {
178
+ "query_id": str(qid),
179
+ "query_text": query_text,
180
+ "start_time": _safe_isoformat(row.get("start_time")),
181
+ "end_time": _safe_isoformat(row.get("end_time")),
182
+ "user": str(row.get("user_id")) if row.get("user_id") is not None else None,
183
+ "database_name": row.get("database_name"),
184
+ "elapsed_time_us": row.get("elapsed_time"),
185
+ }
186
+ entries.append(entry)
187
+
188
+ log.info("Collected %d query log entries", len(entries))
189
+
190
+ manifest = {
191
+ "log_type": LOG_TYPE,
192
+ "collected_at": collected_at,
193
+ "lookback_hours": lookback_hours,
194
+ "lookback_lag_hours": lookback_lag_hours,
195
+ "query_log_count": len(entries),
196
+ "entries": entries,
197
+ }
198
+ with open(manifest_path, "w") as fh:
199
+ json.dump(manifest, fh, indent=2)
200
+ log.info("Manifest written to %s (%d entries)", manifest_path, len(entries))
201
+
202
+ return entries
203
+
204
+
205
+ def main() -> None:
206
+ parser = argparse.ArgumentParser(description="Collect Redshift query logs to a manifest file")
207
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
208
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
209
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
210
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
211
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
212
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
213
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
214
+ parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
215
+ parser.add_argument("--max-queries", type=int, default=MAX_QUERIES)
216
+ parser.add_argument("--manifest", default="manifest_query_logs.json")
217
+ args = parser.parse_args()
218
+
219
+ required = ["host", "db", "user", "password"]
220
+ missing = [k for k in required if getattr(args, k) is None]
221
+ if missing:
222
+ parser.error(f"Missing required arguments/env vars: {missing}")
223
+
224
+ collect(
225
+ host=args.host,
226
+ db=args.db,
227
+ user=args.user,
228
+ password=args.password,
229
+ manifest_path=args.manifest,
230
+ port=args.port,
231
+ lookback_hours=args.lookback_hours,
232
+ lookback_lag_hours=args.lookback_lag_hours,
233
+ batch_size=args.batch_size,
234
+ max_queries=args.max_queries,
235
+ )
236
+
237
+
238
+ if __name__ == "__main__":
239
+ main()
@@ -0,0 +1,178 @@
1
+ """
2
+ Redshift — Lineage Push (push-only)
3
+ =====================================
4
+ Reads a JSON manifest file produced by collect_lineage.py and pushes the lineage
5
+ events to Monte Carlo via the push ingestion API, with configurable batching to
6
+ keep compressed payloads under 1 MB.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
10
+ - MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
11
+ - PUSH_BATCH_SIZE : number of events per API call (default 500)
12
+
13
+ Prerequisites:
14
+ pip install pycarlo
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ LineageAssetRef,
31
+ LineageEvent,
32
+ )
33
+
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
35
+ log = logging.getLogger(__name__)
36
+
37
+ RESOURCE_TYPE = "redshift"
38
+ DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
39
+
40
+
41
+ def _ref_from_dict(d: dict[str, Any]) -> LineageAssetRef:
42
+ return LineageAssetRef(
43
+ type="TABLE",
44
+ name=d["asset_name"],
45
+ database=d.get("database", ""),
46
+ schema=d.get("schema", ""),
47
+ )
48
+
49
+
50
+ def _event_from_dict(d: dict[str, Any]) -> LineageEvent:
51
+ """Reconstruct a LineageEvent from a manifest dict."""
52
+ sources = [_ref_from_dict(s) for s in d.get("sources", [])]
53
+ destination = _ref_from_dict(d["destination"])
54
+ return LineageEvent(
55
+ sources=sources,
56
+ destination=destination,
57
+ )
58
+
59
+
60
+ def push(
61
+ manifest_path: str,
62
+ resource_uuid: str,
63
+ key_id: str,
64
+ key_token: str,
65
+ batch_size: int = DEFAULT_BATCH_SIZE,
66
+ ) -> dict[str, Any]:
67
+ """Read a collect manifest and push lineage events to Monte Carlo in batches.
68
+
69
+ Returns a summary dict with invocation IDs and counts.
70
+ """
71
+ with open(manifest_path) as fh:
72
+ manifest = json.load(fh)
73
+
74
+ event_dicts: list[dict[str, Any]] = manifest["events"]
75
+ events = [_event_from_dict(d) for d in event_dicts]
76
+ log.info("Loaded %d lineage events from %s", len(events), manifest_path)
77
+
78
+ if not events:
79
+ log.info("No lineage events to push.")
80
+ summary = {
81
+ "resource_uuid": resource_uuid,
82
+ "resource_type": RESOURCE_TYPE,
83
+ "invocation_ids": [],
84
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
85
+ "event_count": 0,
86
+ "batch_count": 0,
87
+ "batch_size": batch_size,
88
+ }
89
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
90
+ with open(push_manifest_path, "w") as fh:
91
+ json.dump(summary, fh, indent=2)
92
+ return summary
93
+
94
+ # Split into batches
95
+ batches = []
96
+ for i in range(0, len(events), batch_size):
97
+ batches.append(events[i : i + batch_size])
98
+ total_batches = len(batches)
99
+
100
+ def _push_batch(batch: list, batch_num: int) -> str | None:
101
+ """Push a single batch using a dedicated Session (thread-safe)."""
102
+ log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
103
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
104
+ service = IngestionService(mc_client=client)
105
+ result = service.send_lineage(
106
+ resource_uuid=resource_uuid,
107
+ resource_type=RESOURCE_TYPE,
108
+ events=batch,
109
+ )
110
+ invocation_id = service.extract_invocation_id(result)
111
+ if invocation_id:
112
+ log.info("Batch %d: invocation_id=%s", batch_num, invocation_id)
113
+ return invocation_id
114
+
115
+ # Push batches in parallel (each thread gets its own pycarlo Session)
116
+ max_workers = min(4, total_batches)
117
+ invocation_ids: list[str | None] = [None] * total_batches
118
+
119
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
120
+ futures = {
121
+ pool.submit(_push_batch, batch, i + 1): i
122
+ for i, batch in enumerate(batches)
123
+ }
124
+ for future in as_completed(futures):
125
+ idx = futures[future]
126
+ try:
127
+ invocation_ids[idx] = future.result()
128
+ except Exception as exc:
129
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
130
+ raise
131
+
132
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
133
+
134
+ summary = {
135
+ "resource_uuid": resource_uuid,
136
+ "resource_type": RESOURCE_TYPE,
137
+ "invocation_ids": invocation_ids,
138
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
139
+ "event_count": len(events),
140
+ "batch_count": total_batches,
141
+ "batch_size": batch_size,
142
+ "lookback_hours": manifest.get("lookback_hours"),
143
+ "queries_scanned": manifest.get("queries_scanned"),
144
+ }
145
+
146
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
147
+ with open(push_manifest_path, "w") as fh:
148
+ json.dump(summary, fh, indent=2)
149
+ log.info("Push result written to %s", push_manifest_path)
150
+
151
+ return summary
152
+
153
+
154
+ def main() -> None:
155
+ parser = argparse.ArgumentParser(description="Push Redshift lineage to Monte Carlo from manifest")
156
+ parser.add_argument("--manifest", default="manifest_lineage.json")
157
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
158
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
159
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
160
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
161
+ args = parser.parse_args()
162
+
163
+ required = ["resource_uuid", "key_id", "key_token"]
164
+ missing = [k for k in required if getattr(args, k) is None]
165
+ if missing:
166
+ parser.error(f"Missing required arguments/env vars: {missing}")
167
+
168
+ push(
169
+ manifest_path=args.manifest,
170
+ resource_uuid=args.resource_uuid,
171
+ key_id=args.key_id,
172
+ key_token=args.key_token,
173
+ batch_size=args.batch_size,
174
+ )
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
@@ -0,0 +1,178 @@
1
+ """
2
+ Redshift — Metadata Push (push-only)
3
+ ======================================
4
+ Reads a JSON manifest file produced by collect_metadata.py and pushes the assets
5
+ to Monte Carlo via the push ingestion API, with configurable batching to keep
6
+ compressed payloads under 1 MB.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
10
+ - MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
11
+ - PUSH_BATCH_SIZE : number of assets per API call (default 500)
12
+
13
+ Prerequisites:
14
+ pip install pycarlo
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from datetime import datetime, timezone
25
+ from typing import Any
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ AssetField,
31
+ AssetFreshness,
32
+ AssetMetadata,
33
+ AssetVolume,
34
+ RelationalAsset,
35
+ )
36
+
37
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
38
+ log = logging.getLogger(__name__)
39
+
40
+ RESOURCE_TYPE = "redshift"
41
+ DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
42
+
43
+
44
+ def _asset_from_dict(d: dict[str, Any]) -> RelationalAsset:
45
+ """Reconstruct a RelationalAsset from a manifest dict."""
46
+ fields = [
47
+ AssetField(
48
+ name=f["name"],
49
+ type=f.get("type"),
50
+ description=f.get("description"),
51
+ )
52
+ for f in d.get("fields", [])
53
+ ]
54
+
55
+ volume = None
56
+ if d.get("row_count") is not None or d.get("byte_count") is not None:
57
+ volume = AssetVolume(
58
+ row_count=d.get("row_count"),
59
+ byte_count=d.get("byte_count"),
60
+ )
61
+
62
+ freshness = None
63
+ if d.get("last_updated") is not None:
64
+ freshness = AssetFreshness(last_update_time=d.get("last_updated"))
65
+
66
+ return RelationalAsset(
67
+ type=d.get("asset_type", "TABLE"),
68
+ metadata=AssetMetadata(
69
+ name=d["asset_name"],
70
+ database=d["database"], # ← SUBSTITUTE: use database as top-level namespace
71
+ schema=d["schema"],
72
+ description=d.get("description"),
73
+ ),
74
+ fields=fields,
75
+ volume=volume,
76
+ freshness=freshness,
77
+ )
78
+
79
+
80
+ def push(
81
+ manifest_path: str,
82
+ resource_uuid: str,
83
+ key_id: str,
84
+ key_token: str,
85
+ batch_size: int = DEFAULT_BATCH_SIZE,
86
+ ) -> dict[str, Any]:
87
+ """Read a collect manifest and push assets to Monte Carlo in batches.
88
+
89
+ Returns a summary dict with invocation IDs and counts.
90
+ """
91
+ with open(manifest_path) as fh:
92
+ manifest = json.load(fh)
93
+
94
+ asset_dicts: list[dict[str, Any]] = manifest["assets"]
95
+ assets = [_asset_from_dict(d) for d in asset_dicts]
96
+ log.info("Loaded %d assets from %s", len(assets), manifest_path)
97
+
98
+ # Split into batches
99
+ batches = []
100
+ for i in range(0, max(len(assets), 1), batch_size):
101
+ batches.append(assets[i : i + batch_size])
102
+ total_batches = len(batches)
103
+
104
+ def _push_batch(batch: list, batch_num: int) -> str | None:
105
+ """Push a single batch using a dedicated Session (thread-safe)."""
106
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
107
+ service = IngestionService(mc_client=client)
108
+ result = service.send_metadata(
109
+ resource_uuid=resource_uuid,
110
+ resource_type=RESOURCE_TYPE,
111
+ events=batch,
112
+ )
113
+ invocation_id = service.extract_invocation_id(result)
114
+ log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
115
+ return invocation_id
116
+
117
+ # Push batches in parallel (each thread gets its own pycarlo Session)
118
+ max_workers = min(4, total_batches)
119
+ invocation_ids: list[str | None] = [None] * total_batches
120
+
121
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
122
+ futures = {
123
+ pool.submit(_push_batch, batch, i + 1): i
124
+ for i, batch in enumerate(batches)
125
+ }
126
+ for future in as_completed(futures):
127
+ idx = futures[future]
128
+ try:
129
+ invocation_ids[idx] = future.result()
130
+ except Exception as exc:
131
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
132
+ raise
133
+
134
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
135
+
136
+ summary = {
137
+ "resource_uuid": resource_uuid,
138
+ "resource_type": RESOURCE_TYPE,
139
+ "invocation_ids": invocation_ids,
140
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
141
+ "asset_count": len(assets),
142
+ "batch_count": total_batches,
143
+ "batch_size": batch_size,
144
+ }
145
+
146
+ push_manifest_path = manifest_path.replace(".json", "_push_result.json")
147
+ with open(push_manifest_path, "w") as fh:
148
+ json.dump(summary, fh, indent=2)
149
+ log.info("Push result written to %s", push_manifest_path)
150
+
151
+ return summary
152
+
153
+
154
+ def main() -> None:
155
+ parser = argparse.ArgumentParser(description="Push Redshift metadata to Monte Carlo from manifest")
156
+ parser.add_argument("--manifest", default="manifest_metadata.json")
157
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
158
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
159
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
160
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
161
+ args = parser.parse_args()
162
+
163
+ required = ["resource_uuid", "key_id", "key_token"]
164
+ missing = [k for k in required if getattr(args, k) is None]
165
+ if missing:
166
+ parser.error(f"Missing required arguments/env vars: {missing}")
167
+
168
+ push(
169
+ manifest_path=args.manifest,
170
+ resource_uuid=args.resource_uuid,
171
+ key_id=args.key_id,
172
+ key_token=args.key_token,
173
+ batch_size=args.batch_size,
174
+ )
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()