opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,245 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Push a collected Hive metadata manifest to Monte Carlo — push only.
4
+
5
+ Reads a JSON manifest produced by ``collect_metadata.py``, builds
6
+ RelationalAsset objects, and calls ``send_metadata`` in batches. The manifest
7
+ is updated in-place with ``resource_uuid`` and ``invocation_id`` after a
8
+ successful push.
9
+
10
+ Can be run standalone via CLI or imported (use the ``push()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
15
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
16
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
17
+
18
+ Prerequisites
19
+ -------------
20
+ pip install pycarlo python-dotenv
21
+
22
+ Usage
23
+ -----
24
+ python push_metadata.py \\
25
+ --key-id <MCD_INGEST_ID> \\
26
+ --key-token <MCD_INGEST_TOKEN> \\
27
+ --resource-uuid <MCD_RESOURCE_UUID> \\
28
+ --input-file metadata_output.json
29
+ """
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+ from concurrent.futures import ThreadPoolExecutor, as_completed
35
+ from datetime import datetime, timezone
36
+
37
+ from pycarlo.core import Client, Session
38
+ from pycarlo.features.ingestion import IngestionService
39
+ from pycarlo.features.ingestion.models import (
40
+ AssetField,
41
+ AssetFreshness,
42
+ AssetMetadata,
43
+ AssetVolume,
44
+ RelationalAsset,
45
+ )
46
+
47
+ # ← SUBSTITUTE: default batch size for metadata push (assets per request)
48
+ DEFAULT_BATCH_SIZE = 500
49
+
50
+ # ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
51
+ DEFAULT_TIMEOUT_SECONDS = 120
52
+
53
+
54
+ def _build_assets(manifest: dict) -> list[RelationalAsset]:
55
+ """Rebuild RelationalAsset objects from a collected metadata manifest."""
56
+ assets = []
57
+ for a in manifest.get("assets", []):
58
+ fields = [
59
+ AssetField(
60
+ name=f["name"],
61
+ type=f["type"],
62
+ description=f.get("description"),
63
+ )
64
+ for f in a.get("fields", [])
65
+ ]
66
+
67
+ volume = None
68
+ row_count = a.get("row_count")
69
+ byte_count = a.get("byte_count")
70
+ if row_count or byte_count:
71
+ volume = AssetVolume(
72
+ row_count=row_count if row_count and row_count > 0 else None,
73
+ byte_count=byte_count if byte_count and byte_count > 0 else None,
74
+ )
75
+
76
+ freshness = None
77
+ last_modified = a.get("last_modified")
78
+ if last_modified:
79
+ freshness = AssetFreshness(last_update_time=last_modified)
80
+
81
+ assets.append(
82
+ RelationalAsset(
83
+ type="TABLE",
84
+ metadata=AssetMetadata(
85
+ name=a["name"],
86
+ database=a["database"],
87
+ schema=a["schema"],
88
+ description=a.get("description"),
89
+ created_on=a.get("created_on"),
90
+ ),
91
+ fields=fields,
92
+ volume=volume,
93
+ freshness=freshness,
94
+ )
95
+ )
96
+ return assets
97
+
98
+
99
+ def push(
100
+ manifest: dict,
101
+ resource_uuid: str,
102
+ key_id: str,
103
+ key_token: str,
104
+ batch_size: int = DEFAULT_BATCH_SIZE,
105
+ timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
106
+ ) -> str | None:
107
+ """
108
+ Push collected metadata to Monte Carlo and update the manifest in-place.
109
+
110
+ Assets are sent in batches of ``batch_size`` (default 500) to avoid
111
+ oversized payloads. The manifest is enriched with ``resource_uuid``
112
+ and the last ``invocation_id`` from the response.
113
+
114
+ Args:
115
+ manifest: Dict loaded from a ``collect_metadata.py`` output file.
116
+ resource_uuid: MC resource UUID for this Hive connection.
117
+ key_id: MC ingestion key ID.
118
+ key_token: MC ingestion key token.
119
+ batch_size: Assets per POST request (default 500).
120
+ timeout_seconds: HTTP timeout per request (default 120).
121
+
122
+ Returns:
123
+ The last invocation ID string if returned by MC, otherwise None.
124
+ """
125
+ resource_type = manifest.get("resource_type", "data-lake")
126
+
127
+ assets = _build_assets(manifest)
128
+ n = len(assets)
129
+
130
+ print(f"Loaded {n} asset(s) from manifest")
131
+
132
+ # Split into batches
133
+ batch_list = []
134
+ for i in range(0, max(n, 1), batch_size):
135
+ batch_list.append(assets[i : i + batch_size])
136
+ total_batches = len(batch_list)
137
+
138
+ def _push_batch(batch: list, batch_num: int) -> str | None:
139
+ """Push a single batch using a dedicated Session (thread-safe)."""
140
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
141
+ service = IngestionService(mc_client=client)
142
+ result = service.send_metadata(
143
+ resource_uuid=resource_uuid,
144
+ resource_type=resource_type,
145
+ events=batch,
146
+ )
147
+ invocation_id = service.extract_invocation_id(result)
148
+ print(f" Pushed batch {batch_num}/{total_batches} ({len(batch)} assets) — invocation_id={invocation_id}")
149
+ return invocation_id
150
+
151
+ # Push batches in parallel (each thread gets its own pycarlo Session)
152
+ max_workers = min(4, total_batches)
153
+ invocation_ids: list[str | None] = [None] * total_batches
154
+
155
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
156
+ futures = {
157
+ pool.submit(_push_batch, batch, i + 1): i
158
+ for i, batch in enumerate(batch_list)
159
+ }
160
+ for future in as_completed(futures):
161
+ idx = futures[future]
162
+ try:
163
+ invocation_ids[idx] = future.result()
164
+ except Exception as exc:
165
+ print(f" ERROR pushing batch {idx + 1}: {exc}")
166
+ raise
167
+
168
+ print(f" All {total_batches} batches pushed ({max_workers} workers)")
169
+
170
+ manifest["resource_uuid"] = resource_uuid
171
+ manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
172
+ if len([i for i in invocation_ids if i]) > 1:
173
+ manifest["invocation_ids"] = invocation_ids
174
+ elif "invocation_ids" in manifest:
175
+ del manifest["invocation_ids"]
176
+
177
+ return manifest.get("invocation_id")
178
+
179
+
180
+ def main() -> None:
181
+ parser = argparse.ArgumentParser(
182
+ description="Push a collected Hive metadata manifest to Monte Carlo",
183
+ )
184
+ parser.add_argument(
185
+ "--key-id",
186
+ default=os.environ.get("MCD_INGEST_ID"),
187
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)", # ← SUBSTITUTE env var name if different
188
+ )
189
+ parser.add_argument(
190
+ "--key-token",
191
+ default=os.environ.get("MCD_INGEST_TOKEN"),
192
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)", # ← SUBSTITUTE env var name if different
193
+ )
194
+ parser.add_argument(
195
+ "--resource-uuid",
196
+ default=os.environ.get("MCD_RESOURCE_UUID"),
197
+ required=False,
198
+ help="Monte Carlo resource UUID for this Hive connection (env: MCD_RESOURCE_UUID)",
199
+ )
200
+ parser.add_argument(
201
+ "--input-file",
202
+ default="metadata_output.json",
203
+ help="Path to the JSON manifest written by collect_metadata.py (default: metadata_output.json)",
204
+ )
205
+ parser.add_argument(
206
+ "--batch-size",
207
+ type=int,
208
+ default=DEFAULT_BATCH_SIZE,
209
+ metavar="N",
210
+ help=f"Max assets per POST (default: {DEFAULT_BATCH_SIZE})",
211
+ )
212
+ parser.add_argument(
213
+ "--timeout",
214
+ type=int,
215
+ default=DEFAULT_TIMEOUT_SECONDS,
216
+ metavar="SEC",
217
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
218
+ )
219
+ args = parser.parse_args()
220
+
221
+ if not args.key_id or not args.key_token:
222
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
223
+ if not args.resource_uuid:
224
+ parser.error("--resource-uuid is required (or set MCD_RESOURCE_UUID)")
225
+
226
+ with open(args.input_file) as fh:
227
+ manifest = json.load(fh)
228
+
229
+ push(
230
+ manifest=manifest,
231
+ resource_uuid=args.resource_uuid,
232
+ key_id=args.key_id,
233
+ key_token=args.key_token,
234
+ batch_size=args.batch_size,
235
+ timeout_seconds=args.timeout,
236
+ )
237
+
238
+ with open(args.input_file, "w") as fh:
239
+ json.dump(manifest, fh, indent=2)
240
+ print(f"Manifest updated in-place: {args.input_file}")
241
+ print("Done.")
242
+
243
+
244
+ if __name__ == "__main__":
245
+ main()
@@ -0,0 +1,255 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Push a collected Hive query log manifest to Monte Carlo — push only.
4
+
5
+ Reads a JSON manifest produced by ``collect_query_logs.py``, builds
6
+ QueryLogEntry objects, and calls ``send_query_logs`` in batches. The manifest
7
+ is updated in-place with ``resource_uuid`` and ``invocation_id`` after a
8
+ successful push.
9
+
10
+ Can be run standalone via CLI or imported (use the ``push()`` function).
11
+
12
+ Substitution points
13
+ -------------------
14
+ - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
15
+ - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
16
+ - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID (optional for query logs)
17
+
18
+ Prerequisites
19
+ -------------
20
+ pip install pycarlo python-dateutil python-dotenv
21
+
22
+ Usage
23
+ -----
24
+ python push_query_logs.py \\
25
+ --key-id <MCD_INGEST_ID> \\
26
+ --key-token <MCD_INGEST_TOKEN> \\
27
+ --resource-uuid <MCD_RESOURCE_UUID> \\
28
+ --input-file query_logs_output.json
29
+ """
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+ from concurrent.futures import ThreadPoolExecutor, as_completed
35
+ from datetime import datetime, timezone
36
+
37
+ from dateutil.parser import isoparse
38
+
39
+ from pycarlo.core import Client, Session
40
+ from pycarlo.features.ingestion import IngestionService
41
+ from pycarlo.features.ingestion.models import QueryLogEntry
42
+
43
+ # ← SUBSTITUTE: default batch size for query log push (events per request)
44
+ # Query logs include full SQL text — keep batches small to stay under the 1 MB
45
+ # compressed payload limit. 50 entries can trigger 413 on active warehouses.
46
+ DEFAULT_BATCH_SIZE = 100
47
+
48
+ # ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
49
+ DEFAULT_TIMEOUT_SECONDS = 120
50
+
51
+ # Truncate query_text longer than this to prevent 413 errors.
52
+ # Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
53
+ # compressed payloads even at small batch sizes.
54
+ _MAX_QUERY_TEXT_LEN = 10_000
55
+
56
+
57
+ def _build_events(manifest: dict) -> list[QueryLogEntry]:
58
+ """
59
+ Rebuild QueryLogEntry objects from a collected query log manifest.
60
+
61
+ ISO timestamp strings are parsed back to datetime. Entries are
62
+ deduplicated by query_id.
63
+ """
64
+ seen: set[str] = set()
65
+ events = []
66
+ truncated = 0
67
+ for q in manifest.get("queries", []):
68
+ qid = q.get("query_id")
69
+ if qid and qid in seen:
70
+ continue
71
+ if qid:
72
+ seen.add(qid)
73
+
74
+ start_time = isoparse(q["start_time"])
75
+ if not start_time.tzinfo:
76
+ start_time = start_time.replace(tzinfo=timezone.utc)
77
+
78
+ end_time = isoparse(q["end_time"])
79
+ if not end_time.tzinfo:
80
+ end_time = end_time.replace(tzinfo=timezone.utc)
81
+
82
+ query_text = q.get("query") or ""
83
+
84
+ # Truncate very long SQL to prevent 413 Request Too Large
85
+ if len(query_text) > _MAX_QUERY_TEXT_LEN:
86
+ query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
87
+ truncated += 1
88
+
89
+ events.append(
90
+ QueryLogEntry(
91
+ start_time=start_time,
92
+ end_time=end_time,
93
+ query_text=query_text,
94
+ query_id=qid or None,
95
+ user=q.get("user", "hadoop"), # ← SUBSTITUTE: set the user appropriate for your cluster
96
+ returned_rows=q.get("returned_rows"),
97
+ )
98
+ )
99
+ if truncated:
100
+ print(f" Truncated {truncated} query text(s) exceeding {_MAX_QUERY_TEXT_LEN} chars")
101
+ return events
102
+
103
+
104
+ def push(
105
+ manifest: dict,
106
+ key_id: str,
107
+ key_token: str,
108
+ resource_uuid: str | None = None,
109
+ batch_size: int = DEFAULT_BATCH_SIZE,
110
+ timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
111
+ ) -> str | None:
112
+ """
113
+ Push collected query logs to Monte Carlo and update the manifest in-place.
114
+
115
+ Events are sent in batches of ``batch_size`` (default 100) to avoid
116
+ oversized payloads.
117
+
118
+ Args:
119
+ manifest: Dict loaded from a ``collect_query_logs.py`` output file.
120
+ key_id: MC ingestion key ID.
121
+ key_token: MC ingestion key token.
122
+ resource_uuid: Optional MC resource UUID.
123
+ batch_size: Events per POST request (default 100).
124
+ timeout_seconds: HTTP timeout per request (default 120).
125
+
126
+ Returns:
127
+ The last invocation ID string if returned by MC, otherwise None.
128
+ """
129
+ log_type = manifest.get("log_type", "hive-s3")
130
+
131
+ events = _build_events(manifest)
132
+ n = len(events)
133
+ print(f"Loaded {n} query log entry/entries from manifest")
134
+
135
+ if not events:
136
+ print("No query log entries to push.")
137
+ manifest["log_type"] = log_type
138
+ if resource_uuid is not None:
139
+ manifest["resource_uuid"] = resource_uuid
140
+ manifest["invocation_id"] = None
141
+ return None
142
+
143
+ # Split into batches
144
+ batch_list = []
145
+ for i in range(0, n, batch_size):
146
+ batch_list.append(events[i : i + batch_size])
147
+ total_batches = len(batch_list)
148
+
149
+ def _push_batch(batch: list, batch_num: int) -> str | None:
150
+ """Push a single batch using a dedicated Session (thread-safe)."""
151
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
152
+ service = IngestionService(mc_client=client)
153
+ result = service.send_query_logs(
154
+ resource_uuid=resource_uuid,
155
+ log_type=log_type,
156
+ events=batch,
157
+ )
158
+ invocation_id = service.extract_invocation_id(result)
159
+ print(f" Pushed batch {batch_num}/{total_batches} ({len(batch)} entries) — invocation_id={invocation_id}")
160
+ return invocation_id
161
+
162
+ # Push batches in parallel (each thread gets its own pycarlo Session)
163
+ max_workers = min(4, total_batches)
164
+ invocation_ids: list[str | None] = [None] * total_batches
165
+
166
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
167
+ futures = {
168
+ pool.submit(_push_batch, batch, i + 1): i
169
+ for i, batch in enumerate(batch_list)
170
+ }
171
+ for future in as_completed(futures):
172
+ idx = futures[future]
173
+ try:
174
+ invocation_ids[idx] = future.result()
175
+ except Exception as exc:
176
+ print(f" ERROR pushing batch {idx + 1}: {exc}")
177
+ raise
178
+
179
+ print(f" All {total_batches} batches pushed ({max_workers} workers)")
180
+
181
+ manifest["log_type"] = log_type
182
+ if resource_uuid is not None:
183
+ manifest["resource_uuid"] = resource_uuid
184
+ manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
185
+ if len([i for i in invocation_ids if i]) > 1:
186
+ manifest["invocation_ids"] = invocation_ids
187
+ elif "invocation_ids" in manifest:
188
+ del manifest["invocation_ids"]
189
+
190
+ return manifest.get("invocation_id")
191
+
192
+
193
+ def main() -> None:
194
+ parser = argparse.ArgumentParser(
195
+ description="Push a collected Hive query log manifest to Monte Carlo",
196
+ )
197
+ parser.add_argument(
198
+ "--key-id",
199
+ default=os.environ.get("MCD_INGEST_ID"),
200
+ help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
201
+ )
202
+ parser.add_argument(
203
+ "--key-token",
204
+ default=os.environ.get("MCD_INGEST_TOKEN"),
205
+ help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
206
+ )
207
+ parser.add_argument(
208
+ "--resource-uuid",
209
+ default=os.environ.get("MCD_RESOURCE_UUID"),
210
+ help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
211
+ )
212
+ parser.add_argument(
213
+ "--input-file",
214
+ default="query_logs_output.json",
215
+ help="Path to the JSON manifest written by collect_query_logs.py (default: query_logs_output.json)",
216
+ )
217
+ parser.add_argument(
218
+ "--batch-size",
219
+ type=int,
220
+ default=DEFAULT_BATCH_SIZE,
221
+ metavar="N",
222
+ help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
223
+ )
224
+ parser.add_argument(
225
+ "--timeout",
226
+ type=int,
227
+ default=DEFAULT_TIMEOUT_SECONDS,
228
+ metavar="SEC",
229
+ help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
230
+ )
231
+ args = parser.parse_args()
232
+
233
+ if not args.key_id or not args.key_token:
234
+ parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
235
+
236
+ with open(args.input_file) as fh:
237
+ manifest = json.load(fh)
238
+
239
+ push(
240
+ manifest=manifest,
241
+ key_id=args.key_id,
242
+ key_token=args.key_token,
243
+ resource_uuid=args.resource_uuid,
244
+ batch_size=args.batch_size,
245
+ timeout_seconds=args.timeout,
246
+ )
247
+
248
+ with open(args.input_file, "w") as fh:
249
+ json.dump(manifest, fh, indent=2)
250
+ print(f"Manifest updated in-place: {args.input_file}")
251
+ print("Done.")
252
+
253
+
254
+ if __name__ == "__main__":
255
+ main()
@@ -0,0 +1,78 @@
1
+ """
2
+ Redshift — Lineage Collect & Push (combined)
3
+ ==============================================
4
+ Collects table-level lineage from Redshift by parsing query history, then pushes
5
+ the derived lineage events to Monte Carlo via the push ingestion API.
6
+
7
+ This script imports and calls collect() from collect_lineage and push() from
8
+ push_lineage, running both in sequence.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
12
+ - LOOKBACK_HOURS : how far back to scan query history (default 24 h)
13
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
14
+ - MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
15
+ - PUSH_BATCH_SIZE : number of events per API call (default 500)
16
+
17
+ Prerequisites:
18
+ pip install psycopg2-binary pycarlo
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import logging
25
+ import os
26
+
27
+ from collect_lineage import LOOKBACK_HOURS, collect
28
+ from push_lineage import DEFAULT_BATCH_SIZE, push
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
31
+ log = logging.getLogger(__name__)
32
+
33
+
34
+ def main() -> None:
35
+ parser = argparse.ArgumentParser(description="Collect and push Redshift lineage to Monte Carlo")
36
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
37
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
38
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
39
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
40
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
41
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
42
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
43
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
44
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
45
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
46
+ parser.add_argument("--manifest", default="manifest_lineage.json")
47
+ args = parser.parse_args()
48
+
49
+ required = ["host", "db", "user", "password", "resource_uuid", "key_id", "key_token"]
50
+ missing = [k for k in required if getattr(args, k) is None]
51
+ if missing:
52
+ parser.error(f"Missing required arguments/env vars: {missing}")
53
+
54
+ log.info("Step 1: Collecting lineage …")
55
+ collect(
56
+ host=args.host,
57
+ db=args.db,
58
+ user=args.user,
59
+ password=args.password,
60
+ manifest_path=args.manifest,
61
+ port=args.port,
62
+ lookback_hours=args.lookback_hours,
63
+ )
64
+
65
+ log.info("Step 2: Pushing lineage to Monte Carlo …")
66
+ push(
67
+ manifest_path=args.manifest,
68
+ resource_uuid=args.resource_uuid,
69
+ key_id=args.key_id,
70
+ key_token=args.key_token,
71
+ batch_size=args.batch_size,
72
+ )
73
+
74
+ log.info("Done — collect and push complete.")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
@@ -0,0 +1,80 @@
1
+ """
2
+ Redshift — Metadata Collect & Push (combined)
3
+ ===============================================
4
+ Collects table schemas, row counts, and byte sizes from Amazon Redshift,
5
+ then pushes them to Monte Carlo via the push ingestion API.
6
+
7
+ This script imports and calls collect() from collect_metadata and push() from
8
+ push_metadata, running both in sequence.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - REDSHIFT_HOST : Redshift cluster endpoint or serverless workgroup endpoint
12
+ - REDSHIFT_DB : database name to connect to
13
+ - REDSHIFT_USER : database user (or IAM role user)
14
+ - REDSHIFT_PASSWORD : database password
15
+ - DB_EXCLUSIONS : databases to skip
16
+ - SCHEMA_EXCLUSIONS : schemas to skip in every database
17
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
18
+ - MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
19
+ - PUSH_BATCH_SIZE : number of assets per API call (default 500)
20
+
21
+ Prerequisites:
22
+ pip install psycopg2-binary pycarlo
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import logging
29
+ import os
30
+
31
+ from collect_metadata import collect
32
+ from push_metadata import DEFAULT_BATCH_SIZE, push
33
+
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
35
+ log = logging.getLogger(__name__)
36
+
37
+
38
+ def main() -> None:
39
+ parser = argparse.ArgumentParser(description="Collect and push Redshift metadata to Monte Carlo")
40
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
41
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
42
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
43
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
44
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
45
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
46
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
47
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
48
+ parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
49
+ parser.add_argument("--manifest", default="manifest_metadata.json")
50
+ args = parser.parse_args()
51
+
52
+ required = ["host", "db", "user", "password", "resource_uuid", "key_id", "key_token"]
53
+ missing = [k for k in required if getattr(args, k) is None]
54
+ if missing:
55
+ parser.error(f"Missing required arguments/env vars: {missing}")
56
+
57
+ log.info("Step 1: Collecting metadata …")
58
+ collect(
59
+ host=args.host,
60
+ db=args.db,
61
+ user=args.user,
62
+ password=args.password,
63
+ manifest_path=args.manifest,
64
+ port=args.port,
65
+ )
66
+
67
+ log.info("Step 2: Pushing metadata to Monte Carlo …")
68
+ push(
69
+ manifest_path=args.manifest,
70
+ resource_uuid=args.resource_uuid,
71
+ key_id=args.key_id,
72
+ key_token=args.key_token,
73
+ batch_size=args.batch_size,
74
+ )
75
+
76
+ log.info("Done — collect and push complete.")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()