opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,198 @@
1
+ """
2
+ BigQuery — Lineage Push (push only)
3
+ ====================================
4
+ Reads a manifest file produced by ``collect_lineage.py`` and pushes the lineage
5
+ events to Monte Carlo using the pycarlo push ingestion API. Large payloads are
6
+ split into batches to stay under the 1 MB compressed limit.
7
+
8
+ Can be run standalone via CLI or imported (use the ``push()`` function).
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
12
+ - MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
13
+
14
+ Prerequisites:
15
+ pip install pycarlo
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ from concurrent.futures import ThreadPoolExecutor, as_completed
25
+ from datetime import datetime, timezone
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ LineageAssetRef,
31
+ LineageEvent,
32
+ )
33
+
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
35
+ log = logging.getLogger(__name__)
36
+
37
+ RESOURCE_TYPE = "bigquery"
38
+
39
+ # Maximum events per batch — conservative default to keep compressed payload under 1 MB
40
+ # ← SUBSTITUTE: tune based on average edge complexity (number of sources per event)
41
+ _BATCH_SIZE = 500
42
+
43
+
44
+ def _make_ref(database: str, schema: str, table: str) -> LineageAssetRef:
45
+ return LineageAssetRef(
46
+ type="TABLE",
47
+ name=table,
48
+ database=database,
49
+ schema=schema,
50
+ )
51
+
52
+
53
+ def _build_events(edges: list[dict]) -> list[LineageEvent]:
54
+ """Build LineageEvent objects from manifest edge dicts."""
55
+ events = []
56
+ for edge in edges:
57
+ dest = edge["destination"]
58
+ sources = edge.get("sources", [])
59
+ if not sources:
60
+ continue
61
+ events.append(
62
+ LineageEvent(
63
+ destination=_make_ref(dest["database"], dest["schema"], dest["table"]),
64
+ sources=[
65
+ _make_ref(s["database"], s["schema"], s["table"])
66
+ for s in sources
67
+ ],
68
+ )
69
+ )
70
+ return events
71
+
72
+
73
+ def push(
74
+ input_file: str,
75
+ resource_uuid: str,
76
+ key_id: str,
77
+ key_token: str,
78
+ batch_size: int = _BATCH_SIZE,
79
+ output_file: str = "lineage_push_result.json",
80
+ ) -> dict:
81
+ """
82
+ Read a lineage manifest and push events to Monte Carlo in batches.
83
+
84
+ Returns a result dict with invocation IDs for each batch.
85
+ """
86
+ with open(input_file) as fh:
87
+ manifest = json.load(fh)
88
+
89
+ edges = manifest.get("edges", [])
90
+ resource_type = manifest.get("resource_type", RESOURCE_TYPE)
91
+ events = _build_events(edges)
92
+ log.info("Loaded %d lineage event(s) from %s", len(events), input_file)
93
+
94
+ if not events:
95
+ log.info("No lineage events to push.")
96
+ push_result = {
97
+ "resource_uuid": resource_uuid,
98
+ "resource_type": resource_type,
99
+ "invocation_ids": [],
100
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
101
+ "total_events": 0,
102
+ "batch_count": 0,
103
+ "batch_size": batch_size,
104
+ }
105
+ with open(output_file, "w") as fh:
106
+ json.dump(push_result, fh, indent=2)
107
+ return push_result
108
+
109
+ # Split into batches
110
+ batches = []
111
+ for i in range(0, len(events), batch_size):
112
+ batches.append(events[i : i + batch_size])
113
+ total_batches = len(batches)
114
+
115
+ def _push_batch(batch: list, batch_num: int) -> str | None:
116
+ """Push a single batch using a dedicated Session (thread-safe)."""
117
+ log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch))
118
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
119
+ service = IngestionService(mc_client=client)
120
+ result = service.send_lineage(
121
+ resource_uuid=resource_uuid,
122
+ resource_type=resource_type,
123
+ events=batch,
124
+ )
125
+ invocation_id = service.extract_invocation_id(result)
126
+ if invocation_id:
127
+ log.info(" Batch %d: invocation_id=%s", batch_num, invocation_id)
128
+ return invocation_id
129
+
130
+ # Push batches in parallel (each thread gets its own pycarlo Session)
131
+ max_workers = min(4, total_batches)
132
+ invocation_ids: list[str | None] = [None] * total_batches
133
+
134
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
135
+ futures = {
136
+ pool.submit(_push_batch, batch, i + 1): i
137
+ for i, batch in enumerate(batches)
138
+ }
139
+ for future in as_completed(futures):
140
+ idx = futures[future]
141
+ try:
142
+ invocation_ids[idx] = future.result()
143
+ except Exception as exc:
144
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
145
+ raise
146
+
147
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
148
+
149
+ push_result = {
150
+ "resource_uuid": resource_uuid,
151
+ "resource_type": resource_type,
152
+ "invocation_ids": invocation_ids,
153
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
154
+ "total_events": len(events),
155
+ "batch_count": total_batches,
156
+ "batch_size": batch_size,
157
+ }
158
+ with open(output_file, "w") as fh:
159
+ json.dump(push_result, fh, indent=2)
160
+ log.info("Push result written to %s", output_file)
161
+
162
+ return push_result
163
+
164
+
165
+ def main() -> None:
166
+ parser = argparse.ArgumentParser(
167
+ description="Push BigQuery lineage from a manifest to Monte Carlo",
168
+ )
169
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
170
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
171
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
172
+ parser.add_argument("--input-file", default="lineage_output.json")
173
+ parser.add_argument("--output-file", default="lineage_push_result.json")
174
+ parser.add_argument(
175
+ "--batch-size",
176
+ type=int,
177
+ default=_BATCH_SIZE,
178
+ help=f"Max events per push batch (default: {_BATCH_SIZE})",
179
+ )
180
+ args = parser.parse_args()
181
+
182
+ required = ["resource_uuid", "key_id", "key_token"]
183
+ missing = [k for k in required if getattr(args, k) is None]
184
+ if missing:
185
+ parser.error(f"Missing required arguments/env vars: {missing}")
186
+
187
+ push(
188
+ input_file=args.input_file,
189
+ resource_uuid=args.resource_uuid,
190
+ key_id=args.key_id,
191
+ key_token=args.key_token,
192
+ batch_size=args.batch_size,
193
+ output_file=args.output_file,
194
+ )
195
+
196
+
197
+ if __name__ == "__main__":
198
+ main()
@@ -0,0 +1,193 @@
1
+ """
2
+ BigQuery — Metadata Push (push only)
3
+ =====================================
4
+ Reads a manifest file produced by ``collect_metadata.py`` and pushes the assets
5
+ to Monte Carlo using the pycarlo push ingestion API. Large payloads are split
6
+ into batches to stay under the 1 MB compressed limit.
7
+
8
+ Can be run standalone via CLI or imported (use the ``push()`` function).
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
12
+ - MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
13
+
14
+ Prerequisites:
15
+ pip install pycarlo
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ from concurrent.futures import ThreadPoolExecutor, as_completed
25
+ from datetime import datetime, timezone
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ AssetField,
31
+ AssetFreshness,
32
+ AssetMetadata,
33
+ AssetVolume,
34
+ RelationalAsset,
35
+ )
36
+
37
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
38
+ log = logging.getLogger(__name__)
39
+
40
+ RESOURCE_TYPE = "bigquery"
41
+
42
+ # Maximum assets per batch — conservative default to keep compressed payload under 1 MB
43
+ # ← SUBSTITUTE: tune based on average asset size (fields per table, description length, etc.)
44
+ _BATCH_SIZE = 500
45
+
46
+
47
+ def _asset_from_dict(d: dict) -> RelationalAsset:
48
+ """Reconstruct a RelationalAsset from a manifest dict entry."""
49
+ fields = [
50
+ AssetField(
51
+ name=f["name"],
52
+ type=f.get("type"),
53
+ description=f.get("description"),
54
+ )
55
+ for f in d.get("fields", [])
56
+ ]
57
+
58
+ volume = None
59
+ if d.get("volume"):
60
+ volume = AssetVolume(
61
+ row_count=d["volume"].get("row_count"),
62
+ byte_count=d["volume"].get("byte_count"),
63
+ )
64
+
65
+ freshness = None
66
+ if d.get("freshness"):
67
+ freshness = AssetFreshness(
68
+ last_update_time=d["freshness"].get("last_update_time"),
69
+ )
70
+
71
+ return RelationalAsset(
72
+ type=d.get("type", "TABLE"),
73
+ metadata=AssetMetadata(
74
+ name=d["name"],
75
+ database=d["database"], # ← SUBSTITUTE: use project or dataset as database
76
+ schema=d["schema"],
77
+ description=d.get("description"),
78
+ ),
79
+ fields=fields,
80
+ volume=volume,
81
+ freshness=freshness,
82
+ )
83
+
84
+
85
+ def push(
86
+ input_file: str,
87
+ resource_uuid: str,
88
+ key_id: str,
89
+ key_token: str,
90
+ batch_size: int = _BATCH_SIZE,
91
+ output_file: str = "metadata_push_result.json",
92
+ ) -> dict:
93
+ """
94
+ Read a metadata manifest and push assets to Monte Carlo in batches.
95
+
96
+ Returns a result dict with invocation IDs for each batch.
97
+ """
98
+ with open(input_file) as fh:
99
+ manifest = json.load(fh)
100
+
101
+ asset_dicts = manifest.get("assets", [])
102
+ resource_type = manifest.get("resource_type", RESOURCE_TYPE)
103
+ assets = [_asset_from_dict(d) for d in asset_dicts]
104
+ log.info("Loaded %d asset(s) from %s", len(assets), input_file)
105
+
106
+ # Split into batches
107
+ batches = []
108
+ for i in range(0, max(len(assets), 1), batch_size):
109
+ batches.append(assets[i : i + batch_size])
110
+ total_batches = len(batches)
111
+
112
+ def _push_batch(batch: list, batch_num: int) -> str | None:
113
+ """Push a single batch using a dedicated Session (thread-safe)."""
114
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
115
+ service = IngestionService(mc_client=client)
116
+ result = service.send_metadata(
117
+ resource_uuid=resource_uuid,
118
+ resource_type=resource_type,
119
+ events=batch,
120
+ )
121
+ invocation_id = service.extract_invocation_id(result)
122
+ log.info("Pushed batch %d/%d (%d assets) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
123
+ return invocation_id
124
+
125
+ # Push batches in parallel (each thread gets its own pycarlo Session)
126
+ max_workers = min(4, total_batches)
127
+ invocation_ids: list[str | None] = [None] * total_batches
128
+
129
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
130
+ futures = {
131
+ pool.submit(_push_batch, batch, i + 1): i
132
+ for i, batch in enumerate(batches)
133
+ }
134
+ for future in as_completed(futures):
135
+ idx = futures[future]
136
+ try:
137
+ invocation_ids[idx] = future.result()
138
+ except Exception as exc:
139
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
140
+ raise
141
+
142
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
143
+
144
+ push_result = {
145
+ "resource_uuid": resource_uuid,
146
+ "resource_type": resource_type,
147
+ "invocation_ids": invocation_ids,
148
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
149
+ "total_assets": len(assets),
150
+ "batch_count": total_batches,
151
+ "batch_size": batch_size,
152
+ }
153
+ with open(output_file, "w") as fh:
154
+ json.dump(push_result, fh, indent=2)
155
+ log.info("Push result written to %s", output_file)
156
+
157
+ return push_result
158
+
159
+
160
+ def main() -> None:
161
+ parser = argparse.ArgumentParser(
162
+ description="Push BigQuery metadata from a manifest to Monte Carlo",
163
+ )
164
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
165
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
166
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
167
+ parser.add_argument("--input-file", default="metadata_output.json")
168
+ parser.add_argument("--output-file", default="metadata_push_result.json")
169
+ parser.add_argument(
170
+ "--batch-size",
171
+ type=int,
172
+ default=_BATCH_SIZE,
173
+ help=f"Max assets per push batch (default: {_BATCH_SIZE})",
174
+ )
175
+ args = parser.parse_args()
176
+
177
+ required = ["resource_uuid", "key_id", "key_token"]
178
+ missing = [k for k in required if getattr(args, k) is None]
179
+ if missing:
180
+ parser.error(f"Missing required arguments/env vars: {missing}")
181
+
182
+ push(
183
+ input_file=args.input_file,
184
+ resource_uuid=args.resource_uuid,
185
+ key_id=args.key_id,
186
+ key_token=args.key_token,
187
+ batch_size=args.batch_size,
188
+ output_file=args.output_file,
189
+ )
190
+
191
+
192
+ if __name__ == "__main__":
193
+ main()
@@ -0,0 +1,207 @@
1
+ """
2
+ BigQuery — Query Log Push (push only)
3
+ ======================================
4
+ Reads a manifest file produced by ``collect_query_logs.py`` and pushes the query
5
+ log entries to Monte Carlo using the pycarlo push ingestion API. Large payloads
6
+ are split into batches to stay under the 1 MB compressed limit.
7
+
8
+ Can be run standalone via CLI or imported (use the ``push()`` function).
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
12
+ - MCD_RESOURCE_UUID : UUID of the BigQuery connection in Monte Carlo
13
+
14
+ Prerequisites:
15
+ pip install pycarlo
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ from concurrent.futures import ThreadPoolExecutor, as_completed
25
+ from datetime import datetime, timezone
26
+
27
+ from dateutil.parser import isoparse
28
+ from pycarlo.core import Client, Session
29
+ from pycarlo.features.ingestion import IngestionService
30
+ from pycarlo.features.ingestion.models import QueryLogEntry
31
+
32
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
33
+ log = logging.getLogger(__name__)
34
+
35
+ LOG_TYPE = "bigquery"
36
+
37
+ # Maximum entries per batch — conservative default to keep compressed payload under 1 MB.
38
+ # Query logs include full SQL text — keep batches small to stay under the 1 MB
39
+ # compressed payload limit. 50 entries can trigger 413 on active warehouses.
40
+ # ← SUBSTITUTE: tune based on average query length
41
+ _BATCH_SIZE = 100
42
+
43
+ # Truncate query_text longer than this to prevent 413 errors.
44
+ # Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
45
+ # compressed payloads even at small batch sizes.
46
+ _MAX_QUERY_TEXT_LEN = 10_000
47
+
48
+
49
+ def _build_query_log_entries(queries: list[dict]) -> list[QueryLogEntry]:
50
+ """Convert manifest query dicts into QueryLogEntry objects."""
51
+ entries = []
52
+ truncated = 0
53
+ for q in queries:
54
+ query_text = q.get("query_text") or ""
55
+
56
+ # Truncate very long SQL to prevent 413 Request Too Large
57
+ if len(query_text) > _MAX_QUERY_TEXT_LEN:
58
+ query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
59
+ truncated += 1
60
+
61
+ extra = {}
62
+ if q.get("total_bytes_billed") is not None:
63
+ extra["total_bytes_billed"] = q["total_bytes_billed"]
64
+ if q.get("statement_type") is not None:
65
+ extra["statement_type"] = q["statement_type"]
66
+
67
+ start_time = q.get("start_time")
68
+ end_time = q.get("end_time")
69
+
70
+ entry = QueryLogEntry(
71
+ query_id=q.get("query_id"),
72
+ query_text=query_text,
73
+ start_time=isoparse(start_time) if start_time else None,
74
+ end_time=isoparse(end_time) if end_time else None,
75
+ user=q.get("user"),
76
+ extra=extra or None,
77
+ )
78
+ entries.append(entry)
79
+ if truncated:
80
+ log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
81
+ return entries
82
+
83
+
84
+ def push(
85
+ input_file: str,
86
+ resource_uuid: str,
87
+ key_id: str,
88
+ key_token: str,
89
+ batch_size: int = _BATCH_SIZE,
90
+ output_file: str = "query_logs_push_result.json",
91
+ ) -> dict:
92
+ """
93
+ Read a query log manifest and push entries to Monte Carlo in batches.
94
+
95
+ Returns a result dict with invocation IDs for each batch.
96
+ """
97
+ with open(input_file) as fh:
98
+ manifest = json.load(fh)
99
+
100
+ queries = manifest.get("queries", [])
101
+ log_type = manifest.get("log_type", LOG_TYPE)
102
+ entries = _build_query_log_entries(queries)
103
+ log.info("Loaded %d query log entry/entries from %s", len(entries), input_file)
104
+
105
+ if not entries:
106
+ log.info("No query log entries to push.")
107
+ push_result = {
108
+ "resource_uuid": resource_uuid,
109
+ "log_type": log_type,
110
+ "invocation_ids": [],
111
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
112
+ "total_entries": 0,
113
+ "batch_count": 0,
114
+ "batch_size": batch_size,
115
+ }
116
+ with open(output_file, "w") as fh:
117
+ json.dump(push_result, fh, indent=2)
118
+ return push_result
119
+
120
+ # Split into batches
121
+ batches = []
122
+ for i in range(0, len(entries), batch_size):
123
+ batches.append(entries[i : i + batch_size])
124
+ total_batches = len(batches)
125
+
126
+ def _push_batch(batch: list, batch_num: int) -> str | None:
127
+ """Push a single batch using a dedicated Session (thread-safe)."""
128
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
129
+ service = IngestionService(mc_client=client)
130
+ result = service.send_query_logs(
131
+ resource_uuid=resource_uuid,
132
+ log_type=log_type,
133
+ events=batch,
134
+ )
135
+ invocation_id = service.extract_invocation_id(result)
136
+ log.info("Pushed batch %d/%d (%d entries) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
137
+ return invocation_id
138
+
139
+ # Push batches in parallel (each thread gets its own pycarlo Session)
140
+ max_workers = min(4, total_batches)
141
+ invocation_ids: list[str | None] = [None] * total_batches
142
+
143
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
144
+ futures = {
145
+ pool.submit(_push_batch, batch, i + 1): i
146
+ for i, batch in enumerate(batches)
147
+ }
148
+ for future in as_completed(futures):
149
+ idx = futures[future]
150
+ try:
151
+ invocation_ids[idx] = future.result()
152
+ except Exception as exc:
153
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
154
+ raise
155
+
156
+ log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
157
+
158
+ push_result = {
159
+ "resource_uuid": resource_uuid,
160
+ "log_type": log_type,
161
+ "invocation_ids": invocation_ids,
162
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
163
+ "total_entries": len(entries),
164
+ "batch_count": total_batches,
165
+ "batch_size": batch_size,
166
+ }
167
+ with open(output_file, "w") as fh:
168
+ json.dump(push_result, fh, indent=2)
169
+ log.info("Push result written to %s", output_file)
170
+
171
+ return push_result
172
+
173
+
174
+ def main() -> None:
175
+ parser = argparse.ArgumentParser(
176
+ description="Push BigQuery query logs from a manifest to Monte Carlo",
177
+ )
178
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
179
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
180
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
181
+ parser.add_argument("--input-file", default="query_logs_output.json")
182
+ parser.add_argument("--output-file", default="query_logs_push_result.json")
183
+ parser.add_argument(
184
+ "--batch-size",
185
+ type=int,
186
+ default=_BATCH_SIZE,
187
+ help=f"Max entries per push batch (default: {_BATCH_SIZE})",
188
+ )
189
+ args = parser.parse_args()
190
+
191
+ required = ["resource_uuid", "key_id", "key_token"]
192
+ missing = [k for k in required if getattr(args, k) is None]
193
+ if missing:
194
+ parser.error(f"Missing required arguments/env vars: {missing}")
195
+
196
+ push(
197
+ input_file=args.input_file,
198
+ resource_uuid=args.resource_uuid,
199
+ key_id=args.key_id,
200
+ key_token=args.key_token,
201
+ batch_size=args.batch_size,
202
+ output_file=args.output_file,
203
+ )
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
@@ -0,0 +1,71 @@
1
+ """
2
+ BigQuery Iceberg — Metadata Collect & Push (combined)
3
+ =====================================================
4
+ Convenience wrapper that runs collect_metadata.collect() followed by
5
+ push_metadata.push() in a single invocation. Supports
6
+ ``--only-freshness-and-volume`` for fast periodic pushes.
7
+
8
+ Prerequisites:
9
+ pip install google-cloud-bigquery pycarlo>=0.12.251
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import os
16
+
17
+ from collect_metadata import collect
18
+ from push_metadata import push
19
+
20
+
21
+ def main() -> None:
22
+ parser = argparse.ArgumentParser(
23
+ description="Collect BigQuery Iceberg metadata and push to Monte Carlo",
24
+ )
25
+ # Collection args
26
+ parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))
27
+ parser.add_argument("--datasets", nargs="+", default=None)
28
+ parser.add_argument("--tables", nargs="+", default=None)
29
+ parser.add_argument(
30
+ "--only-freshness-and-volume",
31
+ action="store_true",
32
+ help="Skip field/schema collection — only collect freshness and volume.",
33
+ )
34
+ parser.add_argument("--manifest-file", default="metadata_output.json")
35
+
36
+ # Push args
37
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
38
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
39
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
40
+ parser.add_argument("--batch-size", type=int, default=500)
41
+ parser.add_argument("--push-result-file", default="metadata_push_result.json")
42
+
43
+ args = parser.parse_args()
44
+
45
+ if not args.project_id:
46
+ parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
47
+ required_push = ["resource_uuid", "key_id", "key_token"]
48
+ missing = [k for k in required_push if getattr(args, k) is None]
49
+ if missing:
50
+ parser.error(f"Missing required push arguments/env vars: {missing}")
51
+
52
+ collect(
53
+ project_id=args.project_id,
54
+ datasets=args.datasets,
55
+ tables=args.tables,
56
+ only_freshness_and_volume=args.only_freshness_and_volume,
57
+ output_file=args.manifest_file,
58
+ )
59
+
60
+ push(
61
+ input_file=args.manifest_file,
62
+ resource_uuid=args.resource_uuid,
63
+ key_id=args.key_id,
64
+ key_token=args.key_token,
65
+ batch_size=args.batch_size,
66
+ output_file=args.push_result_file,
67
+ )
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()