opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,64 @@
1
+ """
2
+ BigQuery Iceberg — Query Log Collect & Push (combined)
3
+ =====================================================
4
+ Convenience wrapper that runs collect_query_logs.collect() followed by
5
+ push_query_logs.push() in a single invocation.
6
+
7
+ Prerequisites:
8
+ pip install google-cloud-bigquery pycarlo>=0.12.251 python-dateutil>=2.8.0
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import os
15
+
16
+ from collect_query_logs import LOOKBACK_HOURS, LOOKBACK_LAG_HOURS, collect
17
+ from push_query_logs import push
18
+
19
+
20
+ def main() -> None:
21
+ parser = argparse.ArgumentParser(
22
+ description="Collect BigQuery query logs and push to Monte Carlo",
23
+ )
24
+ # Collection args
25
+ parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))
26
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
27
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
28
+ parser.add_argument("--manifest-file", default="query_logs_output.json")
29
+
30
+ # Push args
31
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
32
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
33
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
34
+ parser.add_argument("--batch-size", type=int, default=100)
35
+ parser.add_argument("--push-result-file", default="query_logs_push_result.json")
36
+
37
+ args = parser.parse_args()
38
+
39
+ if not args.project_id:
40
+ parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
41
+ required_push = ["resource_uuid", "key_id", "key_token"]
42
+ missing = [k for k in required_push if getattr(args, k) is None]
43
+ if missing:
44
+ parser.error(f"Missing required push arguments/env vars: {missing}")
45
+
46
+ collect(
47
+ project_id=args.project_id,
48
+ lookback_hours=args.lookback_hours,
49
+ lookback_lag_hours=args.lookback_lag_hours,
50
+ output_file=args.manifest_file,
51
+ )
52
+
53
+ push(
54
+ input_file=args.manifest_file,
55
+ resource_uuid=args.resource_uuid,
56
+ key_id=args.key_id,
57
+ key_token=args.key_token,
58
+ batch_size=args.batch_size,
59
+ output_file=args.push_result_file,
60
+ )
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
@@ -0,0 +1,253 @@
1
+ """
2
+ BigQuery Iceberg — Metadata Collection (collect only)
3
+ =====================================================
4
+ Collects table schemas, row counts, byte sizes, and freshness for BigQuery
5
+ Iceberg (BigLake-managed) tables using INFORMATION_SCHEMA.TABLE_STORAGE and
6
+ INFORMATION_SCHEMA.COLUMNS. Standard BigQuery collection uses __TABLES__ which
7
+ does not include Iceberg tables — this template fills that gap.
8
+
9
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
10
+
11
+ Supports a ``--only-freshness-and-volume`` flag to skip the COLUMNS query for
12
+ fast periodic pushes after the initial full metadata push.
13
+
14
+ Substitution points (search for "← SUBSTITUTE"):
15
+ - BIGQUERY_PROJECT_ID : GCP project ID to collect from
16
+ - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
17
+ - REGION : BigQuery region (default "us")
18
+
19
+ Prerequisites:
20
+ pip install google-cloud-bigquery
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import argparse
26
+ import json
27
+ import logging
28
+ import os
29
+ from datetime import datetime, timezone
30
+
31
+ from google.cloud import bigquery
32
+
33
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
34
+ log = logging.getLogger(__name__)
35
+
36
+ RESOURCE_TYPE = "bigquery"
37
+
38
+ # BigQuery type → Monte Carlo canonical type
39
+ BQ_TYPE_MAP: dict[str, str] = {
40
+ "INT64": "INTEGER",
41
+ "INTEGER": "INTEGER",
42
+ "FLOAT64": "FLOAT",
43
+ "FLOAT": "FLOAT",
44
+ "BOOL": "BOOLEAN",
45
+ "BOOLEAN": "BOOLEAN",
46
+ "STRING": "VARCHAR",
47
+ "BYTES": "BINARY",
48
+ "DATE": "DATE",
49
+ "DATETIME": "DATETIME",
50
+ "TIMESTAMP": "TIMESTAMP",
51
+ "TIME": "TIME",
52
+ "NUMERIC": "DECIMAL",
53
+ "BIGNUMERIC": "DECIMAL",
54
+ "RECORD": "STRUCT",
55
+ "STRUCT": "STRUCT",
56
+ "REPEATED": "ARRAY",
57
+ "JSON": "JSON",
58
+ "GEOGRAPHY": "GEOGRAPHY",
59
+ }
60
+
61
+
62
+ def map_bq_type(bq_type: str) -> str:
63
+ base = bq_type.split("(")[0].strip().upper()
64
+ return BQ_TYPE_MAP.get(base, bq_type.upper())
65
+
66
+
67
+ def _fetch_iceberg_tables(
68
+ client: bigquery.Client,
69
+ project_id: str,
70
+ datasets: list[str] | None = None,
71
+ tables: list[str] | None = None,
72
+ ) -> list[dict]:
73
+ """Query TABLE_STORAGE for BigLake (Iceberg) tables."""
74
+ conditions = [
75
+ "managed_table_type = 'BIGLAKE'",
76
+ "deleted = FALSE",
77
+ ]
78
+ if datasets:
79
+ ds_list = ", ".join(f"'{d}'" for d in datasets)
80
+ conditions.append(f"table_schema IN ({ds_list})")
81
+ if tables:
82
+ tbl_list = ", ".join(f"'{t}'" for t in tables)
83
+ conditions.append(f"table_name IN ({tbl_list})")
84
+
85
+ where = " AND ".join(conditions)
86
+ query = f"""
87
+ SELECT
88
+ table_schema,
89
+ table_name,
90
+ total_rows,
91
+ current_physical_bytes,
92
+ storage_last_modified_time,
93
+ creation_time
94
+ FROM `{project_id}.region-us`.INFORMATION_SCHEMA.TABLE_STORAGE -- ← SUBSTITUTE: change region if needed
95
+ WHERE {where}
96
+ ORDER BY table_schema, table_name
97
+ """
98
+ log.info("Querying TABLE_STORAGE for Iceberg tables ...")
99
+ rows = list(client.query(query).result())
100
+ log.info("Found %d Iceberg table(s).", len(rows))
101
+ return [dict(row) for row in rows]
102
+
103
+
104
+ def _fetch_columns(
105
+ client: bigquery.Client,
106
+ project_id: str,
107
+ dataset: str,
108
+ table_name: str,
109
+ ) -> list[dict]:
110
+ """Fetch column metadata for a specific table."""
111
+ query = f"""
112
+ SELECT column_name, data_type, ordinal_position, is_nullable, column_default
113
+ FROM `{project_id}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
114
+ WHERE table_name = '{table_name}'
115
+ ORDER BY ordinal_position
116
+ """
117
+ return [
118
+ {
119
+ "name": row["column_name"],
120
+ "type": map_bq_type(row["data_type"]),
121
+ }
122
+ for row in client.query(query).result()
123
+ ]
124
+
125
+
126
+ def _resolve_freshness(row: dict) -> str:
127
+ """Return the best available freshness timestamp as ISO8601.
128
+
129
+ Uses storage_last_modified_time if Google has populated it (expected
130
+ early April 2026). Falls back to current time with a warning.
131
+ """
132
+ if row.get("storage_last_modified_time"):
133
+ return row["storage_last_modified_time"].isoformat()
134
+
135
+ log.warning(
136
+ "storage_last_modified_time is NULL for %s.%s — "
137
+ "falling back to current time. Google's TABLE_STORAGE update "
138
+ "for Iceberg tables may not have shipped yet.",
139
+ row["table_schema"],
140
+ row["table_name"],
141
+ )
142
+ return datetime.now(timezone.utc).isoformat()
143
+
144
+
145
+ def collect(
146
+ project_id: str,
147
+ datasets: list[str] | None = None,
148
+ tables: list[str] | None = None,
149
+ only_freshness_and_volume: bool = False,
150
+ output_file: str = "metadata_output.json",
151
+ ) -> dict:
152
+ """Collect Iceberg table metadata and write a JSON manifest.
153
+
154
+ When only_freshness_and_volume is True, skips the COLUMNS query and
155
+ omits fields from the manifest. Use this for periodic hourly pushes
156
+ after the initial full metadata push.
157
+ """
158
+ client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
159
+
160
+ if only_freshness_and_volume:
161
+ log.info("Running in freshness+volume only mode (skipping fields).")
162
+
163
+ iceberg_tables = _fetch_iceberg_tables(client, project_id, datasets, tables)
164
+ if not iceberg_tables:
165
+ log.warning("No Iceberg tables found matching the criteria.")
166
+ return {"resource_type": RESOURCE_TYPE, "assets": []}
167
+
168
+ assets: list[dict] = []
169
+ for row in iceberg_tables:
170
+ dataset = row["table_schema"]
171
+ name = row["table_name"]
172
+
173
+ asset = {
174
+ "name": name,
175
+ "database": project_id,
176
+ "schema": dataset,
177
+ "type": "TABLE",
178
+ "volume": {
179
+ "row_count": row["total_rows"],
180
+ "byte_count": row["current_physical_bytes"],
181
+ },
182
+ "freshness": {
183
+ "last_updated_time": _resolve_freshness(row),
184
+ },
185
+ }
186
+
187
+ if not only_freshness_and_volume:
188
+ asset["description"] = None
189
+ asset["fields"] = _fetch_columns(client, project_id, dataset, name)
190
+
191
+ assets.append(asset)
192
+ log.info(
193
+ "Collected %s.%s.%s — rows=%s, bytes=%s",
194
+ project_id, dataset, name,
195
+ row["total_rows"], row["current_physical_bytes"],
196
+ )
197
+
198
+ manifest = {
199
+ "resource_type": RESOURCE_TYPE,
200
+ "collected_at": datetime.now(timezone.utc).isoformat(),
201
+ "assets": assets,
202
+ }
203
+ with open(output_file, "w") as fh:
204
+ json.dump(manifest, fh, indent=2)
205
+ log.info("Manifest written to %s (%d assets)", output_file, len(assets))
206
+
207
+ return manifest
208
+
209
+
210
+ def main() -> None:
211
+ parser = argparse.ArgumentParser(
212
+ description="Collect BigQuery Iceberg table metadata into a JSON manifest",
213
+ )
214
+ parser.add_argument(
215
+ "--project-id",
216
+ default=os.getenv("BIGQUERY_PROJECT_ID"), # ← SUBSTITUTE
217
+ help="GCP project ID (or set BIGQUERY_PROJECT_ID env var)",
218
+ )
219
+ parser.add_argument(
220
+ "--datasets",
221
+ nargs="+",
222
+ default=None,
223
+ help="Limit to specific dataset(s). Omit to scan all datasets.",
224
+ )
225
+ parser.add_argument(
226
+ "--tables",
227
+ nargs="+",
228
+ default=None,
229
+ help="Limit to specific table name(s) within the datasets.",
230
+ )
231
+ parser.add_argument(
232
+ "--only-freshness-and-volume",
233
+ action="store_true",
234
+ help="Skip field/schema collection — only collect freshness and volume. "
235
+ "Use for periodic hourly pushes after the initial full metadata push.",
236
+ )
237
+ parser.add_argument("--output-file", default="metadata_output.json")
238
+ args = parser.parse_args()
239
+
240
+ if not args.project_id:
241
+ parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
242
+
243
+ collect(
244
+ project_id=args.project_id,
245
+ datasets=args.datasets,
246
+ tables=args.tables,
247
+ only_freshness_and_volume=args.only_freshness_and_volume,
248
+ output_file=args.output_file,
249
+ )
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
@@ -0,0 +1,149 @@
1
+ """
2
+ BigQuery Iceberg — Query Log Collection (collect only)
3
+ ======================================================
4
+ Queries the BigQuery Jobs API for completed query jobs within a time
5
+ window and writes a JSON manifest that can be fed to push_query_logs.py.
6
+
7
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
8
+
9
+ Substitution points (search for "← SUBSTITUTE"):
10
+ - BIGQUERY_PROJECT_ID : GCP project ID to collect from
11
+ - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
12
+
13
+ Prerequisites:
14
+ pip install google-cloud-bigquery
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import logging
22
+ import os
23
+ from datetime import datetime, timedelta, timezone
24
+
25
+ from google.cloud import bigquery
26
+
27
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
28
+ log = logging.getLogger(__name__)
29
+
30
+ LOG_TYPE = "bigquery"
31
+
32
+ LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25"))
33
+ LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1"))
34
+ MAX_JOBS: int = int(os.getenv("MAX_JOBS", "10000"))
35
+
36
+ # Limit to specific statement types — empty list means collect all.
37
+ STATEMENT_TYPE_FILTER: list[str] = []
38
+
39
+
40
+ def _safe_isoformat(dt: datetime | None) -> str | None:
41
+ if dt is None:
42
+ return None
43
+ if dt.tzinfo is None:
44
+ dt = dt.replace(tzinfo=timezone.utc)
45
+ return dt.isoformat()
46
+
47
+
48
+ def _collect_query_logs(
49
+ bq_client: bigquery.Client,
50
+ project_id: str,
51
+ start_dt: datetime,
52
+ end_dt: datetime,
53
+ ) -> list[dict]:
54
+ """Collect query logs from BigQuery job history."""
55
+ entries: list[dict] = []
56
+
57
+ log.info(
58
+ "Listing jobs for project=%s from %s to %s",
59
+ project_id, start_dt.isoformat(), end_dt.isoformat(),
60
+ )
61
+
62
+ for job in bq_client.list_jobs(
63
+ project=project_id,
64
+ all_users=True,
65
+ min_creation_time=start_dt,
66
+ max_creation_time=end_dt,
67
+ ):
68
+ sql: str = getattr(job, "query", None) or ""
69
+ if not sql.strip():
70
+ continue
71
+
72
+ statement_type: str = getattr(job, "statement_type", None) or ""
73
+ if STATEMENT_TYPE_FILTER and statement_type not in STATEMENT_TYPE_FILTER:
74
+ continue
75
+
76
+ entries.append({
77
+ "query_id": job.job_id,
78
+ "query_text": sql,
79
+ "start_time": _safe_isoformat(getattr(job, "created", None)),
80
+ "end_time": _safe_isoformat(getattr(job, "ended", None)),
81
+ "user": getattr(job, "user_email", None),
82
+ "total_bytes_billed": getattr(job, "total_bytes_billed", None),
83
+ "statement_type": statement_type or None,
84
+ })
85
+
86
+ if len(entries) >= MAX_JOBS:
87
+ log.warning("Reached MAX_JOBS=%d — stopping early", MAX_JOBS)
88
+ break
89
+
90
+ return entries
91
+
92
+
93
+ def collect(
94
+ project_id: str,
95
+ lookback_hours: int = LOOKBACK_HOURS,
96
+ lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
97
+ output_file: str = "query_logs_output.json",
98
+ ) -> dict:
99
+ """Collect query logs and write a JSON manifest."""
100
+ bq_client = bigquery.Client(project=project_id)
101
+
102
+ end_dt = datetime.now(timezone.utc) - timedelta(hours=lookback_lag_hours)
103
+ start_dt = end_dt - timedelta(hours=lookback_hours)
104
+
105
+ entries = _collect_query_logs(bq_client, project_id, start_dt, end_dt)
106
+ log.info("Collected %d query log entries.", len(entries))
107
+
108
+ manifest = {
109
+ "log_type": LOG_TYPE,
110
+ "collected_at": datetime.now(timezone.utc).isoformat(),
111
+ "window_start": start_dt.isoformat(),
112
+ "window_end": end_dt.isoformat(),
113
+ "query_log_count": len(entries),
114
+ "queries": entries,
115
+ }
116
+ with open(output_file, "w") as fh:
117
+ json.dump(manifest, fh, indent=2)
118
+ log.info("Query log manifest written to %s", output_file)
119
+
120
+ return manifest
121
+
122
+
123
+ def main() -> None:
124
+ parser = argparse.ArgumentParser(
125
+ description="Collect BigQuery query logs into a JSON manifest",
126
+ )
127
+ parser.add_argument(
128
+ "--project-id",
129
+ default=os.getenv("BIGQUERY_PROJECT_ID"),
130
+ help="GCP project ID (or set BIGQUERY_PROJECT_ID env var)",
131
+ )
132
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
133
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
134
+ parser.add_argument("--output-file", default="query_logs_output.json")
135
+ args = parser.parse_args()
136
+
137
+ if not args.project_id:
138
+ parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
139
+
140
+ collect(
141
+ project_id=args.project_id,
142
+ lookback_hours=args.lookback_hours,
143
+ lookback_lag_hours=args.lookback_lag_hours,
144
+ output_file=args.output_file,
145
+ )
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
@@ -0,0 +1,190 @@
1
+ """
2
+ BigQuery Iceberg — Metadata Push (push only)
3
+ ============================================
4
+ Reads a JSON manifest produced by collect_metadata.py and pushes table
5
+ metadata to Monte Carlo using the pycarlo SDK's IngestionService.
6
+
7
+ Can be run standalone via CLI or imported (use the ``push()`` function).
8
+
9
+ Substitution points (search for "← SUBSTITUTE"):
10
+ - MCD_INGEST_ID : Monte Carlo Ingestion API key ID
11
+ - MCD_INGEST_TOKEN : Monte Carlo Ingestion API key token
12
+ - MCD_RESOURCE_UUID : Monte Carlo warehouse resource UUID
13
+
14
+ Prerequisites:
15
+ pip install pycarlo>=0.12.251
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ from concurrent.futures import ThreadPoolExecutor, as_completed
25
+ from datetime import datetime, timezone
26
+
27
+ from pycarlo.core import Client, Session
28
+ from pycarlo.features.ingestion import IngestionService
29
+ from pycarlo.features.ingestion.models import (
30
+ AssetField,
31
+ AssetFreshness,
32
+ AssetMetadata,
33
+ AssetVolume,
34
+ RelationalAsset,
35
+ )
36
+
37
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
38
+ log = logging.getLogger(__name__)
39
+
40
+ RESOURCE_TYPE = "bigquery"
41
+ _BATCH_SIZE = 500
42
+
43
+ _ENDPOINT = "https://integrations.getmontecarlo.com"
44
+
45
+
46
+ def _asset_from_dict(d: dict) -> RelationalAsset:
47
+ """Reconstruct a RelationalAsset from a manifest dict entry."""
48
+ fields = [
49
+ AssetField(
50
+ name=f["name"],
51
+ type=f.get("type"),
52
+ description=f.get("description"),
53
+ )
54
+ for f in d.get("fields", [])
55
+ ]
56
+
57
+ volume = None
58
+ if d.get("volume"):
59
+ volume = AssetVolume(
60
+ row_count=d["volume"].get("row_count"),
61
+ byte_count=d["volume"].get("byte_count"),
62
+ )
63
+
64
+ freshness = None
65
+ if d.get("freshness") and d["freshness"].get("last_updated_time"):
66
+ freshness = AssetFreshness(
67
+ last_update_time=d["freshness"]["last_updated_time"],
68
+ )
69
+
70
+ return RelationalAsset(
71
+ type=d.get("type", "TABLE"),
72
+ metadata=AssetMetadata(
73
+ name=d["name"],
74
+ database=d["database"],
75
+ schema=d["schema"],
76
+ description=d.get("description"),
77
+ ),
78
+ fields=fields,
79
+ volume=volume,
80
+ freshness=freshness,
81
+ )
82
+
83
+
84
+ def push(
85
+ input_file: str,
86
+ resource_uuid: str,
87
+ key_id: str,
88
+ key_token: str,
89
+ batch_size: int = _BATCH_SIZE,
90
+ output_file: str = "metadata_push_result.json",
91
+ ) -> dict:
92
+ """Read a metadata manifest and push assets to Monte Carlo in batches."""
93
+ endpoint = _ENDPOINT
94
+ log.info("Using endpoint: %s", endpoint)
95
+ with open(input_file) as fh:
96
+ manifest = json.load(fh)
97
+
98
+ asset_dicts = manifest.get("assets", [])
99
+ resource_type = manifest.get("resource_type", RESOURCE_TYPE)
100
+ assets = [_asset_from_dict(d) for d in asset_dicts]
101
+ log.info("Loaded %d asset(s) from %s", len(assets), input_file)
102
+
103
+ batches = [assets[i : i + batch_size] for i in range(0, max(len(assets), 1), batch_size)]
104
+ total_batches = len(batches)
105
+
106
+ def _push_batch(batch: list[RelationalAsset], batch_num: int) -> str | None:
107
+ client = Client(session=Session(
108
+ mcd_id=key_id, mcd_token=key_token, scope="Ingestion", endpoint=endpoint,
109
+ ))
110
+ service = IngestionService(mc_client=client)
111
+ result = service.send_metadata(
112
+ resource_uuid=resource_uuid,
113
+ resource_type=resource_type,
114
+ events=batch,
115
+ )
116
+ invocation_id = service.extract_invocation_id(result)
117
+ log.info(
118
+ "Pushed batch %d/%d (%d assets) — invocation_id=%s",
119
+ batch_num, total_batches, len(batch), invocation_id,
120
+ )
121
+ return invocation_id
122
+
123
+ max_workers = min(4, total_batches)
124
+ invocation_ids: list[str | None] = [None] * total_batches
125
+
126
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
127
+ futures = {
128
+ pool.submit(_push_batch, batch, i + 1): i
129
+ for i, batch in enumerate(batches)
130
+ }
131
+ for future in as_completed(futures):
132
+ idx = futures[future]
133
+ try:
134
+ invocation_ids[idx] = future.result()
135
+ except Exception as exc:
136
+ log.error("ERROR pushing batch %d: %s", idx + 1, exc)
137
+ raise
138
+
139
+ log.info("All %d batch(es) pushed.", total_batches)
140
+
141
+ push_result = {
142
+ "resource_uuid": resource_uuid,
143
+ "resource_type": resource_type,
144
+ "invocation_ids": invocation_ids,
145
+ "pushed_at": datetime.now(timezone.utc).isoformat(),
146
+ "total_assets": len(assets),
147
+ "batch_count": total_batches,
148
+ "batch_size": batch_size,
149
+ }
150
+ with open(output_file, "w") as fh:
151
+ json.dump(push_result, fh, indent=2)
152
+ log.info("Push result written to %s", output_file)
153
+
154
+ return push_result
155
+
156
+
157
+ def main() -> None:
158
+ parser = argparse.ArgumentParser(
159
+ description="Push BigQuery Iceberg metadata from a manifest to Monte Carlo",
160
+ )
161
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
162
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
163
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
164
+ parser.add_argument("--input-file", default="metadata_output.json")
165
+ parser.add_argument("--output-file", default="metadata_push_result.json")
166
+ parser.add_argument(
167
+ "--batch-size",
168
+ type=int,
169
+ default=_BATCH_SIZE,
170
+ help=f"Max assets per push batch (default: {_BATCH_SIZE})",
171
+ )
172
+ args = parser.parse_args()
173
+
174
+ required = ["resource_uuid", "key_id", "key_token"]
175
+ missing = [k for k in required if getattr(args, k) is None]
176
+ if missing:
177
+ parser.error(f"Missing required arguments/env vars: {missing}")
178
+
179
+ push(
180
+ input_file=args.input_file,
181
+ resource_uuid=args.resource_uuid,
182
+ key_id=args.key_id,
183
+ key_token=args.key_token,
184
+ batch_size=args.batch_size,
185
+ output_file=args.output_file,
186
+ )
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()