opencode-skills-collection 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,214 @@
1
+ """
2
+ BigQuery — Lineage Collection (collect only)
3
+ =============================================
4
+ Collects table-level lineage from two sources:
5
+ 1. INFORMATION_SCHEMA.SCHEMATA_LINKS — cross-project dataset shares (per region)
6
+ 2. Job query history — SQL parsing for CREATE TABLE AS SELECT and INSERT INTO
7
+ SELECT patterns to derive source->destination relationships.
8
+
9
+ Writes the collected lineage edges to a JSON manifest file.
10
+
11
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
12
+
13
+ Substitution points (search for "← SUBSTITUTE"):
14
+ - BIGQUERY_PROJECT_ID : GCP project ID to collect from
15
+ - BIGQUERY_REGION : BigQuery region for INFORMATION_SCHEMA queries (e.g. "us", "eu")
16
+ - LOOKBACK_HOURS : how far back to scan job history (default 24 h)
17
+
18
+ Prerequisites:
19
+ pip install google-cloud-bigquery
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import json
26
+ import logging
27
+ import os
28
+ import re
29
+ from datetime import datetime, timedelta, timezone
30
+
31
+ from google.cloud import bigquery
32
+
33
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
34
+ log = logging.getLogger(__name__)
35
+
36
+ RESOURCE_TYPE = "bigquery"
37
+ LOOKBACK_HOURS = int(os.getenv("LOOKBACK_HOURS", "24")) # ← SUBSTITUTE: adjust lookback window
38
+
39
+ # Regex patterns to detect CTAS and INSERT INTO SELECT in BigQuery SQL
40
+ _CTAS_PATTERN = re.compile(
41
+ r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:TABLE|VIEW)\s+`?(?P<dest>[\w.\-]+)`?"
42
+ r".*?(?:AS\s+)?SELECT\b",
43
+ re.IGNORECASE | re.DOTALL,
44
+ )
45
+ _INSERT_PATTERN = re.compile(
46
+ r"INSERT\s+(?:INTO\s+)?`?(?P<dest>[\w.\-]+)`?.*?SELECT\b",
47
+ re.IGNORECASE | re.DOTALL,
48
+ )
49
+ _TABLE_REF_PATTERN = re.compile(r"`?([\w\-]+\.[\w\-]+\.[\w\-]+)`?", re.IGNORECASE)
50
+
51
+
52
+ def _parse_full_name(full_name: str) -> tuple[str, str, str]:
53
+ """Split 'project.dataset.table' into (project, dataset, table)."""
54
+ parts = full_name.replace("`", "").split(".")
55
+ if len(parts) == 3:
56
+ return parts[0], parts[1], parts[2]
57
+ if len(parts) == 2:
58
+ return "", parts[0], parts[1]
59
+ return "", "", parts[0]
60
+
61
+
62
+ def _collect_schema_link_lineage(
63
+ bq_client: bigquery.Client,
64
+ project_id: str,
65
+ region: str,
66
+ ) -> list[dict]:
67
+ """Collect cross-project lineage from INFORMATION_SCHEMA.SCHEMATA_LINKS."""
68
+ query = f"""
69
+ SELECT
70
+ CATALOG_NAME AS source_project,
71
+ SCHEMA_NAME AS source_dataset,
72
+ LINKED_SCHEMA_CATALOG_NAME AS destination_project,
73
+ LINKED_SCHEMA_NAME AS destination_dataset
74
+ FROM `{project_id}`.`{region}`.INFORMATION_SCHEMA.SCHEMATA_LINKS
75
+ """ # ← SUBSTITUTE: update project_id and region as needed
76
+ edges: list[dict] = []
77
+ try:
78
+ for row in bq_client.query(query).result():
79
+ edges.append(
80
+ {
81
+ "destination": {
82
+ "database": row.destination_project,
83
+ "schema": row.destination_dataset,
84
+ "table": "*",
85
+ },
86
+ "sources": [
87
+ {
88
+ "database": row.source_project,
89
+ "schema": row.source_dataset,
90
+ "table": "*",
91
+ }
92
+ ],
93
+ }
94
+ )
95
+ except Exception:
96
+ log.warning("SCHEMATA_LINKS query failed — skipping dataset-share lineage", exc_info=True)
97
+ return edges
98
+
99
+
100
+ def _collect_query_lineage(
101
+ bq_client: bigquery.Client,
102
+ project_id: str,
103
+ lookback_hours: int,
104
+ ) -> list[dict]:
105
+ """Derive lineage by parsing CTAS/INSERT patterns in job query history."""
106
+ end_dt = datetime.now(timezone.utc)
107
+ start_dt = end_dt - timedelta(hours=lookback_hours)
108
+
109
+ edges: list[dict] = []
110
+ for job in bq_client.list_jobs(all_users=True, min_creation_time=start_dt, max_creation_time=end_dt):
111
+ sql: str = getattr(job, "query", None) or ""
112
+ if not sql.strip():
113
+ continue
114
+
115
+ dest_match = _CTAS_PATTERN.search(sql) or _INSERT_PATTERN.search(sql)
116
+ if not dest_match:
117
+ continue
118
+
119
+ dest_full = dest_match.group("dest")
120
+ dest_project, dest_dataset, dest_table = _parse_full_name(dest_full)
121
+ if not dest_table:
122
+ continue
123
+
124
+ # Collect all 3-part table references in the query as sources, excluding destination
125
+ source_refs = [
126
+ m.group(1)
127
+ for m in _TABLE_REF_PATTERN.finditer(sql)
128
+ if m.group(1) != dest_full
129
+ ]
130
+ if not source_refs:
131
+ continue
132
+
133
+ unique_sources = list(dict.fromkeys(source_refs))
134
+ sources = []
135
+ for ref in unique_sources:
136
+ p, d, t = _parse_full_name(ref)
137
+ sources.append({"database": p, "schema": d, "table": t})
138
+
139
+ edges.append(
140
+ {
141
+ "destination": {
142
+ "database": dest_project or project_id,
143
+ "schema": dest_dataset,
144
+ "table": dest_table,
145
+ },
146
+ "sources": sources,
147
+ }
148
+ )
149
+
150
+ return edges
151
+
152
+
153
+ def collect(
154
+ project_id: str,
155
+ region: str = "us",
156
+ lookback_hours: int = LOOKBACK_HOURS,
157
+ output_file: str = "lineage_output.json",
158
+ ) -> dict:
159
+ """
160
+ Connect to BigQuery, collect lineage edges, and write a JSON manifest.
161
+
162
+ Returns the manifest dict.
163
+ """
164
+ bq_client = bigquery.Client(project=project_id)
165
+
166
+ log.info("Collecting lineage from project %s ...", project_id)
167
+ schema_edges = _collect_schema_link_lineage(bq_client, project_id, region)
168
+ query_edges = _collect_query_lineage(bq_client, project_id, lookback_hours)
169
+ all_edges = schema_edges + query_edges
170
+
171
+ log.info(
172
+ "Collected %d lineage edges (%d schema-link, %d query-derived)",
173
+ len(all_edges), len(schema_edges), len(query_edges),
174
+ )
175
+
176
+ manifest = {
177
+ "resource_type": RESOURCE_TYPE,
178
+ "collected_at": datetime.now(timezone.utc).isoformat(),
179
+ "schema_link_edges": len(schema_edges),
180
+ "query_derived_edges": len(query_edges),
181
+ "edges": all_edges,
182
+ }
183
+ with open(output_file, "w") as fh:
184
+ json.dump(manifest, fh, indent=2)
185
+ log.info("Lineage manifest written to %s", output_file)
186
+
187
+ return manifest
188
+
189
+
190
+ def main() -> None:
191
+ parser = argparse.ArgumentParser(
192
+ description="Collect BigQuery lineage and write to a manifest file",
193
+ )
194
+ parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID")) # ← SUBSTITUTE
195
+ parser.add_argument("--region", default=os.getenv("BIGQUERY_REGION", "us")) # ← SUBSTITUTE
196
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
197
+ parser.add_argument("--output-file", default="lineage_output.json")
198
+ args = parser.parse_args()
199
+
200
+ required = ["project_id"]
201
+ missing = [k for k in required if getattr(args, k) is None]
202
+ if missing:
203
+ parser.error(f"Missing required arguments/env vars: {missing}")
204
+
205
+ collect(
206
+ project_id=args.project_id,
207
+ region=args.region,
208
+ lookback_hours=args.lookback_hours,
209
+ output_file=args.output_file,
210
+ )
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
@@ -0,0 +1,160 @@
1
+ """
2
+ BigQuery — Metadata Collection (collect only)
3
+ ==============================================
4
+ Collects table schemas, row counts, byte sizes, and descriptions from all
5
+ datasets in a BigQuery project and writes them to a JSON manifest file.
6
+
7
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
8
+
9
+ Substitution points (search for "← SUBSTITUTE"):
10
+ - BIGQUERY_PROJECT_ID : GCP project ID to collect from
11
+ - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
12
+ - DATASET_EXCLUSIONS : datasets to skip (informational / system datasets)
13
+
14
+ Prerequisites:
15
+ pip install google-cloud-bigquery
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ from datetime import datetime, timezone
25
+
26
+ from google.cloud import bigquery
27
+
28
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
29
+ log = logging.getLogger(__name__)
30
+
31
+ RESOURCE_TYPE = "bigquery"
32
+
33
+ # Datasets to skip — add any internal / system datasets here
34
+ DATASET_EXCLUSIONS = { # ← SUBSTITUTE: add datasets to exclude
35
+ "_bqc_",
36
+ "INFORMATION_SCHEMA",
37
+ }
38
+
39
+ # BigQuery type → Monte Carlo canonical type
40
+ BQ_TYPE_MAP: dict[str, str] = {
41
+ "INT64": "INTEGER",
42
+ "INTEGER": "INTEGER",
43
+ "FLOAT64": "FLOAT",
44
+ "FLOAT": "FLOAT",
45
+ "BOOL": "BOOLEAN",
46
+ "BOOLEAN": "BOOLEAN",
47
+ "STRING": "VARCHAR",
48
+ "BYTES": "BINARY",
49
+ "DATE": "DATE",
50
+ "DATETIME": "DATETIME",
51
+ "TIMESTAMP": "TIMESTAMP",
52
+ "TIME": "TIME",
53
+ "NUMERIC": "DECIMAL",
54
+ "BIGNUMERIC": "DECIMAL",
55
+ "RECORD": "STRUCT",
56
+ "STRUCT": "STRUCT",
57
+ "REPEATED": "ARRAY",
58
+ "JSON": "JSON",
59
+ "GEOGRAPHY": "GEOGRAPHY",
60
+ }
61
+
62
+
63
+ def map_bq_type(bq_type: str) -> str:
64
+ return BQ_TYPE_MAP.get(bq_type.upper(), bq_type.upper())
65
+
66
+
67
+ def _collect_assets(bq_client: bigquery.Client, project_id: str) -> list[dict]:
68
+ """Collect table metadata from BigQuery and return as a list of dicts."""
69
+ assets: list[dict] = []
70
+
71
+ for dataset_item in bq_client.list_datasets():
72
+ dataset_id = dataset_item.dataset_id
73
+
74
+ if any(exc in dataset_id for exc in DATASET_EXCLUSIONS):
75
+ log.info("Skipping dataset %s", dataset_id)
76
+ continue
77
+
78
+ dataset_ref = bq_client.dataset(dataset_id)
79
+
80
+ for table_item in bq_client.list_tables(dataset_ref):
81
+ table_ref = dataset_ref.table(table_item.table_id)
82
+ table = bq_client.get_table(table_ref)
83
+
84
+ fields = [
85
+ {
86
+ "name": field.name,
87
+ "type": map_bq_type(field.field_type),
88
+ "description": field.description or None,
89
+ }
90
+ for field in table.schema
91
+ ]
92
+
93
+ asset = {
94
+ "name": table.table_id,
95
+ "database": project_id, # ← SUBSTITUTE: use project or dataset as database
96
+ "schema": dataset_id,
97
+ "type": "VIEW" if table.table_type == "VIEW" else "TABLE",
98
+ "description": table.description or None,
99
+ "fields": fields,
100
+ "volume": {
101
+ "row_count": table.num_rows,
102
+ "byte_count": table.num_bytes,
103
+ },
104
+ "freshness": {
105
+ "last_updated_time": table.modified.isoformat() if table.modified else None,
106
+ },
107
+ }
108
+ assets.append(asset)
109
+ log.info("Queued %s.%s.%s", project_id, dataset_id, table.table_id)
110
+
111
+ return assets
112
+
113
+
114
+ def collect(
115
+ project_id: str,
116
+ output_file: str = "metadata_output.json",
117
+ ) -> dict:
118
+ """
119
+ Connect to BigQuery, collect table metadata, and write a JSON manifest.
120
+
121
+ Returns the manifest dict.
122
+ """
123
+ bq_client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
124
+
125
+ log.info("Collecting metadata from project %s ...", project_id)
126
+ assets = _collect_assets(bq_client, project_id)
127
+ log.info("Collected %d asset(s).", len(assets))
128
+
129
+ manifest = {
130
+ "resource_type": RESOURCE_TYPE,
131
+ "collected_at": datetime.now(timezone.utc).isoformat(),
132
+ "assets": assets,
133
+ }
134
+ with open(output_file, "w") as fh:
135
+ json.dump(manifest, fh, indent=2)
136
+ log.info("Asset manifest written to %s", output_file)
137
+
138
+ return manifest
139
+
140
+
141
+ def main() -> None:
142
+ parser = argparse.ArgumentParser(
143
+ description="Collect BigQuery metadata and write to a manifest file",
144
+ )
145
+ parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID")) # ← SUBSTITUTE
146
+ parser.add_argument("--output-file", default="metadata_output.json")
147
+ args = parser.parse_args()
148
+
149
+ missing = [k for k, v in vars(args).items() if v is None and k != "output_file"]
150
+ if missing:
151
+ parser.error(f"Missing required arguments/env vars: {missing}")
152
+
153
+ collect(
154
+ project_id=args.project_id,
155
+ output_file=args.output_file,
156
+ )
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()
@@ -0,0 +1,164 @@
1
+ """
2
+ BigQuery — Query Log Collection (collect only)
3
+ ================================================
4
+ Collects completed job query logs from BigQuery job history and writes them to
5
+ a JSON manifest file for later push to Monte Carlo.
6
+
7
+ Can be run standalone via CLI or imported (use the ``collect()`` function).
8
+
9
+ Substitution points (search for "← SUBSTITUTE"):
10
+ - BIGQUERY_PROJECT_ID : GCP project ID to collect query logs from
11
+ - GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
12
+ - LOOKBACK_HOURS : how many hours back to collect (default 25, skip last 1 h)
13
+ - STATEMENT_TYPE_FILTER : restrict to specific statement types, or leave empty for all
14
+ - MAX_JOBS : cap on number of jobs to collect per run
15
+
16
+ Prerequisites:
17
+ pip install google-cloud-bigquery
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import logging
25
+ import os
26
+ from datetime import datetime, timedelta, timezone
27
+
28
+ from google.cloud import bigquery
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
31
+ log = logging.getLogger(__name__)
32
+
33
+ LOG_TYPE = "bigquery"
34
+
35
+ # Collect jobs from [now - LOOKBACK_HOURS] to [now - LOOKBACK_LAG_HOURS].
36
+ # The lag avoids collecting in-flight jobs that have not yet completed.
37
+ LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
38
+ LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
39
+
40
+ # Limit statement types — e.g. ["SELECT", "CREATE_TABLE_AS_SELECT", "INSERT"]
41
+ # Set to an empty list to collect all statement types.
42
+ STATEMENT_TYPE_FILTER: list[str] = [] # ← SUBSTITUTE
43
+
44
+ # Maximum number of jobs to collect in a single run to avoid runaway costs
45
+ MAX_JOBS: int = int(os.getenv("MAX_JOBS", "10000")) # ← SUBSTITUTE
46
+
47
+
48
+ def _safe_isoformat(dt: datetime | None) -> str | None:
49
+ if dt is None:
50
+ return None
51
+ if dt.tzinfo is None:
52
+ dt = dt.replace(tzinfo=timezone.utc)
53
+ return dt.isoformat()
54
+
55
+
56
+ def _collect_query_logs(
57
+ bq_client: bigquery.Client,
58
+ project_id: str,
59
+ start_dt: datetime,
60
+ end_dt: datetime,
61
+ ) -> list[dict]:
62
+ """Collect query logs from BigQuery job history and return as a list of dicts."""
63
+ entries: list[dict] = []
64
+
65
+ log.info(
66
+ "Listing jobs for project=%s from %s to %s",
67
+ project_id, start_dt.isoformat(), end_dt.isoformat(),
68
+ )
69
+
70
+ for job in bq_client.list_jobs(
71
+ project=project_id,
72
+ all_users=True,
73
+ min_creation_time=start_dt,
74
+ max_creation_time=end_dt,
75
+ ):
76
+ # Only process query jobs that have SQL text
77
+ sql: str = getattr(job, "query", None) or ""
78
+ if not sql.strip():
79
+ continue
80
+
81
+ statement_type: str = getattr(job, "statement_type", None) or ""
82
+ if STATEMENT_TYPE_FILTER and statement_type not in STATEMENT_TYPE_FILTER:
83
+ continue # ← SUBSTITUTE: adjust filter as needed
84
+
85
+ total_bytes_billed: int | None = getattr(job, "total_bytes_billed", None)
86
+
87
+ entries.append(
88
+ {
89
+ "query_id": job.job_id,
90
+ "query_text": sql,
91
+ "start_time": _safe_isoformat(getattr(job, "created", None)),
92
+ "end_time": _safe_isoformat(getattr(job, "ended", None)),
93
+ "user": getattr(job, "user_email", None),
94
+ "total_bytes_billed": total_bytes_billed,
95
+ "statement_type": statement_type or None,
96
+ }
97
+ )
98
+
99
+ if len(entries) >= MAX_JOBS:
100
+ log.warning("Reached MAX_JOBS=%d — stopping early", MAX_JOBS)
101
+ break
102
+
103
+ return entries
104
+
105
+
106
+ def collect(
107
+ project_id: str,
108
+ lookback_hours: int = LOOKBACK_HOURS,
109
+ lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
110
+ output_file: str = "query_logs_output.json",
111
+ ) -> dict:
112
+ """
113
+ Connect to BigQuery, collect query logs, and write a JSON manifest.
114
+
115
+ Returns the manifest dict.
116
+ """
117
+ bq_client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
118
+
119
+ end_dt = datetime.now(timezone.utc) - timedelta(hours=lookback_lag_hours)
120
+ start_dt = end_dt - timedelta(hours=lookback_hours)
121
+
122
+ entries = _collect_query_logs(bq_client, project_id, start_dt, end_dt)
123
+ log.info("Collected %d query log entries.", len(entries))
124
+
125
+ manifest = {
126
+ "log_type": LOG_TYPE,
127
+ "collected_at": datetime.now(timezone.utc).isoformat(),
128
+ "window_start": start_dt.isoformat(),
129
+ "window_end": end_dt.isoformat(),
130
+ "query_log_count": len(entries),
131
+ "queries": entries,
132
+ }
133
+ with open(output_file, "w") as fh:
134
+ json.dump(manifest, fh, indent=2)
135
+ log.info("Query log manifest written to %s", output_file)
136
+
137
+ return manifest
138
+
139
+
140
+ def main() -> None:
141
+ parser = argparse.ArgumentParser(
142
+ description="Collect BigQuery query logs and write to a manifest file",
143
+ )
144
+ parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID")) # ← SUBSTITUTE
145
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
146
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
147
+ parser.add_argument("--output-file", default="query_logs_output.json")
148
+ args = parser.parse_args()
149
+
150
+ required = ["project_id"]
151
+ missing = [k for k in required if getattr(args, k) is None]
152
+ if missing:
153
+ parser.error(f"Missing required arguments/env vars: {missing}")
154
+
155
+ collect(
156
+ project_id=args.project_id,
157
+ lookback_hours=args.lookback_hours,
158
+ lookback_lag_hours=args.lookback_lag_hours,
159
+ output_file=args.output_file,
160
+ )
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()