opencode-skills-collection 2.0.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +6 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/manage-skills/SKILL.md +187 -0
- package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
- package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
- package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
- package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
- package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
- package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
- package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
- package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
- package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
- package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
- package/package.json +1 -1
- package/skills_index.json +503 -61
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery Iceberg — Query Log Collect & Push (combined)
|
|
3
|
+
=====================================================
|
|
4
|
+
Convenience wrapper that runs collect_query_logs.collect() followed by
|
|
5
|
+
push_query_logs.push() in a single invocation.
|
|
6
|
+
|
|
7
|
+
Prerequisites:
|
|
8
|
+
pip install google-cloud-bigquery pycarlo>=0.12.251 python-dateutil>=2.8.0
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from collect_query_logs import LOOKBACK_HOURS, LOOKBACK_LAG_HOURS, collect
|
|
17
|
+
from push_query_logs import push
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main() -> None:
|
|
21
|
+
parser = argparse.ArgumentParser(
|
|
22
|
+
description="Collect BigQuery query logs and push to Monte Carlo",
|
|
23
|
+
)
|
|
24
|
+
# Collection args
|
|
25
|
+
parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID"))
|
|
26
|
+
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
|
|
27
|
+
parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
|
|
28
|
+
parser.add_argument("--manifest-file", default="query_logs_output.json")
|
|
29
|
+
|
|
30
|
+
# Push args
|
|
31
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
32
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
33
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
34
|
+
parser.add_argument("--batch-size", type=int, default=100)
|
|
35
|
+
parser.add_argument("--push-result-file", default="query_logs_push_result.json")
|
|
36
|
+
|
|
37
|
+
args = parser.parse_args()
|
|
38
|
+
|
|
39
|
+
if not args.project_id:
|
|
40
|
+
parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
|
|
41
|
+
required_push = ["resource_uuid", "key_id", "key_token"]
|
|
42
|
+
missing = [k for k in required_push if getattr(args, k) is None]
|
|
43
|
+
if missing:
|
|
44
|
+
parser.error(f"Missing required push arguments/env vars: {missing}")
|
|
45
|
+
|
|
46
|
+
collect(
|
|
47
|
+
project_id=args.project_id,
|
|
48
|
+
lookback_hours=args.lookback_hours,
|
|
49
|
+
lookback_lag_hours=args.lookback_lag_hours,
|
|
50
|
+
output_file=args.manifest_file,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
push(
|
|
54
|
+
input_file=args.manifest_file,
|
|
55
|
+
resource_uuid=args.resource_uuid,
|
|
56
|
+
key_id=args.key_id,
|
|
57
|
+
key_token=args.key_token,
|
|
58
|
+
batch_size=args.batch_size,
|
|
59
|
+
output_file=args.push_result_file,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery Iceberg — Metadata Collection (collect only)
|
|
3
|
+
=====================================================
|
|
4
|
+
Collects table schemas, row counts, byte sizes, and freshness for BigQuery
|
|
5
|
+
Iceberg (BigLake-managed) tables using INFORMATION_SCHEMA.TABLE_STORAGE and
|
|
6
|
+
INFORMATION_SCHEMA.COLUMNS. Standard BigQuery collection uses __TABLES__ which
|
|
7
|
+
does not include Iceberg tables — this template fills that gap.
|
|
8
|
+
|
|
9
|
+
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
10
|
+
|
|
11
|
+
Supports a ``--only-freshness-and-volume`` flag to skip the COLUMNS query for
|
|
12
|
+
fast periodic pushes after the initial full metadata push.
|
|
13
|
+
|
|
14
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
15
|
+
- BIGQUERY_PROJECT_ID : GCP project ID to collect from
|
|
16
|
+
- GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
|
|
17
|
+
- REGION : BigQuery region (default "us")
|
|
18
|
+
|
|
19
|
+
Prerequisites:
|
|
20
|
+
pip install google-cloud-bigquery
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
|
|
31
|
+
from google.cloud import bigquery
|
|
32
|
+
|
|
33
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
34
|
+
log = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
RESOURCE_TYPE = "bigquery"
|
|
37
|
+
|
|
38
|
+
# BigQuery type → Monte Carlo canonical type
|
|
39
|
+
BQ_TYPE_MAP: dict[str, str] = {
|
|
40
|
+
"INT64": "INTEGER",
|
|
41
|
+
"INTEGER": "INTEGER",
|
|
42
|
+
"FLOAT64": "FLOAT",
|
|
43
|
+
"FLOAT": "FLOAT",
|
|
44
|
+
"BOOL": "BOOLEAN",
|
|
45
|
+
"BOOLEAN": "BOOLEAN",
|
|
46
|
+
"STRING": "VARCHAR",
|
|
47
|
+
"BYTES": "BINARY",
|
|
48
|
+
"DATE": "DATE",
|
|
49
|
+
"DATETIME": "DATETIME",
|
|
50
|
+
"TIMESTAMP": "TIMESTAMP",
|
|
51
|
+
"TIME": "TIME",
|
|
52
|
+
"NUMERIC": "DECIMAL",
|
|
53
|
+
"BIGNUMERIC": "DECIMAL",
|
|
54
|
+
"RECORD": "STRUCT",
|
|
55
|
+
"STRUCT": "STRUCT",
|
|
56
|
+
"REPEATED": "ARRAY",
|
|
57
|
+
"JSON": "JSON",
|
|
58
|
+
"GEOGRAPHY": "GEOGRAPHY",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def map_bq_type(bq_type: str) -> str:
|
|
63
|
+
base = bq_type.split("(")[0].strip().upper()
|
|
64
|
+
return BQ_TYPE_MAP.get(base, bq_type.upper())
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _fetch_iceberg_tables(
|
|
68
|
+
client: bigquery.Client,
|
|
69
|
+
project_id: str,
|
|
70
|
+
datasets: list[str] | None = None,
|
|
71
|
+
tables: list[str] | None = None,
|
|
72
|
+
) -> list[dict]:
|
|
73
|
+
"""Query TABLE_STORAGE for BigLake (Iceberg) tables."""
|
|
74
|
+
conditions = [
|
|
75
|
+
"managed_table_type = 'BIGLAKE'",
|
|
76
|
+
"deleted = FALSE",
|
|
77
|
+
]
|
|
78
|
+
if datasets:
|
|
79
|
+
ds_list = ", ".join(f"'{d}'" for d in datasets)
|
|
80
|
+
conditions.append(f"table_schema IN ({ds_list})")
|
|
81
|
+
if tables:
|
|
82
|
+
tbl_list = ", ".join(f"'{t}'" for t in tables)
|
|
83
|
+
conditions.append(f"table_name IN ({tbl_list})")
|
|
84
|
+
|
|
85
|
+
where = " AND ".join(conditions)
|
|
86
|
+
query = f"""
|
|
87
|
+
SELECT
|
|
88
|
+
table_schema,
|
|
89
|
+
table_name,
|
|
90
|
+
total_rows,
|
|
91
|
+
current_physical_bytes,
|
|
92
|
+
storage_last_modified_time,
|
|
93
|
+
creation_time
|
|
94
|
+
FROM `{project_id}.region-us`.INFORMATION_SCHEMA.TABLE_STORAGE -- ← SUBSTITUTE: change region if needed
|
|
95
|
+
WHERE {where}
|
|
96
|
+
ORDER BY table_schema, table_name
|
|
97
|
+
"""
|
|
98
|
+
log.info("Querying TABLE_STORAGE for Iceberg tables ...")
|
|
99
|
+
rows = list(client.query(query).result())
|
|
100
|
+
log.info("Found %d Iceberg table(s).", len(rows))
|
|
101
|
+
return [dict(row) for row in rows]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _fetch_columns(
|
|
105
|
+
client: bigquery.Client,
|
|
106
|
+
project_id: str,
|
|
107
|
+
dataset: str,
|
|
108
|
+
table_name: str,
|
|
109
|
+
) -> list[dict]:
|
|
110
|
+
"""Fetch column metadata for a specific table."""
|
|
111
|
+
query = f"""
|
|
112
|
+
SELECT column_name, data_type, ordinal_position, is_nullable, column_default
|
|
113
|
+
FROM `{project_id}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
|
|
114
|
+
WHERE table_name = '{table_name}'
|
|
115
|
+
ORDER BY ordinal_position
|
|
116
|
+
"""
|
|
117
|
+
return [
|
|
118
|
+
{
|
|
119
|
+
"name": row["column_name"],
|
|
120
|
+
"type": map_bq_type(row["data_type"]),
|
|
121
|
+
}
|
|
122
|
+
for row in client.query(query).result()
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _resolve_freshness(row: dict) -> str:
|
|
127
|
+
"""Return the best available freshness timestamp as ISO8601.
|
|
128
|
+
|
|
129
|
+
Uses storage_last_modified_time if Google has populated it (expected
|
|
130
|
+
early April 2026). Falls back to current time with a warning.
|
|
131
|
+
"""
|
|
132
|
+
if row.get("storage_last_modified_time"):
|
|
133
|
+
return row["storage_last_modified_time"].isoformat()
|
|
134
|
+
|
|
135
|
+
log.warning(
|
|
136
|
+
"storage_last_modified_time is NULL for %s.%s — "
|
|
137
|
+
"falling back to current time. Google's TABLE_STORAGE update "
|
|
138
|
+
"for Iceberg tables may not have shipped yet.",
|
|
139
|
+
row["table_schema"],
|
|
140
|
+
row["table_name"],
|
|
141
|
+
)
|
|
142
|
+
return datetime.now(timezone.utc).isoformat()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def collect(
|
|
146
|
+
project_id: str,
|
|
147
|
+
datasets: list[str] | None = None,
|
|
148
|
+
tables: list[str] | None = None,
|
|
149
|
+
only_freshness_and_volume: bool = False,
|
|
150
|
+
output_file: str = "metadata_output.json",
|
|
151
|
+
) -> dict:
|
|
152
|
+
"""Collect Iceberg table metadata and write a JSON manifest.
|
|
153
|
+
|
|
154
|
+
When only_freshness_and_volume is True, skips the COLUMNS query and
|
|
155
|
+
omits fields from the manifest. Use this for periodic hourly pushes
|
|
156
|
+
after the initial full metadata push.
|
|
157
|
+
"""
|
|
158
|
+
client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
|
|
159
|
+
|
|
160
|
+
if only_freshness_and_volume:
|
|
161
|
+
log.info("Running in freshness+volume only mode (skipping fields).")
|
|
162
|
+
|
|
163
|
+
iceberg_tables = _fetch_iceberg_tables(client, project_id, datasets, tables)
|
|
164
|
+
if not iceberg_tables:
|
|
165
|
+
log.warning("No Iceberg tables found matching the criteria.")
|
|
166
|
+
return {"resource_type": RESOURCE_TYPE, "assets": []}
|
|
167
|
+
|
|
168
|
+
assets: list[dict] = []
|
|
169
|
+
for row in iceberg_tables:
|
|
170
|
+
dataset = row["table_schema"]
|
|
171
|
+
name = row["table_name"]
|
|
172
|
+
|
|
173
|
+
asset = {
|
|
174
|
+
"name": name,
|
|
175
|
+
"database": project_id,
|
|
176
|
+
"schema": dataset,
|
|
177
|
+
"type": "TABLE",
|
|
178
|
+
"volume": {
|
|
179
|
+
"row_count": row["total_rows"],
|
|
180
|
+
"byte_count": row["current_physical_bytes"],
|
|
181
|
+
},
|
|
182
|
+
"freshness": {
|
|
183
|
+
"last_updated_time": _resolve_freshness(row),
|
|
184
|
+
},
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if not only_freshness_and_volume:
|
|
188
|
+
asset["description"] = None
|
|
189
|
+
asset["fields"] = _fetch_columns(client, project_id, dataset, name)
|
|
190
|
+
|
|
191
|
+
assets.append(asset)
|
|
192
|
+
log.info(
|
|
193
|
+
"Collected %s.%s.%s — rows=%s, bytes=%s",
|
|
194
|
+
project_id, dataset, name,
|
|
195
|
+
row["total_rows"], row["current_physical_bytes"],
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
manifest = {
|
|
199
|
+
"resource_type": RESOURCE_TYPE,
|
|
200
|
+
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
201
|
+
"assets": assets,
|
|
202
|
+
}
|
|
203
|
+
with open(output_file, "w") as fh:
|
|
204
|
+
json.dump(manifest, fh, indent=2)
|
|
205
|
+
log.info("Manifest written to %s (%d assets)", output_file, len(assets))
|
|
206
|
+
|
|
207
|
+
return manifest
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def main() -> None:
|
|
211
|
+
parser = argparse.ArgumentParser(
|
|
212
|
+
description="Collect BigQuery Iceberg table metadata into a JSON manifest",
|
|
213
|
+
)
|
|
214
|
+
parser.add_argument(
|
|
215
|
+
"--project-id",
|
|
216
|
+
default=os.getenv("BIGQUERY_PROJECT_ID"), # ← SUBSTITUTE
|
|
217
|
+
help="GCP project ID (or set BIGQUERY_PROJECT_ID env var)",
|
|
218
|
+
)
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--datasets",
|
|
221
|
+
nargs="+",
|
|
222
|
+
default=None,
|
|
223
|
+
help="Limit to specific dataset(s). Omit to scan all datasets.",
|
|
224
|
+
)
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
"--tables",
|
|
227
|
+
nargs="+",
|
|
228
|
+
default=None,
|
|
229
|
+
help="Limit to specific table name(s) within the datasets.",
|
|
230
|
+
)
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--only-freshness-and-volume",
|
|
233
|
+
action="store_true",
|
|
234
|
+
help="Skip field/schema collection — only collect freshness and volume. "
|
|
235
|
+
"Use for periodic hourly pushes after the initial full metadata push.",
|
|
236
|
+
)
|
|
237
|
+
parser.add_argument("--output-file", default="metadata_output.json")
|
|
238
|
+
args = parser.parse_args()
|
|
239
|
+
|
|
240
|
+
if not args.project_id:
|
|
241
|
+
parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
|
|
242
|
+
|
|
243
|
+
collect(
|
|
244
|
+
project_id=args.project_id,
|
|
245
|
+
datasets=args.datasets,
|
|
246
|
+
tables=args.tables,
|
|
247
|
+
only_freshness_and_volume=args.only_freshness_and_volume,
|
|
248
|
+
output_file=args.output_file,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
main()
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery Iceberg — Query Log Collection (collect only)
|
|
3
|
+
======================================================
|
|
4
|
+
Queries the BigQuery Jobs API for completed query jobs within a time
|
|
5
|
+
window and writes a JSON manifest that can be fed to push_query_logs.py.
|
|
6
|
+
|
|
7
|
+
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
8
|
+
|
|
9
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
10
|
+
- BIGQUERY_PROJECT_ID : GCP project ID to collect from
|
|
11
|
+
- GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
|
|
12
|
+
|
|
13
|
+
Prerequisites:
|
|
14
|
+
pip install google-cloud-bigquery
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from datetime import datetime, timedelta, timezone
|
|
24
|
+
|
|
25
|
+
from google.cloud import bigquery
|
|
26
|
+
|
|
27
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
28
|
+
log = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
LOG_TYPE = "bigquery"
|
|
31
|
+
|
|
32
|
+
LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25"))
|
|
33
|
+
LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1"))
|
|
34
|
+
MAX_JOBS: int = int(os.getenv("MAX_JOBS", "10000"))
|
|
35
|
+
|
|
36
|
+
# Limit to specific statement types — empty list means collect all.
|
|
37
|
+
STATEMENT_TYPE_FILTER: list[str] = []
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _safe_isoformat(dt: datetime | None) -> str | None:
|
|
41
|
+
if dt is None:
|
|
42
|
+
return None
|
|
43
|
+
if dt.tzinfo is None:
|
|
44
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
45
|
+
return dt.isoformat()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _collect_query_logs(
|
|
49
|
+
bq_client: bigquery.Client,
|
|
50
|
+
project_id: str,
|
|
51
|
+
start_dt: datetime,
|
|
52
|
+
end_dt: datetime,
|
|
53
|
+
) -> list[dict]:
|
|
54
|
+
"""Collect query logs from BigQuery job history."""
|
|
55
|
+
entries: list[dict] = []
|
|
56
|
+
|
|
57
|
+
log.info(
|
|
58
|
+
"Listing jobs for project=%s from %s to %s",
|
|
59
|
+
project_id, start_dt.isoformat(), end_dt.isoformat(),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
for job in bq_client.list_jobs(
|
|
63
|
+
project=project_id,
|
|
64
|
+
all_users=True,
|
|
65
|
+
min_creation_time=start_dt,
|
|
66
|
+
max_creation_time=end_dt,
|
|
67
|
+
):
|
|
68
|
+
sql: str = getattr(job, "query", None) or ""
|
|
69
|
+
if not sql.strip():
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
statement_type: str = getattr(job, "statement_type", None) or ""
|
|
73
|
+
if STATEMENT_TYPE_FILTER and statement_type not in STATEMENT_TYPE_FILTER:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
entries.append({
|
|
77
|
+
"query_id": job.job_id,
|
|
78
|
+
"query_text": sql,
|
|
79
|
+
"start_time": _safe_isoformat(getattr(job, "created", None)),
|
|
80
|
+
"end_time": _safe_isoformat(getattr(job, "ended", None)),
|
|
81
|
+
"user": getattr(job, "user_email", None),
|
|
82
|
+
"total_bytes_billed": getattr(job, "total_bytes_billed", None),
|
|
83
|
+
"statement_type": statement_type or None,
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
if len(entries) >= MAX_JOBS:
|
|
87
|
+
log.warning("Reached MAX_JOBS=%d — stopping early", MAX_JOBS)
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
return entries
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def collect(
|
|
94
|
+
project_id: str,
|
|
95
|
+
lookback_hours: int = LOOKBACK_HOURS,
|
|
96
|
+
lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
|
|
97
|
+
output_file: str = "query_logs_output.json",
|
|
98
|
+
) -> dict:
|
|
99
|
+
"""Collect query logs and write a JSON manifest."""
|
|
100
|
+
bq_client = bigquery.Client(project=project_id)
|
|
101
|
+
|
|
102
|
+
end_dt = datetime.now(timezone.utc) - timedelta(hours=lookback_lag_hours)
|
|
103
|
+
start_dt = end_dt - timedelta(hours=lookback_hours)
|
|
104
|
+
|
|
105
|
+
entries = _collect_query_logs(bq_client, project_id, start_dt, end_dt)
|
|
106
|
+
log.info("Collected %d query log entries.", len(entries))
|
|
107
|
+
|
|
108
|
+
manifest = {
|
|
109
|
+
"log_type": LOG_TYPE,
|
|
110
|
+
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
111
|
+
"window_start": start_dt.isoformat(),
|
|
112
|
+
"window_end": end_dt.isoformat(),
|
|
113
|
+
"query_log_count": len(entries),
|
|
114
|
+
"queries": entries,
|
|
115
|
+
}
|
|
116
|
+
with open(output_file, "w") as fh:
|
|
117
|
+
json.dump(manifest, fh, indent=2)
|
|
118
|
+
log.info("Query log manifest written to %s", output_file)
|
|
119
|
+
|
|
120
|
+
return manifest
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main() -> None:
|
|
124
|
+
parser = argparse.ArgumentParser(
|
|
125
|
+
description="Collect BigQuery query logs into a JSON manifest",
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--project-id",
|
|
129
|
+
default=os.getenv("BIGQUERY_PROJECT_ID"),
|
|
130
|
+
help="GCP project ID (or set BIGQUERY_PROJECT_ID env var)",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
|
|
133
|
+
parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
|
|
134
|
+
parser.add_argument("--output-file", default="query_logs_output.json")
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
|
|
137
|
+
if not args.project_id:
|
|
138
|
+
parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
|
|
139
|
+
|
|
140
|
+
collect(
|
|
141
|
+
project_id=args.project_id,
|
|
142
|
+
lookback_hours=args.lookback_hours,
|
|
143
|
+
lookback_lag_hours=args.lookback_lag_hours,
|
|
144
|
+
output_file=args.output_file,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
main()
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigQuery Iceberg — Metadata Push (push only)
|
|
3
|
+
============================================
|
|
4
|
+
Reads a JSON manifest produced by collect_metadata.py and pushes table
|
|
5
|
+
metadata to Monte Carlo using the pycarlo SDK's IngestionService.
|
|
6
|
+
|
|
7
|
+
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
8
|
+
|
|
9
|
+
Substitution points (search for "← SUBSTITUTE"):
|
|
10
|
+
- MCD_INGEST_ID : Monte Carlo Ingestion API key ID
|
|
11
|
+
- MCD_INGEST_TOKEN : Monte Carlo Ingestion API key token
|
|
12
|
+
- MCD_RESOURCE_UUID : Monte Carlo warehouse resource UUID
|
|
13
|
+
|
|
14
|
+
Prerequisites:
|
|
15
|
+
pip install pycarlo>=0.12.251
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
|
|
27
|
+
from pycarlo.core import Client, Session
|
|
28
|
+
from pycarlo.features.ingestion import IngestionService
|
|
29
|
+
from pycarlo.features.ingestion.models import (
|
|
30
|
+
AssetField,
|
|
31
|
+
AssetFreshness,
|
|
32
|
+
AssetMetadata,
|
|
33
|
+
AssetVolume,
|
|
34
|
+
RelationalAsset,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
38
|
+
log = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
RESOURCE_TYPE = "bigquery"
|
|
41
|
+
_BATCH_SIZE = 500
|
|
42
|
+
|
|
43
|
+
_ENDPOINT = "https://integrations.getmontecarlo.com"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _asset_from_dict(d: dict) -> RelationalAsset:
|
|
47
|
+
"""Reconstruct a RelationalAsset from a manifest dict entry."""
|
|
48
|
+
fields = [
|
|
49
|
+
AssetField(
|
|
50
|
+
name=f["name"],
|
|
51
|
+
type=f.get("type"),
|
|
52
|
+
description=f.get("description"),
|
|
53
|
+
)
|
|
54
|
+
for f in d.get("fields", [])
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
volume = None
|
|
58
|
+
if d.get("volume"):
|
|
59
|
+
volume = AssetVolume(
|
|
60
|
+
row_count=d["volume"].get("row_count"),
|
|
61
|
+
byte_count=d["volume"].get("byte_count"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
freshness = None
|
|
65
|
+
if d.get("freshness") and d["freshness"].get("last_updated_time"):
|
|
66
|
+
freshness = AssetFreshness(
|
|
67
|
+
last_update_time=d["freshness"]["last_updated_time"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return RelationalAsset(
|
|
71
|
+
type=d.get("type", "TABLE"),
|
|
72
|
+
metadata=AssetMetadata(
|
|
73
|
+
name=d["name"],
|
|
74
|
+
database=d["database"],
|
|
75
|
+
schema=d["schema"],
|
|
76
|
+
description=d.get("description"),
|
|
77
|
+
),
|
|
78
|
+
fields=fields,
|
|
79
|
+
volume=volume,
|
|
80
|
+
freshness=freshness,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def push(
|
|
85
|
+
input_file: str,
|
|
86
|
+
resource_uuid: str,
|
|
87
|
+
key_id: str,
|
|
88
|
+
key_token: str,
|
|
89
|
+
batch_size: int = _BATCH_SIZE,
|
|
90
|
+
output_file: str = "metadata_push_result.json",
|
|
91
|
+
) -> dict:
|
|
92
|
+
"""Read a metadata manifest and push assets to Monte Carlo in batches."""
|
|
93
|
+
endpoint = _ENDPOINT
|
|
94
|
+
log.info("Using endpoint: %s", endpoint)
|
|
95
|
+
with open(input_file) as fh:
|
|
96
|
+
manifest = json.load(fh)
|
|
97
|
+
|
|
98
|
+
asset_dicts = manifest.get("assets", [])
|
|
99
|
+
resource_type = manifest.get("resource_type", RESOURCE_TYPE)
|
|
100
|
+
assets = [_asset_from_dict(d) for d in asset_dicts]
|
|
101
|
+
log.info("Loaded %d asset(s) from %s", len(assets), input_file)
|
|
102
|
+
|
|
103
|
+
batches = [assets[i : i + batch_size] for i in range(0, max(len(assets), 1), batch_size)]
|
|
104
|
+
total_batches = len(batches)
|
|
105
|
+
|
|
106
|
+
def _push_batch(batch: list[RelationalAsset], batch_num: int) -> str | None:
|
|
107
|
+
client = Client(session=Session(
|
|
108
|
+
mcd_id=key_id, mcd_token=key_token, scope="Ingestion", endpoint=endpoint,
|
|
109
|
+
))
|
|
110
|
+
service = IngestionService(mc_client=client)
|
|
111
|
+
result = service.send_metadata(
|
|
112
|
+
resource_uuid=resource_uuid,
|
|
113
|
+
resource_type=resource_type,
|
|
114
|
+
events=batch,
|
|
115
|
+
)
|
|
116
|
+
invocation_id = service.extract_invocation_id(result)
|
|
117
|
+
log.info(
|
|
118
|
+
"Pushed batch %d/%d (%d assets) — invocation_id=%s",
|
|
119
|
+
batch_num, total_batches, len(batch), invocation_id,
|
|
120
|
+
)
|
|
121
|
+
return invocation_id
|
|
122
|
+
|
|
123
|
+
max_workers = min(4, total_batches)
|
|
124
|
+
invocation_ids: list[str | None] = [None] * total_batches
|
|
125
|
+
|
|
126
|
+
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
127
|
+
futures = {
|
|
128
|
+
pool.submit(_push_batch, batch, i + 1): i
|
|
129
|
+
for i, batch in enumerate(batches)
|
|
130
|
+
}
|
|
131
|
+
for future in as_completed(futures):
|
|
132
|
+
idx = futures[future]
|
|
133
|
+
try:
|
|
134
|
+
invocation_ids[idx] = future.result()
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
log.info("All %d batch(es) pushed.", total_batches)
|
|
140
|
+
|
|
141
|
+
push_result = {
|
|
142
|
+
"resource_uuid": resource_uuid,
|
|
143
|
+
"resource_type": resource_type,
|
|
144
|
+
"invocation_ids": invocation_ids,
|
|
145
|
+
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
146
|
+
"total_assets": len(assets),
|
|
147
|
+
"batch_count": total_batches,
|
|
148
|
+
"batch_size": batch_size,
|
|
149
|
+
}
|
|
150
|
+
with open(output_file, "w") as fh:
|
|
151
|
+
json.dump(push_result, fh, indent=2)
|
|
152
|
+
log.info("Push result written to %s", output_file)
|
|
153
|
+
|
|
154
|
+
return push_result
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def main() -> None:
|
|
158
|
+
parser = argparse.ArgumentParser(
|
|
159
|
+
description="Push BigQuery Iceberg metadata from a manifest to Monte Carlo",
|
|
160
|
+
)
|
|
161
|
+
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
162
|
+
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
163
|
+
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
164
|
+
parser.add_argument("--input-file", default="metadata_output.json")
|
|
165
|
+
parser.add_argument("--output-file", default="metadata_push_result.json")
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
"--batch-size",
|
|
168
|
+
type=int,
|
|
169
|
+
default=_BATCH_SIZE,
|
|
170
|
+
help=f"Max assets per push batch (default: {_BATCH_SIZE})",
|
|
171
|
+
)
|
|
172
|
+
args = parser.parse_args()
|
|
173
|
+
|
|
174
|
+
required = ["resource_uuid", "key_id", "key_token"]
|
|
175
|
+
missing = [k for k in required if getattr(args, k) is None]
|
|
176
|
+
if missing:
|
|
177
|
+
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
178
|
+
|
|
179
|
+
push(
|
|
180
|
+
input_file=args.input_file,
|
|
181
|
+
resource_uuid=args.resource_uuid,
|
|
182
|
+
key_id=args.key_id,
|
|
183
|
+
key_token=args.key_token,
|
|
184
|
+
batch_size=args.batch_size,
|
|
185
|
+
output_file=args.output_file,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
main()
|