opencode-skills-collection 2.0.0-beta.3 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -0
  2. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  3. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  4. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  5. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  6. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  7. package/bundled-skills/docs/users/bundles.md +1 -1
  8. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  9. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  10. package/bundled-skills/docs/users/getting-started.md +1 -1
  11. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  12. package/bundled-skills/docs/users/usage.md +4 -4
  13. package/bundled-skills/docs/users/visual-guide.md +4 -4
  14. package/bundled-skills/manage-skills/SKILL.md +187 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  20. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  21. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  22. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  23. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  24. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  86. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  89. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  90. package/package.json +1 -1
  91. package/skills_index.json +503 -61
@@ -0,0 +1,88 @@
1
+ """
2
+ Redshift — Query Log Collect & Push (combined)
3
+ ================================================
4
+ Collects completed query execution records from Redshift using sys_query_history
5
+ and sys_querytext, then pushes them to Monte Carlo for query-pattern analysis,
6
+ lineage derivation, and usage attribution.
7
+
8
+ This script imports and calls collect() from collect_query_logs and push() from
9
+ push_query_logs, running both in sequence.
10
+
11
+ Substitution points (search for "← SUBSTITUTE"):
12
+ - REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
13
+ - LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
14
+ - LOOKBACK_LAG_HOURS: lag behind now to avoid in-flight queries (default 1)
15
+ - BATCH_SIZE : number of query_ids to fetch texts for in one SQL call
16
+ - MAX_QUERIES : maximum query rows to process per run
17
+ - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
18
+ - MCD_RESOURCE_UUID : UUID of the Redshift connection in Monte Carlo
19
+ - PUSH_BATCH_SIZE : number of entries per API call (default 250)
20
+
21
+ Prerequisites:
22
+ pip install psycopg2-binary pycarlo
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import logging
29
+ import os
30
+
31
+ from collect_query_logs import BATCH_SIZE, LOOKBACK_HOURS, LOOKBACK_LAG_HOURS, MAX_QUERIES, collect
32
+ from push_query_logs import DEFAULT_BATCH_SIZE, push
33
+
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
35
+ log = logging.getLogger(__name__)
36
+
37
+
38
+ def main() -> None:
39
+ parser = argparse.ArgumentParser(description="Collect and push Redshift query logs to Monte Carlo")
40
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
41
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
42
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
43
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
44
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
45
+ parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
46
+ parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
47
+ parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
48
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
49
+ parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
50
+ parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
51
+ parser.add_argument("--max-queries", type=int, default=MAX_QUERIES)
52
+ parser.add_argument("--push-batch-size", type=int, default=DEFAULT_BATCH_SIZE)
53
+ parser.add_argument("--manifest", default="manifest_query_logs.json")
54
+ args = parser.parse_args()
55
+
56
+ required = ["host", "db", "user", "password", "resource_uuid", "key_id", "key_token"]
57
+ missing = [k for k in required if getattr(args, k) is None]
58
+ if missing:
59
+ parser.error(f"Missing required arguments/env vars: {missing}")
60
+
61
+ log.info("Step 1: Collecting query logs …")
62
+ collect(
63
+ host=args.host,
64
+ db=args.db,
65
+ user=args.user,
66
+ password=args.password,
67
+ manifest_path=args.manifest,
68
+ port=args.port,
69
+ lookback_hours=args.lookback_hours,
70
+ lookback_lag_hours=args.lookback_lag_hours,
71
+ batch_size=args.batch_size,
72
+ max_queries=args.max_queries,
73
+ )
74
+
75
+ log.info("Step 2: Pushing query logs to Monte Carlo …")
76
+ push(
77
+ manifest_path=args.manifest,
78
+ resource_uuid=args.resource_uuid,
79
+ key_id=args.key_id,
80
+ key_token=args.key_token,
81
+ batch_size=args.push_batch_size,
82
+ )
83
+
84
+ log.info("Done — collect and push complete.")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
@@ -0,0 +1,235 @@
1
+ """
2
+ Redshift — Lineage Collection (collect-only)
3
+ ==============================================
4
+ Collects table-level lineage from Redshift by fetching recent successful query
5
+ history from sys_query_history + sys_querytext and parsing CREATE TABLE AS SELECT
6
+ (CTAS) and INSERT INTO SELECT patterns to derive source->destination relationships.
7
+
8
+ Writes a JSON manifest file that can be consumed by push_lineage.py.
9
+
10
+ Substitution points (search for "← SUBSTITUTE"):
11
+ - REDSHIFT_HOST / REDSHIFT_DB / REDSHIFT_USER / REDSHIFT_PASSWORD : connection
12
+ - LOOKBACK_HOURS : how far back to scan query history (default 24 h)
13
+
14
+ Prerequisites:
15
+ pip install psycopg2-binary
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ import os
24
+ import re
25
+ from datetime import datetime, timezone
26
+ from typing import Any
27
+
28
+ import psycopg2
29
+
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
31
+ log = logging.getLogger(__name__)
32
+
33
+ RESOURCE_TYPE = "redshift"
34
+ LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "24")) # ← SUBSTITUTE
35
+
36
+
37
+ def _check_available_memory(min_gb: float = 2.0) -> None:
38
+ """Warn if available memory is below the threshold."""
39
+ try:
40
+ if hasattr(os, "sysconf"): # Linux / macOS
41
+ page_size = os.sysconf("SC_PAGE_SIZE")
42
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
43
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
44
+ else:
45
+ return # Windows — skip check
46
+ except (ValueError, OSError):
47
+ return
48
+ if avail_gb < min_gb:
49
+ log.warning(
50
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
51
+ "Consider reducing the collection scope or increasing available memory.",
52
+ avail_gb,
53
+ min_gb,
54
+ )
55
+
56
+
57
+ # Regex: CTAS — CREATE [OR REPLACE] TABLE <dest> AS SELECT
58
+ _CTAS_RE = re.compile(
59
+ r"CREATE\s+(?:OR\s+REPLACE\s+)?(?:TABLE|VIEW)\s+(?P<dest>\"?[\w.\"]+\"?)\s*(?:\([^)]*\))?\s*AS\s+SELECT\b",
60
+ re.IGNORECASE | re.DOTALL,
61
+ )
62
+ # Regex: INSERT INTO <dest> … SELECT
63
+ _INSERT_RE = re.compile(
64
+ r"INSERT\s+INTO\s+(?P<dest>\"?[\w.\"]+\"?)\s.*?SELECT\b",
65
+ re.IGNORECASE | re.DOTALL,
66
+ )
67
+ # Matches any schema.table or database.schema.table reference in the query
68
+ _TABLE_REF_RE = re.compile(r'"?([\w]+)"?\."?([\w]+)"?(?:\."?([\w]+)"?)?', re.IGNORECASE)
69
+
70
+
71
+ def _clean_name(name: str) -> str:
72
+ return name.strip('"').strip()
73
+
74
+
75
+ def _parse_ref(ref: str) -> tuple[str, str, str]:
76
+ """Parse 'db.schema.table' or 'schema.table' -> (database, schema, table)."""
77
+ parts = [_clean_name(p) for p in ref.split(".")]
78
+ if len(parts) == 3:
79
+ return parts[0], parts[1], parts[2]
80
+ if len(parts) == 2:
81
+ return "", parts[0], parts[1]
82
+ return "", "", parts[0]
83
+
84
+
85
+ def _dictfetch(cursor: Any, sql: str, params: tuple | None = None) -> list[dict[str, Any]]:
86
+ cursor.execute(sql, params)
87
+ cols = [d.name for d in cursor.description]
88
+ rows = []
89
+ while True:
90
+ chunk = cursor.fetchmany(1000)
91
+ if not chunk:
92
+ break
93
+ rows.extend(dict(zip(cols, row)) for row in chunk)
94
+ return rows
95
+
96
+
97
+ def fetch_query_texts(cursor: Any, lookback_hours: int) -> list[str]:
98
+ """Assemble full query texts from sys_query_history + sys_querytext."""
99
+ rows = _dictfetch(
100
+ cursor,
101
+ f"""
102
+ SELECT
103
+ sq.query_id,
104
+ LISTAGG(
105
+ CASE WHEN LEN(st.text) <= 200 THEN st.text ELSE LEFT(st.text, 200) END,
106
+ ''
107
+ ) WITHIN GROUP (ORDER BY st.sequence) AS full_text
108
+ FROM sys_query_history sq
109
+ JOIN sys_querytext st ON sq.query_id = st.query_id
110
+ WHERE sq.start_time >= DATEADD(hour, -{lookback_hours}, GETDATE())
111
+ AND sq.status = 'success'
112
+ GROUP BY sq.query_id
113
+ LIMIT 50000
114
+ """, # ← SUBSTITUTE: adjust lookback_hours, LIMIT, or add user/database filters
115
+ )
116
+ return [r["full_text"] for r in rows if r.get("full_text")]
117
+
118
+
119
+ def parse_lineage_from_sql(sql_text: str) -> list[dict[str, Any]]:
120
+ events: list[dict[str, Any]] = []
121
+
122
+ dest_match = _CTAS_RE.search(sql_text) or _INSERT_RE.search(sql_text)
123
+ if not dest_match:
124
+ return events
125
+
126
+ dest_raw = dest_match.group("dest")
127
+ dest_db, dest_schema, dest_table = _parse_ref(dest_raw)
128
+ if not dest_table:
129
+ return events
130
+
131
+ # Find all schema.table refs in the query, excluding the destination
132
+ source_refs: list[str] = []
133
+ for m in _TABLE_REF_RE.finditer(sql_text):
134
+ if m.group(3):
135
+ ref = f"{m.group(1)}.{m.group(2)}.{m.group(3)}"
136
+ else:
137
+ ref = f"{m.group(1)}.{m.group(2)}"
138
+
139
+ db, schema, table = _parse_ref(ref)
140
+ if not table or (db == dest_db and schema == dest_schema and table == dest_table):
141
+ continue
142
+ source_refs.append(ref)
143
+
144
+ if not source_refs:
145
+ return events
146
+
147
+ # Deduplicate sources while preserving order
148
+ seen: set[str] = set()
149
+ sources: list[dict[str, str]] = []
150
+ for ref in source_refs:
151
+ if ref not in seen:
152
+ seen.add(ref)
153
+ db, schema, table = _parse_ref(ref)
154
+ sources.append({"database": db, "schema": schema, "asset_name": table})
155
+
156
+ events.append({
157
+ "sources": sources,
158
+ "destination": {"database": dest_db, "schema": dest_schema, "asset_name": dest_table},
159
+ })
160
+ return events
161
+
162
+
163
+ def collect(
164
+ host: str,
165
+ db: str,
166
+ user: str,
167
+ password: str,
168
+ manifest_path: str = "manifest_lineage.json",
169
+ port: int = 5439,
170
+ lookback_hours: int = LOOKBACK_HOURS,
171
+ ) -> list[dict[str, Any]]:
172
+ """Connect to Redshift, collect lineage, write a JSON manifest, and return events."""
173
+ _check_available_memory()
174
+ collected_at = datetime.now(timezone.utc).isoformat()
175
+
176
+ conn = psycopg2.connect(
177
+ host=host, port=port, dbname=db, user=user, password=password, connect_timeout=30,
178
+ )
179
+ try:
180
+ with conn.cursor() as cursor:
181
+ query_texts = fetch_query_texts(cursor, lookback_hours)
182
+ finally:
183
+ conn.close()
184
+
185
+ log.info("Parsing lineage from %d query texts …", len(query_texts))
186
+ all_events: list[dict[str, Any]] = []
187
+ for sql_text in query_texts:
188
+ all_events.extend(parse_lineage_from_sql(sql_text))
189
+
190
+ log.info("Collected %d lineage events", len(all_events))
191
+
192
+ manifest = {
193
+ "resource_type": RESOURCE_TYPE,
194
+ "collected_at": collected_at,
195
+ "lookback_hours": lookback_hours,
196
+ "queries_scanned": len(query_texts),
197
+ "lineage_event_count": len(all_events),
198
+ "events": all_events,
199
+ }
200
+ with open(manifest_path, "w") as fh:
201
+ json.dump(manifest, fh, indent=2)
202
+ log.info("Manifest written to %s (%d events)", manifest_path, len(all_events))
203
+
204
+ return all_events
205
+
206
+
207
+ def main() -> None:
208
+ parser = argparse.ArgumentParser(description="Collect Redshift lineage to a manifest file")
209
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
210
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
211
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
212
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
213
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
214
+ parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
215
+ parser.add_argument("--manifest", default="manifest_lineage.json")
216
+ args = parser.parse_args()
217
+
218
+ required = ["host", "db", "user", "password"]
219
+ missing = [k for k in required if getattr(args, k) is None]
220
+ if missing:
221
+ parser.error(f"Missing required arguments/env vars: {missing}")
222
+
223
+ collect(
224
+ host=args.host,
225
+ db=args.db,
226
+ user=args.user,
227
+ password=args.password,
228
+ manifest_path=args.manifest,
229
+ port=args.port,
230
+ lookback_hours=args.lookback_hours,
231
+ )
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
@@ -0,0 +1,219 @@
1
+ """
2
+ Redshift — Metadata Collection (collect-only)
3
+ ===============================================
4
+ Collects table schemas, row counts, and byte sizes from Amazon Redshift using
5
+ SVV system views, then writes a JSON manifest file that can be consumed by
6
+ push_metadata.py.
7
+
8
+ Substitution points (search for "← SUBSTITUTE"):
9
+ - REDSHIFT_HOST : Redshift cluster endpoint or serverless workgroup endpoint
10
+ - REDSHIFT_DB : database name to connect to
11
+ - REDSHIFT_USER : database user (or IAM role user)
12
+ - REDSHIFT_PASSWORD : database password
13
+ - DB_EXCLUSIONS : databases to skip
14
+ - SCHEMA_EXCLUSIONS : schemas to skip in every database
15
+
16
+ Prerequisites:
17
+ pip install psycopg2-binary
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import logging
25
+ import os
26
+ from datetime import datetime, timezone
27
+ from typing import Any
28
+
29
+ import psycopg2
30
+ import psycopg2.extras
31
+
32
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
33
+ log = logging.getLogger(__name__)
34
+
35
+ RESOURCE_TYPE = "redshift"
36
+
37
+ DB_EXCLUSIONS: set[str] = {"dev", "padb_harvest"} # ← SUBSTITUTE: add internal databases
38
+
39
+ SCHEMA_EXCLUSIONS: set[str] = { # ← SUBSTITUTE: add internal schemas
40
+ "information_schema",
41
+ "pg_catalog",
42
+ "pg_internal",
43
+ "catalog_history",
44
+ }
45
+
46
+
47
+ def _check_available_memory(min_gb: float = 2.0) -> None:
48
+ """Warn if available memory is below the threshold."""
49
+ try:
50
+ if hasattr(os, "sysconf"): # Linux / macOS
51
+ page_size = os.sysconf("SC_PAGE_SIZE")
52
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
53
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
54
+ else:
55
+ return # Windows — skip check
56
+ except (ValueError, OSError):
57
+ return
58
+ if avail_gb < min_gb:
59
+ log.warning(
60
+ "Only %.1f GB of memory available (minimum recommended: %.1f GB). "
61
+ "Consider reducing the collection scope or increasing available memory.",
62
+ avail_gb,
63
+ min_gb,
64
+ )
65
+
66
+
67
+ def _dictfetch(cursor: Any, sql: str, params: tuple | None = None) -> list[dict[str, Any]]:
68
+ cursor.execute(sql, params)
69
+ cols = [d.name for d in cursor.description]
70
+ rows = []
71
+ while True:
72
+ chunk = cursor.fetchmany(1000)
73
+ if not chunk:
74
+ break
75
+ rows.extend(dict(zip(cols, row)) for row in chunk)
76
+ return rows
77
+
78
+
79
+ def collect_databases(cursor: Any) -> list[str]:
80
+ rows = _dictfetch(
81
+ cursor,
82
+ "SELECT database_name FROM svv_redshift_databases ORDER BY database_name",
83
+ )
84
+ return [r["database_name"] for r in rows if r["database_name"] not in DB_EXCLUSIONS]
85
+
86
+
87
+ def collect_tables(cursor: Any, db: str) -> list[dict[str, Any]]:
88
+ schema_list = ", ".join(f"'{s}'" for s in SCHEMA_EXCLUSIONS)
89
+ return _dictfetch(
90
+ cursor,
91
+ f"""
92
+ SELECT
93
+ database AS db,
94
+ schema,
95
+ "table" AS table_name,
96
+ "rows" AS row_count,
97
+ size * 1024 * 1024 AS byte_count
98
+ FROM svv_table_info
99
+ WHERE database = %s
100
+ AND schema NOT IN ({schema_list})
101
+ ORDER BY schema, "table"
102
+ """, # ← SUBSTITUTE: add additional WHERE clauses to narrow scope
103
+ (db,),
104
+ )
105
+
106
+
107
+ def collect_columns(cursor: Any, db: str, schema: str, table: str) -> list[dict[str, Any]]:
108
+ return _dictfetch(
109
+ cursor,
110
+ """
111
+ SELECT column_name, data_type, remarks AS comment
112
+ FROM svv_columns
113
+ WHERE table_catalog = %s
114
+ AND table_schema = %s
115
+ AND table_name = %s
116
+ ORDER BY ordinal_position
117
+ """,
118
+ (db, schema, table),
119
+ )
120
+
121
+
122
+ def collect(
123
+ host: str,
124
+ db: str,
125
+ user: str,
126
+ password: str,
127
+ manifest_path: str = "manifest_metadata.json",
128
+ port: int = 5439,
129
+ ) -> list[dict[str, Any]]:
130
+ """Connect to Redshift, collect metadata, write a JSON manifest, and return asset dicts."""
131
+ _check_available_memory()
132
+ collected_at = datetime.now(timezone.utc).isoformat()
133
+ assets: list[dict[str, Any]] = []
134
+
135
+ conn = psycopg2.connect(
136
+ host=host, # ← SUBSTITUTE
137
+ port=port,
138
+ dbname=db, # ← SUBSTITUTE
139
+ user=user, # ← SUBSTITUTE
140
+ password=password, # ← SUBSTITUTE
141
+ connect_timeout=30,
142
+ )
143
+ try:
144
+ with conn.cursor() as cursor:
145
+ databases = collect_databases(cursor)
146
+ log.info("Found databases: %s", databases)
147
+
148
+ for database in databases:
149
+ tables = collect_tables(cursor, database)
150
+ log.info("Database %s — %d tables", database, len(tables))
151
+
152
+ for t in tables:
153
+ schema = t["schema"]
154
+ table_name = t["table_name"]
155
+
156
+ columns = collect_columns(cursor, database, schema, table_name)
157
+ fields = [
158
+ {
159
+ "name": col["column_name"],
160
+ "type": col["data_type"].upper(),
161
+ "description": col.get("comment") or None,
162
+ }
163
+ for col in columns
164
+ ]
165
+
166
+ asset = {
167
+ "asset_name": table_name,
168
+ "database": database, # ← SUBSTITUTE: use database as top-level namespace
169
+ "schema": schema,
170
+ "asset_type": "TABLE",
171
+ "fields": fields,
172
+ "row_count": t.get("row_count"),
173
+ "byte_count": t.get("byte_count"),
174
+ }
175
+ assets.append(asset)
176
+ log.info("Collected %s.%s.%s", database, schema, table_name)
177
+ finally:
178
+ conn.close()
179
+
180
+ manifest = {
181
+ "resource_type": RESOURCE_TYPE,
182
+ "collected_at": collected_at,
183
+ "asset_count": len(assets),
184
+ "assets": assets,
185
+ }
186
+ with open(manifest_path, "w") as fh:
187
+ json.dump(manifest, fh, indent=2)
188
+ log.info("Manifest written to %s (%d assets)", manifest_path, len(assets))
189
+
190
+ return assets
191
+
192
+
193
+ def main() -> None:
194
+ parser = argparse.ArgumentParser(description="Collect Redshift metadata to a manifest file")
195
+ parser.add_argument("--host", default=os.getenv("REDSHIFT_HOST")) # ← SUBSTITUTE
196
+ parser.add_argument("--db", default=os.getenv("REDSHIFT_DB")) # ← SUBSTITUTE
197
+ parser.add_argument("--user", default=os.getenv("REDSHIFT_USER")) # ← SUBSTITUTE
198
+ parser.add_argument("--password", default=os.getenv("REDSHIFT_PASSWORD")) # ← SUBSTITUTE
199
+ parser.add_argument("--port", type=int, default=int(os.getenv("REDSHIFT_PORT", "5439")))
200
+ parser.add_argument("--manifest", default="manifest_metadata.json")
201
+ args = parser.parse_args()
202
+
203
+ required = ["host", "db", "user", "password"]
204
+ missing = [k for k in required if getattr(args, k) is None]
205
+ if missing:
206
+ parser.error(f"Missing required arguments/env vars: {missing}")
207
+
208
+ collect(
209
+ host=args.host,
210
+ db=args.db,
211
+ user=args.user,
212
+ password=args.password,
213
+ manifest_path=args.manifest,
214
+ port=args.port,
215
+ )
216
+
217
+
218
+ if __name__ == "__main__":
219
+ main()