opencode-skills-collection 2.0.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +6 -1
  2. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  3. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  4. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  5. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  6. package/bundled-skills/docs/users/bundles.md +1 -1
  7. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  8. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  9. package/bundled-skills/docs/users/getting-started.md +1 -1
  10. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  11. package/bundled-skills/docs/users/usage.md +4 -4
  12. package/bundled-skills/docs/users/visual-guide.md +4 -4
  13. package/bundled-skills/manage-skills/SKILL.md +187 -0
  14. package/bundled-skills/monte-carlo-monitor-creation/SKILL.md +222 -0
  15. package/bundled-skills/monte-carlo-monitor-creation/references/comparison-monitor.md +426 -0
  16. package/bundled-skills/monte-carlo-monitor-creation/references/custom-sql-monitor.md +207 -0
  17. package/bundled-skills/monte-carlo-monitor-creation/references/metric-monitor.md +292 -0
  18. package/bundled-skills/monte-carlo-monitor-creation/references/table-monitor.md +231 -0
  19. package/bundled-skills/monte-carlo-monitor-creation/references/validation-monitor.md +404 -0
  20. package/bundled-skills/monte-carlo-prevent/SKILL.md +252 -0
  21. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +23 -0
  22. package/bundled-skills/monte-carlo-prevent/references/parameters.md +32 -0
  23. package/bundled-skills/monte-carlo-prevent/references/workflows.md +478 -0
  24. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +363 -0
  25. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +87 -0
  26. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +203 -0
  27. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +207 -0
  28. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +150 -0
  29. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +160 -0
  30. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +158 -0
  31. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +219 -0
  32. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +257 -0
  33. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +357 -0
  34. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +70 -0
  35. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +65 -0
  36. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +70 -0
  37. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +214 -0
  38. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +160 -0
  39. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +164 -0
  40. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +198 -0
  41. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +193 -0
  42. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +207 -0
  43. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +71 -0
  44. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +64 -0
  45. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +253 -0
  46. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +149 -0
  47. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +190 -0
  48. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +208 -0
  49. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +83 -0
  50. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +77 -0
  51. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +83 -0
  52. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +240 -0
  53. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +212 -0
  54. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +204 -0
  55. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +192 -0
  56. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +178 -0
  57. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +200 -0
  58. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +119 -0
  59. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +119 -0
  60. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +117 -0
  61. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +265 -0
  62. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +313 -0
  63. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +284 -0
  64. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +309 -0
  65. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +245 -0
  66. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +255 -0
  67. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +78 -0
  68. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +80 -0
  69. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +88 -0
  70. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +235 -0
  71. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +219 -0
  72. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +239 -0
  73. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +178 -0
  74. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +178 -0
  75. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +196 -0
  76. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +154 -0
  77. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +137 -0
  78. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +137 -0
  79. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +349 -0
  80. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +329 -0
  81. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +254 -0
  82. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +307 -0
  83. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +228 -0
  84. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +248 -0
  85. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +340 -0
  86. package/bundled-skills/monte-carlo-validation-notebook/SKILL.md +685 -0
  87. package/bundled-skills/monte-carlo-validation-notebook/scripts/generate_notebook_url.py +141 -0
  88. package/bundled-skills/monte-carlo-validation-notebook/scripts/resolve_dbt_schema.py +161 -0
  89. package/package.json +1 -1
  90. package/skills_index.json +503 -61
@@ -0,0 +1,363 @@
1
+ ---
2
+ name: monte-carlo-push-ingestion
3
+ description: "Expert guide for pushing metadata, lineage, and query logs to Monte Carlo from any data warehouse."
4
+ category: data
5
+ risk: safe
6
+ source: community
7
+ source_repo: monte-carlo-data/mc-agent-toolkit
8
+ source_type: community
9
+ date_added: "2026-04-08"
10
+ author: monte-carlo-data
11
+ tags: [data-observability, ingestion, monte-carlo, pycarlo, metadata]
12
+ tools: [claude, cursor, codex]
13
+ ---
14
+
15
+ # Monte Carlo Push Ingestion
16
+
17
+ You are an agent that helps customers collect metadata, lineage, and query logs from their
18
+ data warehouses and push that data to Monte Carlo via the push ingestion API. The push model
19
+ works with **any data source** — if the customer's warehouse does not have a ready-made
20
+ template, derive the appropriate collection queries from that warehouse's system catalog or
21
+ metadata APIs. The push format and pycarlo SDK calls are the same regardless of source.
22
+
23
+ Monte Carlo's push model lets customers send metadata, lineage, and query logs directly to
24
+ Monte Carlo instead of waiting for the pull collector to gather it. It fills gaps the pull
25
+ model cannot always cover — integrations that don't expose query history, custom lineage
26
+ between non-warehouse assets, or customers who already have this data and want to send it
27
+ directly.
28
+
29
+ Push data travels through the integration gateway → dedicated Kinesis streams → thin
30
+ adapter/normalizer code → the same downstream systems that power the pull model. The only
31
+ new infrastructure is the ingress layer; everything after it is shared.
32
+
33
+ ## MANDATORY — Always start from templates
34
+
35
+ When generating any push-ingestion script, you MUST:
36
+
37
+ 1. **Read the corresponding template** before writing any code. Templates live in this skill's
38
+ directory under `scripts/templates/<warehouse>/`. To find them, glob for
39
+ `**/push-ingestion/scripts/templates/<warehouse>/*.py` — this works regardless of where the
40
+ skill is installed. Do NOT search from the current working directory alone.
41
+ 2. **Adapt the template** to the customer's needs — do not write pycarlo imports, model constructors,
42
+ or SDK method calls from memory.
43
+ 3. If no template exists for the target warehouse, read the **Snowflake template** as the canonical
44
+ reference and adapt only the warehouse-specific collection queries.
45
+
46
+ Template files follow this naming pattern:
47
+ - `collect_<flow>.py` — collection only (queries the warehouse, writes a JSON manifest)
48
+ - `push_<flow>.py` — push only (reads the manifest, sends to Monte Carlo)
49
+ - `collect_and_push_<flow>.py` — combined (imports from both, runs in sequence)
50
+
51
+ **After running any push script**, you MUST surface the `invocation_id`(s) returned by the API
52
+ to the user. The invocation ID is the only way to trace pushed data through downstream systems
53
+ and is required for validation. Never let a push complete without showing the user the
54
+ invocation IDs — they need them for `/mc-validate-metadata`, `/mc-validate-lineage`, and
55
+ debugging.
56
+
57
+ ## Canonical pycarlo API — authoritative reference
58
+
59
+ The following imports, classes, and method signatures are the **ONLY** correct pycarlo API for
60
+ push ingestion. If your training data suggests different names, **it is wrong**. Use exactly
61
+ what is listed here.
62
+
63
+ ### Imports and client setup
64
+
65
+ ```python
66
+ from pycarlo.core import Client, Session
67
+ from pycarlo.features.ingestion import IngestionService
68
+ from pycarlo.features.ingestion.models import (
69
+ # Metadata
70
+ RelationalAsset, AssetMetadata, AssetField, AssetVolume, AssetFreshness, Tag,
71
+ # Lineage
72
+ LineageEvent, LineageAssetRef, ColumnLineageField, ColumnLineageSourceField,
73
+ # Query logs
74
+ QueryLogEntry,
75
+ )
76
+
77
+ client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
78
+ service = IngestionService(mc_client=client)
79
+ ```
80
+
81
+ ### Method signatures
82
+
83
+ ```python
84
+ # Metadata
85
+ service.send_metadata(resource_uuid=..., resource_type=..., events=[RelationalAsset(...)])
86
+
87
+ # Lineage (table or column)
88
+ service.send_lineage(resource_uuid=..., resource_type=..., events=[LineageEvent(...)])
89
+
90
+ # Query logs — note: log_type, NOT resource_type
91
+ service.send_query_logs(resource_uuid=..., log_type=..., events=[QueryLogEntry(...)])
92
+
93
+ # Extract invocation ID from any response
94
+ service.extract_invocation_id(result)
95
+ ```
96
+
97
+ ### RelationalAsset structure (nested, NOT flat)
98
+
99
+ ```python
100
+ RelationalAsset(
101
+ type="TABLE", # ONLY "TABLE" or "VIEW" (uppercase) — normalize warehouse-native values
102
+ metadata=AssetMetadata(
103
+ name="my_table",
104
+ database="analytics",
105
+ schema="public",
106
+ description="optional description",
107
+ ),
108
+ fields=[
109
+ AssetField(name="id", type="INTEGER", description=None),
110
+ AssetField(name="amount", type="DECIMAL(10,2)"),
111
+ ],
112
+ volume=AssetVolume(row_count=1000000, byte_count=111111111), # optional
113
+ freshness=AssetFreshness(last_update_time="2026-03-12T14:30:00Z"), # optional
114
+ )
115
+ ```
116
+
117
+ ## Environment variable conventions
118
+
119
+ All generated scripts MUST use these exact variable names. Do NOT invent alternatives like
120
+ `MCD_KEY_ID`, `MC_TOKEN`, `MONTE_CARLO_KEY`, etc.
121
+
122
+ | Variable | Purpose | Used by |
123
+ |---|---|---|
124
+ | `MCD_INGEST_ID` | Ingestion key ID (scope=Ingestion) | push scripts |
125
+ | `MCD_INGEST_TOKEN` | Ingestion key secret | push scripts |
126
+ | `MCD_ID` | GraphQL API key ID | verification scripts |
127
+ | `MCD_TOKEN` | GraphQL API key secret | verification scripts |
128
+ | `MCD_RESOURCE_UUID` | Warehouse resource UUID | all scripts |
129
+
130
+ ## What this skill can build for you
131
+
132
+ Tell Claude your warehouse or data platform and Monte Carlo resource UUID and this skill will
133
+ generate a ready-to-run Python script that:
134
+ - Connects to your warehouse using the idiomatic driver for that platform
135
+ - Discovers databases, schemas, and tables
136
+ - Extracts the right columns — names, types, row counts, byte counts, last modified time, descriptions
137
+ - Builds the correct pycarlo `RelationalAsset`, `LineageEvent`, or `QueryLogEntry` objects
138
+ - Pushes to Monte Carlo and saves an output manifest with the `invocation_id` for tracing
139
+
140
+ Templates are available for common warehouses (Snowflake, BigQuery, BigQuery Iceberg,
141
+ Databricks, Redshift, Hive). For any other platform, Claude will derive the appropriate
142
+ collection queries from the warehouse's system catalog or metadata APIs and generate an
143
+ equivalent script.
144
+
145
+ ### Ready-to-run examples
146
+
147
+ Production-ready example scripts built from these templates are published in the
148
+ [mcd-public-resources](https://github.com/monte-carlo-data/mcd-public-resources) repo:
149
+
150
+ - **[BigQuery Iceberg (BigLake) tables](https://github.com/monte-carlo-data/mcd-public-resources/tree/main/examples/push-ingestion/bigquery/push-iceberg-tables)** —
151
+ metadata and query log collection for BigQuery Iceberg tables that are invisible to Monte
152
+ Carlo's standard pull collector (which uses `__TABLES__`). Includes a `--only-freshness-and-volume`
153
+ flag for fast periodic pushes that skip the schema/fields query — useful for hourly cron jobs
154
+ after the initial full metadata push.
155
+
156
+ ## Reference docs — when to load
157
+
158
+ | Reference file | Load when… |
159
+ |---|---|
160
+ | `references/prerequisites.md` | Customer is setting up for the first time, has auth errors, or needs help creating API keys |
161
+ | `references/push-metadata.md` | Building or debugging a metadata collection script |
162
+ | `references/push-lineage.md` | Building or debugging a lineage collection script |
163
+ | `references/push-query-logs.md` | Building or debugging a query log collection script |
164
+ | `references/custom-lineage.md` | Customer needs custom lineage nodes or edges via GraphQL |
165
+ | `references/validation.md` | Verifying pushed data, running GraphQL checks, or deleting push-ingested tables |
166
+ | `references/direct-http-api.md` | Customer wants to call push APIs directly via curl/HTTP without pycarlo |
167
+ | `references/anomaly-detection.md` | Customer asks why freshness or volume detectors aren't firing |
168
+
169
+ ## Prerequisites — read this first
170
+
171
+ → Load `references/prerequisites.md`
172
+
173
+ Two separate API keys are required. This is the most common setup stumbling block:
174
+ - **Ingestion key** (scope=Ingestion) — for pushing data
175
+ - **GraphQL API key** — for verification queries
176
+
177
+ Both use the same `x-mcd-id` / `x-mcd-token` headers but point to different endpoints.
178
+
179
+ ## What you can push
180
+
181
+ | Flow | pycarlo method | Push endpoint | Type field | Expiration |
182
+ |---|---|---|---|---|
183
+ | Table metadata | `send_metadata()` | `/ingest/v1/metadata` | `resource_type` (e.g. `"data-lake"`) | **Never expires** |
184
+ | Table lineage | `send_lineage()` | `/ingest/v1/lineage` | `resource_type` (same as metadata) | **Never expires** |
185
+ | Column lineage | `send_lineage()` (events include `fields`) | `/ingest/v1/lineage` | `resource_type` (same as metadata) | **Expires after 10 days** |
186
+ | Query logs | `send_query_logs()` | `/ingest/v1/querylogs` | **`log_type`** (not `resource_type`!) | Same as pulled |
187
+ | Custom lineage | GraphQL mutations | `api.getmontecarlo.com/graphql` | N/A — uses GraphQL API key | 7 days default; set `expireAt: "9999-12-31"` for permanent |
188
+
189
+ **Important**: Query logs use `log_type` instead of `resource_type`. This is the only push
190
+ endpoint where the field name differs. See `references/push-query-logs.md` for the full list
191
+ of supported `log_type` values.
192
+
193
+ The pycarlo SDK is optional — you can also call the push APIs directly via HTTP/curl. See
194
+ `references/direct-http-api.md` for examples.
195
+
196
+ Every push returns an `invocation_id` — save it. It is your primary debugging handle across
197
+ all downstream systems.
198
+
199
+ ## Step 1 — Generate your collection scripts
200
+
201
+ Ask Claude to build the script for your warehouse:
202
+
203
+ > "Build me a metadata collection script for Snowflake. My MC resource UUID is `abc-123`."
204
+
205
+ The script templates in `**/push-ingestion/scripts/templates/` (Snowflake, BigQuery, BigQuery Iceberg, Databricks, Redshift, Hive)
206
+ are the **mandatory starting point** for script generation — they contain the correct pycarlo
207
+ imports, model constructors, and SDK calls. **They are not an exhaustive list.** If the
208
+ customer's warehouse is not listed, use the templates as a guide and determine the appropriate
209
+ queries or file-collection approach for their platform. For file-based sources (like Hive
210
+ Metastore logs), provide the command to retrieve the file, parse it, and transform it into the
211
+ format required by the push APIs. The push format and SDK calls are identical regardless of
212
+ source; only the collection queries change.
213
+
214
+ **Batching**: For large payloads, split events into batches. Use a batch size of **50 assets**
215
+ per push call. The pycarlo HTTP client has a hardcoded 10-second read timeout that cannot be
216
+ overridden (`Session` and `Client` do not accept a `timeout` parameter) — larger batches (200+)
217
+ will timeout on warehouses with thousands of tables. The compressed request body must also not
218
+ exceed **1MB** (Kinesis limit). All push endpoints support batching.
219
+
220
+ **Push frequency**: Push at most **once per hour**. Sub-hourly pushes produce unpredictable
221
+ anomaly detector behavior because the training pipeline aggregates into hourly buckets.
222
+
223
+ **Per flow, see:**
224
+ - Metadata (schema + volume + freshness): `references/push-metadata.md`
225
+ - Table and column lineage: `references/push-lineage.md`
226
+ - Query logs: `references/push-query-logs.md`
227
+
228
+ ## Step 2 — Validate pushed data
229
+
230
+ After pushing, verify data is visible in Monte Carlo using the GraphQL API (GraphQL API key).
231
+
232
+ → `references/validation.md` — all verification queries (getTable, getMetricsV4,
233
+ getTableLineage, getDerivedTablesPartialLineage, getAggregatedQueries)
234
+
235
+ Timing expectations:
236
+ - **Metadata**: visible within a few minutes
237
+ - **Table lineage**: visible within seconds to a few minutes (fast direct path to Neo4j)
238
+ - **Column lineage**: a few minutes
239
+ - **Query logs**: at least **15-20 minutes** (async processing pipeline)
240
+
241
+ ## Step 3 — Anomaly detection (optional)
242
+
243
+ If you want Monte Carlo's freshness and volume detectors to fire on pushed data, you need to
244
+ push consistently over time — detectors require historical data to train.
245
+
246
+ → `references/anomaly-detection.md` — recommended push frequency, minimum samples,
247
+ training windows, and what to tell customers who ask why detectors aren't activating
248
+
249
+ ## Custom lineage nodes and edges
250
+
251
+ For non-warehouse assets (dbt models, Airflow DAGs, custom ETL pipelines) or cross-resource
252
+ lineage, use the GraphQL mutations directly:
253
+
254
+ → `references/custom-lineage.md` — `createOrUpdateLineageNode`, `createOrUpdateLineageEdge`,
255
+ `deleteLineageNode`, and the critical `expireAt: "9999-12-31"` rule
256
+
257
+ ## Deleting push-ingested tables
258
+
259
+ Push tables are excluded from the normal pull-based deletion flow (intentionally). To delete
260
+ them explicitly, use `deletePushIngestedTables` — covered in `references/validation.md`
261
+ under "Table management operations".
262
+
263
+ ## Available slash commands
264
+
265
+ Customers can invoke these explicitly instead of describing their intent in prose:
266
+
267
+ | Command | Purpose |
268
+ |---|---|
269
+ | `/mc-build-metadata-collector` | Generate a metadata collection script |
270
+ | `/mc-build-lineage-collector` | Generate a lineage collection script |
271
+ | `/mc-build-query-log-collector` | Generate a query log collection script |
272
+ | `/mc-validate-metadata` | Verify pushed metadata via the GraphQL API |
273
+ | `/mc-validate-lineage` | Verify pushed lineage via the GraphQL API |
274
+ | `/mc-validate-query-logs` | Verify pushed query logs via the GraphQL API |
275
+ | `/mc-create-lineage-node` | Create a custom lineage node |
276
+ | `/mc-create-lineage-edge` | Create a custom lineage edge |
277
+ | `/mc-delete-lineage-node` | Delete a custom lineage node |
278
+ | `/mc-delete-push-tables` | Delete push-ingested tables |
279
+
280
+ ## Debugging checkpoints
281
+
282
+ When pushed data isn't appearing, work through these five checkpoints in order:
283
+
284
+ 1. **Did the SDK return a `202` and an `invocation_id`?**
285
+ If not, the gateway rejected the request — check auth headers and `resource.uuid`.
286
+
287
+ 2. **Is the integration key the right type?**
288
+ Must be scope `Ingestion`, created via `montecarlo integrations create-key --scope Ingestion`.
289
+ A standard GraphQL API key will not work for push.
290
+
291
+ 3. **Is `resource.uuid` correct and authorized?**
292
+ The key can be scoped to specific warehouse UUIDs. If the UUID doesn't match, you get `403`.
293
+
294
+ 4. **Did the normalizer process it?**
295
+ Use the `invocation_id` to search CloudWatch logs for the relevant Lambda. For query logs,
296
+ check the `log_type` — Hive requires `"hive-s3"`, not `"hive"`.
297
+
298
+ 5. **Did the downstream system pick it up?**
299
+ - Metadata: query `getTable` in GraphQL
300
+ - Table lineage: check Neo4j within seconds–minutes (fast path via PushLineageProcessor)
301
+ - Query logs: wait at least 15-20 minutes; check `getAggregatedQueries`
302
+
303
+ ## Known gotchas
304
+
305
+ - **`log_type` vs `resource_type`**: metadata and lineage use `resource_type` (e.g. `"data-lake"`);
306
+ query logs use **`log_type`** — the only endpoint where the field name differs. Wrong value →
307
+ `Unsupported ingest query-log log_type` error.
308
+ - **`invocation_id` must be saved**: every output manifest should include it — it's your
309
+ only tracing handle once the request leaves the SDK.
310
+ - **Query log async delay**: at least 15-20 minutes. `getAggregatedQueries` will return 0 until
311
+ processing completes — this is expected, not a bug.
312
+ - **Custom lineage `expireAt` defaults to 7 days**: nodes vanish silently unless you set
313
+ `expireAt: "9999-12-31"` for permanent nodes.
314
+ - **Push tables are never auto-deleted**: the periodic cleanup job excludes them by default
315
+ (`exclude_push_tables=True`). Delete them explicitly via `deletePushIngestedTables` (max
316
+ 1,000 MCONs per call; also deletes lineage nodes and all edges touching those nodes).
317
+ - **Anomaly detectors need history**: pushing once is not enough. Freshness needs 7+ pushes
318
+ over ~2 weeks; volume needs 10–48 samples over ~42 days. Push at most once per hour.
319
+ - **Batching required for large payloads**: the compressed request body must not exceed 1MB.
320
+ Split large event lists into batches.
321
+ - **Column lineage expires after 10 days**: unlike table metadata and table lineage (which
322
+ never expire), column lineage has a 10-day TTL, same as pulled column lineage.
323
+ - **Quote SQL identifiers in warehouse queries**: database, schema, and table names must be
324
+ quoted to handle mixed-case or special characters. The quoting syntax varies by warehouse —
325
+ Snowflake and Redshift use double quotes (`"{db}"`), BigQuery/Databricks/Hive use backticks
326
+ (`` `db` ``). The templates already handle this correctly for each warehouse — follow the
327
+ same quoting pattern when adapting.
328
+
329
+ ## Memory safety
330
+
331
+ Generated scripts must include a startup memory check. The collection phase loads query history
332
+ rows into memory for parsing — on large warehouses with long lookback windows, this can exhaust
333
+ available RAM and cause the process to be silently killed (SIGKILL / exit 137) with no traceback.
334
+
335
+ Add this pattern near the top of every generated script, after imports:
336
+
337
+ ```python
338
+ import os
339
+
340
+ def _check_available_memory(min_gb: float = 2.0) -> None:
341
+ """Warn if available memory is below the threshold."""
342
+ try:
343
+ if hasattr(os, "sysconf"): # Linux / macOS
344
+ page_size = os.sysconf("SC_PAGE_SIZE")
345
+ avail_pages = os.sysconf("SC_AVPHYS_PAGES")
346
+ avail_gb = (page_size * avail_pages) / (1024 ** 3)
347
+ else:
348
+ return # Windows — skip check
349
+ except (ValueError, OSError):
350
+ return
351
+ if avail_gb < min_gb:
352
+ print(
353
+ f"WARNING: Only {avail_gb:.1f} GB of memory available "
354
+ f"(minimum recommended: {min_gb:.1f} GB). "
355
+ f"Consider reducing the lookback window or increasing available memory."
356
+ )
357
+ ```
358
+
359
+ Call `_check_available_memory()` before connecting to the warehouse.
360
+
361
+ Additionally, when fetching query history:
362
+ - Use `cursor.fetchmany(batch_size)` in a loop instead of `cursor.fetchall()` when possible
363
+ - For very large result sets, consider adding a LIMIT clause and processing in windows
@@ -0,0 +1,87 @@
1
+ # Anomaly Detection for Push-Ingested Data
2
+
3
+ Push volume and freshness data feeds the same anomaly detectors as the pull model.
4
+ The detectors don't activate immediately — they need enough historical data to learn
5
+ expected behavior before they can alert on deviations.
6
+
7
+ ## Recommended push frequency: hourly
8
+
9
+ - Push at most **once per hour** — pushing more frequently produces unpredictable detector
10
+ behavior because the training pipeline aggregates data into hourly buckets
11
+ - Push **consistently** — gaps of more than a few days delay activation or deactivate
12
+ previously-active detectors
13
+
14
+ ## Freshness detector
15
+
16
+ The freshness detector learns how often a table is updated and fires when it has not been
17
+ updated for longer than expected.
18
+
19
+ **What it trains on**: consecutive differences (`delta_sec`) between `last_update_time`
20
+ values across pushes. A push only counts if `last_update_time` actually changed.
21
+
22
+ **Requirements to activate:**
23
+ | Requirement | Value |
24
+ |---|---|
25
+ | Minimum samples | 7 pushes where `last_update_time` changed (or coverage ≥ 0.8 for slow tables) |
26
+ | Minimum coverage | 0.15 (= `median_update_secs × n_samples / 22 days`) |
27
+ | Training window | 35 days |
28
+ | Supported update cycle | 5 minutes – 7.7 days |
29
+ | Minimum table age | ~14 days on older warehouses |
30
+
31
+ **Deactivation triggers:**
32
+ - No push for **14 days** → `"no recent data"`
33
+ - Gap > 7 days in last 14 days, for fast tables (median update ≤ 26.4 hours) → `"gap of over a week in last 2 weeks"`
34
+
35
+ ## Volume detector (Volume Change + Unchanged Size)
36
+
37
+ Detects unexpected spikes/drops in row count or byte count.
38
+
39
+ **Requirements to activate:**
40
+ | Requirement | Value |
41
+ |---|---|
42
+ | Minimum samples (daily) | 10 |
43
+ | Minimum samples (subdaily, ~12x/day) | 48 |
44
+ | Minimum samples (weekly) | 5 |
45
+ | Minimum coverage | 0.30 (= `N × median_update_secs / 42 days`) |
46
+ | Training window | 42 days |
47
+ | Minimum table age | 5 days |
48
+ | Regularity check | 75th/25th percentile of update intervals ≥ 0.2 |
49
+
50
+ **Deactivation**: No hard gap limit, but coverage degrades as the 42-day window advances
51
+ without new data. Eventually drops below 0.3 and deactivates.
52
+
53
+ ## Summary table
54
+
55
+ | | Freshness | Volume Change / Unchanged Size |
56
+ |---|---|---|
57
+ | Recommended frequency | Hourly | Hourly |
58
+ | Maximum frequency | Once per hour | Once per hour |
59
+ | Training window | 35 days | 42 days |
60
+ | Minimum samples | 7 | 10 (daily) / 48 (subdaily) / 5 (weekly) |
61
+ | Minimum coverage | 0.15 | 0.30 |
62
+ | Hard deactivation gap | 14 days | No (coverage degrades) |
63
+ | Fast-table gap warning | 7 days in last 14 | N/A |
64
+
65
+ ## What to tell customers
66
+
67
+ When a customer asks "why isn't my anomaly detection working?":
68
+
69
+ 1. **Check detector status** in the MC UI or via GraphQL (`getTable.thresholds.freshness.status`).
70
+ A `"training"` status means not enough data yet. `"inactive"` means a deactivation
71
+ condition was hit — check the reason code.
72
+
73
+ 2. **Verify push frequency** — are they pushing exactly once per hour? Both too-fast and
74
+ too-slow rates cause problems.
75
+
76
+ 3. **Verify that `last_update_time` changes** — for freshness to accumulate training samples,
77
+ each push must carry a *different* `last_update_time` than the previous one. If the table
78
+ hasn't actually updated, the push still arrives but doesn't advance the sample count.
79
+
80
+ 4. **Set realistic expectations** — freshness detectors need about 1–2 weeks of hourly pushes.
81
+ Volume detectors need 10+ days for daily tables, up to 42 days for subdaily tables.
82
+ Anomaly detection is not instant.
83
+
84
+ 5. **Don't push gaps and then resume** — if a customer pauses pushes for a week and then
85
+ resumes, the freshness detector may deactivate. They should keep pushing even when the
86
+ table hasn't changed (just repeat the same `last_update_time`) to maintain coverage,
87
+ even though that specific push won't count as a new freshness sample.
@@ -0,0 +1,203 @@
1
+ # Custom Lineage Nodes and Edges
2
+
3
+ ## When to use this
4
+
5
+ The `send_lineage()` pycarlo method is the right choice for warehouse tables you own.
6
+ The **GraphQL mutations** in this document are for:
7
+ - Non-warehouse assets: dbt models, Airflow DAGs, Fivetran connectors, custom ETL jobs
8
+ - Connecting nodes across different MC resources (warehouses)
9
+ - One-off lineage corrections not tied to a collector run
10
+ - Fine-grained control over node properties, object types, and expiry
11
+
12
+ All mutations use the **GraphQL API key** (not the Ingestion key) and the endpoint
13
+ `https://api.getmontecarlo.com/graphql`.
14
+
15
+ ## Critical: expireAt
16
+
17
+ If you don't set `expireAt`, nodes and edges expire after **7 days** and vanish from the
18
+ lineage graph silently. For any node or edge that should persist:
19
+
20
+ ```
21
+ expireAt: "9999-12-31"
22
+ ```
23
+
24
+ This is the same value that `PushLineageProcessor` uses internally for all push-ingested
25
+ lineage. Forgetting this is the most common cause of "my lineage disappeared after a week".
26
+
27
+ ---
28
+
29
+ ## createOrUpdateLineageNode
30
+
31
+ Creates or updates a node in the lineage graph. If a node with the same
32
+ `objectType` + `objectId` + `resourceId` already exists, it is updated.
33
+
34
+ ```graphql
35
+ mutation CreateOrUpdateLineageNode(
36
+ $objectType: String!
37
+ $objectId: String!
38
+ $resourceId: UUID
39
+ $resourceName: String
40
+ $name: String
41
+ $properties: [ObjectPropertyInput]
42
+ $expireAt: DateTime
43
+ ) {
44
+ createOrUpdateLineageNode(
45
+ objectType: $objectType
46
+ objectId: $objectId
47
+ resourceId: $resourceId
48
+ resourceName: $resourceName
49
+ name: $name
50
+ properties: $properties
51
+ expireAt: $expireAt
52
+ ) {
53
+ node {
54
+ mcon
55
+ displayName
56
+ objectType
57
+ isCustom
58
+ expireAt
59
+ }
60
+ }
61
+ }
62
+ ```
63
+
64
+ **Variables:**
65
+ ```json
66
+ {
67
+ "objectType": "table",
68
+ "objectId": "analytics:analytics.orders",
69
+ "resourceId": "<warehouse-uuid>",
70
+ "name": "orders",
71
+ "expireAt": "9999-12-31"
72
+ }
73
+ ```
74
+
75
+ `objectType` can be any string — common values: `"table"`, `"view"`, `"report"`,
76
+ `"dashboard"`, `"job"`, `"model"`.
77
+
78
+ `objectId` should be a stable unique identifier for the asset within the resource.
79
+ For tables, use the `fullTableId` format: `database:schema.table`.
80
+
81
+ The returned `mcon` is the stable MC identifier for this node — save it if you plan to
82
+ reference it in edges or deletions.
83
+
84
+ ---
85
+
86
+ ## createOrUpdateLineageEdge
87
+
88
+ Creates or updates a directed edge: source → destination (default: IS_DOWNSTREAM).
89
+
90
+ ```graphql
91
+ mutation CreateOrUpdateLineageEdge(
92
+ $source: NodeInput!
93
+ $destination: NodeInput!
94
+ $expireAt: DateTime
95
+ $edgeType: EdgeType
96
+ ) {
97
+ createOrUpdateLineageEdge(
98
+ source: $source
99
+ destination: $destination
100
+ expireAt: $expireAt
101
+ edgeType: $edgeType
102
+ ) {
103
+ edge {
104
+ source { mcon displayName objectType }
105
+ destination { mcon displayName objectType }
106
+ isCustom
107
+ expireAt
108
+ }
109
+ }
110
+ }
111
+ ```
112
+
113
+ `NodeInput` shape:
114
+ ```json
115
+ {
116
+ "objectType": "table",
117
+ "objectId": "analytics:analytics.orders",
118
+ "resourceId": "<warehouse-uuid>"
119
+ }
120
+ ```
121
+
122
+ **Full example — dbt model → warehouse table:**
123
+ ```json
124
+ {
125
+ "source": {
126
+ "objectType": "model",
127
+ "objectId": "dbt://my_project/models/staging/stg_orders",
128
+ "resourceName": "dbt-production"
129
+ },
130
+ "destination": {
131
+ "objectType": "table",
132
+ "objectId": "analytics:analytics.orders",
133
+ "resourceId": "<snowflake-warehouse-uuid>"
134
+ },
135
+ "expireAt": "9999-12-31",
136
+ "edgeType": "IS_DOWNSTREAM"
137
+ }
138
+ ```
139
+
140
+ ---
141
+
142
+ ## deleteLineageNode
143
+
144
+ Deletes a node and **all its edges and objects**. This is irreversible.
145
+
146
+ ```graphql
147
+ mutation DeleteLineageNode($mcon: String!) {
148
+ deleteLineageNode(mcon: $mcon) {
149
+ objectsDeleted
150
+ nodesDeleted
151
+ edgesDeleted
152
+ }
153
+ }
154
+ ```
155
+
156
+ Get the MCON from `createOrUpdateLineageNode`'s response, or from:
157
+ ```graphql
158
+ query {
159
+ getTable(fullTableId: "analytics:analytics.orders", dwId: "<warehouse-uuid>") {
160
+ mcon
161
+ }
162
+ }
163
+ ```
164
+
165
+ ---
166
+
167
+ ## Python helper for all three mutations
168
+
169
+ ```python
170
+ import requests
171
+
172
+ GRAPHQL_URL = "https://api.getmontecarlo.com/graphql"
173
+ HEADERS = {
174
+ "x-mcd-id": "<graphql-api-key-id>",
175
+ "x-mcd-token": "<graphql-api-key-secret>",
176
+ "Content-Type": "application/json",
177
+ }
178
+
179
+ def run_mutation(query: str, variables: dict) -> dict:
180
+ resp = requests.post(GRAPHQL_URL, json={"query": query, "variables": variables}, headers=HEADERS)
181
+ resp.raise_for_status()
182
+ data = resp.json()
183
+ if "errors" in data:
184
+ raise RuntimeError(data["errors"])
185
+ return data["data"]
186
+
187
+ # Example: create a permanent node
188
+ result = run_mutation(
189
+ """mutation($objectType: String!, $objectId: String!, $resourceId: UUID, $expireAt: DateTime) {
190
+ createOrUpdateLineageNode(objectType: $objectType, objectId: $objectId,
191
+ resourceId: $resourceId, expireAt: $expireAt) {
192
+ node { mcon displayName }
193
+ }
194
+ }""",
195
+ {
196
+ "objectType": "table",
197
+ "objectId": "analytics:analytics.orders",
198
+ "resourceId": "<warehouse-uuid>",
199
+ "expireAt": "9999-12-31",
200
+ }
201
+ )
202
+ print("MCON:", result["createOrUpdateLineageNode"]["node"]["mcon"])
203
+ ```