coalesce-transform-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +304 -0
  3. package/dist/cache-dir.d.ts +26 -0
  4. package/dist/cache-dir.js +106 -0
  5. package/dist/client.d.ts +25 -0
  6. package/dist/client.js +212 -0
  7. package/dist/coalesce/api/environments.d.ts +20 -0
  8. package/dist/coalesce/api/environments.js +15 -0
  9. package/dist/coalesce/api/git-accounts.d.ts +21 -0
  10. package/dist/coalesce/api/git-accounts.js +21 -0
  11. package/dist/coalesce/api/jobs.d.ts +25 -0
  12. package/dist/coalesce/api/jobs.js +21 -0
  13. package/dist/coalesce/api/nodes.d.ts +29 -0
  14. package/dist/coalesce/api/nodes.js +33 -0
  15. package/dist/coalesce/api/projects.d.ts +22 -0
  16. package/dist/coalesce/api/projects.js +25 -0
  17. package/dist/coalesce/api/runs.d.ts +19 -0
  18. package/dist/coalesce/api/runs.js +34 -0
  19. package/dist/coalesce/api/subgraphs.d.ts +20 -0
  20. package/dist/coalesce/api/subgraphs.js +17 -0
  21. package/dist/coalesce/api/users.d.ts +30 -0
  22. package/dist/coalesce/api/users.js +31 -0
  23. package/dist/coalesce/types.d.ts +298 -0
  24. package/dist/coalesce/types.js +746 -0
  25. package/dist/generated/.gitkeep +0 -0
  26. package/dist/generated/node-type-corpus.json +42656 -0
  27. package/dist/index.d.ts +2 -0
  28. package/dist/index.js +10 -0
  29. package/dist/mcp/cache.d.ts +3 -0
  30. package/dist/mcp/cache.js +137 -0
  31. package/dist/mcp/environments.d.ts +3 -0
  32. package/dist/mcp/environments.js +61 -0
  33. package/dist/mcp/git-accounts.d.ts +3 -0
  34. package/dist/mcp/git-accounts.js +70 -0
  35. package/dist/mcp/jobs.d.ts +3 -0
  36. package/dist/mcp/jobs.js +77 -0
  37. package/dist/mcp/node-type-corpus.d.ts +3 -0
  38. package/dist/mcp/node-type-corpus.js +173 -0
  39. package/dist/mcp/nodes.d.ts +3 -0
  40. package/dist/mcp/nodes.js +341 -0
  41. package/dist/mcp/pipelines.d.ts +3 -0
  42. package/dist/mcp/pipelines.js +342 -0
  43. package/dist/mcp/projects.d.ts +3 -0
  44. package/dist/mcp/projects.js +70 -0
  45. package/dist/mcp/repo-node-types.d.ts +135 -0
  46. package/dist/mcp/repo-node-types.js +387 -0
  47. package/dist/mcp/runs.d.ts +3 -0
  48. package/dist/mcp/runs.js +92 -0
  49. package/dist/mcp/subgraphs.d.ts +3 -0
  50. package/dist/mcp/subgraphs.js +60 -0
  51. package/dist/mcp/users.d.ts +3 -0
  52. package/dist/mcp/users.js +107 -0
  53. package/dist/prompts/index.d.ts +2 -0
  54. package/dist/prompts/index.js +58 -0
  55. package/dist/resources/context/aggregation-patterns.md +145 -0
  56. package/dist/resources/context/data-engineering-principles.md +183 -0
  57. package/dist/resources/context/hydrated-metadata.md +92 -0
  58. package/dist/resources/context/id-discovery.md +64 -0
  59. package/dist/resources/context/intelligent-node-configuration.md +162 -0
  60. package/dist/resources/context/node-creation-decision-tree.md +156 -0
  61. package/dist/resources/context/node-operations.md +316 -0
  62. package/dist/resources/context/node-payloads.md +114 -0
  63. package/dist/resources/context/node-type-corpus.md +166 -0
  64. package/dist/resources/context/node-type-selection-guide.md +96 -0
  65. package/dist/resources/context/overview.md +135 -0
  66. package/dist/resources/context/pipeline-workflows.md +355 -0
  67. package/dist/resources/context/run-operations.md +55 -0
  68. package/dist/resources/context/sql-bigquery.md +41 -0
  69. package/dist/resources/context/sql-databricks.md +40 -0
  70. package/dist/resources/context/sql-platform-selection.md +70 -0
  71. package/dist/resources/context/sql-snowflake.md +43 -0
  72. package/dist/resources/context/storage-mappings.md +49 -0
  73. package/dist/resources/context/tool-usage.md +98 -0
  74. package/dist/resources/index.d.ts +5 -0
  75. package/dist/resources/index.js +254 -0
  76. package/dist/schemas/node-payloads.d.ts +5019 -0
  77. package/dist/schemas/node-payloads.js +147 -0
  78. package/dist/server.d.ts +7 -0
  79. package/dist/server.js +63 -0
  80. package/dist/services/cache/snapshots.d.ts +108 -0
  81. package/dist/services/cache/snapshots.js +275 -0
  82. package/dist/services/config/context-analyzer.d.ts +14 -0
  83. package/dist/services/config/context-analyzer.js +76 -0
  84. package/dist/services/config/field-classifier.d.ts +23 -0
  85. package/dist/services/config/field-classifier.js +47 -0
  86. package/dist/services/config/intelligent.d.ts +55 -0
  87. package/dist/services/config/intelligent.js +306 -0
  88. package/dist/services/config/rules.d.ts +6 -0
  89. package/dist/services/config/rules.js +44 -0
  90. package/dist/services/config/schema-resolver.d.ts +18 -0
  91. package/dist/services/config/schema-resolver.js +80 -0
  92. package/dist/services/corpus/loader.d.ts +56 -0
  93. package/dist/services/corpus/loader.js +25 -0
  94. package/dist/services/corpus/search.d.ts +49 -0
  95. package/dist/services/corpus/search.js +69 -0
  96. package/dist/services/corpus/templates.d.ts +4 -0
  97. package/dist/services/corpus/templates.js +11 -0
  98. package/dist/services/pipelines/execution.d.ts +20 -0
  99. package/dist/services/pipelines/execution.js +290 -0
  100. package/dist/services/pipelines/node-type-intent.d.ts +96 -0
  101. package/dist/services/pipelines/node-type-intent.js +356 -0
  102. package/dist/services/pipelines/node-type-selection.d.ts +66 -0
  103. package/dist/services/pipelines/node-type-selection.js +758 -0
  104. package/dist/services/pipelines/planning.d.ts +543 -0
  105. package/dist/services/pipelines/planning.js +1839 -0
  106. package/dist/services/policies/sql-override.d.ts +7 -0
  107. package/dist/services/policies/sql-override.js +109 -0
  108. package/dist/services/repo/operations.d.ts +6 -0
  109. package/dist/services/repo/operations.js +10 -0
  110. package/dist/services/repo/parser.d.ts +70 -0
  111. package/dist/services/repo/parser.js +365 -0
  112. package/dist/services/repo/path.d.ts +2 -0
  113. package/dist/services/repo/path.js +58 -0
  114. package/dist/services/templates/nodes.d.ts +50 -0
  115. package/dist/services/templates/nodes.js +336 -0
  116. package/dist/services/workspace/analysis.d.ts +56 -0
  117. package/dist/services/workspace/analysis.js +151 -0
  118. package/dist/services/workspace/mutations.d.ts +150 -0
  119. package/dist/services/workspace/mutations.js +1718 -0
  120. package/dist/utils.d.ts +5 -0
  121. package/dist/utils.js +7 -0
  122. package/dist/workflows/get-environment-overview.d.ts +9 -0
  123. package/dist/workflows/get-environment-overview.js +23 -0
  124. package/dist/workflows/get-run-details.d.ts +10 -0
  125. package/dist/workflows/get-run-details.js +28 -0
  126. package/dist/workflows/progress.d.ts +20 -0
  127. package/dist/workflows/progress.js +54 -0
  128. package/dist/workflows/retry-and-wait.d.ts +13 -0
  129. package/dist/workflows/retry-and-wait.js +139 -0
  130. package/dist/workflows/run-and-wait.d.ts +13 -0
  131. package/dist/workflows/run-and-wait.js +141 -0
  132. package/dist/workflows/run-status.d.ts +10 -0
  133. package/dist/workflows/run-status.js +27 -0
  134. package/package.json +34 -0
@@ -0,0 +1,58 @@
1
+ export function registerPrompts(server) {
2
+ server.registerPrompt("coalesce-start-here", {
3
+ title: "Coalesce Start Here",
4
+ description: "Discover projects, workspaces, environments, jobs, and node IDs before calling mutating tools.",
5
+ }, async () => ({
6
+ messages: [
7
+ {
8
+ role: "user",
9
+ content: {
10
+ type: "text",
11
+ text: "Start with discovery before mutation. Use list-projects(includeWorkspaces=true) or get-project(includeWorkspaces=true) to resolve workspace IDs, list-environments for environment IDs, list-jobs for job IDs, and list-workspace-nodes or get-workspace-node before editing node bodies. Read coalesce://context/id-discovery and coalesce://context/tool-usage for the detailed lookup patterns.",
12
+ },
13
+ },
14
+ ],
15
+ }));
16
+ server.registerPrompt("safe-pipeline-planning", {
17
+ title: "Safe Pipeline Planning",
18
+ description: "Planner-first pipeline workflow, including review and approval before any workspace mutation.",
19
+ }, async () => ({
20
+ messages: [
21
+ {
22
+ role: "user",
23
+ content: {
24
+ type: "text",
25
+ text: "Always call plan-pipeline before create-pipeline-from-plan or create-pipeline-from-sql. If the planner returns status needs_clarification, stop and address openQuestions and warnings first. If it returns status ready, present the planned nodes, exact nodeType values, transforms, and filters to the user and wait for explicit approval before creating anything. Review coalesce://context/pipeline-workflows and coalesce://context/tool-usage for the mandatory planner-first sequence.",
26
+ },
27
+ },
28
+ ],
29
+ }));
30
+ server.registerPrompt("run-operations-guide", {
31
+ title: "Run Operations Guide",
32
+ description: "Choose the right run helper and interpret run statuses, results, warnings, and timeouts correctly.",
33
+ }, async () => ({
34
+ messages: [
35
+ {
36
+ role: "user",
37
+ content: {
38
+ type: "text",
39
+ text: "Use run-and-wait when the user wants a final outcome in one call, retry-and-wait for immediate reruns of failed runs, run-status for live scheduler polling, and get-run-details when you need metadata plus results together. Treat waitingToRun and running as non-terminal, and completed, failed, and canceled as terminal. Inspect validation, warning, resultsError, incomplete, and timedOut fields before reporting success. See coalesce://context/run-operations for the full lifecycle.",
40
+ },
41
+ },
42
+ ],
43
+ }));
44
+ server.registerPrompt("large-result-handling", {
45
+ title: "Large Result Handling",
46
+ description: "Use cache tools and coalesce://cache resource URIs when payloads are too large to return inline.",
47
+ }, async () => ({
48
+ messages: [
49
+ {
50
+ role: "user",
51
+ content: {
52
+ type: "text",
53
+ text: "Large JSON responses may be returned as cache metadata with a coalesce://cache/... resource URI instead of full inline payloads. Read the referenced resource rather than assuming the JSON is embedded in the tool result. When you know a large snapshot is needed, prefer explicit cache tools like cache-workspace-nodes, cache-environment-nodes, cache-runs, or cache-org-users so the artifact can be reused. See coalesce://context/tool-usage for paging and cache-handling guidance.",
54
+ },
55
+ },
56
+ ],
57
+ }));
58
+ }
@@ -0,0 +1,145 @@
1
+ # Aggregation and GROUP BY Patterns
2
+
3
+ ## Automatic JOIN ON Generation
4
+
5
+ When creating or converting multi-predecessor nodes, the system analyzes common columns between predecessors and generates JOIN ON clauses.
6
+
7
+ 1. **Column Analysis**: Compares columns from each predecessor pair
8
+ 2. **Normalization**: Case-insensitive column name matching
9
+ 3. **SQL Generation**: Produces FROM/JOIN/ON clauses
10
+
11
+ Example: predecessors ORDERS (ORDER_ID, CUSTOMER_ID) and CUSTOMERS (CUSTOMER_ID, CUSTOMER_NAME) produce:
12
+
13
+ ```sql
14
+ FROM "ORDERS"
15
+ INNER JOIN "CUSTOMERS"
16
+ ON "ORDERS"."CUSTOMER_ID" = "CUSTOMERS"."CUSTOMER_ID"
17
+ ```
18
+
19
+ ## Automatic Datatype Inference
20
+
21
+ The system infers datatypes from transform expressions:
22
+
23
+ | Transform Pattern | Inferred Datatype |
24
+ |-------------------|-------------------|
25
+ | `COUNT(...)` | `NUMBER` |
26
+ | `SUM(...)` | `NUMBER(38,4)` |
27
+ | `AVG(...)` | `NUMBER(38,4)` |
28
+ | `MIN/MAX(..._TS)` | `TIMESTAMP_NTZ(9)` |
29
+ | `MIN/MAX(..._DATE)` | `DATE` |
30
+ | `DATEDIFF(...)` | `NUMBER` |
31
+ | `CURRENT_DATE` | `DATE` |
32
+ | `CURRENT_TIMESTAMP` | `TIMESTAMP_NTZ(9)` |
33
+ | `ROW_NUMBER()` | `NUMBER` |
34
+ | `CONCAT(...)` | `VARCHAR` |
35
+
36
+ ## GROUP BY Analysis and Validation
37
+
38
+ The system automatically detects aggregate functions, identifies non-aggregate columns needing GROUP BY, validates coverage, and generates the GROUP BY clause.
39
+
40
+ ### Detection Rules
41
+
42
+ **Aggregate functions** (column goes into aggregate list):
43
+ `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, `STDDEV`, `VARIANCE`, `LISTAGG`, `ARRAY_AGG`
44
+
45
+ **Window functions** (column goes into aggregate list):
46
+ `ROW_NUMBER`, `RANK`, `DENSE_RANK`, `LEAD`, `LAG`, `FIRST_VALUE`, `LAST_VALUE`
47
+
48
+ **Non-aggregate columns** (must be in GROUP BY):
49
+ Simple references (`"TABLE"."COLUMN"`), expressions without aggregation (`UPPER("TABLE"."NAME")`)
50
+
51
+ ### Validation
52
+
53
+ The `validation.valid` flag indicates whether GROUP BY is correct. When `valid: false`, the query has aggregate functions but missing GROUP BY columns — all non-aggregate columns must be included.
54
+
55
+ ## Common Patterns
56
+
57
+ ### Customer Lifetime Metrics
58
+
59
+ ```javascript
60
+ {
61
+ groupByColumns: ['"ORDERS"."CUSTOMER_ID"'],
62
+ aggregates: [
63
+ { name: "TOTAL_ORDERS", function: "COUNT", expression: 'DISTINCT "ORDERS"."ORDER_ID"' },
64
+ { name: "LIFETIME_VALUE", function: "SUM", expression: '"ORDERS"."ORDER_TOTAL"' },
65
+ { name: "AVG_ORDER_VALUE", function: "AVG", expression: '"ORDERS"."ORDER_TOTAL"' },
66
+ { name: "FIRST_ORDER_DATE", function: "MIN", expression: '"ORDERS"."ORDER_TS"' },
67
+ { name: "LAST_ORDER_DATE", function: "MAX", expression: '"ORDERS"."ORDER_TS"' },
68
+ { name: "DAYS_SINCE_LAST", function: "DATEDIFF", expression: 'day, MAX("ORDERS"."ORDER_TS"), CURRENT_DATE()' }
69
+ ]
70
+ }
71
+ ```
72
+
73
+ ### Daily Sales Summary
74
+
75
+ ```javascript
76
+ {
77
+ groupByColumns: [
78
+ 'DATE_TRUNC(\'day\', "ORDERS"."ORDER_TS")',
79
+ '"ORDERS"."LOCATION_ID"'
80
+ ],
81
+ aggregates: [
82
+ { name: "DAILY_ORDERS", function: "COUNT", expression: 'DISTINCT "ORDERS"."ORDER_ID"' },
83
+ { name: "DAILY_REVENUE", function: "SUM", expression: '"ORDER_DETAIL"."LINE_TOTAL"' },
84
+ { name: "AVG_ORDER_SIZE", function: "AVG", expression: '"ORDER_DETAIL"."QUANTITY"' }
85
+ ]
86
+ }
87
+ ```
88
+
89
+ ### Product Category Performance
90
+
91
+ ```javascript
92
+ {
93
+ groupByColumns: ['"PRODUCTS"."CATEGORY"', '"PRODUCTS"."SUBCATEGORY"'],
94
+ aggregates: [
95
+ { name: "TOTAL_SALES", function: "SUM", expression: '"ORDERS"."AMOUNT"' },
96
+ { name: "UNITS_SOLD", function: "SUM", expression: '"ORDERS"."QUANTITY"' },
97
+ { name: "UNIQUE_CUSTOMERS", function: "COUNT", expression: 'DISTINCT "ORDERS"."CUSTOMER_ID"' },
98
+ { name: "AVG_PRICE", function: "AVG", expression: '"ORDERS"."UNIT_PRICE"' }
99
+ ]
100
+ }
101
+ ```
102
+
103
+ ## groupByColumns is Analysis Data Only
104
+
105
+ **CRITICAL**: The `groupByColumns` field returned by `convert-join-to-aggregation` is for analysis only. It must NEVER be included in node metadata sent to the Coalesce API.
106
+
107
+ The Coalesce API rejects it with: `"request/body must NOT have additional properties"`
108
+
109
+ Our tools (`convert-join-to-aggregation`, `replace-workspace-node-columns`, `update-workspace-node`) automatically strip `groupByColumns` from metadata. But if you call `set-workspace-node` with a body containing `groupByColumns` in metadata, you'll get an error.
110
+
111
+ ## Automatic Config Completion
112
+
113
+ `convert-join-to-aggregation` automatically completes config fields after transformation:
114
+
115
+ ### Column-Level Attributes
116
+
117
+ - `isBusinessKey: true` on GROUP BY columns (dimensions)
118
+ - `isChangeTracking: true` on aggregate columns (measures)
119
+
120
+ These are column-level attributes set directly on each column object. See `coalesce://context/node-operations` for details on columnSelector attributes.
121
+
122
+ ### Node-Type-Aware Config
123
+
124
+ Automatically set based on context:
125
+
126
+ - `selectDistinct: false` (incompatible with aggregates)
127
+ - `truncateBefore: false` (table materialization default)
128
+ - `insertStrategy`: based on multi-source detection
129
+
130
+ See `coalesce://context/intelligent-node-configuration` for complete details.
131
+
132
+ ## Tips
133
+
134
+ 1. Always use fully-qualified column names: `"TABLE"."COLUMN"`
135
+ 2. Check `validation.valid` for GROUP BY correctness
136
+ 3. Use `maintainJoins: true` for automatic JOIN ON generation
137
+ 4. Let datatype inference work — don't manually specify unless needed
138
+ 5. Review `joinSQL.fullSQL` for the generated SQL
139
+ 6. Never include groupByColumns in metadata sent to the API
140
+
141
+ ## Related Resources
142
+
143
+ - `coalesce://context/pipeline-workflows` — using aggregation in pipelines
144
+ - `coalesce://context/node-operations` — column-level attributes, config fields
145
+ - `coalesce://context/intelligent-node-configuration` — config completion details
@@ -0,0 +1,183 @@
1
+ # Data Engineering Principles for Coalesce
2
+
3
+ ## How to Use This Guide
4
+
5
+ When to consult:
6
+ - Before creating workspace nodes that need architecture decisions
7
+ - When evaluating existing workspace structure or methodology
8
+ - When choosing materialization strategies
9
+
10
+ Application pattern:
11
+ - If the user already specified the exact node type, use it and skip analysis.
12
+ - Use `analyze-workspace-patterns` for a compact inline profile of workspace conventions.
13
+ - Use `cache-workspace-nodes` when the full node list should bypass chat context or be reused.
14
+ - If workspace differs from recommendations, inform user with rationale.
15
+ - If workspace aligns, proceed with existing pattern.
16
+
17
+ For node type selection by pipeline layer, see `coalesce://context/pipeline-workflows`.
18
+
19
+ ## Platform Awareness
20
+
21
+ Coalesce supports multiple data platforms. Materialization strategies, cost models, and features differ significantly. Determine the platform before recommending materialization.
22
+
23
+ To detect: check `coalesce://context/sql-platform-selection`, inspect existing node configurations, or ask.
24
+
25
+ ### Snowflake
26
+
27
+ - Compute model: Per-second compute billing
28
+ - Key features: Transient tables (no Fail-safe, lower cost for staging), Dynamic Tables (declarative, auto-refreshing), Streams/Tasks (CDC), materialized views
29
+ - Staging best practice: Transient tables for bronze/silver layers
30
+ - Incremental: MERGE with high-water mark, Streams for CDC
31
+ - Config indicators: `insertStrategy`, `truncateBefore`, `materializationType`
32
+
33
+ ### Databricks
34
+
35
+ - Compute model: DBU-based billing, Photon engine
36
+ - Key features: Delta format (time travel, OPTIMIZE, ZORDER), Delta Live Tables (DLT), streaming tables, Unity Catalog
37
+ - Staging best practice: Delta tables with OPTIMIZE for large staging
38
+ - Incremental: Delta MERGE, APPLY CHANGES (DLT), streaming tables
39
+ - Note: Views are logical views over Delta tables; materialized views less common than Snowflake/BigQuery
40
+
41
+ ### BigQuery
42
+
43
+ - Compute model: Per-bytes-scanned (on-demand) or slot-based (reservations). Views are expensive because each query rescans underlying data.
44
+ - Key features: Partitioned tables, clustered tables, materialized views (auto-refreshing), table snapshots, expiration policies
45
+ - Staging best practice: Partitioned tables with expiration; clustering on high-cardinality filter columns
46
+ - Incremental: MERGE with partition pruning, streaming inserts
47
+ - IMPORTANT: Avoid views for repeatedly queried or large-scan nodes. Prefer materialized views or tables.
48
+
49
+ ## Workspace Pattern Analysis
50
+
51
+ ### Package Detection
52
+
53
+ Scan node types for package prefixes:
54
+ - `base-nodes:::*` -> base-nodes package observed
55
+ - `custom-package:::*` -> custom package observed
56
+ - No prefix -> built-in type
57
+
58
+ Presence indicates observed usage, not exhaustive inventory. Absence doesn't prove a package is unavailable.
59
+
60
+ Package categories:
61
+ - **base-nodes**: Enhanced Stage, View, Dimension, Fact, Work
62
+ - **Specialized**: Data Vault, Kimball extensions, semantic layers
63
+ - **Platform-specific**: Databricks DLT, BigQuery advanced, Dynamic Tables, Streams/Tasks
64
+ - **Data quality**: Incremental loading, test filtering, validation
65
+ - **Built-in**: Stage, View, Dimension, Fact, persistentStage (no prefix)
66
+
67
+ ### DAG Topology Analysis
68
+
69
+ | Layer | DAG Signature | Typical Names |
70
+ |-------|--------------|---------------|
71
+ | Bronze/Landing | 0 predecessors or source predecessors | RAW_*, SRC_*, LANDING_* |
72
+ | Silver/Staging | 1-2 predecessors from bronze | STG_*, STAGE_*, CLEAN_* |
73
+ | Intermediate | Mid-pipeline (has predecessors AND consumers) | INT_*, WORK_*, TRANSFORM_* |
74
+ | Gold/Mart | Multiple predecessors, few/no consumers | DIM_*, FACT_*, FCT_*, MART_* |
75
+
76
+ ### Methodology Detection
77
+
78
+ **Kimball**: DIM_*/FACT_* separation, facts with 3+ dimension predecessors, star/snowflake topology. SCD indicators: EFFECTIVE_FROM, EFFECTIVE_TO, IS_CURRENT columns.
79
+
80
+ **Data Vault 2.0**: Hub (few columns, business key + metadata, many downstream satellites), Satellite (many columns, single hub predecessor, HASH_DIFF), Link (multiple hub predecessors). May use specialized packages from github.com/coalesceio.
81
+
82
+ **dbt-Style**: stg_ -> int_ -> fct_/dim_ naming, heavy View usage in intermediate layer, selective materialization.
83
+
84
+ **Mixed/Unclear**: Default to staging -> mart pattern. Don't force methodology on established workspaces.
85
+
86
+ ## Materialization Strategies
87
+
88
+ ### Table (Full Refresh)
89
+
90
+ When: Data changes significantly each run, staging/bronze layer, dimensions with low update frequency.
91
+ Trade-offs: Simple, predictable, consistent. Higher compute for large datasets.
92
+
93
+ - Snowflake: `truncateBefore: true`, `materializationType: "table"`. Transient tables for staging.
94
+ - BigQuery: Cost-effective even for large datasets (bytes written, not row count).
95
+ - Databricks: Delta tables with ACID guarantees.
96
+
97
+ ### View
98
+
99
+ When: Intermediate transforms, low query frequency, always-fresh data, simple transforms without aggregation.
100
+
101
+ IMPORTANT: `View` node types can ONLY materialize as views. Cannot convert to tables. For aggregations or frequently queried nodes, use `Dimension`, `Fact`, `Stage`, or `Work`.
102
+
103
+ - Snowflake: Each query consumes compute. Acceptable for low-frequency.
104
+ - BigQuery: Expensive — every query rescans and bills per bytes. Avoid for frequent queries or large tables.
105
+ - Databricks: Benefits from Delta caching but recomputes on each query.
106
+
107
+ ### Incremental (Merge/Append)
108
+
109
+ When: Large datasets, time-series/event data, fact tables with high volume, staging with clear update patterns.
110
+ Trade-offs: Efficient, faster runs. More complex, risk of drift.
111
+
112
+ - Snowflake: MERGE with high-water mark, `insertStrategy: "MERGE"`. Streams/Tasks for CDC.
113
+ - BigQuery: MERGE with partition pruning (always partition incremental tables).
114
+ - Databricks: Delta MERGE with schema evolution. APPLY CHANGES in DLT.
115
+
116
+ ### Dynamic / Auto-Refreshing
117
+
118
+ - Snowflake Dynamic Tables: Declarative SQL, auto-refresh based on lag target
119
+ - BigQuery Materialized Views: Auto-refresh with smart tuning (single-table aggregations)
120
+ - Databricks DLT: Declarative pipeline definitions with automatic refresh
121
+
122
+ Check for platform-specific packages (Dynamic-Table-Nodes, databricks-DLT).
123
+
124
+ ### Layer-Specific Defaults
125
+
126
+ - Bronze -> Tables (preserve raw; Snowflake: transient; BigQuery: partitioned with expiration)
127
+ - Silver -> Tables for small, incremental for large (BigQuery: always partition)
128
+ - Intermediate -> Views (BigQuery: materialize if queried by multiple downstream nodes)
129
+ - Gold Dimensions -> Tables (small, need persistence)
130
+ - Gold Facts -> Incremental tables (large, time-series)
131
+ - Metrics -> Tables via `Dimension` or `Fact`; consider materialized views for single-table aggregations
132
+
133
+ ## Dependency Management
134
+
135
+ ### Healthy DAG Patterns
136
+
137
+ - **Fan-out** (1 -> many): One staging feeds multiple downstream. Promotes reusability.
138
+ - **Fan-in** (many -> 1): Multiple sources join into one. Sweet spot: 2-4 predecessors.
139
+ - **Linear chains** (1 -> 1 -> 1): Acceptable at 3-5 steps if each adds clear value.
140
+
141
+ ### Problematic Patterns
142
+
143
+ - **Excessive fan-in** (>5 predecessors): Break into intermediate nodes.
144
+ - **Deep chains** (>6 steps): Consolidate or use views.
145
+ - **Circular dependencies**: Break by extracting shared logic upstream.
146
+ - **Cross-layer skips**: Gold reading Bronze directly. Route through proper layers.
147
+
148
+ ### Warnings to Surface
149
+
150
+ - predecessorNodeIDs.length > 5 -> warn about excessive fan-in
151
+ - 7th node in linear chain -> warn about deep chain
152
+ - Cross-layer skip -> warn about layer violation
153
+
154
+ ## Package Recommendations
155
+
156
+ ### Base Nodes
157
+
158
+ If workspace has NO base-nodes types: "The base-nodes package offers enhanced Stage, View, Dimension, and Fact. Install via Build Settings > Packages."
159
+ If workspace HAS base-nodes types: use base-nodes versions by default.
160
+
161
+ ### Common Packages
162
+
163
+ | Package | When to Recommend | Node Types |
164
+ |---------|------------------|------------|
165
+ | Incremental-Nodes | Large fact tables, "incremental"/"delta" mentions | Incremental Load, Test Passed/Failed Records, Looped Load |
166
+ | Dynamic Tables | Auto-refreshing aggregations (Snowflake) | Dynamic Table Work, Dimension |
167
+ | Materialized Views | Simple single-table aggregations (Snowflake) | Materialized View |
168
+ | Streams/Tasks | CDC, event-driven (Snowflake) | Stream, Task |
169
+ | DLT | Declarative pipelines (Databricks) | DLT nodes |
170
+
171
+ ### Recommendation Logic
172
+
173
+ 1. Match existing workspace patterns first
174
+ 2. Fresh workspace with built-in types -> soft recommend base-nodes
175
+ 3. Specialized need -> point to specific package, remind "Build Settings > Packages to install"
176
+ 4. Never assume installed: "If you have the package..." not "Use the node type"
177
+ 5. Don't push packages when built-in types work fine
178
+
179
+ ## Related Resources
180
+
181
+ - `coalesce://context/pipeline-workflows` — node type selection by layer, pipeline building
182
+ - `coalesce://context/node-type-corpus` — node type discovery and corpus search
183
+ - `coalesce://context/sql-platform-selection` — platform detection
@@ -0,0 +1,92 @@
1
+ # Hydrated Metadata
2
+
3
+ Use this resource when the user wants to provide or edit raw node `metadata`, `config`, or `storageLocations`.
4
+
5
+ ## What This Covers
6
+
7
+ Coalesce hydrated node bodies commonly include:
8
+ - `metadata.columns`
9
+ - `sources`
10
+ - `config`
11
+ - `storageLocations`
12
+
13
+ The exact structure varies by node type and configuration.
14
+
15
+ ## Practical Summary
16
+
17
+ ### `metadata.columns`
18
+
19
+ Columns map to the Mapping Grid in Coalesce.
20
+
21
+ Common fields on a column include:
22
+ - `id`
23
+ - `name`
24
+ - `dataType`
25
+ - `description`
26
+ - `nullable`
27
+ - `defaultValue`
28
+ - `tests`
29
+ - `transform`
30
+
31
+ Columns can also contain lineage-related source information.
32
+
33
+ ### `sources`
34
+
35
+ Hydrated source metadata can include:
36
+ - source `name`
37
+ - source `columns`
38
+ - source `join`
39
+ - `dependencies`
40
+
41
+ This is useful for multisource and join-oriented nodes.
42
+
43
+ ### `config`
44
+
45
+ Hydrated config can contain both simple scalar fields and nested structures.
46
+
47
+ Examples include:
48
+ - `preSQL`
49
+ - `postSQL`
50
+ - `insertStrategy`
51
+ - booleans
52
+ - dropdown values
53
+ - nested tabular config items
54
+ - config entries that themselves reference column-like objects
55
+
56
+ Treat config as node-type-specific. Preserve unknown keys unless the user intends to replace them.
57
+
58
+ ### `storageLocations`
59
+
60
+ Hydrated storage locations are arrays of objects with fields such as:
61
+ - `database`
62
+ - `schema`
63
+ - `name`
64
+
65
+ Do not assume storage location names can be normalized for SQL style. They must match the actual Coalesce objects.
66
+
67
+ ## Editing Rules
68
+
69
+ - Prefer `update-workspace-node` for partial changes.
70
+ - Treat arrays as full-replacement fields (see `coalesce://context/node-payloads` for array safety details).
71
+ - Preserve existing hydrated structures you are not intentionally changing.
72
+ - When working from scratch, provide `metadata.columns` explicitly if the user expects a configured node.
73
+
74
+ ## When To Use Raw Hydrated Input
75
+
76
+ Use raw hydrated structures when:
77
+ - the user already knows the exact payload shape they want
78
+ - the node type has custom nested config
79
+ - you need to preserve advanced lineage or source structures
80
+
81
+ If the user only wants a normal create/update flow, prefer the higher-level helpers and simpler fields first.
82
+
83
+ ## Official Reference
84
+
85
+ See the Coalesce documentation for the fuller field inventory:
86
+ - [Hydrated Metadata](https://docs.coalesce.io/docs/build-your-pipeline/user-defined-nodes/hydrated-metadata)
87
+ - [Hydrated Metadata Reference](https://docs.coalesce.io/docs/build-your-pipeline/user-defined-nodes/hydrated-metadata-reference)
88
+
89
+ ## Related Resources
90
+
91
+ - `coalesce://context/node-payloads`
92
+ - `coalesce://context/storage-mappings`
@@ -0,0 +1,64 @@
1
+ # ID Discovery
2
+
3
+ Use this resource when the user knows names but not Coalesce IDs.
4
+
5
+ ## Core Rule
6
+
7
+ Prefer list/get discovery tools over guessing IDs from URLs or names.
8
+
9
+ ## Common ID Lookups
10
+
11
+ ### Project IDs
12
+
13
+ - Use `list-projects` to browse projects.
14
+ - Use `get-project` when you already know the `projectID`.
15
+
16
+ ### Workspace IDs
17
+
18
+ - Workspace IDs are nested under projects.
19
+ - Use:
20
+ - `list-projects({ includeWorkspaces: true })`
21
+ - `get-project({ projectID, includeWorkspaces: true })`
22
+
23
+ Do not assume workspace IDs are visible unless `includeWorkspaces` was requested.
24
+
25
+ ### Job IDs
26
+
27
+ - Jobs are nested under projects and workspaces.
28
+ - Use:
29
+ - `list-projects({ includeJobs: true, includeWorkspaces: true })`
30
+ - `get-project({ projectID, includeJobs: true, includeWorkspaces: true })`
31
+
32
+ If the user gives a job name, resolve it to a job ID before calling `start-run`.
33
+
34
+ ### Environment IDs
35
+
36
+ - Use `list-environments` to discover environments by name.
37
+ - Use `get-environment` only after you already know the `environmentID`.
38
+
39
+ ### Node IDs
40
+
41
+ - Use `list-workspace-nodes` when working in a workspace.
42
+ - Use `list-environment-nodes` when working against an environment.
43
+ - Use `get-workspace-node` or `get-environment-node` only after you know the node ID.
44
+
45
+ ### Run IDs and Run Counters
46
+
47
+ - Use `list-runs` to discover recent runs when needed.
48
+ - For run ID format details (runCounter vs UUID) and operational usage, see `coalesce://context/run-operations`.
49
+
50
+ ### Org IDs
51
+
52
+ - For org ID requirements in run operations (cancel-run), see `coalesce://context/run-operations`.
53
+
54
+ ## Good Defaults
55
+
56
+ 1. Discover by name with a list tool.
57
+ 2. Resolve the exact ID.
58
+ 3. Use the matching get/mutate tool with that ID.
59
+
60
+ ## Related Resources
61
+
62
+ - `coalesce://context/tool-usage`
63
+ - `coalesce://context/run-operations`
64
+ - `coalesce://context/sql-platform-selection`