coalesce-transform-mcp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +304 -0
- package/dist/cache-dir.d.ts +26 -0
- package/dist/cache-dir.js +106 -0
- package/dist/client.d.ts +25 -0
- package/dist/client.js +212 -0
- package/dist/coalesce/api/environments.d.ts +20 -0
- package/dist/coalesce/api/environments.js +15 -0
- package/dist/coalesce/api/git-accounts.d.ts +21 -0
- package/dist/coalesce/api/git-accounts.js +21 -0
- package/dist/coalesce/api/jobs.d.ts +25 -0
- package/dist/coalesce/api/jobs.js +21 -0
- package/dist/coalesce/api/nodes.d.ts +29 -0
- package/dist/coalesce/api/nodes.js +33 -0
- package/dist/coalesce/api/projects.d.ts +22 -0
- package/dist/coalesce/api/projects.js +25 -0
- package/dist/coalesce/api/runs.d.ts +19 -0
- package/dist/coalesce/api/runs.js +34 -0
- package/dist/coalesce/api/subgraphs.d.ts +20 -0
- package/dist/coalesce/api/subgraphs.js +17 -0
- package/dist/coalesce/api/users.d.ts +30 -0
- package/dist/coalesce/api/users.js +31 -0
- package/dist/coalesce/types.d.ts +298 -0
- package/dist/coalesce/types.js +746 -0
- package/dist/generated/.gitkeep +0 -0
- package/dist/generated/node-type-corpus.json +42656 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +10 -0
- package/dist/mcp/cache.d.ts +3 -0
- package/dist/mcp/cache.js +137 -0
- package/dist/mcp/environments.d.ts +3 -0
- package/dist/mcp/environments.js +61 -0
- package/dist/mcp/git-accounts.d.ts +3 -0
- package/dist/mcp/git-accounts.js +70 -0
- package/dist/mcp/jobs.d.ts +3 -0
- package/dist/mcp/jobs.js +77 -0
- package/dist/mcp/node-type-corpus.d.ts +3 -0
- package/dist/mcp/node-type-corpus.js +173 -0
- package/dist/mcp/nodes.d.ts +3 -0
- package/dist/mcp/nodes.js +341 -0
- package/dist/mcp/pipelines.d.ts +3 -0
- package/dist/mcp/pipelines.js +342 -0
- package/dist/mcp/projects.d.ts +3 -0
- package/dist/mcp/projects.js +70 -0
- package/dist/mcp/repo-node-types.d.ts +135 -0
- package/dist/mcp/repo-node-types.js +387 -0
- package/dist/mcp/runs.d.ts +3 -0
- package/dist/mcp/runs.js +92 -0
- package/dist/mcp/subgraphs.d.ts +3 -0
- package/dist/mcp/subgraphs.js +60 -0
- package/dist/mcp/users.d.ts +3 -0
- package/dist/mcp/users.js +107 -0
- package/dist/prompts/index.d.ts +2 -0
- package/dist/prompts/index.js +58 -0
- package/dist/resources/context/aggregation-patterns.md +145 -0
- package/dist/resources/context/data-engineering-principles.md +183 -0
- package/dist/resources/context/hydrated-metadata.md +92 -0
- package/dist/resources/context/id-discovery.md +64 -0
- package/dist/resources/context/intelligent-node-configuration.md +162 -0
- package/dist/resources/context/node-creation-decision-tree.md +156 -0
- package/dist/resources/context/node-operations.md +316 -0
- package/dist/resources/context/node-payloads.md +114 -0
- package/dist/resources/context/node-type-corpus.md +166 -0
- package/dist/resources/context/node-type-selection-guide.md +96 -0
- package/dist/resources/context/overview.md +135 -0
- package/dist/resources/context/pipeline-workflows.md +355 -0
- package/dist/resources/context/run-operations.md +55 -0
- package/dist/resources/context/sql-bigquery.md +41 -0
- package/dist/resources/context/sql-databricks.md +40 -0
- package/dist/resources/context/sql-platform-selection.md +70 -0
- package/dist/resources/context/sql-snowflake.md +43 -0
- package/dist/resources/context/storage-mappings.md +49 -0
- package/dist/resources/context/tool-usage.md +98 -0
- package/dist/resources/index.d.ts +5 -0
- package/dist/resources/index.js +254 -0
- package/dist/schemas/node-payloads.d.ts +5019 -0
- package/dist/schemas/node-payloads.js +147 -0
- package/dist/server.d.ts +7 -0
- package/dist/server.js +63 -0
- package/dist/services/cache/snapshots.d.ts +108 -0
- package/dist/services/cache/snapshots.js +275 -0
- package/dist/services/config/context-analyzer.d.ts +14 -0
- package/dist/services/config/context-analyzer.js +76 -0
- package/dist/services/config/field-classifier.d.ts +23 -0
- package/dist/services/config/field-classifier.js +47 -0
- package/dist/services/config/intelligent.d.ts +55 -0
- package/dist/services/config/intelligent.js +306 -0
- package/dist/services/config/rules.d.ts +6 -0
- package/dist/services/config/rules.js +44 -0
- package/dist/services/config/schema-resolver.d.ts +18 -0
- package/dist/services/config/schema-resolver.js +80 -0
- package/dist/services/corpus/loader.d.ts +56 -0
- package/dist/services/corpus/loader.js +25 -0
- package/dist/services/corpus/search.d.ts +49 -0
- package/dist/services/corpus/search.js +69 -0
- package/dist/services/corpus/templates.d.ts +4 -0
- package/dist/services/corpus/templates.js +11 -0
- package/dist/services/pipelines/execution.d.ts +20 -0
- package/dist/services/pipelines/execution.js +290 -0
- package/dist/services/pipelines/node-type-intent.d.ts +96 -0
- package/dist/services/pipelines/node-type-intent.js +356 -0
- package/dist/services/pipelines/node-type-selection.d.ts +66 -0
- package/dist/services/pipelines/node-type-selection.js +758 -0
- package/dist/services/pipelines/planning.d.ts +543 -0
- package/dist/services/pipelines/planning.js +1839 -0
- package/dist/services/policies/sql-override.d.ts +7 -0
- package/dist/services/policies/sql-override.js +109 -0
- package/dist/services/repo/operations.d.ts +6 -0
- package/dist/services/repo/operations.js +10 -0
- package/dist/services/repo/parser.d.ts +70 -0
- package/dist/services/repo/parser.js +365 -0
- package/dist/services/repo/path.d.ts +2 -0
- package/dist/services/repo/path.js +58 -0
- package/dist/services/templates/nodes.d.ts +50 -0
- package/dist/services/templates/nodes.js +336 -0
- package/dist/services/workspace/analysis.d.ts +56 -0
- package/dist/services/workspace/analysis.js +151 -0
- package/dist/services/workspace/mutations.d.ts +150 -0
- package/dist/services/workspace/mutations.js +1718 -0
- package/dist/utils.d.ts +5 -0
- package/dist/utils.js +7 -0
- package/dist/workflows/get-environment-overview.d.ts +9 -0
- package/dist/workflows/get-environment-overview.js +23 -0
- package/dist/workflows/get-run-details.d.ts +10 -0
- package/dist/workflows/get-run-details.js +28 -0
- package/dist/workflows/progress.d.ts +20 -0
- package/dist/workflows/progress.js +54 -0
- package/dist/workflows/retry-and-wait.d.ts +13 -0
- package/dist/workflows/retry-and-wait.js +139 -0
- package/dist/workflows/run-and-wait.d.ts +13 -0
- package/dist/workflows/run-and-wait.js +141 -0
- package/dist/workflows/run-status.d.ts +10 -0
- package/dist/workflows/run-status.js +27 -0
- package/package.json +34 -0
|
@@ -0,0 +1,1839 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { randomUUID } from "node:crypto";
|
|
3
|
+
import { validatePathSegment } from "../../coalesce/types.js";
|
|
4
|
+
import { getWorkspaceNode, listWorkspaceNodes, } from "../../coalesce/api/nodes.js";
|
|
5
|
+
import { listWorkspaceNodeTypes } from "../workspace/mutations.js";
|
|
6
|
+
import { isPlainObject } from "../../utils.js";
|
|
7
|
+
import { NodeConfigInputSchema } from "../../schemas/node-payloads.js";
|
|
8
|
+
import { selectPipelineNodeType, } from "./node-type-selection.js";
|
|
9
|
+
const PlannedSelectItemSchema = z
|
|
10
|
+
.object({
|
|
11
|
+
expression: z.string(),
|
|
12
|
+
outputName: z.string().nullable(),
|
|
13
|
+
sourceNodeAlias: z.string().nullable(),
|
|
14
|
+
sourceNodeName: z.string().nullable(),
|
|
15
|
+
sourceNodeID: z.string().nullable(),
|
|
16
|
+
sourceColumnName: z.string().nullable(),
|
|
17
|
+
kind: z.enum(["column", "expression"]),
|
|
18
|
+
supported: z.boolean(),
|
|
19
|
+
reason: z.string().optional(),
|
|
20
|
+
})
|
|
21
|
+
.strict();
|
|
22
|
+
const PlannedPipelineNodeSchema = z
|
|
23
|
+
.object({
|
|
24
|
+
planNodeID: z.string(),
|
|
25
|
+
name: z.string(),
|
|
26
|
+
nodeType: z.string(),
|
|
27
|
+
nodeTypeFamily: z
|
|
28
|
+
.enum([
|
|
29
|
+
"stage",
|
|
30
|
+
"persistent-stage",
|
|
31
|
+
"view",
|
|
32
|
+
"work",
|
|
33
|
+
"dimension",
|
|
34
|
+
"fact",
|
|
35
|
+
"hub",
|
|
36
|
+
"satellite",
|
|
37
|
+
"link",
|
|
38
|
+
"unknown",
|
|
39
|
+
])
|
|
40
|
+
.nullable()
|
|
41
|
+
.optional(),
|
|
42
|
+
predecessorNodeIDs: z.array(z.string()),
|
|
43
|
+
predecessorPlanNodeIDs: z.array(z.string()),
|
|
44
|
+
predecessorNodeNames: z.array(z.string()),
|
|
45
|
+
description: z.string().nullable(),
|
|
46
|
+
sql: z.string().nullable(),
|
|
47
|
+
selectItems: z.array(PlannedSelectItemSchema),
|
|
48
|
+
outputColumnNames: z.array(z.string()),
|
|
49
|
+
configOverrides: NodeConfigInputSchema,
|
|
50
|
+
sourceRefs: z.array(z
|
|
51
|
+
.object({
|
|
52
|
+
locationName: z.string(),
|
|
53
|
+
nodeName: z.string(),
|
|
54
|
+
alias: z.string().nullable(),
|
|
55
|
+
nodeID: z.string().nullable(),
|
|
56
|
+
})
|
|
57
|
+
.strict()),
|
|
58
|
+
joinCondition: z.string().nullable(),
|
|
59
|
+
location: z
|
|
60
|
+
.object({
|
|
61
|
+
locationName: z.string().optional(),
|
|
62
|
+
database: z.string().optional(),
|
|
63
|
+
schema: z.string().optional(),
|
|
64
|
+
})
|
|
65
|
+
.strict(),
|
|
66
|
+
requiresFullSetNode: z.boolean(),
|
|
67
|
+
templateDefaults: z
|
|
68
|
+
.object({
|
|
69
|
+
inferredTopLevelFields: z.record(z.unknown()),
|
|
70
|
+
inferredConfig: NodeConfigInputSchema,
|
|
71
|
+
})
|
|
72
|
+
.strict()
|
|
73
|
+
.optional(),
|
|
74
|
+
})
|
|
75
|
+
.strict();
|
|
76
|
+
export const PipelinePlanSchema = z
|
|
77
|
+
.object({
|
|
78
|
+
version: z.literal(1),
|
|
79
|
+
intent: z.enum(["sql", "goal"]),
|
|
80
|
+
status: z.enum(["ready", "needs_clarification"]),
|
|
81
|
+
workspaceID: z.string(),
|
|
82
|
+
platform: z.string().nullable(),
|
|
83
|
+
goal: z.string().nullable(),
|
|
84
|
+
sql: z.string().nullable(),
|
|
85
|
+
nodes: z.array(PlannedPipelineNodeSchema),
|
|
86
|
+
assumptions: z.array(z.string()),
|
|
87
|
+
openQuestions: z.array(z.string()),
|
|
88
|
+
warnings: z.array(z.string()),
|
|
89
|
+
supportedNodeTypes: z.array(z.string()),
|
|
90
|
+
nodeTypeSelection: z.record(z.unknown()).optional(),
|
|
91
|
+
cteNodeSummary: z.array(z.record(z.unknown())).optional(),
|
|
92
|
+
STOP_AND_CONFIRM: z.string().optional(),
|
|
93
|
+
})
|
|
94
|
+
.strict();
|
|
95
|
+
const WORKSPACE_NODE_PAGE_LIMIT = 200;
|
|
96
|
+
export const DEFAULT_STAGE_CONFIG = {
|
|
97
|
+
postSQL: "",
|
|
98
|
+
preSQL: "",
|
|
99
|
+
testsEnabled: true,
|
|
100
|
+
};
|
|
101
|
+
export function normalizeSqlIdentifier(identifier) {
|
|
102
|
+
return identifier.trim().replace(/^["`[]|["`\]]$/g, "").toUpperCase();
|
|
103
|
+
}
|
|
104
|
+
export function deepClone(value) {
|
|
105
|
+
return JSON.parse(JSON.stringify(value));
|
|
106
|
+
}
|
|
107
|
+
export function normalizeWhitespace(value) {
|
|
108
|
+
return value.replace(/\s+/g, " ").trim();
|
|
109
|
+
}
|
|
110
|
+
function isIdentifierChar(char) {
|
|
111
|
+
return !!char && /[A-Za-z0-9_$]/.test(char);
|
|
112
|
+
}
|
|
113
|
+
function stripIdentifierQuotes(identifier) {
|
|
114
|
+
const trimmed = identifier.trim();
|
|
115
|
+
if ((trimmed.startsWith('"') && trimmed.endsWith('"')) ||
|
|
116
|
+
(trimmed.startsWith("`") && trimmed.endsWith("`")) ||
|
|
117
|
+
(trimmed.startsWith("[") && trimmed.endsWith("]"))) {
|
|
118
|
+
return trimmed.slice(1, -1);
|
|
119
|
+
}
|
|
120
|
+
return trimmed;
|
|
121
|
+
}
|
|
122
|
+
function findTopLevelKeywordIndex(sql, keyword, startIndex = 0) {
|
|
123
|
+
const lowerKeyword = keyword.toLowerCase();
|
|
124
|
+
let parenDepth = 0;
|
|
125
|
+
let inSingleQuote = false;
|
|
126
|
+
let inDoubleQuote = false;
|
|
127
|
+
let inBacktick = false;
|
|
128
|
+
let inBracket = false;
|
|
129
|
+
for (let index = startIndex; index < sql.length; index += 1) {
|
|
130
|
+
const char = sql[index];
|
|
131
|
+
const next = sql[index + 1];
|
|
132
|
+
if (inSingleQuote) {
|
|
133
|
+
if (char === "'" && next === "'") {
|
|
134
|
+
index += 1;
|
|
135
|
+
}
|
|
136
|
+
else if (char === "'") {
|
|
137
|
+
inSingleQuote = false;
|
|
138
|
+
}
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
if (inDoubleQuote) {
|
|
142
|
+
if (char === '"') {
|
|
143
|
+
inDoubleQuote = false;
|
|
144
|
+
}
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
if (inBacktick) {
|
|
148
|
+
if (char === "`") {
|
|
149
|
+
inBacktick = false;
|
|
150
|
+
}
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
if (inBracket) {
|
|
154
|
+
if (char === "]") {
|
|
155
|
+
inBracket = false;
|
|
156
|
+
}
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
if (char === "'") {
|
|
160
|
+
inSingleQuote = true;
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
if (char === '"') {
|
|
164
|
+
inDoubleQuote = true;
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
if (char === "`") {
|
|
168
|
+
inBacktick = true;
|
|
169
|
+
continue;
|
|
170
|
+
}
|
|
171
|
+
if (char === "[") {
|
|
172
|
+
inBracket = true;
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
if (char === "(") {
|
|
176
|
+
parenDepth += 1;
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
if (char === ")" && parenDepth > 0) {
|
|
180
|
+
parenDepth -= 1;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
if (parenDepth !== 0) {
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
if (sql.slice(index, index + lowerKeyword.length).toLowerCase() === lowerKeyword &&
|
|
187
|
+
!isIdentifierChar(sql[index - 1]) &&
|
|
188
|
+
!isIdentifierChar(sql[index + lowerKeyword.length])) {
|
|
189
|
+
return index;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return -1;
|
|
193
|
+
}
|
|
194
|
+
function splitTopLevel(value, delimiter) {
|
|
195
|
+
const parts = [];
|
|
196
|
+
let current = "";
|
|
197
|
+
let parenDepth = 0;
|
|
198
|
+
let inSingleQuote = false;
|
|
199
|
+
let inDoubleQuote = false;
|
|
200
|
+
let inBacktick = false;
|
|
201
|
+
let inBracket = false;
|
|
202
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
203
|
+
const char = value[index];
|
|
204
|
+
const next = value[index + 1];
|
|
205
|
+
if (inSingleQuote) {
|
|
206
|
+
current += char;
|
|
207
|
+
if (char === "'" && next === "'") {
|
|
208
|
+
current += next;
|
|
209
|
+
index += 1;
|
|
210
|
+
}
|
|
211
|
+
else if (char === "'") {
|
|
212
|
+
inSingleQuote = false;
|
|
213
|
+
}
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
if (inDoubleQuote) {
|
|
217
|
+
current += char;
|
|
218
|
+
if (char === '"') {
|
|
219
|
+
inDoubleQuote = false;
|
|
220
|
+
}
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
if (inBacktick) {
|
|
224
|
+
current += char;
|
|
225
|
+
if (char === "`") {
|
|
226
|
+
inBacktick = false;
|
|
227
|
+
}
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
if (inBracket) {
|
|
231
|
+
current += char;
|
|
232
|
+
if (char === "]") {
|
|
233
|
+
inBracket = false;
|
|
234
|
+
}
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
if (char === "'") {
|
|
238
|
+
inSingleQuote = true;
|
|
239
|
+
current += char;
|
|
240
|
+
continue;
|
|
241
|
+
}
|
|
242
|
+
if (char === '"') {
|
|
243
|
+
inDoubleQuote = true;
|
|
244
|
+
current += char;
|
|
245
|
+
continue;
|
|
246
|
+
}
|
|
247
|
+
if (char === "`") {
|
|
248
|
+
inBacktick = true;
|
|
249
|
+
current += char;
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
if (char === "[") {
|
|
253
|
+
inBracket = true;
|
|
254
|
+
current += char;
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
if (char === "(") {
|
|
258
|
+
parenDepth += 1;
|
|
259
|
+
current += char;
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
262
|
+
if (char === ")" && parenDepth > 0) {
|
|
263
|
+
parenDepth -= 1;
|
|
264
|
+
current += char;
|
|
265
|
+
continue;
|
|
266
|
+
}
|
|
267
|
+
if (char === delimiter && parenDepth === 0) {
|
|
268
|
+
parts.push(current.trim());
|
|
269
|
+
current = "";
|
|
270
|
+
continue;
|
|
271
|
+
}
|
|
272
|
+
current += char;
|
|
273
|
+
}
|
|
274
|
+
if (current.trim().length > 0) {
|
|
275
|
+
parts.push(current.trim());
|
|
276
|
+
}
|
|
277
|
+
return parts;
|
|
278
|
+
}
|
|
279
|
+
function extractSelectClause(sql) {
|
|
280
|
+
const selectIndex = findTopLevelKeywordIndex(sql, "select");
|
|
281
|
+
if (selectIndex < 0) {
|
|
282
|
+
return null;
|
|
283
|
+
}
|
|
284
|
+
const fromIndex = findTopLevelKeywordIndex(sql, "from", selectIndex + 6);
|
|
285
|
+
if (fromIndex < 0) {
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
return sql.slice(selectIndex + 6, fromIndex).trim();
|
|
289
|
+
}
|
|
290
|
+
function extractFromClause(sql) {
|
|
291
|
+
const selectIndex = findTopLevelKeywordIndex(sql, "select");
|
|
292
|
+
if (selectIndex < 0) {
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
const fromIndex = findTopLevelKeywordIndex(sql, "from", selectIndex + 6);
|
|
296
|
+
if (fromIndex < 0) {
|
|
297
|
+
return null;
|
|
298
|
+
}
|
|
299
|
+
return sql
|
|
300
|
+
.slice(fromIndex)
|
|
301
|
+
.trim()
|
|
302
|
+
.replace(/;+\s*$/u, "");
|
|
303
|
+
}
|
|
304
|
+
function parseRefCalls(sql) {
|
|
305
|
+
const refs = [];
|
|
306
|
+
const pattern = /\{\{\s*ref\(\s*(['"])([^'"]+)\1\s*,\s*(['"])([^'"]+)\3\s*\)\s*\}\}\s*(?:(?:AS)\s+)?([A-Za-z_][\w$]*|"[^"]+"|`[^`]+`|\[[^\]]+\])?/gi;
|
|
307
|
+
for (const match of sql.matchAll(pattern)) {
|
|
308
|
+
refs.push({
|
|
309
|
+
locationName: match[2] ?? "",
|
|
310
|
+
nodeName: match[4] ?? "",
|
|
311
|
+
alias: match[5] ? stripIdentifierQuotes(match[5]) : null,
|
|
312
|
+
nodeID: null,
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
return refs;
|
|
316
|
+
}
|
|
317
|
+
function splitExpressionAlias(rawItem) {
|
|
318
|
+
const asMatch = rawItem.match(/^(.*?)(?:\s+AS\s+)([A-Za-z_][\w$]*|"[^"]+"|`[^`]+`|\[[^\]]+\])$/i);
|
|
319
|
+
if (asMatch) {
|
|
320
|
+
return {
|
|
321
|
+
expression: asMatch[1]?.trim() ?? rawItem.trim(),
|
|
322
|
+
outputName: stripIdentifierQuotes(asMatch[2] ?? ""),
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
const bareAliasMatch = rawItem.match(/^(.*?)(?:\s+)([A-Za-z_][\w$]*|"[^"]+"|`[^`]+`|\[[^\]]+\])$/);
|
|
326
|
+
if (bareAliasMatch) {
|
|
327
|
+
const candidateExpression = bareAliasMatch[1]?.trim() ?? rawItem.trim();
|
|
328
|
+
if (candidateExpression.includes(".") || candidateExpression.includes("(")) {
|
|
329
|
+
return {
|
|
330
|
+
expression: candidateExpression,
|
|
331
|
+
outputName: stripIdentifierQuotes(bareAliasMatch[2] ?? ""),
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
return {
|
|
336
|
+
expression: rawItem.trim(),
|
|
337
|
+
outputName: null,
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
function parseDirectColumnExpression(expression) {
|
|
341
|
+
const trimmed = expression.trim();
|
|
342
|
+
if (trimmed === "*") {
|
|
343
|
+
return null;
|
|
344
|
+
}
|
|
345
|
+
const qualifiedMatch = trimmed.match(/^(?:(?<alias>"[^"]+"|`[^`]+`|\[[^\]]+\]|[A-Za-z_][\w$]*)\s*\.\s*)?(?<column>"[^"]+"|`[^`]+`|\[[^\]]+\]|[A-Za-z_][\w$]*)$/u);
|
|
346
|
+
if (!qualifiedMatch?.groups?.column) {
|
|
347
|
+
return null;
|
|
348
|
+
}
|
|
349
|
+
return {
|
|
350
|
+
sourceNodeAlias: qualifiedMatch.groups.alias
|
|
351
|
+
? stripIdentifierQuotes(qualifiedMatch.groups.alias)
|
|
352
|
+
: null,
|
|
353
|
+
sourceColumnName: stripIdentifierQuotes(qualifiedMatch.groups.column),
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
function parseWildcardExpression(expression) {
|
|
357
|
+
const trimmed = expression.trim();
|
|
358
|
+
if (trimmed === "*") {
|
|
359
|
+
return { sourceNodeAlias: null };
|
|
360
|
+
}
|
|
361
|
+
const aliasMatch = trimmed.match(/^(?<alias>"[^"]+"|`[^`]+`|\[[^\]]+\]|[A-Za-z_][\w$]*)\s*\.\s*\*$/u);
|
|
362
|
+
if (!aliasMatch?.groups?.alias) {
|
|
363
|
+
return null;
|
|
364
|
+
}
|
|
365
|
+
return {
|
|
366
|
+
sourceNodeAlias: stripIdentifierQuotes(aliasMatch.groups.alias),
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
function listToQuestion(values) {
|
|
370
|
+
return values.join(", ");
|
|
371
|
+
}
|
|
372
|
+
function parseSqlSelectItems(sql, refs) {
|
|
373
|
+
const warnings = [];
|
|
374
|
+
const refsByAlias = new Map();
|
|
375
|
+
for (const ref of refs) {
|
|
376
|
+
refsByAlias.set(normalizeSqlIdentifier(ref.alias ?? ref.nodeName), ref);
|
|
377
|
+
}
|
|
378
|
+
const selectClause = extractSelectClause(sql);
|
|
379
|
+
if (!selectClause) {
|
|
380
|
+
return {
|
|
381
|
+
refs,
|
|
382
|
+
selectItems: [],
|
|
383
|
+
warnings: ["Could not find a top-level SELECT ... FROM clause in the SQL."],
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
const rawItems = splitTopLevel(selectClause, ",");
|
|
387
|
+
const selectItems = [];
|
|
388
|
+
for (const rawItem of rawItems) {
|
|
389
|
+
const { expression, outputName } = splitExpressionAlias(rawItem);
|
|
390
|
+
const wildcard = parseWildcardExpression(expression);
|
|
391
|
+
if (wildcard) {
|
|
392
|
+
if (wildcard.sourceNodeAlias === null && refs.length !== 1) {
|
|
393
|
+
selectItems.push({
|
|
394
|
+
expression,
|
|
395
|
+
outputName: null,
|
|
396
|
+
sourceNodeAlias: null,
|
|
397
|
+
sourceNodeName: null,
|
|
398
|
+
sourceNodeID: null,
|
|
399
|
+
sourceColumnName: null,
|
|
400
|
+
kind: "expression",
|
|
401
|
+
supported: false,
|
|
402
|
+
reason: "Unqualified * is only supported when exactly one predecessor ref is present.",
|
|
403
|
+
});
|
|
404
|
+
continue;
|
|
405
|
+
}
|
|
406
|
+
const ref = wildcard.sourceNodeAlias === null
|
|
407
|
+
? refs[0] ?? null
|
|
408
|
+
: refsByAlias.get(normalizeSqlIdentifier(wildcard.sourceNodeAlias)) ?? null;
|
|
409
|
+
if (!ref) {
|
|
410
|
+
selectItems.push({
|
|
411
|
+
expression,
|
|
412
|
+
outputName: null,
|
|
413
|
+
sourceNodeAlias: wildcard.sourceNodeAlias,
|
|
414
|
+
sourceNodeName: null,
|
|
415
|
+
sourceNodeID: null,
|
|
416
|
+
sourceColumnName: null,
|
|
417
|
+
kind: "expression",
|
|
418
|
+
supported: false,
|
|
419
|
+
reason: "Wildcard source alias could not be resolved to a predecessor ref.",
|
|
420
|
+
});
|
|
421
|
+
continue;
|
|
422
|
+
}
|
|
423
|
+
// Wildcards are expanded later after predecessor nodes are fetched.
|
|
424
|
+
selectItems.push({
|
|
425
|
+
expression,
|
|
426
|
+
outputName: null,
|
|
427
|
+
sourceNodeAlias: wildcard.sourceNodeAlias ?? ref.alias ?? ref.nodeName,
|
|
428
|
+
sourceNodeName: ref.nodeName,
|
|
429
|
+
sourceNodeID: ref.nodeID,
|
|
430
|
+
sourceColumnName: "*",
|
|
431
|
+
kind: "expression",
|
|
432
|
+
supported: true,
|
|
433
|
+
});
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
const directColumn = parseDirectColumnExpression(expression);
|
|
437
|
+
if (!directColumn) {
|
|
438
|
+
// Expression is not a direct column reference - it's a computed expression
|
|
439
|
+
// Support it if it has an output name (alias)
|
|
440
|
+
if (outputName === null) {
|
|
441
|
+
selectItems.push({
|
|
442
|
+
expression,
|
|
443
|
+
outputName: null,
|
|
444
|
+
sourceNodeAlias: null,
|
|
445
|
+
sourceNodeName: null,
|
|
446
|
+
sourceNodeID: null,
|
|
447
|
+
sourceColumnName: null,
|
|
448
|
+
kind: "expression",
|
|
449
|
+
supported: false,
|
|
450
|
+
reason: "Computed expressions require an alias (e.g., CASE ... END AS column_name)",
|
|
451
|
+
});
|
|
452
|
+
continue;
|
|
453
|
+
}
|
|
454
|
+
// Computed expression with alias - supported
|
|
455
|
+
selectItems.push({
|
|
456
|
+
expression,
|
|
457
|
+
outputName,
|
|
458
|
+
sourceNodeAlias: null,
|
|
459
|
+
sourceNodeName: null,
|
|
460
|
+
sourceNodeID: null,
|
|
461
|
+
sourceColumnName: null,
|
|
462
|
+
kind: "expression",
|
|
463
|
+
supported: true,
|
|
464
|
+
});
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
const ref = directColumn.sourceNodeAlias === null
|
|
468
|
+
? refs.length === 1
|
|
469
|
+
? refs[0] ?? null
|
|
470
|
+
: null
|
|
471
|
+
: refsByAlias.get(normalizeSqlIdentifier(directColumn.sourceNodeAlias)) ?? null;
|
|
472
|
+
if (!ref) {
|
|
473
|
+
selectItems.push({
|
|
474
|
+
expression,
|
|
475
|
+
outputName: outputName ?? directColumn.sourceColumnName,
|
|
476
|
+
sourceNodeAlias: directColumn.sourceNodeAlias,
|
|
477
|
+
sourceNodeName: null,
|
|
478
|
+
sourceNodeID: null,
|
|
479
|
+
sourceColumnName: directColumn.sourceColumnName,
|
|
480
|
+
kind: "column",
|
|
481
|
+
supported: false,
|
|
482
|
+
reason: directColumn.sourceNodeAlias === null
|
|
483
|
+
? "Unqualified columns are only supported when exactly one predecessor ref is present."
|
|
484
|
+
: `The source alias ${directColumn.sourceNodeAlias} did not match a predecessor ref.`,
|
|
485
|
+
});
|
|
486
|
+
continue;
|
|
487
|
+
}
|
|
488
|
+
selectItems.push({
|
|
489
|
+
expression,
|
|
490
|
+
outputName: outputName ?? directColumn.sourceColumnName,
|
|
491
|
+
sourceNodeAlias: directColumn.sourceNodeAlias ?? ref.alias ?? ref.nodeName,
|
|
492
|
+
sourceNodeName: ref.nodeName,
|
|
493
|
+
sourceNodeID: ref.nodeID,
|
|
494
|
+
sourceColumnName: directColumn.sourceColumnName,
|
|
495
|
+
kind: "column",
|
|
496
|
+
supported: true,
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
if (selectItems.length === 0) {
|
|
500
|
+
warnings.push("The SQL SELECT clause did not produce any supported projected columns.");
|
|
501
|
+
}
|
|
502
|
+
return { refs, selectItems, warnings };
|
|
503
|
+
}
|
|
504
|
+
async function listAllWorkspaceNodes(client, workspaceID) {
|
|
505
|
+
const nodes = [];
|
|
506
|
+
const seenCursors = new Set();
|
|
507
|
+
let next;
|
|
508
|
+
let isFirstPage = true;
|
|
509
|
+
while (isFirstPage || next) {
|
|
510
|
+
const response = await listWorkspaceNodes(client, {
|
|
511
|
+
workspaceID,
|
|
512
|
+
limit: WORKSPACE_NODE_PAGE_LIMIT,
|
|
513
|
+
orderBy: "id",
|
|
514
|
+
...(next ? { startingFrom: next } : {}),
|
|
515
|
+
});
|
|
516
|
+
if (!isPlainObject(response)) {
|
|
517
|
+
throw new Error("Workspace node list response was not an object");
|
|
518
|
+
}
|
|
519
|
+
if (Array.isArray(response.data)) {
|
|
520
|
+
for (const item of response.data) {
|
|
521
|
+
if (!isPlainObject(item) || typeof item.id !== "string" || typeof item.name !== "string") {
|
|
522
|
+
continue;
|
|
523
|
+
}
|
|
524
|
+
nodes.push({
|
|
525
|
+
id: item.id,
|
|
526
|
+
name: item.name,
|
|
527
|
+
nodeType: typeof item.nodeType === "string" ? item.nodeType : null,
|
|
528
|
+
locationName: typeof item.locationName === "string" ? item.locationName : null,
|
|
529
|
+
});
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
const responseNext = typeof response.next === "string" && response.next.trim().length > 0
|
|
533
|
+
? response.next
|
|
534
|
+
: undefined;
|
|
535
|
+
if (responseNext) {
|
|
536
|
+
if (seenCursors.has(responseNext)) {
|
|
537
|
+
throw new Error(`Workspace node pagination repeated cursor ${responseNext}`);
|
|
538
|
+
}
|
|
539
|
+
seenCursors.add(responseNext);
|
|
540
|
+
}
|
|
541
|
+
next = responseNext;
|
|
542
|
+
isFirstPage = false;
|
|
543
|
+
}
|
|
544
|
+
return nodes;
|
|
545
|
+
}
|
|
546
|
+
function getNodeLocationName(node) {
|
|
547
|
+
if (typeof node.locationName === "string" && node.locationName.trim().length > 0) {
|
|
548
|
+
return node.locationName;
|
|
549
|
+
}
|
|
550
|
+
return null;
|
|
551
|
+
}
|
|
552
|
+
async function resolveSqlRefsToWorkspaceNodes(client, workspaceID, refs) {
|
|
553
|
+
const warnings = [];
|
|
554
|
+
const openQuestions = [];
|
|
555
|
+
const predecessorNodes = {};
|
|
556
|
+
if (refs.length === 0) {
|
|
557
|
+
openQuestions.push("Which upstream Coalesce node(s) should this pipeline build from? Add {{ ref('LOCATION', 'NODE') }} references to the SQL or provide sourceNodeIDs.");
|
|
558
|
+
return { refs, openQuestions, warnings, predecessorNodes };
|
|
559
|
+
}
|
|
560
|
+
const workspaceNodes = await listAllWorkspaceNodes(client, workspaceID);
|
|
561
|
+
const nodesByNormalizedName = new Map();
|
|
562
|
+
for (const node of workspaceNodes) {
|
|
563
|
+
const normalized = normalizeSqlIdentifier(node.name);
|
|
564
|
+
const existing = nodesByNormalizedName.get(normalized) ?? [];
|
|
565
|
+
existing.push(node);
|
|
566
|
+
nodesByNormalizedName.set(normalized, existing);
|
|
567
|
+
}
|
|
568
|
+
for (const ref of refs) {
|
|
569
|
+
const matches = nodesByNormalizedName.get(normalizeSqlIdentifier(ref.nodeName)) ?? [];
|
|
570
|
+
if (matches.length === 0) {
|
|
571
|
+
openQuestions.push(`Could not resolve the SQL ref ${ref.locationName}.${ref.nodeName} to a workspace node ID in workspace ${workspaceID}.`);
|
|
572
|
+
continue;
|
|
573
|
+
}
|
|
574
|
+
const matchingLocationEntries = matches.filter((entry) => entry.locationName &&
|
|
575
|
+
normalizeSqlIdentifier(entry.locationName) ===
|
|
576
|
+
normalizeSqlIdentifier(ref.locationName));
|
|
577
|
+
if (matchingLocationEntries.length === 1) {
|
|
578
|
+
ref.nodeID = matchingLocationEntries[0]?.id ?? null;
|
|
579
|
+
continue;
|
|
580
|
+
}
|
|
581
|
+
if (matchingLocationEntries.length > 1) {
|
|
582
|
+
openQuestions.push(`Multiple workspace nodes matched the SQL ref ${ref.locationName}.${ref.nodeName}. Resolve the exact node before creation.`);
|
|
583
|
+
continue;
|
|
584
|
+
}
|
|
585
|
+
if (matches.length > 1) {
|
|
586
|
+
const detailedMatches = await Promise.all(matches.map(async (match) => {
|
|
587
|
+
const node = await getWorkspaceNode(client, {
|
|
588
|
+
workspaceID,
|
|
589
|
+
nodeID: match.id,
|
|
590
|
+
});
|
|
591
|
+
return {
|
|
592
|
+
match,
|
|
593
|
+
node: isPlainObject(node) ? node : null,
|
|
594
|
+
};
|
|
595
|
+
}));
|
|
596
|
+
const exactLocationMatches = detailedMatches.filter((candidate) => candidate.node &&
|
|
597
|
+
getNodeLocationName(candidate.node) &&
|
|
598
|
+
normalizeSqlIdentifier(getNodeLocationName(candidate.node) ?? "") ===
|
|
599
|
+
normalizeSqlIdentifier(ref.locationName));
|
|
600
|
+
if (exactLocationMatches.length === 1) {
|
|
601
|
+
ref.nodeID = exactLocationMatches[0]?.match.id ?? null;
|
|
602
|
+
continue;
|
|
603
|
+
}
|
|
604
|
+
if (exactLocationMatches.length > 1) {
|
|
605
|
+
openQuestions.push(`Multiple workspace nodes matched the SQL ref ${ref.locationName}.${ref.nodeName}. Resolve the exact node before creation.`);
|
|
606
|
+
continue;
|
|
607
|
+
}
|
|
608
|
+
openQuestions.push(`Workspace nodes named ${ref.nodeName} were found, but none matched the requested location ${ref.locationName}.`);
|
|
609
|
+
continue;
|
|
610
|
+
}
|
|
611
|
+
ref.nodeID = matches[0]?.id ?? null;
|
|
612
|
+
}
|
|
613
|
+
for (const ref of refs) {
|
|
614
|
+
if (!ref.nodeID) {
|
|
615
|
+
continue;
|
|
616
|
+
}
|
|
617
|
+
const predecessor = await getWorkspaceNode(client, {
|
|
618
|
+
workspaceID,
|
|
619
|
+
nodeID: ref.nodeID,
|
|
620
|
+
});
|
|
621
|
+
if (!isPlainObject(predecessor)) {
|
|
622
|
+
warnings.push(`Resolved predecessor ${ref.nodeName} did not return an object body.`);
|
|
623
|
+
continue;
|
|
624
|
+
}
|
|
625
|
+
const predecessorLocationName = getNodeLocationName(predecessor);
|
|
626
|
+
if (predecessorLocationName &&
|
|
627
|
+
normalizeSqlIdentifier(predecessorLocationName) !==
|
|
628
|
+
normalizeSqlIdentifier(ref.locationName)) {
|
|
629
|
+
ref.nodeID = null;
|
|
630
|
+
openQuestions.push(`Resolved node ${ref.nodeName} is in location ${predecessorLocationName}, not the requested location ${ref.locationName}.`);
|
|
631
|
+
continue;
|
|
632
|
+
}
|
|
633
|
+
predecessorNodes[ref.nodeID] = predecessor;
|
|
634
|
+
}
|
|
635
|
+
return { refs, openQuestions, warnings, predecessorNodes };
|
|
636
|
+
}
|
|
637
|
+
export function getColumnNamesFromNode(node) {
|
|
638
|
+
const metadata = isPlainObject(node.metadata) ? node.metadata : undefined;
|
|
639
|
+
if (!Array.isArray(metadata?.columns)) {
|
|
640
|
+
return [];
|
|
641
|
+
}
|
|
642
|
+
return metadata.columns.flatMap((column) => {
|
|
643
|
+
if (!isPlainObject(column) || typeof column.name !== "string") {
|
|
644
|
+
return [];
|
|
645
|
+
}
|
|
646
|
+
return [column.name];
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
function buildSelectItemsFromSourceNode(sourceNodeID, sourceNodeName, node) {
|
|
650
|
+
return getColumnNamesFromNode(node).map((columnName) => ({
|
|
651
|
+
expression: `${sourceNodeName}.${columnName}`,
|
|
652
|
+
outputName: columnName,
|
|
653
|
+
sourceNodeAlias: sourceNodeName,
|
|
654
|
+
sourceNodeName,
|
|
655
|
+
sourceNodeID,
|
|
656
|
+
sourceColumnName: columnName,
|
|
657
|
+
kind: "column",
|
|
658
|
+
supported: true,
|
|
659
|
+
}));
|
|
660
|
+
}
|
|
661
|
+
async function getSourceNodesByID(client, workspaceID, sourceNodeIDs) {
|
|
662
|
+
const sourceRefs = [];
|
|
663
|
+
const predecessorNodes = {};
|
|
664
|
+
const openQuestions = [];
|
|
665
|
+
const warnings = [];
|
|
666
|
+
for (const sourceNodeID of sourceNodeIDs) {
|
|
667
|
+
const node = await getWorkspaceNode(client, {
|
|
668
|
+
workspaceID,
|
|
669
|
+
nodeID: sourceNodeID,
|
|
670
|
+
});
|
|
671
|
+
if (!isPlainObject(node)) {
|
|
672
|
+
openQuestions.push(`Could not read source node ${sourceNodeID} in workspace ${workspaceID}.`);
|
|
673
|
+
continue;
|
|
674
|
+
}
|
|
675
|
+
if (typeof node.name !== "string" || node.name.trim().length === 0) {
|
|
676
|
+
openQuestions.push(`Source node ${sourceNodeID} does not have a usable name.`);
|
|
677
|
+
continue;
|
|
678
|
+
}
|
|
679
|
+
const locationName = getNodeLocationName(node);
|
|
680
|
+
if (!locationName) {
|
|
681
|
+
openQuestions.push(`Source node ${node.name} does not expose locationName. Clarify the Coalesce location before generating ref() SQL for this pipeline.`);
|
|
682
|
+
}
|
|
683
|
+
predecessorNodes[sourceNodeID] = node;
|
|
684
|
+
sourceRefs.push({
|
|
685
|
+
locationName: locationName ?? "UNKNOWN_LOCATION",
|
|
686
|
+
nodeName: node.name,
|
|
687
|
+
alias: node.name,
|
|
688
|
+
nodeID: sourceNodeID,
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
return {
|
|
692
|
+
sourceRefs,
|
|
693
|
+
predecessorNodes,
|
|
694
|
+
openQuestions,
|
|
695
|
+
warnings,
|
|
696
|
+
};
|
|
697
|
+
}
|
|
698
|
+
function expandWildcardSelectItems(selectItems, refs, predecessorNodes) {
|
|
699
|
+
const expanded = [];
|
|
700
|
+
for (const item of selectItems) {
|
|
701
|
+
if (item.sourceColumnName !== "*" || !item.supported) {
|
|
702
|
+
expanded.push(item);
|
|
703
|
+
continue;
|
|
704
|
+
}
|
|
705
|
+
const ref = item.sourceNodeID
|
|
706
|
+
? refs.find((candidate) => candidate.nodeID === item.sourceNodeID) ?? null
|
|
707
|
+
: refs.find((candidate) => normalizeSqlIdentifier(candidate.alias ?? candidate.nodeName) ===
|
|
708
|
+
normalizeSqlIdentifier(item.sourceNodeAlias ?? "")) ?? null;
|
|
709
|
+
if (!ref?.nodeID) {
|
|
710
|
+
expanded.push({
|
|
711
|
+
...item,
|
|
712
|
+
supported: false,
|
|
713
|
+
reason: "Wildcard source could not be resolved to a concrete predecessor node.",
|
|
714
|
+
});
|
|
715
|
+
continue;
|
|
716
|
+
}
|
|
717
|
+
const predecessor = predecessorNodes[ref.nodeID];
|
|
718
|
+
if (!predecessor) {
|
|
719
|
+
expanded.push({
|
|
720
|
+
...item,
|
|
721
|
+
supported: false,
|
|
722
|
+
reason: "Wildcard source predecessor body was not available for column expansion.",
|
|
723
|
+
});
|
|
724
|
+
continue;
|
|
725
|
+
}
|
|
726
|
+
const columnNames = getColumnNamesFromNode(predecessor);
|
|
727
|
+
if (columnNames.length === 0) {
|
|
728
|
+
expanded.push({
|
|
729
|
+
...item,
|
|
730
|
+
supported: false,
|
|
731
|
+
reason: "Wildcard source predecessor has no columns to expand.",
|
|
732
|
+
});
|
|
733
|
+
continue;
|
|
734
|
+
}
|
|
735
|
+
for (const columnName of columnNames) {
|
|
736
|
+
expanded.push({
|
|
737
|
+
expression: item.sourceNodeAlias && item.sourceNodeAlias.length > 0
|
|
738
|
+
? `${item.sourceNodeAlias}.${columnName}`
|
|
739
|
+
: columnName,
|
|
740
|
+
outputName: columnName,
|
|
741
|
+
sourceNodeAlias: item.sourceNodeAlias,
|
|
742
|
+
sourceNodeName: item.sourceNodeName,
|
|
743
|
+
sourceNodeID: ref.nodeID,
|
|
744
|
+
sourceColumnName: columnName,
|
|
745
|
+
kind: "column",
|
|
746
|
+
supported: true,
|
|
747
|
+
});
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
return expanded;
|
|
751
|
+
}
|
|
752
|
+
function buildDefaultNodePrefix(nodeTypeFamily, shortName) {
|
|
753
|
+
if (shortName && shortName.trim().length > 0) {
|
|
754
|
+
return shortName.trim().toUpperCase().replace(/[^A-Z0-9]+/g, "_");
|
|
755
|
+
}
|
|
756
|
+
switch (nodeTypeFamily) {
|
|
757
|
+
case "stage":
|
|
758
|
+
return "STG";
|
|
759
|
+
case "persistent-stage":
|
|
760
|
+
return "PSTG";
|
|
761
|
+
case "view":
|
|
762
|
+
return "VW";
|
|
763
|
+
case "work":
|
|
764
|
+
return "WRK";
|
|
765
|
+
case "dimension":
|
|
766
|
+
return "DIM";
|
|
767
|
+
case "fact":
|
|
768
|
+
return "FACT";
|
|
769
|
+
case "hub":
|
|
770
|
+
return "HUB";
|
|
771
|
+
case "satellite":
|
|
772
|
+
return "SAT";
|
|
773
|
+
case "link":
|
|
774
|
+
return "LNK";
|
|
775
|
+
default:
|
|
776
|
+
return "NODE";
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
function buildDefaultNodeName(targetName, refs, nodeTypeFamily, shortName) {
|
|
780
|
+
if (targetName && targetName.trim().length > 0) {
|
|
781
|
+
return targetName.trim();
|
|
782
|
+
}
|
|
783
|
+
const prefix = buildDefaultNodePrefix(nodeTypeFamily, shortName);
|
|
784
|
+
const firstRef = refs[0];
|
|
785
|
+
if (!firstRef) {
|
|
786
|
+
return `${prefix}_NEW_PIPELINE`;
|
|
787
|
+
}
|
|
788
|
+
const stripped = firstRef.nodeName.replace(/^(SRC[_-]?|STG[_-]?|DIM[_-]?|FACT[_-]?|FCT[_-]?|INT[_-]?|WORK[_-]?|VW[_-]?)/i, "");
|
|
789
|
+
return `${prefix}_${stripped}`.toUpperCase().replace(/__+/g, "_");
|
|
790
|
+
}
|
|
791
|
+
function matchesObservedNodeType(requestedNodeType, observedNodeTypes) {
|
|
792
|
+
const requestedID = requestedNodeType.includes(":::")
|
|
793
|
+
? requestedNodeType.split(":::")[1] ?? requestedNodeType
|
|
794
|
+
: requestedNodeType;
|
|
795
|
+
return observedNodeTypes.some((observed) => {
|
|
796
|
+
if (observed === requestedNodeType) {
|
|
797
|
+
return true;
|
|
798
|
+
}
|
|
799
|
+
const observedID = observed.includes(":::") ? observed.split(":::")[1] ?? observed : observed;
|
|
800
|
+
return observedID === requestedID;
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
async function getWorkspaceNodeTypeInventory(client, workspaceID) {
|
|
804
|
+
try {
|
|
805
|
+
const result = await listWorkspaceNodeTypes(client, { workspaceID });
|
|
806
|
+
return {
|
|
807
|
+
nodeTypes: result.nodeTypes ?? [],
|
|
808
|
+
counts: result.counts ?? {},
|
|
809
|
+
total: result.total ?? 0,
|
|
810
|
+
warnings: [],
|
|
811
|
+
};
|
|
812
|
+
}
|
|
813
|
+
catch {
|
|
814
|
+
return {
|
|
815
|
+
nodeTypes: [],
|
|
816
|
+
counts: {},
|
|
817
|
+
total: 0,
|
|
818
|
+
warnings: [
|
|
819
|
+
`Observed workspace node types could not be fetched for workspace ${workspaceID}. ` +
|
|
820
|
+
`Use list-workspace-node-types or cache-workspace-nodes to inspect current workspace usage and confirm installation before execution.`,
|
|
821
|
+
],
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
function applyWorkspaceNodeTypeValidation(plan, inventory, requestedNodeType) {
|
|
826
|
+
plan.warnings.push(...inventory.warnings);
|
|
827
|
+
if (inventory.total === 0) {
|
|
828
|
+
return;
|
|
829
|
+
}
|
|
830
|
+
const recommendedTypes = (plan.nodes ?? [])
|
|
831
|
+
.map((node) => node.nodeType)
|
|
832
|
+
.filter((nodeType) => typeof nodeType === "string" && nodeType.length > 0);
|
|
833
|
+
if (requestedNodeType && requestedNodeType.trim().length > 0) {
|
|
834
|
+
recommendedTypes.push(requestedNodeType);
|
|
835
|
+
}
|
|
836
|
+
const missingTypes = Array.from(new Set(recommendedTypes)).filter((nodeType) => !matchesObservedNodeType(nodeType, inventory.nodeTypes));
|
|
837
|
+
if (missingTypes.length > 0) {
|
|
838
|
+
plan.warnings.push(`The following node types were not observed in current workspace nodes: ${missingTypes.join(", ")}. This observation is based on existing nodes, not a true installed-type registry. Confirm installation in Coalesce before creating nodes of these types.`);
|
|
839
|
+
plan.status = "needs_clarification";
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
function buildPlanFromSql(params, parseResult, predecessorNodes, openQuestions, warnings) {
|
|
843
|
+
const nodeType = params.selectedNodeType?.nodeType ?? params.targetNodeType ?? "Stage";
|
|
844
|
+
const planOpenQuestions = [...openQuestions];
|
|
845
|
+
if (!params.selectedNodeType) {
|
|
846
|
+
warnings.push(`No ranked node type candidate was available, so planning fell back to ${nodeType}.`);
|
|
847
|
+
}
|
|
848
|
+
else if (!params.selectedNodeType.autoExecutable) {
|
|
849
|
+
warnings.push(`Planner selected node type ${nodeType}, but it likely needs additional semantic configuration before automatic creation.`);
|
|
850
|
+
if (params.selectedNodeType.semanticSignals.length > 0) {
|
|
851
|
+
planOpenQuestions.push(`Confirm the required configuration for ${nodeType}: ${params.selectedNodeType.semanticSignals.join(", ")}.`);
|
|
852
|
+
}
|
|
853
|
+
if (params.selectedNodeType.missingDefaultFields.length > 0) {
|
|
854
|
+
planOpenQuestions.push(`Provide values for ${nodeType} config fields without defaults: ${params.selectedNodeType.missingDefaultFields.join(", ")}.`);
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
const expandedSelectItems = expandWildcardSelectItems(parseResult.selectItems, parseResult.refs, predecessorNodes);
|
|
858
|
+
const unsupportedItems = expandedSelectItems.filter((item) => !item.supported);
|
|
859
|
+
if (unsupportedItems.length > 0) {
|
|
860
|
+
for (const item of unsupportedItems) {
|
|
861
|
+
warnings.push(item.reason
|
|
862
|
+
? `${item.expression}: ${item.reason}`
|
|
863
|
+
: `${item.expression}: unsupported SQL projection in v1`);
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
const supportedOutputColumnCount = expandedSelectItems.filter((item) => item.supported && item.outputName).length;
|
|
867
|
+
if (parseResult.warnings.some((warning) => warning.includes("Could not find a top-level SELECT ... FROM clause"))) {
|
|
868
|
+
planOpenQuestions.push("Provide a top-level SELECT ... FROM query using direct column projections before creating this pipeline.");
|
|
869
|
+
}
|
|
870
|
+
else if (supportedOutputColumnCount === 0) {
|
|
871
|
+
planOpenQuestions.push("Specify at least one supported projected column before creating this pipeline.");
|
|
872
|
+
}
|
|
873
|
+
const predecessorNodeIDs = parseResult.refs.flatMap((ref) => ref.nodeID ? [ref.nodeID] : []);
|
|
874
|
+
const predecessorNodeNames = parseResult.refs.map((ref) => ref.nodeName);
|
|
875
|
+
const ready = (params.selectedNodeType?.autoExecutable ?? true) &&
|
|
876
|
+
predecessorNodeIDs.length > 0 &&
|
|
877
|
+
supportedOutputColumnCount > 0 &&
|
|
878
|
+
unsupportedItems.length === 0 &&
|
|
879
|
+
parseResult.warnings.length === 0 &&
|
|
880
|
+
planOpenQuestions.length === 0;
|
|
881
|
+
const name = buildDefaultNodeName(params.targetName, parseResult.refs, params.selectedNodeType?.family ?? null, params.selectedNodeType?.shortName ?? null);
|
|
882
|
+
const plan = {
|
|
883
|
+
version: 1,
|
|
884
|
+
intent: "sql",
|
|
885
|
+
status: ready ? "ready" : "needs_clarification",
|
|
886
|
+
workspaceID: params.workspaceID,
|
|
887
|
+
platform: null,
|
|
888
|
+
goal: params.goal ?? null,
|
|
889
|
+
sql: params.sql,
|
|
890
|
+
nodes: [
|
|
891
|
+
{
|
|
892
|
+
planNodeID: "node-1",
|
|
893
|
+
name,
|
|
894
|
+
nodeType,
|
|
895
|
+
nodeTypeFamily: params.selectedNodeType?.family ?? null,
|
|
896
|
+
predecessorNodeIDs,
|
|
897
|
+
predecessorPlanNodeIDs: [],
|
|
898
|
+
predecessorNodeNames,
|
|
899
|
+
description: params.description ?? null,
|
|
900
|
+
sql: params.sql,
|
|
901
|
+
selectItems: expandedSelectItems,
|
|
902
|
+
outputColumnNames: expandedSelectItems.flatMap((item) => item.outputName ? [item.outputName] : []),
|
|
903
|
+
configOverrides: params.configOverrides ? deepClone(params.configOverrides) : {},
|
|
904
|
+
sourceRefs: parseResult.refs.map((ref) => ({
|
|
905
|
+
locationName: ref.locationName,
|
|
906
|
+
nodeName: ref.nodeName,
|
|
907
|
+
alias: ref.alias,
|
|
908
|
+
nodeID: ref.nodeID,
|
|
909
|
+
})),
|
|
910
|
+
joinCondition: extractFromClause(params.sql),
|
|
911
|
+
location: params.location ?? {},
|
|
912
|
+
requiresFullSetNode: true,
|
|
913
|
+
...(params.selectedNodeType?.templateDefaults
|
|
914
|
+
? { templateDefaults: params.selectedNodeType.templateDefaults }
|
|
915
|
+
: {}),
|
|
916
|
+
},
|
|
917
|
+
],
|
|
918
|
+
assumptions: [
|
|
919
|
+
`Planner ${params.nodeTypeSelection.strategy} selected ${nodeType} from repo/workspace candidates.`,
|
|
920
|
+
"The generated plan uses create-workspace-node-from-predecessor followed by set-workspace-node when the selected type is projection-capable.",
|
|
921
|
+
],
|
|
922
|
+
openQuestions: planOpenQuestions,
|
|
923
|
+
warnings: [...parseResult.warnings, ...warnings],
|
|
924
|
+
supportedNodeTypes: params.nodeTypeSelection.supportedNodeTypes.length > 0
|
|
925
|
+
? params.nodeTypeSelection.supportedNodeTypes
|
|
926
|
+
: [nodeType],
|
|
927
|
+
nodeTypeSelection: params.nodeTypeSelection,
|
|
928
|
+
};
|
|
929
|
+
return plan;
|
|
930
|
+
}
|
|
931
|
+
/**
|
|
932
|
+
* Extract CTEs with their bodies from SQL.
|
|
933
|
+
* Handles nested parentheses to find each CTE body.
|
|
934
|
+
*/
|
|
935
|
+
function extractCtes(sql) {
|
|
936
|
+
const trimmed = sql.trim();
|
|
937
|
+
if (!/^with\b/i.test(trimmed)) {
|
|
938
|
+
return [];
|
|
939
|
+
}
|
|
940
|
+
const ctes = [];
|
|
941
|
+
// Match CTE header: WITH name AS ( or , name AS (
|
|
942
|
+
const pattern = /(?:^with|,)\s+([A-Za-z_][\w$]*|"[^"]+"|`[^`]+`)\s+AS\s*\(/gi;
|
|
943
|
+
const matches = [...trimmed.matchAll(pattern)];
|
|
944
|
+
for (let i = 0; i < matches.length; i++) {
|
|
945
|
+
const match = matches[i];
|
|
946
|
+
const rawName = match[1] ? stripIdentifierQuotes(match[1]) : null;
|
|
947
|
+
if (!rawName)
|
|
948
|
+
continue;
|
|
949
|
+
const name = rawName.toUpperCase();
|
|
950
|
+
// Find the CTE body: start after the opening '(' of "AS ("
|
|
951
|
+
const bodyStart = match.index + match[0].length;
|
|
952
|
+
const body = extractParenBody(trimmed, bodyStart);
|
|
953
|
+
if (body) {
|
|
954
|
+
const columns = parseCteColumns(body);
|
|
955
|
+
const whereClause = extractCteWhereClause(body);
|
|
956
|
+
const sourceTable = extractCteSourceTable(body);
|
|
957
|
+
const hasGroupBy = /\bGROUP\s+BY\b/i.test(body);
|
|
958
|
+
const hasJoin = /\bJOIN\b/i.test(body);
|
|
959
|
+
ctes.push({ name, body, columns, whereClause, sourceTable, hasGroupBy, hasJoin });
|
|
960
|
+
}
|
|
961
|
+
else {
|
|
962
|
+
ctes.push({ name, body: "", columns: [], whereClause: null, sourceTable: null, hasGroupBy: false, hasJoin: false });
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
return ctes;
|
|
966
|
+
}
|
|
967
|
+
/**
|
|
968
|
+
* Extract the body between balanced parentheses.
|
|
969
|
+
* `startIndex` should be the position right after the opening '('.
|
|
970
|
+
*/
|
|
971
|
+
function extractParenBody(sql, startIndex) {
|
|
972
|
+
let depth = 1;
|
|
973
|
+
let i = startIndex;
|
|
974
|
+
while (i < sql.length && depth > 0) {
|
|
975
|
+
const ch = sql[i];
|
|
976
|
+
if (ch === "'") {
|
|
977
|
+
// Skip string literal
|
|
978
|
+
i++;
|
|
979
|
+
while (i < sql.length && sql[i] !== "'") {
|
|
980
|
+
if (sql[i] === "'" && i + 1 < sql.length && sql[i + 1] === "'")
|
|
981
|
+
i++; // escaped quote
|
|
982
|
+
i++;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
else if (ch === "(") {
|
|
986
|
+
depth++;
|
|
987
|
+
}
|
|
988
|
+
else if (ch === ")") {
|
|
989
|
+
depth--;
|
|
990
|
+
if (depth === 0) {
|
|
991
|
+
return sql.slice(startIndex, i).trim();
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
i++;
|
|
995
|
+
}
|
|
996
|
+
return null;
|
|
997
|
+
}
|
|
998
|
+
/**
|
|
999
|
+
* Parse a CTE body's SELECT list into columns with transform detection.
|
|
1000
|
+
*
|
|
1001
|
+
* Handles `SELECT * FROM (subquery) WHERE ...` by recursing into the subquery.
|
|
1002
|
+
*/
|
|
1003
|
+
function parseCteColumns(body) {
|
|
1004
|
+
const selectClause = extractSelectClause(body);
|
|
1005
|
+
if (!selectClause)
|
|
1006
|
+
return [];
|
|
1007
|
+
const rawItems = splitTopLevel(selectClause, ",");
|
|
1008
|
+
// Detect "SELECT * FROM (subquery)" — recurse into the subquery
|
|
1009
|
+
if (rawItems.length === 1 && /^\*$/.test(rawItems[0].trim())) {
|
|
1010
|
+
const subqueryBody = extractSubqueryFromFrom(body);
|
|
1011
|
+
if (subqueryBody) {
|
|
1012
|
+
return parseCteColumns(subqueryBody);
|
|
1013
|
+
}
|
|
1014
|
+
return [];
|
|
1015
|
+
}
|
|
1016
|
+
const columns = [];
|
|
1017
|
+
for (const rawItem of rawItems) {
|
|
1018
|
+
const { expression, outputName } = splitExpressionAlias(rawItem);
|
|
1019
|
+
const trimmedExpr = expression.trim();
|
|
1020
|
+
// Skip wildcards
|
|
1021
|
+
if (/^\*$/.test(trimmedExpr) || /\.\*$/.test(trimmedExpr))
|
|
1022
|
+
continue;
|
|
1023
|
+
const bareColName = extractBareColumnName(trimmedExpr)?.toUpperCase() ?? null;
|
|
1024
|
+
const colName = (outputName?.toUpperCase() ?? bareColName);
|
|
1025
|
+
if (!colName)
|
|
1026
|
+
continue;
|
|
1027
|
+
// Detect transforms: anything that isn't a simple column reference,
|
|
1028
|
+
// OR a column rename (AS alias differs from the source column name).
|
|
1029
|
+
// Renames need a transform so preserveColumnLinkage can match by the NEW name
|
|
1030
|
+
// and propagate the expression into sources[*].transform.
|
|
1031
|
+
const isRename = outputName !== null && bareColName !== null && outputName.toUpperCase() !== bareColName;
|
|
1032
|
+
const isTransform = !isSimpleColumnRef(trimmedExpr) || isRename;
|
|
1033
|
+
columns.push({
|
|
1034
|
+
outputName: colName,
|
|
1035
|
+
expression: trimmedExpr,
|
|
1036
|
+
isTransform,
|
|
1037
|
+
});
|
|
1038
|
+
}
|
|
1039
|
+
return columns;
|
|
1040
|
+
}
|
|
1041
|
+
/**
|
|
1042
|
+
* Extract the subquery body from `FROM (subquery)`.
|
|
1043
|
+
* Returns the SQL inside the parentheses, or null if FROM doesn't start with a subquery.
|
|
1044
|
+
*/
|
|
1045
|
+
function extractSubqueryFromFrom(sql) {
|
|
1046
|
+
const fromIndex = findTopLevelKeywordIndex(sql, "from");
|
|
1047
|
+
if (fromIndex < 0)
|
|
1048
|
+
return null;
|
|
1049
|
+
const afterFrom = sql.slice(fromIndex + 4).trimStart();
|
|
1050
|
+
if (!afterFrom.startsWith("("))
|
|
1051
|
+
return null;
|
|
1052
|
+
return extractParenBody(afterFrom, 1);
|
|
1053
|
+
}
|
|
1054
|
+
/**
|
|
1055
|
+
* Check if an expression is a simple column reference (no transform needed).
|
|
1056
|
+
* Simple: `col`, `"col"`, `table.col`, `table."col"`, `"table"."col"`
|
|
1057
|
+
*/
|
|
1058
|
+
function isSimpleColumnRef(expr) {
|
|
1059
|
+
// Simple: identifier or qualified identifier (with optional quotes)
|
|
1060
|
+
return /^(?:[A-Za-z_][\w$]*|"[^"]+")(?:\.(?:[A-Za-z_][\w$]*|"[^"]+"))?$/.test(expr.trim());
|
|
1061
|
+
}
|
|
1062
|
+
/**
|
|
1063
|
+
* Extract a bare column name from a simple reference like `table.col` or `col`.
|
|
1064
|
+
*/
|
|
1065
|
+
function extractBareColumnName(expr) {
|
|
1066
|
+
const match = expr.trim().match(/(?:.*\.)?([A-Za-z_][\w$]*|"[^"]+")$/);
|
|
1067
|
+
if (!match?.[1])
|
|
1068
|
+
return null;
|
|
1069
|
+
return stripIdentifierQuotes(match[1]);
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* Extract WHERE clause from a CTE body (ignoring subqueries).
|
|
1073
|
+
*/
|
|
1074
|
+
function extractCteWhereClause(body) {
|
|
1075
|
+
// Find WHERE that's not inside parentheses
|
|
1076
|
+
const upperBody = body.toUpperCase();
|
|
1077
|
+
let depth = 0;
|
|
1078
|
+
for (let i = 0; i < body.length; i++) {
|
|
1079
|
+
if (body[i] === "(")
|
|
1080
|
+
depth++;
|
|
1081
|
+
else if (body[i] === ")")
|
|
1082
|
+
depth--;
|
|
1083
|
+
else if (depth === 0 && upperBody.startsWith("WHERE", i)) {
|
|
1084
|
+
// Check it's a word boundary
|
|
1085
|
+
const before = i > 0 ? body[i - 1] : " ";
|
|
1086
|
+
const after = i + 5 < body.length ? body[i + 5] : " ";
|
|
1087
|
+
if (/\s/.test(before) && /\s/.test(after)) {
|
|
1088
|
+
// Extract until GROUP BY, ORDER BY, HAVING, LIMIT, QUALIFY, or end
|
|
1089
|
+
const rest = body.slice(i + 5);
|
|
1090
|
+
const endMatch = rest.search(/\b(?:GROUP\s+BY|ORDER\s+BY|HAVING|LIMIT|QUALIFY)\b/i);
|
|
1091
|
+
const clause = endMatch >= 0 ? rest.slice(0, endMatch).trim() : rest.trim();
|
|
1092
|
+
return clause || null;
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
return null;
|
|
1097
|
+
}
|
|
1098
|
+
const AGGREGATE_FUNCTIONS = new Set([
|
|
1099
|
+
"COUNT", "SUM", "AVG", "MIN", "MAX",
|
|
1100
|
+
"LISTAGG", "ARRAY_AGG", "MEDIAN", "MODE",
|
|
1101
|
+
"STDDEV", "VARIANCE", "ANY_VALUE",
|
|
1102
|
+
"COUNT_IF", "SUM_IF", "AVG_IF",
|
|
1103
|
+
"APPROX_COUNT_DISTINCT", "HLL",
|
|
1104
|
+
]);
|
|
1105
|
+
function isAggregateFn(name) {
|
|
1106
|
+
return AGGREGATE_FUNCTIONS.has(name.toUpperCase());
|
|
1107
|
+
}
|
|
1108
|
+
/**
|
|
1109
|
+
* Extract the main source table from a CTE body's FROM clause.
|
|
1110
|
+
*/
|
|
1111
|
+
function extractCteSourceTable(body) {
|
|
1112
|
+
const match = body.match(/\bFROM\s+([A-Za-z_][\w$.]*(?:\.[A-Za-z_][\w$]*)*)/i);
|
|
1113
|
+
return match?.[1]?.toUpperCase() ?? null;
|
|
1114
|
+
}
|
|
1115
|
+
/**
|
|
1116
|
+
* Classify a CTE's pattern to pick the right node type.
|
|
1117
|
+
*/
|
|
1118
|
+
function classifyCtePattern(cte) {
|
|
1119
|
+
if (cte.hasGroupBy)
|
|
1120
|
+
return "aggregation";
|
|
1121
|
+
if (cte.hasJoin)
|
|
1122
|
+
return "multiSource";
|
|
1123
|
+
return "staging";
|
|
1124
|
+
}
|
|
1125
|
+
/**
|
|
1126
|
+
* Build a per-CTE instruction block that tells the agent exactly what transforms
|
|
1127
|
+
* and filters to apply for this CTE.
|
|
1128
|
+
*/
|
|
1129
|
+
function buildCteNodeInstruction(cte, nodeType) {
|
|
1130
|
+
const lines = [];
|
|
1131
|
+
lines.push(`## ${cte.name}`);
|
|
1132
|
+
lines.push(`- nodeType: "${nodeType}"`);
|
|
1133
|
+
if (cte.sourceTable) {
|
|
1134
|
+
lines.push(`- source: ${cte.sourceTable}`);
|
|
1135
|
+
}
|
|
1136
|
+
const transforms = cte.columns.filter((c) => c.isTransform);
|
|
1137
|
+
const passthroughCols = cte.columns.filter((c) => !c.isTransform);
|
|
1138
|
+
if (cte.hasGroupBy) {
|
|
1139
|
+
lines.push(`- AGGREGATION NODE: pass groupByColumns + aggregates directly to create-workspace-node-from-predecessor (single call)`);
|
|
1140
|
+
}
|
|
1141
|
+
else if (cte.columns.length > 0) {
|
|
1142
|
+
lines.push(`- Pass columns array + whereCondition directly to create-workspace-node-from-predecessor (single call)`);
|
|
1143
|
+
}
|
|
1144
|
+
if (transforms.length > 0) {
|
|
1145
|
+
lines.push(`- Column transforms:`);
|
|
1146
|
+
for (const col of transforms) {
|
|
1147
|
+
lines.push(` - ${col.outputName}: ${col.expression}`);
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
if (passthroughCols.length > 0) {
|
|
1151
|
+
lines.push(`- Passthrough columns: ${passthroughCols.map((c) => c.outputName).join(", ")}`);
|
|
1152
|
+
}
|
|
1153
|
+
if (cte.columns.length > 0) {
|
|
1154
|
+
lines.push(`- ONLY keep these ${cte.columns.length} columns: ${cte.columns.map((c) => c.outputName).join(", ")}`);
|
|
1155
|
+
}
|
|
1156
|
+
if (cte.whereClause) {
|
|
1157
|
+
lines.push(`- WHERE filter (pass as whereCondition — do NOT construct {{ ref() }}): ${cte.whereClause}`);
|
|
1158
|
+
}
|
|
1159
|
+
if (cte.hasJoin) {
|
|
1160
|
+
lines.push(`- Has JOIN — use apply-join-condition or update-workspace-node for join setup`);
|
|
1161
|
+
}
|
|
1162
|
+
return lines.join("\n");
|
|
1163
|
+
}
|
|
1164
|
+
/**
|
|
1165
|
+
* When the user's SQL contains CTEs, return a plan that instructs the agent
|
|
1166
|
+
* to break each CTE into a separate Coalesce node using the declarative tools.
|
|
1167
|
+
* CTEs are not supported in Coalesce — each CTE should be its own node.
|
|
1168
|
+
*
|
|
1169
|
+
* The plan includes per-CTE structured data: column transforms, WHERE clauses,
|
|
1170
|
+
* source tables, and which columns to keep/remove.
|
|
1171
|
+
*/
|
|
1172
|
+
function buildCtePlan(params, ctes, nodeTypeSelections) {
|
|
1173
|
+
const stagingType = nodeTypeSelections.staging.selectedNodeType ?? "Stage";
|
|
1174
|
+
const multiSourceType = nodeTypeSelections.multiSource.selectedNodeType ?? stagingType;
|
|
1175
|
+
const aggregationType = nodeTypeSelections.aggregation.selectedNodeType ?? stagingType;
|
|
1176
|
+
const typeMap = {
|
|
1177
|
+
staging: stagingType,
|
|
1178
|
+
multiSource: multiSourceType,
|
|
1179
|
+
aggregation: aggregationType,
|
|
1180
|
+
};
|
|
1181
|
+
// Build per-CTE instructions
|
|
1182
|
+
const cteInstructions = [];
|
|
1183
|
+
for (const cte of ctes) {
|
|
1184
|
+
const pattern = classifyCtePattern(cte);
|
|
1185
|
+
const nodeType = typeMap[pattern];
|
|
1186
|
+
cteInstructions.push(buildCteNodeInstruction(cte, nodeType));
|
|
1187
|
+
}
|
|
1188
|
+
// Detect if any CTE references another CTE (pipeline dependency)
|
|
1189
|
+
const cteNameSet = new Set(ctes.map((c) => c.name));
|
|
1190
|
+
const cteDependencies = [];
|
|
1191
|
+
for (const cte of ctes) {
|
|
1192
|
+
const deps = ctes
|
|
1193
|
+
.filter((other) => other.name !== cte.name && cte.body.toUpperCase().includes(other.name))
|
|
1194
|
+
.map((other) => other.name);
|
|
1195
|
+
if (deps.length > 0) {
|
|
1196
|
+
cteDependencies.push(`${cte.name} depends on: ${deps.join(", ")}`);
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
// Detect the final SELECT after all CTEs
|
|
1200
|
+
const finalSelectNote = extractFinalSelectFromCteQuery(params.sql ?? "", cteNameSet);
|
|
1201
|
+
const allTransformCount = ctes.reduce((sum, cte) => sum + cte.columns.filter((c) => c.isTransform).length, 0);
|
|
1202
|
+
const allFilterCount = ctes.filter((c) => c.whereClause).length;
|
|
1203
|
+
// Build structured per-CTE summary for easy agent consumption
|
|
1204
|
+
// Includes columnsParam / groupByColumnsParam / aggregatesParam for single-call creation
|
|
1205
|
+
const cteNodeSummary = ctes.map((cte) => {
|
|
1206
|
+
const pattern = classifyCtePattern(cte);
|
|
1207
|
+
const nodeType = typeMap[pattern];
|
|
1208
|
+
const transforms = cte.columns.filter((c) => c.isTransform);
|
|
1209
|
+
const summary = {
|
|
1210
|
+
name: cte.name,
|
|
1211
|
+
nodeType,
|
|
1212
|
+
pattern,
|
|
1213
|
+
sourceTable: cte.sourceTable,
|
|
1214
|
+
columnCount: cte.columns.length,
|
|
1215
|
+
transforms: transforms.map((c) => ({ column: c.outputName, expression: c.expression })),
|
|
1216
|
+
passthroughColumns: cte.columns.filter((c) => !c.isTransform).map((c) => c.outputName),
|
|
1217
|
+
whereFilter: cte.whereClause,
|
|
1218
|
+
hasGroupBy: cte.hasGroupBy,
|
|
1219
|
+
hasJoin: cte.hasJoin,
|
|
1220
|
+
dependsOn: ctes
|
|
1221
|
+
.filter((other) => other.name !== cte.name && cte.body.toUpperCase().includes(other.name))
|
|
1222
|
+
.map((other) => other.name),
|
|
1223
|
+
};
|
|
1224
|
+
// Add structured params for single-call creation
|
|
1225
|
+
if (cte.hasGroupBy && cte.columns.length > 0) {
|
|
1226
|
+
// GROUP BY CTEs: split columns into group-by (passthrough) and aggregates (transforms with agg functions)
|
|
1227
|
+
const groupByCols = [];
|
|
1228
|
+
const aggCols = [];
|
|
1229
|
+
for (const col of cte.columns) {
|
|
1230
|
+
const aggMatch = col.expression.match(/^(\w+)\s*\((.*)\)$/s);
|
|
1231
|
+
if (col.isTransform && aggMatch && isAggregateFn(aggMatch[1])) {
|
|
1232
|
+
aggCols.push({
|
|
1233
|
+
name: col.outputName,
|
|
1234
|
+
function: aggMatch[1].toUpperCase(),
|
|
1235
|
+
expression: aggMatch[2].trim(),
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
else {
|
|
1239
|
+
// Non-aggregate columns in a GROUP BY CTE are the GROUP BY dimensions
|
|
1240
|
+
groupByCols.push(col.expression);
|
|
1241
|
+
}
|
|
1242
|
+
}
|
|
1243
|
+
if (groupByCols.length > 0 && aggCols.length > 0) {
|
|
1244
|
+
summary.groupByColumnsParam = groupByCols;
|
|
1245
|
+
summary.aggregatesParam = aggCols;
|
|
1246
|
+
}
|
|
1247
|
+
}
|
|
1248
|
+
else if (cte.columns.length > 0 && !cte.hasJoin) {
|
|
1249
|
+
// Only set columnsParam for single-source CTEs where expressions can be passed directly.
|
|
1250
|
+
// Multi-source JOIN CTEs have SQL aliases (soh.*, sl.*) that don't map to Coalesce node names —
|
|
1251
|
+
// the agent must translate these to "NODE_NAME"."COLUMN" format.
|
|
1252
|
+
summary.columnsParam = cte.columns.map((c) => ({
|
|
1253
|
+
name: c.outputName,
|
|
1254
|
+
...(c.isTransform ? { transform: c.expression } : {}),
|
|
1255
|
+
}));
|
|
1256
|
+
}
|
|
1257
|
+
return summary;
|
|
1258
|
+
});
|
|
1259
|
+
return {
|
|
1260
|
+
version: 1,
|
|
1261
|
+
intent: "sql",
|
|
1262
|
+
status: "needs_clarification",
|
|
1263
|
+
STOP_AND_CONFIRM: `STOP. Present the pipeline summary to the user in a table format and ask for confirmation BEFORE creating any nodes. For EACH node in cteNodeSummary, display: name, the EXACT nodeType string (e.g. "Coalesce-Base-Node-Types:::Stage"), pattern, transforms, and whereFilter. Use the cteNodeSummary array — do NOT paraphrase or simplify the nodeType values. Do NOT proceed until the user explicitly approves.`,
|
|
1264
|
+
workspaceID: params.workspaceID,
|
|
1265
|
+
platform: null,
|
|
1266
|
+
goal: params.goal ?? null,
|
|
1267
|
+
sql: params.sql ?? null,
|
|
1268
|
+
nodes: [],
|
|
1269
|
+
cteNodeSummary,
|
|
1270
|
+
assumptions: [
|
|
1271
|
+
`Parsed ${ctes.length} CTEs with ${allTransformCount} column transforms and ${allFilterCount} WHERE filters.`,
|
|
1272
|
+
`Staging and aggregation CTEs: 1 call per node. Multi-source JOIN CTEs: 2 calls (create + apply-join-condition).`,
|
|
1273
|
+
],
|
|
1274
|
+
openQuestions: [
|
|
1275
|
+
`STOP: Present this pipeline summary to the user and ask "Should I proceed with creating these ${ctes.length} nodes?" Do NOT create nodes until the user confirms.`,
|
|
1276
|
+
`This SQL uses CTEs (WITH ... AS), which Coalesce does not support as a single node. Each CTE must become a separate node.`,
|
|
1277
|
+
`--- PER-CTE INSTRUCTIONS ---\n\n${cteInstructions.join("\n\n")}`,
|
|
1278
|
+
...(cteDependencies.length > 0
|
|
1279
|
+
? [`CTE dependencies (create in order):\n${cteDependencies.map((d) => ` - ${d}`).join("\n")}`]
|
|
1280
|
+
: []),
|
|
1281
|
+
...(finalSelectNote ? [finalSelectNote] : []),
|
|
1282
|
+
`Node type guidance (do NOT use list-workspace-node-types):\n` +
|
|
1283
|
+
`- Staging CTEs (single-source): nodeType "${stagingType}"\n` +
|
|
1284
|
+
`- Join/transform CTEs (multi-source): nodeType "${multiSourceType}"\n` +
|
|
1285
|
+
`- Aggregation CTEs (GROUP BY): nodeType "${aggregationType}"`,
|
|
1286
|
+
`Workflow per CTE:\n` +
|
|
1287
|
+
`create-workspace-node-from-predecessor accepts columns, whereCondition, groupByColumns, and aggregates directly:\n` +
|
|
1288
|
+
`- For staging/transform CTEs (single-source): 1 call — pass columns (from cteNodeSummary.columnsParam) + whereCondition\n` +
|
|
1289
|
+
`- For GROUP BY CTEs: 1 call — pass groupByColumns (from cteNodeSummary.groupByColumnsParam) + aggregates (from cteNodeSummary.aggregatesParam)\n` +
|
|
1290
|
+
`- For multi-source JOIN CTEs: 2 calls — first create-workspace-node-from-predecessor with columns + whereCondition, then apply-join-condition to set up FROM/JOIN/ON\n` +
|
|
1291
|
+
`- Do NOT construct {{ ref() }} syntax — the FROM clause and joins are auto-generated\n` +
|
|
1292
|
+
`- Pass repoPath to each call for automatic config completion`,
|
|
1293
|
+
],
|
|
1294
|
+
warnings: [
|
|
1295
|
+
`SQL contains ${ctes.length} CTEs: ${ctes.map((c) => c.name).join(", ")}. Each must be a separate Coalesce node.` +
|
|
1296
|
+
(allTransformCount > 0 ? ` ${allTransformCount} column transforms detected.` : ``),
|
|
1297
|
+
],
|
|
1298
|
+
supportedNodeTypes: nodeTypeSelections.staging.supportedNodeTypes.length > 0
|
|
1299
|
+
? nodeTypeSelections.staging.supportedNodeTypes
|
|
1300
|
+
: [stagingType],
|
|
1301
|
+
nodeTypeSelection: nodeTypeSelections.staging,
|
|
1302
|
+
};
|
|
1303
|
+
}
|
|
1304
|
+
/**
|
|
1305
|
+
* Extract information about the final SELECT after all CTEs.
|
|
1306
|
+
*/
|
|
1307
|
+
function extractFinalSelectFromCteQuery(sql, cteNames) {
|
|
1308
|
+
// Find the final SELECT that comes after the last CTE
|
|
1309
|
+
// It's the SELECT that's not inside any CTE body
|
|
1310
|
+
const trimmed = sql.trim();
|
|
1311
|
+
// Find last top-level SELECT
|
|
1312
|
+
let lastSelectIdx = -1;
|
|
1313
|
+
let depth = 0;
|
|
1314
|
+
const upper = trimmed.toUpperCase();
|
|
1315
|
+
for (let i = 0; i < trimmed.length; i++) {
|
|
1316
|
+
if (trimmed[i] === "(")
|
|
1317
|
+
depth++;
|
|
1318
|
+
else if (trimmed[i] === ")")
|
|
1319
|
+
depth--;
|
|
1320
|
+
else if (depth === 0 && upper.startsWith("SELECT", i)) {
|
|
1321
|
+
const before = i > 0 ? trimmed[i - 1] : " ";
|
|
1322
|
+
if (/[\s,)]/.test(before) || i === 0) {
|
|
1323
|
+
lastSelectIdx = i;
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
if (lastSelectIdx < 0)
|
|
1328
|
+
return null;
|
|
1329
|
+
const finalSelect = trimmed.slice(lastSelectIdx).trim();
|
|
1330
|
+
// Check which CTEs the final SELECT references
|
|
1331
|
+
const referencedCtes = [...cteNames].filter((name) => new RegExp(`\\b${name}\\b`, "i").test(finalSelect));
|
|
1332
|
+
if (referencedCtes.length === 0)
|
|
1333
|
+
return null;
|
|
1334
|
+
// Check if the final SELECT is just `SELECT * FROM single_cte` — redundant
|
|
1335
|
+
const selectStarFromOne = referencedCtes.length === 1 &&
|
|
1336
|
+
/^SELECT\s+\*\s+FROM\s+\w+\s*;?\s*$/i.test(finalSelect);
|
|
1337
|
+
if (selectStarFromOne) {
|
|
1338
|
+
return (`Final SELECT is just \`SELECT * FROM ${referencedCtes[0]}\` — this is redundant. ` +
|
|
1339
|
+
`The last CTE node (${referencedCtes[0]}) already represents the final output. ` +
|
|
1340
|
+
`Do NOT create an additional node for this.`);
|
|
1341
|
+
}
|
|
1342
|
+
return (`Final output query references: ${referencedCtes.join(", ")}. ` +
|
|
1343
|
+
`Create a final node with these as predecessors. ` +
|
|
1344
|
+
`The final SELECT is:\n${finalSelect.slice(0, 500)}${finalSelect.length > 500 ? "..." : ""}`);
|
|
1345
|
+
}
|
|
1346
|
+
export async function planPipeline(client, params) {
|
|
1347
|
+
const location = {
|
|
1348
|
+
...(params.locationName ? { locationName: params.locationName } : {}),
|
|
1349
|
+
...(params.database ? { database: params.database } : {}),
|
|
1350
|
+
...(params.schema ? { schema: params.schema } : {}),
|
|
1351
|
+
};
|
|
1352
|
+
const workspaceNodeTypeInventory = await getWorkspaceNodeTypeInventory(client, params.workspaceID);
|
|
1353
|
+
if (params.sql && params.sql.trim().length > 0) {
|
|
1354
|
+
// Detect CTEs — Coalesce does not support CTEs. Each CTE should be a separate node.
|
|
1355
|
+
const ctes = extractCtes(params.sql);
|
|
1356
|
+
if (ctes.length > 0) {
|
|
1357
|
+
// Evaluate each layer pattern independently.
|
|
1358
|
+
// Goals explicitly mention "batch ETL CTE decomposition" so that specialized
|
|
1359
|
+
// patterns (Dynamic Tables, Incremental, etc.) are properly excluded by the scorer.
|
|
1360
|
+
const sharedContext = {
|
|
1361
|
+
workspaceNodeTypes: workspaceNodeTypeInventory.nodeTypes,
|
|
1362
|
+
workspaceNodeTypeCounts: workspaceNodeTypeInventory.counts,
|
|
1363
|
+
repoPath: params.repoPath,
|
|
1364
|
+
};
|
|
1365
|
+
const userGoal = params.goal ? ` for ${params.goal}` : "";
|
|
1366
|
+
const stagingSelection = selectPipelineNodeType({
|
|
1367
|
+
...sharedContext,
|
|
1368
|
+
explicitNodeType: params.targetNodeType,
|
|
1369
|
+
goal: `batch ETL CTE decomposition — staging layer${userGoal}. Use Stage or Work node type.`,
|
|
1370
|
+
sourceCount: 1,
|
|
1371
|
+
hasJoin: false,
|
|
1372
|
+
hasGroupBy: false,
|
|
1373
|
+
});
|
|
1374
|
+
const multiSourceSelection = selectPipelineNodeType({
|
|
1375
|
+
...sharedContext,
|
|
1376
|
+
explicitNodeType: params.targetNodeType,
|
|
1377
|
+
goal: `batch ETL CTE decomposition — join transform${userGoal}. Use Stage, Work, or View node type.`,
|
|
1378
|
+
sourceCount: 3,
|
|
1379
|
+
hasJoin: true,
|
|
1380
|
+
hasGroupBy: false,
|
|
1381
|
+
});
|
|
1382
|
+
const aggregationSelection = selectPipelineNodeType({
|
|
1383
|
+
...sharedContext,
|
|
1384
|
+
explicitNodeType: params.targetNodeType,
|
|
1385
|
+
goal: `batch ETL CTE decomposition — aggregation transform${userGoal}. Use Stage or Work node type.`,
|
|
1386
|
+
sourceCount: 1,
|
|
1387
|
+
hasJoin: false,
|
|
1388
|
+
hasGroupBy: true,
|
|
1389
|
+
});
|
|
1390
|
+
const ctePlan = buildCtePlan(params, ctes, {
|
|
1391
|
+
staging: stagingSelection.selection,
|
|
1392
|
+
multiSource: multiSourceSelection.selection,
|
|
1393
|
+
aggregation: aggregationSelection.selection,
|
|
1394
|
+
});
|
|
1395
|
+
applyWorkspaceNodeTypeValidation(ctePlan, workspaceNodeTypeInventory, params.targetNodeType);
|
|
1396
|
+
return ctePlan;
|
|
1397
|
+
}
|
|
1398
|
+
const parseResult = parseSqlSelectItems(params.sql, parseRefCalls(params.sql));
|
|
1399
|
+
const { refs, predecessorNodes, openQuestions, warnings, } = await resolveSqlRefsToWorkspaceNodes(client, params.workspaceID, parseResult.refs);
|
|
1400
|
+
const selectionResult = selectPipelineNodeType({
|
|
1401
|
+
explicitNodeType: params.targetNodeType,
|
|
1402
|
+
goal: params.goal,
|
|
1403
|
+
targetName: params.targetName,
|
|
1404
|
+
sql: params.sql,
|
|
1405
|
+
sourceCount: refs.length,
|
|
1406
|
+
workspaceNodeTypes: workspaceNodeTypeInventory.nodeTypes,
|
|
1407
|
+
workspaceNodeTypeCounts: workspaceNodeTypeInventory.counts,
|
|
1408
|
+
repoPath: params.repoPath,
|
|
1409
|
+
});
|
|
1410
|
+
const plan = buildPlanFromSql({
|
|
1411
|
+
workspaceID: params.workspaceID,
|
|
1412
|
+
goal: params.goal,
|
|
1413
|
+
sql: params.sql,
|
|
1414
|
+
targetName: params.targetName,
|
|
1415
|
+
description: params.description,
|
|
1416
|
+
targetNodeType: params.targetNodeType,
|
|
1417
|
+
configOverrides: params.configOverrides,
|
|
1418
|
+
nodeTypeSelection: selectionResult.selection,
|
|
1419
|
+
selectedNodeType: selectionResult.selectedCandidate,
|
|
1420
|
+
location,
|
|
1421
|
+
}, { ...parseResult, refs }, predecessorNodes, openQuestions, [...warnings, ...selectionResult.warnings]);
|
|
1422
|
+
applyWorkspaceNodeTypeValidation(plan, workspaceNodeTypeInventory, params.targetNodeType);
|
|
1423
|
+
return plan;
|
|
1424
|
+
}
|
|
1425
|
+
if (params.sourceNodeIDs && params.sourceNodeIDs.length > 0) {
|
|
1426
|
+
const { sourceRefs, predecessorNodes, openQuestions, warnings, } = await getSourceNodesByID(client, params.workspaceID, params.sourceNodeIDs);
|
|
1427
|
+
const multiSource = sourceRefs.length > 1;
|
|
1428
|
+
const singleSource = sourceRefs.length === 1;
|
|
1429
|
+
const selectionResult = selectPipelineNodeType({
|
|
1430
|
+
explicitNodeType: params.targetNodeType,
|
|
1431
|
+
goal: params.goal,
|
|
1432
|
+
targetName: params.targetName,
|
|
1433
|
+
sourceCount: sourceRefs.length,
|
|
1434
|
+
workspaceNodeTypes: workspaceNodeTypeInventory.nodeTypes,
|
|
1435
|
+
workspaceNodeTypeCounts: workspaceNodeTypeInventory.counts,
|
|
1436
|
+
repoPath: params.repoPath,
|
|
1437
|
+
});
|
|
1438
|
+
const selectedNodeType = selectionResult.selectedCandidate?.nodeType ??
|
|
1439
|
+
params.targetNodeType ??
|
|
1440
|
+
"Stage";
|
|
1441
|
+
if (singleSource) {
|
|
1442
|
+
const sourceRef = sourceRefs[0];
|
|
1443
|
+
const predecessor = predecessorNodes[sourceRef.nodeID];
|
|
1444
|
+
const selectItems = buildSelectItemsFromSourceNode(sourceRef.nodeID, sourceRef.alias ?? sourceRef.nodeName, predecessor);
|
|
1445
|
+
const ready = (selectionResult.selectedCandidate?.autoExecutable ?? true) &&
|
|
1446
|
+
openQuestions.length === 0 &&
|
|
1447
|
+
selectItems.length > 0;
|
|
1448
|
+
const planWarnings = [...warnings, ...selectionResult.warnings];
|
|
1449
|
+
const planOpenQuestions = [...openQuestions];
|
|
1450
|
+
if (selectionResult.selectedCandidate && !selectionResult.selectedCandidate.autoExecutable) {
|
|
1451
|
+
planWarnings.push(`Planner selected node type ${selectedNodeType}, but it likely needs additional semantic configuration before automatic creation.`);
|
|
1452
|
+
if (selectionResult.selectedCandidate.semanticSignals.length > 0) {
|
|
1453
|
+
planOpenQuestions.push(`Confirm the required configuration for ${selectedNodeType}: ${selectionResult.selectedCandidate.semanticSignals.join(", ")}.`);
|
|
1454
|
+
}
|
|
1455
|
+
if (selectionResult.selectedCandidate.missingDefaultFields.length > 0) {
|
|
1456
|
+
planOpenQuestions.push(`Provide values for ${selectedNodeType} config fields without defaults: ${selectionResult.selectedCandidate.missingDefaultFields.join(", ")}.`);
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
const plan = {
|
|
1460
|
+
version: 1,
|
|
1461
|
+
intent: "goal",
|
|
1462
|
+
status: ready ? "ready" : "needs_clarification",
|
|
1463
|
+
workspaceID: params.workspaceID,
|
|
1464
|
+
platform: null,
|
|
1465
|
+
goal: params.goal ?? null,
|
|
1466
|
+
sql: null,
|
|
1467
|
+
nodes: [
|
|
1468
|
+
{
|
|
1469
|
+
planNodeID: "node-1",
|
|
1470
|
+
name: buildDefaultNodeName(params.targetName, [
|
|
1471
|
+
{
|
|
1472
|
+
locationName: sourceRef.locationName,
|
|
1473
|
+
nodeName: sourceRef.nodeName,
|
|
1474
|
+
alias: sourceRef.alias,
|
|
1475
|
+
nodeID: sourceRef.nodeID,
|
|
1476
|
+
},
|
|
1477
|
+
], selectionResult.selectedCandidate?.family ?? null, selectionResult.selectedCandidate?.shortName ?? null),
|
|
1478
|
+
nodeType: selectedNodeType,
|
|
1479
|
+
nodeTypeFamily: selectionResult.selectedCandidate?.family ?? null,
|
|
1480
|
+
predecessorNodeIDs: [sourceRef.nodeID],
|
|
1481
|
+
predecessorPlanNodeIDs: [],
|
|
1482
|
+
predecessorNodeNames: [sourceRef.nodeName],
|
|
1483
|
+
description: params.description ?? null,
|
|
1484
|
+
sql: null,
|
|
1485
|
+
selectItems,
|
|
1486
|
+
outputColumnNames: selectItems.flatMap((item) => item.outputName ? [item.outputName] : []),
|
|
1487
|
+
configOverrides: params.configOverrides
|
|
1488
|
+
? deepClone(params.configOverrides)
|
|
1489
|
+
: {},
|
|
1490
|
+
sourceRefs,
|
|
1491
|
+
joinCondition: `FROM {{ ref('${sourceRef.locationName}', '${sourceRef.nodeName}') }} "${sourceRef.alias ?? sourceRef.nodeName}"`,
|
|
1492
|
+
location,
|
|
1493
|
+
requiresFullSetNode: true,
|
|
1494
|
+
...(selectionResult.selectedCandidate?.templateDefaults
|
|
1495
|
+
? { templateDefaults: selectionResult.selectedCandidate.templateDefaults }
|
|
1496
|
+
: {}),
|
|
1497
|
+
},
|
|
1498
|
+
],
|
|
1499
|
+
assumptions: [
|
|
1500
|
+
`Planner ${selectionResult.selection.strategy} selected ${selectedNodeType} from repo/workspace candidates.`,
|
|
1501
|
+
"Goal-driven planning uses a pass-through projection from the supplied source node IDs when the selected type is projection-capable.",
|
|
1502
|
+
"Review the generated plan before execution if the goal implies filters, joins, or computed columns.",
|
|
1503
|
+
],
|
|
1504
|
+
openQuestions: planOpenQuestions,
|
|
1505
|
+
warnings: planWarnings,
|
|
1506
|
+
supportedNodeTypes: selectionResult.selection.supportedNodeTypes.length > 0
|
|
1507
|
+
? selectionResult.selection.supportedNodeTypes
|
|
1508
|
+
: [selectedNodeType],
|
|
1509
|
+
nodeTypeSelection: selectionResult.selection,
|
|
1510
|
+
};
|
|
1511
|
+
applyWorkspaceNodeTypeValidation(plan, workspaceNodeTypeInventory, params.targetNodeType);
|
|
1512
|
+
return plan;
|
|
1513
|
+
}
|
|
1514
|
+
const multiSourceWarnings = [...warnings, ...selectionResult.warnings];
|
|
1515
|
+
const multiSourceOpenQuestions = [
|
|
1516
|
+
...openQuestions,
|
|
1517
|
+
...(multiSource
|
|
1518
|
+
? [
|
|
1519
|
+
`How should these sources be joined or filtered: ${sourceRefs
|
|
1520
|
+
.map((ref) => ref.nodeName)
|
|
1521
|
+
.join(", ")}?`,
|
|
1522
|
+
]
|
|
1523
|
+
: []),
|
|
1524
|
+
];
|
|
1525
|
+
if (selectionResult.selectedCandidate && !selectionResult.selectedCandidate.autoExecutable) {
|
|
1526
|
+
multiSourceWarnings.push(`Planner selected node type ${selectedNodeType}, but it likely needs additional semantic configuration before automatic creation.`);
|
|
1527
|
+
if (selectionResult.selectedCandidate.semanticSignals.length > 0) {
|
|
1528
|
+
multiSourceOpenQuestions.push(`Confirm the required configuration for ${selectedNodeType}: ${selectionResult.selectedCandidate.semanticSignals.join(", ")}.`);
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
const plan = {
|
|
1532
|
+
version: 1,
|
|
1533
|
+
intent: "goal",
|
|
1534
|
+
status: "needs_clarification",
|
|
1535
|
+
workspaceID: params.workspaceID,
|
|
1536
|
+
platform: null,
|
|
1537
|
+
goal: params.goal ?? null,
|
|
1538
|
+
sql: null,
|
|
1539
|
+
nodes: [
|
|
1540
|
+
{
|
|
1541
|
+
planNodeID: "node-1",
|
|
1542
|
+
name: params.targetName ??
|
|
1543
|
+
`${buildDefaultNodePrefix(selectionResult.selectedCandidate?.family ?? null, selectionResult.selectedCandidate?.shortName ?? null)}_MULTI_SOURCE`,
|
|
1544
|
+
nodeType: selectedNodeType,
|
|
1545
|
+
nodeTypeFamily: selectionResult.selectedCandidate?.family ?? null,
|
|
1546
|
+
predecessorNodeIDs: sourceRefs.flatMap((ref) => ref.nodeID ? [ref.nodeID] : []),
|
|
1547
|
+
predecessorPlanNodeIDs: [],
|
|
1548
|
+
predecessorNodeNames: sourceRefs.map((ref) => ref.nodeName),
|
|
1549
|
+
description: params.description ?? null,
|
|
1550
|
+
sql: null,
|
|
1551
|
+
selectItems: [],
|
|
1552
|
+
outputColumnNames: [],
|
|
1553
|
+
configOverrides: params.configOverrides
|
|
1554
|
+
? deepClone(params.configOverrides)
|
|
1555
|
+
: {},
|
|
1556
|
+
sourceRefs,
|
|
1557
|
+
joinCondition: null,
|
|
1558
|
+
location,
|
|
1559
|
+
requiresFullSetNode: true,
|
|
1560
|
+
...(selectionResult.selectedCandidate?.templateDefaults
|
|
1561
|
+
? { templateDefaults: selectionResult.selectedCandidate.templateDefaults }
|
|
1562
|
+
: {}),
|
|
1563
|
+
},
|
|
1564
|
+
],
|
|
1565
|
+
assumptions: [
|
|
1566
|
+
`Planner ${selectionResult.selection.strategy} selected ${selectedNodeType} from repo/workspace candidates.`,
|
|
1567
|
+
"Goal-based planning can scaffold a multisource request, but it does not infer joins automatically.",
|
|
1568
|
+
],
|
|
1569
|
+
openQuestions: multiSourceOpenQuestions,
|
|
1570
|
+
warnings: multiSourceWarnings,
|
|
1571
|
+
supportedNodeTypes: selectionResult.selection.supportedNodeTypes.length > 0
|
|
1572
|
+
? selectionResult.selection.supportedNodeTypes
|
|
1573
|
+
: [selectedNodeType],
|
|
1574
|
+
nodeTypeSelection: selectionResult.selection,
|
|
1575
|
+
};
|
|
1576
|
+
applyWorkspaceNodeTypeValidation(plan, workspaceNodeTypeInventory, params.targetNodeType);
|
|
1577
|
+
return plan;
|
|
1578
|
+
}
|
|
1579
|
+
const openQuestions = [];
|
|
1580
|
+
if (!params.goal || params.goal.trim().length === 0) {
|
|
1581
|
+
openQuestions.push("What pipeline should be built, and what should it produce?");
|
|
1582
|
+
}
|
|
1583
|
+
if (!params.sourceNodeIDs || params.sourceNodeIDs.length === 0) {
|
|
1584
|
+
openQuestions.push("Which upstream Coalesce node IDs should this pipeline build from?");
|
|
1585
|
+
}
|
|
1586
|
+
const selectionResult = selectPipelineNodeType({
|
|
1587
|
+
explicitNodeType: params.targetNodeType,
|
|
1588
|
+
goal: params.goal,
|
|
1589
|
+
targetName: params.targetName,
|
|
1590
|
+
sourceCount: 0,
|
|
1591
|
+
workspaceNodeTypes: workspaceNodeTypeInventory.nodeTypes,
|
|
1592
|
+
workspaceNodeTypeCounts: workspaceNodeTypeInventory.counts,
|
|
1593
|
+
repoPath: params.repoPath,
|
|
1594
|
+
});
|
|
1595
|
+
const plan = {
|
|
1596
|
+
version: 1,
|
|
1597
|
+
intent: "goal",
|
|
1598
|
+
status: "needs_clarification",
|
|
1599
|
+
workspaceID: params.workspaceID,
|
|
1600
|
+
platform: null,
|
|
1601
|
+
goal: params.goal ?? null,
|
|
1602
|
+
sql: null,
|
|
1603
|
+
nodes: [],
|
|
1604
|
+
assumptions: [
|
|
1605
|
+
selectionResult.selectedCandidate
|
|
1606
|
+
? `Planner ${selectionResult.selection.strategy} would prefer ${selectionResult.selectedCandidate.nodeType} for this goal once sources are confirmed.`
|
|
1607
|
+
: "Planner could not rank a preferred node type because no repo-backed or observed workspace candidates were available.",
|
|
1608
|
+
"Goal-only planning currently returns clarification questions rather than inferred node graphs.",
|
|
1609
|
+
],
|
|
1610
|
+
openQuestions,
|
|
1611
|
+
warnings: [...selectionResult.warnings],
|
|
1612
|
+
supportedNodeTypes: selectionResult.selection.supportedNodeTypes.length > 0
|
|
1613
|
+
? selectionResult.selection.supportedNodeTypes
|
|
1614
|
+
: selectionResult.selectedCandidate
|
|
1615
|
+
? [selectionResult.selectedCandidate.nodeType]
|
|
1616
|
+
: ["Stage"],
|
|
1617
|
+
nodeTypeSelection: selectionResult.selection,
|
|
1618
|
+
};
|
|
1619
|
+
applyWorkspaceNodeTypeValidation(plan, workspaceNodeTypeInventory, params.targetNodeType);
|
|
1620
|
+
return plan;
|
|
1621
|
+
}
|
|
1622
|
+
export function getNodeColumnArray(node) {
|
|
1623
|
+
const metadata = isPlainObject(node.metadata) ? node.metadata : undefined;
|
|
1624
|
+
if (!Array.isArray(metadata?.columns)) {
|
|
1625
|
+
return [];
|
|
1626
|
+
}
|
|
1627
|
+
return metadata.columns.filter(isPlainObject);
|
|
1628
|
+
}
|
|
1629
|
+
export function getColumnSourceNodeIDs(column) {
|
|
1630
|
+
if (!Array.isArray(column.sources)) {
|
|
1631
|
+
return [];
|
|
1632
|
+
}
|
|
1633
|
+
const ids = new Set();
|
|
1634
|
+
for (const source of column.sources) {
|
|
1635
|
+
if (!isPlainObject(source) || !Array.isArray(source.columnReferences)) {
|
|
1636
|
+
continue;
|
|
1637
|
+
}
|
|
1638
|
+
for (const ref of source.columnReferences) {
|
|
1639
|
+
if (isPlainObject(ref) && typeof ref.nodeID === "string") {
|
|
1640
|
+
ids.add(ref.nodeID);
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
}
|
|
1644
|
+
return Array.from(ids);
|
|
1645
|
+
}
|
|
1646
|
+
export function findMatchingBaseColumn(node, selectItem) {
|
|
1647
|
+
const normalizedTargetName = normalizeSqlIdentifier(selectItem.sourceColumnName ?? "");
|
|
1648
|
+
for (const column of getNodeColumnArray(node)) {
|
|
1649
|
+
if (typeof column.name !== "string" ||
|
|
1650
|
+
normalizeSqlIdentifier(column.name) !== normalizedTargetName) {
|
|
1651
|
+
continue;
|
|
1652
|
+
}
|
|
1653
|
+
const sourceNodeIDs = getColumnSourceNodeIDs(column);
|
|
1654
|
+
if (selectItem.sourceNodeID && sourceNodeIDs.includes(selectItem.sourceNodeID)) {
|
|
1655
|
+
return deepClone(column);
|
|
1656
|
+
}
|
|
1657
|
+
if (!selectItem.sourceNodeID) {
|
|
1658
|
+
return deepClone(column);
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
return null;
|
|
1662
|
+
}
|
|
1663
|
+
export function renameSourceMappingEntries(node, newName) {
|
|
1664
|
+
const metadata = isPlainObject(node.metadata) ? node.metadata : undefined;
|
|
1665
|
+
if (!metadata || !Array.isArray(metadata.sourceMapping)) {
|
|
1666
|
+
return node;
|
|
1667
|
+
}
|
|
1668
|
+
const previousName = typeof node.name === "string" && node.name.trim().length > 0 ? node.name : null;
|
|
1669
|
+
const updateSingleUnnamedMapping = previousName === null && metadata.sourceMapping.length === 1;
|
|
1670
|
+
return {
|
|
1671
|
+
...node,
|
|
1672
|
+
metadata: {
|
|
1673
|
+
...metadata,
|
|
1674
|
+
sourceMapping: metadata.sourceMapping.map((entry) => {
|
|
1675
|
+
if (!isPlainObject(entry)) {
|
|
1676
|
+
return entry;
|
|
1677
|
+
}
|
|
1678
|
+
const shouldRename = (previousName !== null && entry.name === previousName) ||
|
|
1679
|
+
updateSingleUnnamedMapping;
|
|
1680
|
+
if (!shouldRename) {
|
|
1681
|
+
return entry;
|
|
1682
|
+
}
|
|
1683
|
+
return {
|
|
1684
|
+
...entry,
|
|
1685
|
+
name: newName,
|
|
1686
|
+
};
|
|
1687
|
+
}),
|
|
1688
|
+
},
|
|
1689
|
+
};
|
|
1690
|
+
}
|
|
1691
|
+
export function buildStageSourceMappingFromPlan(currentNode, nodePlan) {
|
|
1692
|
+
const metadata = isPlainObject(currentNode.metadata) ? currentNode.metadata : undefined;
|
|
1693
|
+
const existingEntry = metadata && Array.isArray(metadata.sourceMapping)
|
|
1694
|
+
? metadata.sourceMapping.find(isPlainObject)
|
|
1695
|
+
: undefined;
|
|
1696
|
+
const aliases = {};
|
|
1697
|
+
for (const ref of nodePlan.sourceRefs) {
|
|
1698
|
+
if (!ref.nodeID) {
|
|
1699
|
+
continue;
|
|
1700
|
+
}
|
|
1701
|
+
const alias = ref.alias ?? ref.nodeName;
|
|
1702
|
+
if (nodePlan.sourceRefs.length > 1 || ref.alias) {
|
|
1703
|
+
aliases[alias] = ref.nodeID;
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
return [
|
|
1707
|
+
{
|
|
1708
|
+
...(isPlainObject(existingEntry) ? existingEntry : {}),
|
|
1709
|
+
aliases,
|
|
1710
|
+
customSQL: {
|
|
1711
|
+
...(isPlainObject(existingEntry) && isPlainObject(existingEntry.customSQL)
|
|
1712
|
+
? existingEntry.customSQL
|
|
1713
|
+
: {}),
|
|
1714
|
+
customSQL: "",
|
|
1715
|
+
},
|
|
1716
|
+
dependencies: nodePlan.sourceRefs.map((ref) => ({
|
|
1717
|
+
locationName: ref.locationName,
|
|
1718
|
+
nodeName: ref.nodeName,
|
|
1719
|
+
})),
|
|
1720
|
+
join: {
|
|
1721
|
+
...(isPlainObject(existingEntry) && isPlainObject(existingEntry.join)
|
|
1722
|
+
? existingEntry.join
|
|
1723
|
+
: {}),
|
|
1724
|
+
joinCondition: nodePlan.joinCondition ?? "",
|
|
1725
|
+
},
|
|
1726
|
+
name: nodePlan.name,
|
|
1727
|
+
noLinkRefs: isPlainObject(existingEntry) && Array.isArray(existingEntry.noLinkRefs)
|
|
1728
|
+
? existingEntry.noLinkRefs
|
|
1729
|
+
: [],
|
|
1730
|
+
},
|
|
1731
|
+
];
|
|
1732
|
+
}
|
|
1733
|
+
function buildStageNodeBodyFromPlan(currentNode, nodePlan) {
|
|
1734
|
+
const updatedNode = deepClone(currentNode);
|
|
1735
|
+
updatedNode.name = nodePlan.name;
|
|
1736
|
+
if (nodePlan.description !== null) {
|
|
1737
|
+
updatedNode.description = nodePlan.description;
|
|
1738
|
+
}
|
|
1739
|
+
if (Object.keys(nodePlan.location).length > 0) {
|
|
1740
|
+
Object.assign(updatedNode, nodePlan.location);
|
|
1741
|
+
}
|
|
1742
|
+
updatedNode.config = {
|
|
1743
|
+
...DEFAULT_STAGE_CONFIG,
|
|
1744
|
+
...(isPlainObject(updatedNode.config) ? updatedNode.config : {}),
|
|
1745
|
+
...nodePlan.configOverrides,
|
|
1746
|
+
};
|
|
1747
|
+
const plannedColumns = [];
|
|
1748
|
+
for (const selectItem of nodePlan.selectItems) {
|
|
1749
|
+
const baseColumn = findMatchingBaseColumn(updatedNode, selectItem);
|
|
1750
|
+
if (!baseColumn) {
|
|
1751
|
+
throw new Error(`Could not map planned output column ${selectItem.outputName ?? selectItem.expression} onto the created predecessor-based node body.`);
|
|
1752
|
+
}
|
|
1753
|
+
baseColumn.name = selectItem.outputName ?? baseColumn.name;
|
|
1754
|
+
if (isPlainObject(baseColumn.columnReference)) {
|
|
1755
|
+
baseColumn.columnReference = {
|
|
1756
|
+
...baseColumn.columnReference,
|
|
1757
|
+
columnCounter: randomUUID(),
|
|
1758
|
+
};
|
|
1759
|
+
}
|
|
1760
|
+
if (typeof baseColumn.columnID === "string") {
|
|
1761
|
+
baseColumn.columnID = randomUUID();
|
|
1762
|
+
}
|
|
1763
|
+
plannedColumns.push(baseColumn);
|
|
1764
|
+
}
|
|
1765
|
+
const currentMetadata = isPlainObject(updatedNode.metadata)
|
|
1766
|
+
? updatedNode.metadata
|
|
1767
|
+
: {};
|
|
1768
|
+
updatedNode.metadata = {
|
|
1769
|
+
...currentMetadata,
|
|
1770
|
+
columns: plannedColumns,
|
|
1771
|
+
sourceMapping: buildStageSourceMappingFromPlan(updatedNode, nodePlan),
|
|
1772
|
+
};
|
|
1773
|
+
return renameSourceMappingEntries(updatedNode, nodePlan.name);
|
|
1774
|
+
}
|
|
1775
|
+
function getSavedNodeColumnNames(node) {
|
|
1776
|
+
return getColumnNamesFromNode(node);
|
|
1777
|
+
}
|
|
1778
|
+
function validateSavedStageNode(node, nodePlan) {
|
|
1779
|
+
const savedColumnNames = getSavedNodeColumnNames(node);
|
|
1780
|
+
const expectedColumnNames = nodePlan.outputColumnNames;
|
|
1781
|
+
const normalizedSaved = savedColumnNames.map(normalizeSqlIdentifier);
|
|
1782
|
+
const normalizedExpected = expectedColumnNames.map(normalizeSqlIdentifier);
|
|
1783
|
+
const referencedPredecessorNodeIDs = new Set();
|
|
1784
|
+
const metadata = isPlainObject(node.metadata) ? node.metadata : undefined;
|
|
1785
|
+
const sourceMappingEntry = metadata && Array.isArray(metadata.sourceMapping)
|
|
1786
|
+
? metadata.sourceMapping.find(isPlainObject)
|
|
1787
|
+
: undefined;
|
|
1788
|
+
const savedDependencies = isPlainObject(sourceMappingEntry) && Array.isArray(sourceMappingEntry.dependencies)
|
|
1789
|
+
? sourceMappingEntry.dependencies
|
|
1790
|
+
.filter(isPlainObject)
|
|
1791
|
+
.flatMap((dependency) => typeof dependency.nodeName === "string" ? [dependency.nodeName] : [])
|
|
1792
|
+
: [];
|
|
1793
|
+
const savedJoinCondition = isPlainObject(sourceMappingEntry) &&
|
|
1794
|
+
isPlainObject(sourceMappingEntry.join) &&
|
|
1795
|
+
typeof sourceMappingEntry.join.joinCondition === "string"
|
|
1796
|
+
? normalizeWhitespace(sourceMappingEntry.join.joinCondition)
|
|
1797
|
+
: "";
|
|
1798
|
+
for (const column of getNodeColumnArray(node)) {
|
|
1799
|
+
for (const nodeID of getColumnSourceNodeIDs(column)) {
|
|
1800
|
+
referencedPredecessorNodeIDs.add(nodeID);
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
return {
|
|
1804
|
+
nodeNameSatisfied: node.name === nodePlan.name,
|
|
1805
|
+
expectedColumnCount: expectedColumnNames.length,
|
|
1806
|
+
actualColumnCount: savedColumnNames.length,
|
|
1807
|
+
outputColumnsSatisfied: normalizedExpected.length === normalizedSaved.length &&
|
|
1808
|
+
normalizedExpected.every((name, index) => normalizedSaved[index] === name),
|
|
1809
|
+
expectedColumnNames,
|
|
1810
|
+
actualColumnNames: savedColumnNames,
|
|
1811
|
+
sourceMappingDependenciesSatisfied: savedDependencies.length === nodePlan.sourceRefs.length &&
|
|
1812
|
+
nodePlan.sourceRefs.every((ref) => savedDependencies.includes(ref.nodeName)),
|
|
1813
|
+
expectedDependencyNodeNames: nodePlan.sourceRefs.map((ref) => ref.nodeName),
|
|
1814
|
+
actualDependencyNodeNames: savedDependencies,
|
|
1815
|
+
joinConditionSatisfied: (nodePlan.joinCondition === null && savedJoinCondition.length === 0) ||
|
|
1816
|
+
savedJoinCondition === normalizeWhitespace(nodePlan.joinCondition ?? ""),
|
|
1817
|
+
expectedJoinCondition: nodePlan.joinCondition,
|
|
1818
|
+
actualJoinCondition: savedJoinCondition.length > 0 ? savedJoinCondition : null,
|
|
1819
|
+
predecessorCoverageSatisfied: nodePlan.predecessorNodeIDs.every((nodeID) => referencedPredecessorNodeIDs.has(nodeID)),
|
|
1820
|
+
predecessorNodeIDs: nodePlan.predecessorNodeIDs,
|
|
1821
|
+
referencedPredecessorNodeIDs: Array.from(referencedPredecessorNodeIDs),
|
|
1822
|
+
};
|
|
1823
|
+
}
|
|
1824
|
+
async function deleteWorkspaceNode(client, workspaceID, nodeID) {
|
|
1825
|
+
await client.delete(`/api/v1/workspaces/${validatePathSegment(workspaceID, "workspaceID")}/nodes/${validatePathSegment(nodeID, "nodeID")}`);
|
|
1826
|
+
}
|
|
1827
|
+
async function rollbackCreatedPipelineNodes(client, workspaceID, nodeIDs) {
|
|
1828
|
+
const rollbackFailures = [];
|
|
1829
|
+
const uniqueNodeIDs = Array.from(new Set(nodeIDs));
|
|
1830
|
+
for (const nodeID of uniqueNodeIDs.reverse()) {
|
|
1831
|
+
try {
|
|
1832
|
+
await deleteWorkspaceNode(client, workspaceID, nodeID);
|
|
1833
|
+
}
|
|
1834
|
+
catch {
|
|
1835
|
+
rollbackFailures.push(nodeID);
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
return rollbackFailures;
|
|
1839
|
+
}
|