goldenpipe 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +157 -0
- package/dist/cli.cjs +1055 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1053 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +898 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +439 -0
- package/dist/core/index.d.ts +439 -0
- package/dist/core/index.js +861 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +898 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +861 -0
- package/dist/index.js.map +1 -0
- package/dist/node/index.cjs +1081 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +43 -0
- package/dist/node/index.d.ts +43 -0
- package/dist/node/index.js +1039 -0
- package/dist/node/index.js.map +1 -0
- package/package.json +90 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,1053 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { writeFileSync, readFileSync } from 'fs';
|
|
3
|
+
import { join } from 'path';
|
|
4
|
+
import { Command } from 'commander';
|
|
5
|
+
import { TabularData, scanData, severityLabel } from 'goldencheck/core';
|
|
6
|
+
import { TransformEngine } from 'goldenflow/core';
|
|
7
|
+
import { makeConfig, dedupe, makeMatchkeyConfig, makeMatchkeyField, makeBlockingConfig } from 'goldenmatch/core';
|
|
8
|
+
|
|
9
|
+
// src/core/models.ts
|
|
10
|
+
var StageStatus = {
|
|
11
|
+
SUCCESS: "success",
|
|
12
|
+
SKIPPED: "skipped",
|
|
13
|
+
FAILED: "failed"
|
|
14
|
+
};
|
|
15
|
+
var PipeStatus = {
|
|
16
|
+
SUCCESS: "success",
|
|
17
|
+
PARTIAL: "partial",
|
|
18
|
+
FAILED: "failed"
|
|
19
|
+
};
|
|
20
|
+
function makePipeContext(input) {
|
|
21
|
+
return {
|
|
22
|
+
df: input?.df ?? null,
|
|
23
|
+
artifacts: input?.artifacts ?? {},
|
|
24
|
+
metadata: input?.metadata ?? {},
|
|
25
|
+
timing: input?.timing ?? {},
|
|
26
|
+
reasoning: input?.reasoning ?? {},
|
|
27
|
+
stageConfig: input?.stageConfig ?? {}
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
function makeStageSpec(input) {
|
|
31
|
+
if (typeof input === "string") {
|
|
32
|
+
return { use: input, needs: [], onError: "continue", config: {} };
|
|
33
|
+
}
|
|
34
|
+
return {
|
|
35
|
+
...input.name !== void 0 ? { name: input.name } : {},
|
|
36
|
+
use: input.use,
|
|
37
|
+
needs: input.needs ?? [],
|
|
38
|
+
...input.skipIf !== void 0 ? { skipIf: input.skipIf } : {},
|
|
39
|
+
onError: input.onError ?? "continue",
|
|
40
|
+
config: input.config ?? {}
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function makePipelineConfig(input) {
|
|
44
|
+
return {
|
|
45
|
+
pipeline: input.pipeline,
|
|
46
|
+
...input.source !== void 0 ? { source: input.source } : {},
|
|
47
|
+
...input.output !== void 0 ? { output: input.output } : {},
|
|
48
|
+
stages: input.stages,
|
|
49
|
+
decisions: input.decisions ?? []
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// src/core/columnContext.ts
|
|
54
|
+
var ColumnType = {
|
|
55
|
+
NAME: "name",
|
|
56
|
+
EMAIL: "email",
|
|
57
|
+
PHONE: "phone",
|
|
58
|
+
DATE: "date",
|
|
59
|
+
GEO: "geo",
|
|
60
|
+
ADDRESS: "address",
|
|
61
|
+
ZIP: "zip",
|
|
62
|
+
IDENTIFIER: "identifier",
|
|
63
|
+
NUMERIC: "numeric",
|
|
64
|
+
STRING: "string"};
|
|
65
|
+
var CardinalityBand = {
|
|
66
|
+
UNSET: "",
|
|
67
|
+
LOW: "low",
|
|
68
|
+
MID: "mid",
|
|
69
|
+
HIGH: "high",
|
|
70
|
+
SKIP: "skip"
|
|
71
|
+
};
|
|
72
|
+
var MIN_CONFIDENCE = 0.3;
|
|
73
|
+
var IDENTIFIER_TYPES = /* @__PURE__ */ new Set([
|
|
74
|
+
ColumnType.NAME,
|
|
75
|
+
ColumnType.EMAIL,
|
|
76
|
+
ColumnType.PHONE
|
|
77
|
+
]);
|
|
78
|
+
var NEVER_IDENTIFIER_TYPES = /* @__PURE__ */ new Set([
|
|
79
|
+
ColumnType.DATE,
|
|
80
|
+
ColumnType.NUMERIC,
|
|
81
|
+
ColumnType.IDENTIFIER
|
|
82
|
+
]);
|
|
83
|
+
function makeColumnContext(input) {
|
|
84
|
+
const ctx = {
|
|
85
|
+
name: input.name,
|
|
86
|
+
inferredType: input.inferredType ?? ColumnType.STRING,
|
|
87
|
+
nullRate: input.nullRate ?? 0,
|
|
88
|
+
cardinality: input.cardinality ?? 0,
|
|
89
|
+
isIdentifier: input.isIdentifier ?? false,
|
|
90
|
+
transformsApplied: input.transformsApplied ?? [],
|
|
91
|
+
findings: input.findings ?? [],
|
|
92
|
+
confidence: input.confidence ?? 0.5,
|
|
93
|
+
cardinalityBand: input.cardinalityBand ?? CardinalityBand.UNSET
|
|
94
|
+
};
|
|
95
|
+
if (!ctx.name) {
|
|
96
|
+
throw new Error("ColumnContext.name must be non-empty");
|
|
97
|
+
}
|
|
98
|
+
if (!(ctx.nullRate >= 0 && ctx.nullRate <= 1)) {
|
|
99
|
+
throw new Error(`nullRate must be in [0, 1], got ${ctx.nullRate}`);
|
|
100
|
+
}
|
|
101
|
+
if (ctx.cardinality < 0) {
|
|
102
|
+
throw new Error(`cardinality must be >= 0, got ${ctx.cardinality}`);
|
|
103
|
+
}
|
|
104
|
+
if (!(ctx.confidence >= 0 && ctx.confidence <= 1)) {
|
|
105
|
+
throw new Error(`confidence must be in [0, 1], got ${ctx.confidence}`);
|
|
106
|
+
}
|
|
107
|
+
return ctx;
|
|
108
|
+
}
|
|
109
|
+
var NAME_PATTERNS = /(^name$|first.?name|last.?name|full.?name|fname|lname|surname|given.?name|middle)/i;
|
|
110
|
+
var EMAIL_PATTERNS = /(email|e.?mail|email.?addr)/i;
|
|
111
|
+
var PHONE_PATTERNS = /(phone|tel|mobile|fax|cell)/i;
|
|
112
|
+
var ZIP_PATTERNS = /(zip|postal|postcode|zip.?code)/i;
|
|
113
|
+
var ADDRESS_PATTERNS = /(address|street|addr|line.?1|line.?2)/i;
|
|
114
|
+
var GEO_PATTERNS = /(city|^state$|state.?cd|^country$|province|region|county)/i;
|
|
115
|
+
var DATE_PATTERNS = /(date|_dt$|_date$|registr|created|updated|birth.?d|dob)/i;
|
|
116
|
+
var ID_PATTERNS = /(^id$|^key$|^code$|^sku$|_id$|_key$)/i;
|
|
117
|
+
function classifyByName(colName) {
|
|
118
|
+
if (DATE_PATTERNS.test(colName)) return ColumnType.DATE;
|
|
119
|
+
if (EMAIL_PATTERNS.test(colName)) return ColumnType.EMAIL;
|
|
120
|
+
if (ZIP_PATTERNS.test(colName)) return ColumnType.ZIP;
|
|
121
|
+
if (GEO_PATTERNS.test(colName)) return ColumnType.GEO;
|
|
122
|
+
if (ADDRESS_PATTERNS.test(colName)) return ColumnType.ADDRESS;
|
|
123
|
+
if (PHONE_PATTERNS.test(colName)) return ColumnType.PHONE;
|
|
124
|
+
if (NAME_PATTERNS.test(colName)) return ColumnType.NAME;
|
|
125
|
+
if (ID_PATTERNS.test(colName)) return ColumnType.IDENTIFIER;
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
function normalizeDtype(rawType) {
|
|
129
|
+
const t = rawType.toLowerCase().trim();
|
|
130
|
+
if (t.includes("int") || t.includes("float")) return ColumnType.NUMERIC;
|
|
131
|
+
if (t.includes("date") || t.includes("time")) return ColumnType.DATE;
|
|
132
|
+
if (t.includes("bool")) return ColumnType.STRING;
|
|
133
|
+
return ColumnType.STRING;
|
|
134
|
+
}
|
|
135
|
+
function computeCardinalityBands(contexts) {
|
|
136
|
+
const stringContexts = contexts.filter(
|
|
137
|
+
(c) => c.inferredType !== ColumnType.NUMERIC && c.inferredType !== ColumnType.DATE
|
|
138
|
+
);
|
|
139
|
+
if (stringContexts.length < 3) {
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
const cardinalities = stringContexts.map((c) => c.cardinality).sort((a, b) => a - b);
|
|
143
|
+
const n = cardinalities.length;
|
|
144
|
+
const q1 = cardinalities[Math.floor(n / 4)];
|
|
145
|
+
const q3 = cardinalities[Math.floor(3 * n / 4)];
|
|
146
|
+
for (const ctx of contexts) {
|
|
147
|
+
if (ctx.inferredType === ColumnType.NUMERIC || ctx.inferredType === ColumnType.DATE) {
|
|
148
|
+
ctx.cardinalityBand = CardinalityBand.SKIP;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
if (ctx.cardinality <= q1) {
|
|
152
|
+
ctx.cardinalityBand = CardinalityBand.LOW;
|
|
153
|
+
} else if (ctx.cardinality >= q3) {
|
|
154
|
+
ctx.cardinalityBand = CardinalityBand.HIGH;
|
|
155
|
+
} else {
|
|
156
|
+
ctx.cardinalityBand = CardinalityBand.MID;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
function applyCardinalitySignal(contexts) {
|
|
161
|
+
for (const ctx of contexts) {
|
|
162
|
+
if (NEVER_IDENTIFIER_TYPES.has(ctx.inferredType)) {
|
|
163
|
+
ctx.isIdentifier = false;
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
const hasNameSignal = IDENTIFIER_TYPES.has(ctx.inferredType);
|
|
167
|
+
const band = ctx.cardinalityBand;
|
|
168
|
+
if (hasNameSignal && band === CardinalityBand.MID) {
|
|
169
|
+
ctx.isIdentifier = true;
|
|
170
|
+
ctx.confidence = Math.min(ctx.confidence + 0.15, 1);
|
|
171
|
+
} else if (hasNameSignal && band === CardinalityBand.LOW) {
|
|
172
|
+
ctx.isIdentifier = false;
|
|
173
|
+
ctx.confidence = Math.max(ctx.confidence - 0.2, MIN_CONFIDENCE);
|
|
174
|
+
} else if (hasNameSignal && band === CardinalityBand.HIGH) {
|
|
175
|
+
ctx.isIdentifier = true;
|
|
176
|
+
ctx.confidence = Math.min(ctx.confidence + 0.05, 1);
|
|
177
|
+
} else if (!hasNameSignal && band === CardinalityBand.MID) {
|
|
178
|
+
if (ctx.inferredType === ColumnType.STRING) {
|
|
179
|
+
ctx.isIdentifier = true;
|
|
180
|
+
ctx.confidence = 0.5;
|
|
181
|
+
}
|
|
182
|
+
} else if (!hasNameSignal && band === CardinalityBand.LOW) {
|
|
183
|
+
ctx.isIdentifier = false;
|
|
184
|
+
} else if (!hasNameSignal && band === CardinalityBand.HIGH) {
|
|
185
|
+
ctx.isIdentifier = false;
|
|
186
|
+
}
|
|
187
|
+
if (ctx.nullRate > 0.3 && ctx.isIdentifier) {
|
|
188
|
+
ctx.confidence = Math.max(ctx.confidence - 0.1, MIN_CONFIDENCE);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
function buildContextsFromCheck(findings, columnProfiles) {
|
|
193
|
+
if (!columnProfiles || columnProfiles.length === 0) {
|
|
194
|
+
return [];
|
|
195
|
+
}
|
|
196
|
+
const contexts = /* @__PURE__ */ new Map();
|
|
197
|
+
for (const cp of columnProfiles) {
|
|
198
|
+
let semanticType = classifyByName(cp.name);
|
|
199
|
+
if (!semanticType) {
|
|
200
|
+
semanticType = normalizeDtype(cp.inferredType ?? "string");
|
|
201
|
+
}
|
|
202
|
+
const ctx = makeColumnContext({
|
|
203
|
+
name: cp.name,
|
|
204
|
+
inferredType: semanticType,
|
|
205
|
+
nullRate: cp.nullPct ?? 0,
|
|
206
|
+
cardinality: cp.uniqueCount ?? 0,
|
|
207
|
+
isIdentifier: IDENTIFIER_TYPES.has(semanticType),
|
|
208
|
+
confidence: semanticType !== ColumnType.STRING ? 0.8 : 0.4
|
|
209
|
+
});
|
|
210
|
+
contexts.set(cp.name, ctx);
|
|
211
|
+
}
|
|
212
|
+
const contextList = [...contexts.values()];
|
|
213
|
+
computeCardinalityBands(contextList);
|
|
214
|
+
applyCardinalitySignal(contextList);
|
|
215
|
+
for (const f of findings) {
|
|
216
|
+
const colName = f.column;
|
|
217
|
+
if (!colName || !contexts.has(colName)) continue;
|
|
218
|
+
const ctx = contexts.get(colName);
|
|
219
|
+
const check = f.check ?? "";
|
|
220
|
+
const message = String(f.message ?? "").slice(0, 80);
|
|
221
|
+
ctx.findings.push(`${check}: ${message}`);
|
|
222
|
+
}
|
|
223
|
+
return contextList;
|
|
224
|
+
}
|
|
225
|
+
function enrichContextsFromFlow(contexts, records) {
|
|
226
|
+
if (!records) return;
|
|
227
|
+
const lookup = new Map(contexts.map((c) => [c.name, c]));
|
|
228
|
+
for (const record of records) {
|
|
229
|
+
const colName = record.column;
|
|
230
|
+
const transform = record.transform;
|
|
231
|
+
const affected = record.affectedRows ?? 0;
|
|
232
|
+
if (!colName || !lookup.has(colName)) continue;
|
|
233
|
+
const ctx = lookup.get(colName);
|
|
234
|
+
if (affected > 0 && transform) {
|
|
235
|
+
ctx.transformsApplied.push(transform);
|
|
236
|
+
}
|
|
237
|
+
if (transform && transform.toLowerCase().includes("date")) {
|
|
238
|
+
ctx.inferredType = ColumnType.DATE;
|
|
239
|
+
ctx.isIdentifier = false;
|
|
240
|
+
ctx.confidence = 0.95;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
function distinctNonNull(rows, col) {
|
|
245
|
+
const seen = /* @__PURE__ */ new Set();
|
|
246
|
+
for (const row of rows) {
|
|
247
|
+
const v = row[col];
|
|
248
|
+
if (v === null || v === void 0 || v === "") continue;
|
|
249
|
+
seen.add(v);
|
|
250
|
+
}
|
|
251
|
+
return seen.size;
|
|
252
|
+
}
|
|
253
|
+
function nullRateOf(rows, col) {
|
|
254
|
+
if (rows.length === 0) return 1;
|
|
255
|
+
let nulls = 0;
|
|
256
|
+
for (const row of rows) {
|
|
257
|
+
const v = row[col];
|
|
258
|
+
if (v === null || v === void 0 || v === "") nulls += 1;
|
|
259
|
+
}
|
|
260
|
+
return nulls / rows.length;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// src/core/engine/registry.ts
|
|
264
|
+
var StageRegistry = class {
|
|
265
|
+
stages = /* @__PURE__ */ new Map();
|
|
266
|
+
/** Register a stage under its `info.name`. */
|
|
267
|
+
register(stage2) {
|
|
268
|
+
this.stages.set(stage2.info.name, stage2);
|
|
269
|
+
}
|
|
270
|
+
/** Retrieve a stage by name. Throws if not found. */
|
|
271
|
+
get(name) {
|
|
272
|
+
const stage2 = this.stages.get(name);
|
|
273
|
+
if (stage2 === void 0) {
|
|
274
|
+
throw new Error(`Stage '${name}' not found in registry`);
|
|
275
|
+
}
|
|
276
|
+
return stage2;
|
|
277
|
+
}
|
|
278
|
+
/** True when a stage with this name is registered. */
|
|
279
|
+
has(name) {
|
|
280
|
+
return this.stages.has(name);
|
|
281
|
+
}
|
|
282
|
+
/** Return `{ name: StageInfo }` for all registered stages. */
|
|
283
|
+
listAll() {
|
|
284
|
+
const out = {};
|
|
285
|
+
for (const [name, s] of this.stages) {
|
|
286
|
+
out[name] = s.info;
|
|
287
|
+
}
|
|
288
|
+
return out;
|
|
289
|
+
}
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
// src/core/engine/resolver.ts
|
|
293
|
+
var WiringError = class extends Error {
|
|
294
|
+
constructor(message) {
|
|
295
|
+
super(message);
|
|
296
|
+
this.name = "WiringError";
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
var Resolver = {
|
|
300
|
+
/**
|
|
301
|
+
* Resolve a config + registry into an ordered ExecutionPlan. Auto-prepends
|
|
302
|
+
* the built-in `load` stage when available and validates that every stage's
|
|
303
|
+
* `consumes` is produced by an earlier stage.
|
|
304
|
+
*/
|
|
305
|
+
resolve(config, registry) {
|
|
306
|
+
const plan = { stages: [] };
|
|
307
|
+
const availableArtifacts = /* @__PURE__ */ new Set();
|
|
308
|
+
if (registry.has("load")) {
|
|
309
|
+
const load = registry.get("load");
|
|
310
|
+
plan.stages.push({
|
|
311
|
+
name: "load",
|
|
312
|
+
stage: load,
|
|
313
|
+
spec: makeStageSpec("load"),
|
|
314
|
+
config: {}
|
|
315
|
+
});
|
|
316
|
+
for (const p of load.info.produces) availableArtifacts.add(p);
|
|
317
|
+
} else {
|
|
318
|
+
availableArtifacts.add("df");
|
|
319
|
+
}
|
|
320
|
+
for (const rawSpec of config.stages) {
|
|
321
|
+
const spec = makeStageSpec(rawSpec);
|
|
322
|
+
const stageObj = registry.get(spec.use);
|
|
323
|
+
const name = spec.name ?? stageObj.info.name;
|
|
324
|
+
for (const dep of stageObj.info.consumes) {
|
|
325
|
+
if (!availableArtifacts.has(dep)) {
|
|
326
|
+
throw new WiringError(
|
|
327
|
+
`Stage '${name}' consumes '${dep}' but no prior stage produces it. Available: ${[...availableArtifacts].sort().join(", ")}`
|
|
328
|
+
);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
plan.stages.push({ name, stage: stageObj, spec, config: spec.config });
|
|
332
|
+
for (const p of stageObj.info.produces) availableArtifacts.add(p);
|
|
333
|
+
}
|
|
334
|
+
return plan;
|
|
335
|
+
}
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
// src/core/engine/router.ts
|
|
339
|
+
var Router = {
|
|
340
|
+
/**
|
|
341
|
+
* Apply a Decision (skip / abort / insert) to the remaining stages and
|
|
342
|
+
* return the new remaining list. Records `decision.reason` in
|
|
343
|
+
* `ctx.reasoning._router`.
|
|
344
|
+
*/
|
|
345
|
+
apply(decision, remaining, ctx, registry) {
|
|
346
|
+
if (decision.reason) {
|
|
347
|
+
ctx.reasoning["_router"] = decision.reason;
|
|
348
|
+
}
|
|
349
|
+
if (decision.abort) {
|
|
350
|
+
ctx.reasoning["_router"] = `ABORT: ${decision.reason}`;
|
|
351
|
+
return [];
|
|
352
|
+
}
|
|
353
|
+
let next = remaining;
|
|
354
|
+
if (decision.skip.length > 0) {
|
|
355
|
+
const skipSet = new Set(decision.skip);
|
|
356
|
+
next = next.filter((s) => !skipSet.has(s.name));
|
|
357
|
+
}
|
|
358
|
+
if (decision.insert.length > 0) {
|
|
359
|
+
const inserted = [];
|
|
360
|
+
for (const name of decision.insert) {
|
|
361
|
+
const stageObj = registry.get(name);
|
|
362
|
+
inserted.push({
|
|
363
|
+
name,
|
|
364
|
+
stage: stageObj,
|
|
365
|
+
spec: makeStageSpec(name),
|
|
366
|
+
config: {}
|
|
367
|
+
});
|
|
368
|
+
}
|
|
369
|
+
next = [...inserted, ...next];
|
|
370
|
+
}
|
|
371
|
+
return next;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
// src/core/engine/runner.ts
|
|
376
|
+
function isFalsy(value) {
|
|
377
|
+
if (value === null || value === void 0) return true;
|
|
378
|
+
if (value === false || value === 0 || value === "") return true;
|
|
379
|
+
if (Array.isArray(value)) return value.length === 0;
|
|
380
|
+
if (value instanceof Map || value instanceof Set) return value.size === 0;
|
|
381
|
+
if (typeof value === "object") return Object.keys(value).length === 0;
|
|
382
|
+
return false;
|
|
383
|
+
}
|
|
384
|
+
var Runner = class {
|
|
385
|
+
constructor(registry) {
|
|
386
|
+
this.registry = registry;
|
|
387
|
+
}
|
|
388
|
+
registry;
|
|
389
|
+
/** Execute an ExecutionPlan against a PipeContext, returning per-stage results. */
|
|
390
|
+
async run(plan, ctx) {
|
|
391
|
+
const results = {};
|
|
392
|
+
let remaining = [...plan.stages];
|
|
393
|
+
while (remaining.length > 0) {
|
|
394
|
+
const planned = remaining.shift();
|
|
395
|
+
if (planned.spec.skipIf) {
|
|
396
|
+
const artifact = ctx.artifacts[planned.spec.skipIf];
|
|
397
|
+
if (isFalsy(artifact)) {
|
|
398
|
+
results[planned.name] = { status: StageStatus.SKIPPED };
|
|
399
|
+
ctx.reasoning[planned.name] = `Skipped: artifact '${planned.spec.skipIf}' is missing/falsy`;
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
const start = performance.now();
|
|
404
|
+
try {
|
|
405
|
+
ctx.stageConfig = planned.config;
|
|
406
|
+
await planned.stage.validate(ctx);
|
|
407
|
+
const result = await planned.stage.run(ctx);
|
|
408
|
+
ctx.timing[planned.name] = (performance.now() - start) / 1e3;
|
|
409
|
+
results[planned.name] = result;
|
|
410
|
+
if (result.decision != null) {
|
|
411
|
+
remaining = Router.apply(result.decision, remaining, ctx, this.registry);
|
|
412
|
+
}
|
|
413
|
+
} catch (e) {
|
|
414
|
+
ctx.timing[planned.name] = (performance.now() - start) / 1e3;
|
|
415
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
416
|
+
results[planned.name] = { status: StageStatus.FAILED, error: message };
|
|
417
|
+
ctx.reasoning[planned.name] = `Failed: ${message}`;
|
|
418
|
+
if (planned.spec.onError === "abort") {
|
|
419
|
+
break;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
return results;
|
|
424
|
+
}
|
|
425
|
+
};
|
|
426
|
+
|
|
427
|
+
// src/core/engine/reporter.ts
|
|
428
|
+
var Reporter = {
|
|
429
|
+
build(ctx, stages) {
|
|
430
|
+
const entries = Object.entries(stages);
|
|
431
|
+
const errors = entries.filter(([, r]) => r.status === StageStatus.FAILED && r.error).map(([name, r]) => `${name}: ${r.error}`);
|
|
432
|
+
const skipped = entries.filter(([, r]) => r.status === StageStatus.SKIPPED).map(([name]) => name);
|
|
433
|
+
const nonSkip = entries.map(([, r]) => r.status).filter((s) => s !== StageStatus.SKIPPED);
|
|
434
|
+
let status;
|
|
435
|
+
if (nonSkip.length === 0) {
|
|
436
|
+
status = PipeStatus.SUCCESS;
|
|
437
|
+
} else if (nonSkip.every((s) => s === StageStatus.FAILED)) {
|
|
438
|
+
status = PipeStatus.FAILED;
|
|
439
|
+
} else if (nonSkip.every((s) => s === StageStatus.SUCCESS)) {
|
|
440
|
+
status = PipeStatus.SUCCESS;
|
|
441
|
+
} else {
|
|
442
|
+
status = PipeStatus.PARTIAL;
|
|
443
|
+
}
|
|
444
|
+
return {
|
|
445
|
+
status,
|
|
446
|
+
source: typeof ctx.metadata["source"] === "string" ? ctx.metadata["source"] : "",
|
|
447
|
+
inputRows: typeof ctx.metadata["input_rows"] === "number" ? ctx.metadata["input_rows"] : 0,
|
|
448
|
+
stages,
|
|
449
|
+
artifacts: { ...ctx.artifacts },
|
|
450
|
+
skipped,
|
|
451
|
+
errors,
|
|
452
|
+
reasoning: { ...ctx.reasoning },
|
|
453
|
+
timing: { ...ctx.timing }
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
};
|
|
457
|
+
|
|
458
|
+
// src/core/adapters/load.ts
|
|
459
|
+
var LoadStage = {
|
|
460
|
+
info: { name: "load", produces: ["df"], consumes: [] },
|
|
461
|
+
validate(_ctx) {
|
|
462
|
+
},
|
|
463
|
+
async run(_ctx) {
|
|
464
|
+
return { status: StageStatus.SUCCESS };
|
|
465
|
+
},
|
|
466
|
+
rollback: null
|
|
467
|
+
};
|
|
468
|
+
function normalizeFinding(f) {
|
|
469
|
+
return {
|
|
470
|
+
severity: severityLabel(f.severity).toLowerCase(),
|
|
471
|
+
check: f.check,
|
|
472
|
+
column: f.column,
|
|
473
|
+
message: f.message
|
|
474
|
+
};
|
|
475
|
+
}
|
|
476
|
+
function toColumnProfileLike(cp) {
|
|
477
|
+
return {
|
|
478
|
+
name: cp.name,
|
|
479
|
+
inferredType: cp.inferredType,
|
|
480
|
+
nullPct: cp.nullPct,
|
|
481
|
+
uniqueCount: cp.uniqueCount
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
var ScanStage = {
|
|
485
|
+
info: { name: "goldencheck.scan", produces: ["findings", "profile"], consumes: ["df"] },
|
|
486
|
+
validate(ctx) {
|
|
487
|
+
if (ctx.df === null) {
|
|
488
|
+
throw new Error("ScanStage: no df in context");
|
|
489
|
+
}
|
|
490
|
+
},
|
|
491
|
+
async run(ctx) {
|
|
492
|
+
const rows = ctx.df ?? [];
|
|
493
|
+
const data = new TabularData(rows);
|
|
494
|
+
const stageCfg = ctx.stageConfig;
|
|
495
|
+
const opts = stageCfg && Object.keys(stageCfg).length > 0 ? stageCfg : void 0;
|
|
496
|
+
const result = scanData(data, opts);
|
|
497
|
+
const findings = result.findings.map(normalizeFinding);
|
|
498
|
+
const columnProfiles = result.profile.columns;
|
|
499
|
+
ctx.artifacts["findings"] = findings;
|
|
500
|
+
ctx.artifacts["profile"] = result.profile;
|
|
501
|
+
try {
|
|
502
|
+
const profileLikes = columnProfiles.map(toColumnProfileLike);
|
|
503
|
+
const findingLikes = findings.map((f) => ({
|
|
504
|
+
column: f.column,
|
|
505
|
+
check: f.check,
|
|
506
|
+
message: f.message
|
|
507
|
+
}));
|
|
508
|
+
ctx.artifacts["column_contexts"] = buildContextsFromCheck(findingLikes, profileLikes);
|
|
509
|
+
} catch {
|
|
510
|
+
ctx.artifacts["column_contexts"] = [];
|
|
511
|
+
}
|
|
512
|
+
return { status: StageStatus.SUCCESS };
|
|
513
|
+
},
|
|
514
|
+
rollback: null
|
|
515
|
+
};
|
|
516
|
+
var TransformStage = {
|
|
517
|
+
info: { name: "goldenflow.transform", produces: ["df", "manifest"], consumes: ["df"] },
|
|
518
|
+
validate(ctx) {
|
|
519
|
+
if (ctx.df === null) {
|
|
520
|
+
throw new Error("TransformStage: no df in context");
|
|
521
|
+
}
|
|
522
|
+
},
|
|
523
|
+
async run(ctx) {
|
|
524
|
+
const rows = ctx.df ?? [];
|
|
525
|
+
const stageCfg = ctx.stageConfig;
|
|
526
|
+
const config = stageCfg && Object.keys(stageCfg).length > 0 ? stageCfg : void 0;
|
|
527
|
+
const engine = new TransformEngine(config);
|
|
528
|
+
const result = engine.transformDf(rows);
|
|
529
|
+
ctx.df = [...result.rows];
|
|
530
|
+
ctx.artifacts["manifest"] = result.manifest;
|
|
531
|
+
const contexts = ctx.artifacts["column_contexts"];
|
|
532
|
+
if (Array.isArray(contexts)) {
|
|
533
|
+
try {
|
|
534
|
+
const records = result.manifest.records.map((r) => ({
|
|
535
|
+
column: r.column,
|
|
536
|
+
transform: r.transform,
|
|
537
|
+
affectedRows: r.affectedRows
|
|
538
|
+
}));
|
|
539
|
+
enrichContextsFromFlow(contexts, records);
|
|
540
|
+
} catch {
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
return { status: StageStatus.SUCCESS };
|
|
544
|
+
},
|
|
545
|
+
rollback: null
|
|
546
|
+
};
|
|
547
|
+
function castRowsToString(rows) {
|
|
548
|
+
return rows.map((row) => {
|
|
549
|
+
const out = {};
|
|
550
|
+
for (const [k, v] of Object.entries(row)) {
|
|
551
|
+
out[k] = v === null || v === void 0 ? "" : String(v);
|
|
552
|
+
}
|
|
553
|
+
return out;
|
|
554
|
+
});
|
|
555
|
+
}
|
|
556
|
+
var DedupeStage = {
|
|
557
|
+
info: { name: "goldenmatch.dedupe", produces: ["clusters", "golden"], consumes: ["df"] },
|
|
558
|
+
validate(ctx) {
|
|
559
|
+
if (ctx.df === null) {
|
|
560
|
+
throw new Error("DedupeStage: no df in context");
|
|
561
|
+
}
|
|
562
|
+
},
|
|
563
|
+
async run(ctx) {
|
|
564
|
+
const rows = castRowsToString(ctx.df ?? []);
|
|
565
|
+
ctx.df = rows;
|
|
566
|
+
const stageCfg = ctx.stageConfig;
|
|
567
|
+
let config = null;
|
|
568
|
+
if (stageCfg && Object.keys(stageCfg).length > 0) {
|
|
569
|
+
config = makeConfig(stageCfg);
|
|
570
|
+
} else {
|
|
571
|
+
const contexts = ctx.artifacts["column_contexts"];
|
|
572
|
+
if (Array.isArray(contexts) && contexts.length > 0) {
|
|
573
|
+
config = buildConfigFromContexts(contexts, rows);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
const result = config !== null ? await dedupe(rows, { config }) : await dedupe(rows);
|
|
577
|
+
ctx.artifacts["clusters"] = result.clusters;
|
|
578
|
+
ctx.artifacts["golden"] = result.goldenRecords;
|
|
579
|
+
ctx.artifacts["unique"] = result.unique;
|
|
580
|
+
ctx.artifacts["dupes"] = result.dupes;
|
|
581
|
+
ctx.artifacts["match_stats"] = result.stats;
|
|
582
|
+
ctx.artifacts["scored_pairs"] = result.scoredPairs;
|
|
583
|
+
const mks = config?.matchkeys;
|
|
584
|
+
if (mks && mks.length > 0) {
|
|
585
|
+
ctx.artifacts["matchkey_used"] = mks[0].name;
|
|
586
|
+
}
|
|
587
|
+
return { status: StageStatus.SUCCESS };
|
|
588
|
+
},
|
|
589
|
+
rollback: null
|
|
590
|
+
};
|
|
591
|
+
function buildConfigFromContexts(contexts, rows) {
|
|
592
|
+
const nameCols = contexts.filter(
|
|
593
|
+
(c) => c.inferredType === ColumnType.NAME && c.isIdentifier
|
|
594
|
+
);
|
|
595
|
+
const emailCols = contexts.filter((c) => c.inferredType === ColumnType.EMAIL);
|
|
596
|
+
const geoCols = contexts.filter((c) => c.inferredType === ColumnType.GEO);
|
|
597
|
+
const matchkeys = [];
|
|
598
|
+
for (const col of emailCols) {
|
|
599
|
+
matchkeys.push(
|
|
600
|
+
makeMatchkeyConfig({
|
|
601
|
+
name: `exact_${col.name}`,
|
|
602
|
+
type: "exact",
|
|
603
|
+
fields: [makeMatchkeyField({ field: col.name, transforms: ["lowercase", "strip"], scorer: "exact" })]
|
|
604
|
+
})
|
|
605
|
+
);
|
|
606
|
+
}
|
|
607
|
+
if (nameCols.length > 0) {
|
|
608
|
+
const fuzzyFields = nameCols.map(
|
|
609
|
+
(col) => makeMatchkeyField({
|
|
610
|
+
field: col.name,
|
|
611
|
+
scorer: "jaro_winkler",
|
|
612
|
+
weight: 1,
|
|
613
|
+
transforms: ["lowercase", "strip"]
|
|
614
|
+
})
|
|
615
|
+
);
|
|
616
|
+
matchkeys.push(
|
|
617
|
+
makeMatchkeyConfig({
|
|
618
|
+
name: "fuzzy_names",
|
|
619
|
+
type: "weighted",
|
|
620
|
+
threshold: 0.85,
|
|
621
|
+
fields: fuzzyFields
|
|
622
|
+
})
|
|
623
|
+
);
|
|
624
|
+
}
|
|
625
|
+
if (matchkeys.length === 0) {
|
|
626
|
+
let stringCols = contexts.filter(
|
|
627
|
+
(c) => c.inferredType === ColumnType.STRING || c.inferredType === ColumnType.NAME
|
|
628
|
+
);
|
|
629
|
+
if (rows.length > 0) {
|
|
630
|
+
const minCardinality = Math.max(10, Math.floor(rows.length * 0.05));
|
|
631
|
+
stringCols = stringCols.filter((c) => distinctNonNull(rows, c.name) >= minCardinality);
|
|
632
|
+
}
|
|
633
|
+
const fallbackFields = stringCols.slice(0, 3).map(
|
|
634
|
+
(col) => makeMatchkeyField({
|
|
635
|
+
field: col.name,
|
|
636
|
+
scorer: "jaro_winkler",
|
|
637
|
+
weight: 1,
|
|
638
|
+
transforms: ["lowercase", "strip"]
|
|
639
|
+
})
|
|
640
|
+
);
|
|
641
|
+
if (fallbackFields.length > 0) {
|
|
642
|
+
matchkeys.push(
|
|
643
|
+
makeMatchkeyConfig({
|
|
644
|
+
name: "fuzzy_fallback",
|
|
645
|
+
type: "weighted",
|
|
646
|
+
threshold: 0.85,
|
|
647
|
+
fields: fallbackFields
|
|
648
|
+
})
|
|
649
|
+
);
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
if (matchkeys.length === 0) {
|
|
653
|
+
return null;
|
|
654
|
+
}
|
|
655
|
+
let bestGeo = null;
|
|
656
|
+
if (geoCols.length > 0 && rows.length > 0) {
|
|
657
|
+
const maxNullRate = 0.2;
|
|
658
|
+
const geoCandidates = [];
|
|
659
|
+
for (const g of geoCols) {
|
|
660
|
+
if (nullRateOf(rows, g.name) <= maxNullRate) {
|
|
661
|
+
geoCandidates.push([g.name, distinctNonNull(rows, g.name)]);
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
if (geoCandidates.length > 0) {
|
|
665
|
+
geoCandidates.sort((a, b) => a[1] - b[1]);
|
|
666
|
+
bestGeo = geoCandidates[0][0];
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
const makeBlocking = (primaryFields, recallName, withGeo = false) => {
|
|
670
|
+
const passes = [
|
|
671
|
+
{ fields: primaryFields, transforms: ["lowercase", "strip"] }
|
|
672
|
+
];
|
|
673
|
+
if (withGeo) {
|
|
674
|
+
passes.push({ fields: primaryFields, transforms: ["lowercase", "substring:0:3"] });
|
|
675
|
+
}
|
|
676
|
+
passes.push({ fields: [recallName], transforms: ["lowercase", "soundex"] });
|
|
677
|
+
return makeBlockingConfig({
|
|
678
|
+
strategy: "multi_pass",
|
|
679
|
+
keys: [passes[0]],
|
|
680
|
+
passes,
|
|
681
|
+
maxBlockSize: 500,
|
|
682
|
+
skipOversized: true
|
|
683
|
+
});
|
|
684
|
+
};
|
|
685
|
+
let blocking = null;
|
|
686
|
+
const lastNameCols = nameCols.filter((c) => c.name.toLowerCase().includes("last"));
|
|
687
|
+
if (lastNameCols.length > 0) {
|
|
688
|
+
const bestName = lastNameCols[0].name;
|
|
689
|
+
blocking = bestGeo ? makeBlocking([bestGeo, bestName], bestName, true) : makeBlocking([bestName], bestName);
|
|
690
|
+
} else if (nameCols.length > 0) {
|
|
691
|
+
const bestName = nameCols[0].name;
|
|
692
|
+
if (bestGeo) {
|
|
693
|
+
blocking = makeBlocking([bestGeo, bestName], bestName, true);
|
|
694
|
+
} else {
|
|
695
|
+
blocking = makeBlockingConfig({
|
|
696
|
+
strategy: "static",
|
|
697
|
+
keys: [{ fields: [bestName], transforms: ["lowercase", "soundex"] }],
|
|
698
|
+
maxBlockSize: 500,
|
|
699
|
+
skipOversized: true
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
if (!blocking && bestGeo && matchkeys.length > 0) {
|
|
704
|
+
const fuzzyMks = matchkeys.filter((mk) => mk.type === "weighted");
|
|
705
|
+
const first = fuzzyMks[0];
|
|
706
|
+
if (first && first.fields.length > 0) {
|
|
707
|
+
const anchor = first.fields[0].field;
|
|
708
|
+
blocking = makeBlocking([bestGeo, anchor], anchor, true);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
if (!blocking) {
|
|
712
|
+
blocking = makeBlockingConfig({ keys: [], autoSuggest: true });
|
|
713
|
+
}
|
|
714
|
+
return makeConfig({ matchkeys, blocking });
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
// src/core/adapters/index.ts
|
|
718
|
+
function buildDefaultRegistry() {
|
|
719
|
+
const registry = new StageRegistry();
|
|
720
|
+
registry.register(LoadStage);
|
|
721
|
+
registry.register(ScanStage);
|
|
722
|
+
registry.register(TransformStage);
|
|
723
|
+
registry.register(DedupeStage);
|
|
724
|
+
return registry;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
// src/core/pipeline.ts
|
|
728
|
+
var DEFAULT_STAGE_ORDER = [
|
|
729
|
+
"goldencheck.scan",
|
|
730
|
+
"goldenflow.transform",
|
|
731
|
+
"goldenmatch.dedupe"
|
|
732
|
+
];
|
|
733
|
+
var Pipeline = class {
|
|
734
|
+
config;
|
|
735
|
+
registry;
|
|
736
|
+
constructor(options) {
|
|
737
|
+
this.config = options?.config;
|
|
738
|
+
this.registry = options?.registry ?? buildDefaultRegistry();
|
|
739
|
+
}
|
|
740
|
+
/** Run the pipeline on an array of rows. */
|
|
741
|
+
async run(rows, source = "<rows>") {
|
|
742
|
+
const ctx = makePipeContext({
|
|
743
|
+
df: [...rows],
|
|
744
|
+
metadata: { source, input_rows: rows.length }
|
|
745
|
+
});
|
|
746
|
+
const config = this.config ?? this.autoConfig();
|
|
747
|
+
let plan;
|
|
748
|
+
try {
|
|
749
|
+
plan = Resolver.resolve(config, this.registry);
|
|
750
|
+
} catch (e) {
|
|
751
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
752
|
+
return {
|
|
753
|
+
status: PipeStatus.FAILED,
|
|
754
|
+
source,
|
|
755
|
+
inputRows: rows.length,
|
|
756
|
+
stages: {},
|
|
757
|
+
artifacts: {},
|
|
758
|
+
skipped: [],
|
|
759
|
+
errors: [`Pipeline resolution failed: ${message}`],
|
|
760
|
+
reasoning: {},
|
|
761
|
+
timing: {}
|
|
762
|
+
};
|
|
763
|
+
}
|
|
764
|
+
const runner = new Runner(this.registry);
|
|
765
|
+
const stages = await runner.run(plan, ctx);
|
|
766
|
+
return Reporter.build(ctx, stages);
|
|
767
|
+
}
|
|
768
|
+
/** Build the default check→flow→dedupe config from the available stages. */
|
|
769
|
+
autoConfig() {
|
|
770
|
+
const available = this.registry.listAll();
|
|
771
|
+
const stages = DEFAULT_STAGE_ORDER.filter((name) => name in available).map(
|
|
772
|
+
(name) => makeStageSpec(name)
|
|
773
|
+
);
|
|
774
|
+
return makePipelineConfig({ pipeline: "auto", stages });
|
|
775
|
+
}
|
|
776
|
+
};
|
|
777
|
+
function parseCsv(content) {
|
|
778
|
+
const records = parseRecords(content);
|
|
779
|
+
if (records.length === 0) return [];
|
|
780
|
+
const headers = records[0];
|
|
781
|
+
const rows = [];
|
|
782
|
+
for (let i = 1; i < records.length; i++) {
|
|
783
|
+
const values = records[i];
|
|
784
|
+
if (values.length === 1 && values[0] === "") continue;
|
|
785
|
+
const row = {};
|
|
786
|
+
for (let c = 0; c < headers.length; c++) {
|
|
787
|
+
row[headers[c]] = c < values.length ? values[c] : "";
|
|
788
|
+
}
|
|
789
|
+
rows.push(row);
|
|
790
|
+
}
|
|
791
|
+
return rows;
|
|
792
|
+
}
|
|
793
|
+
function readCsv(path) {
|
|
794
|
+
const content = readFileSync(path, "utf8");
|
|
795
|
+
return parseCsv(content);
|
|
796
|
+
}
|
|
797
|
+
function parseRecords(content) {
|
|
798
|
+
const records = [];
|
|
799
|
+
let field = "";
|
|
800
|
+
let record = [];
|
|
801
|
+
let inQuotes = false;
|
|
802
|
+
let i = 0;
|
|
803
|
+
const n = content.length;
|
|
804
|
+
const pushField = () => {
|
|
805
|
+
record.push(field);
|
|
806
|
+
field = "";
|
|
807
|
+
};
|
|
808
|
+
const pushRecord = () => {
|
|
809
|
+
pushField();
|
|
810
|
+
records.push(record);
|
|
811
|
+
record = [];
|
|
812
|
+
};
|
|
813
|
+
while (i < n) {
|
|
814
|
+
const ch = content[i];
|
|
815
|
+
if (inQuotes) {
|
|
816
|
+
if (ch === '"') {
|
|
817
|
+
if (i + 1 < n && content[i + 1] === '"') {
|
|
818
|
+
field += '"';
|
|
819
|
+
i += 2;
|
|
820
|
+
continue;
|
|
821
|
+
}
|
|
822
|
+
inQuotes = false;
|
|
823
|
+
i += 1;
|
|
824
|
+
continue;
|
|
825
|
+
}
|
|
826
|
+
field += ch;
|
|
827
|
+
i += 1;
|
|
828
|
+
continue;
|
|
829
|
+
}
|
|
830
|
+
if (ch === '"') {
|
|
831
|
+
inQuotes = true;
|
|
832
|
+
i += 1;
|
|
833
|
+
continue;
|
|
834
|
+
}
|
|
835
|
+
if (ch === ",") {
|
|
836
|
+
pushField();
|
|
837
|
+
i += 1;
|
|
838
|
+
continue;
|
|
839
|
+
}
|
|
840
|
+
if (ch === "\r") {
|
|
841
|
+
if (i + 1 < n && content[i + 1] === "\n") i += 1;
|
|
842
|
+
pushRecord();
|
|
843
|
+
i += 1;
|
|
844
|
+
continue;
|
|
845
|
+
}
|
|
846
|
+
if (ch === "\n") {
|
|
847
|
+
pushRecord();
|
|
848
|
+
i += 1;
|
|
849
|
+
continue;
|
|
850
|
+
}
|
|
851
|
+
field += ch;
|
|
852
|
+
i += 1;
|
|
853
|
+
}
|
|
854
|
+
if (field !== "" || record.length > 0) {
|
|
855
|
+
pushRecord();
|
|
856
|
+
}
|
|
857
|
+
return records;
|
|
858
|
+
}
|
|
859
|
+
async function loadConfig(path) {
|
|
860
|
+
let parse;
|
|
861
|
+
try {
|
|
862
|
+
const yamlMod = await import('yaml');
|
|
863
|
+
parse = yamlMod.parse;
|
|
864
|
+
} catch {
|
|
865
|
+
throw new Error(
|
|
866
|
+
"YAML support requires the optional `yaml` peer dependency. Run: npm install yaml"
|
|
867
|
+
);
|
|
868
|
+
}
|
|
869
|
+
let content;
|
|
870
|
+
try {
|
|
871
|
+
content = readFileSync(path, "utf8");
|
|
872
|
+
} catch {
|
|
873
|
+
throw new Error(`Config file not found: ${path}`);
|
|
874
|
+
}
|
|
875
|
+
const raw = parse(content);
|
|
876
|
+
return normalizeConfig(raw);
|
|
877
|
+
}
|
|
878
|
+
function normalizeConfig(raw) {
|
|
879
|
+
if (raw === null || typeof raw !== "object") {
|
|
880
|
+
throw new Error("Pipeline config must be a mapping");
|
|
881
|
+
}
|
|
882
|
+
const obj = raw;
|
|
883
|
+
if (typeof obj["pipeline"] !== "string") {
|
|
884
|
+
throw new Error("Pipeline config must have a string 'pipeline' field");
|
|
885
|
+
}
|
|
886
|
+
const rawStages = obj["stages"] ?? [];
|
|
887
|
+
if (!Array.isArray(rawStages)) {
|
|
888
|
+
throw new Error(`'stages' must be a list, got: ${typeof rawStages}`);
|
|
889
|
+
}
|
|
890
|
+
const stages = [];
|
|
891
|
+
for (const s of rawStages) {
|
|
892
|
+
if (typeof s === "string") {
|
|
893
|
+
stages.push(makeStageSpec(s));
|
|
894
|
+
} else if (s !== null && typeof s === "object") {
|
|
895
|
+
const so = s;
|
|
896
|
+
if (!("use" in so) && !("name" in so)) {
|
|
897
|
+
throw new Error(`Stage spec must have 'use' field: ${JSON.stringify(s)}`);
|
|
898
|
+
}
|
|
899
|
+
const use = so["use"] ?? so["name"];
|
|
900
|
+
stages.push(
|
|
901
|
+
makeStageSpec({
|
|
902
|
+
use,
|
|
903
|
+
...typeof so["name"] === "string" ? { name: so["name"] } : {},
|
|
904
|
+
...Array.isArray(so["needs"]) ? { needs: so["needs"] } : {},
|
|
905
|
+
...typeof so["skip_if"] === "string" ? { skipIf: so["skip_if"] } : {},
|
|
906
|
+
...typeof so["skipIf"] === "string" ? { skipIf: so["skipIf"] } : {},
|
|
907
|
+
...so["on_error"] === "abort" || so["on_error"] === "continue" ? { onError: so["on_error"] } : {},
|
|
908
|
+
...so["onError"] === "abort" || so["onError"] === "continue" ? { onError: so["onError"] } : {},
|
|
909
|
+
...so["config"] !== null && typeof so["config"] === "object" ? { config: so["config"] } : {}
|
|
910
|
+
})
|
|
911
|
+
);
|
|
912
|
+
} else {
|
|
913
|
+
throw new Error(`Invalid stage spec: ${JSON.stringify(s)}`);
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
return makePipelineConfig({
|
|
917
|
+
pipeline: obj["pipeline"],
|
|
918
|
+
...typeof obj["source"] === "string" ? { source: obj["source"] } : {},
|
|
919
|
+
...typeof obj["output"] === "string" ? { output: obj["output"] } : {},
|
|
920
|
+
stages,
|
|
921
|
+
...Array.isArray(obj["decisions"]) ? { decisions: obj["decisions"] } : {}
|
|
922
|
+
});
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
// src/node/run.ts
|
|
926
|
+
async function run(source, options) {
|
|
927
|
+
let rows;
|
|
928
|
+
try {
|
|
929
|
+
rows = readCsv(source);
|
|
930
|
+
} catch (e) {
|
|
931
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
932
|
+
return {
|
|
933
|
+
status: PipeStatus.FAILED,
|
|
934
|
+
source,
|
|
935
|
+
inputRows: 0,
|
|
936
|
+
stages: {},
|
|
937
|
+
artifacts: {},
|
|
938
|
+
skipped: [],
|
|
939
|
+
errors: [`Failed to load data: ${message}`],
|
|
940
|
+
reasoning: {},
|
|
941
|
+
timing: {}
|
|
942
|
+
};
|
|
943
|
+
}
|
|
944
|
+
let config;
|
|
945
|
+
const cfgOpt = options?.config;
|
|
946
|
+
if (typeof cfgOpt === "string") {
|
|
947
|
+
config = await loadConfig(cfgOpt);
|
|
948
|
+
} else if (cfgOpt !== void 0) {
|
|
949
|
+
config = cfgOpt;
|
|
950
|
+
}
|
|
951
|
+
const pipe = new Pipeline(config !== void 0 ? { config } : {});
|
|
952
|
+
return pipe.run(rows, source);
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
// src/cli.ts
|
|
956
|
+
var VERSION = "0.1.0";
|
|
957
|
+
function printResult(result, verbose) {
|
|
958
|
+
process.stdout.write(`GoldenPipe: ${result.source}
|
|
959
|
+
`);
|
|
960
|
+
process.stdout.write("Stage Status Details\n");
|
|
961
|
+
for (const [name, sr] of Object.entries(result.stages)) {
|
|
962
|
+
const details = sr.error ?? "";
|
|
963
|
+
process.stdout.write(`${name.padEnd(25)} ${sr.status.padEnd(10)} ${details}
|
|
964
|
+
`);
|
|
965
|
+
}
|
|
966
|
+
process.stdout.write(
|
|
967
|
+
`
|
|
968
|
+
${result.status.toUpperCase()} | ${result.inputRows} rows | ${result.source}
|
|
969
|
+
`
|
|
970
|
+
);
|
|
971
|
+
if (result.errors.length > 0) {
|
|
972
|
+
process.stdout.write("\nErrors:\n");
|
|
973
|
+
for (const e of result.errors) process.stdout.write(` - ${e}
|
|
974
|
+
`);
|
|
975
|
+
}
|
|
976
|
+
if (verbose) {
|
|
977
|
+
const reasoning = Object.entries(result.reasoning).filter(([k]) => !k.startsWith("_"));
|
|
978
|
+
if (reasoning.length > 0) {
|
|
979
|
+
process.stdout.write("\nReasoning:\n");
|
|
980
|
+
for (const [k, v] of reasoning) process.stdout.write(` ${k}: ${v}
|
|
981
|
+
`);
|
|
982
|
+
}
|
|
983
|
+
const timing = Object.entries(result.timing);
|
|
984
|
+
if (timing.length > 0) {
|
|
985
|
+
process.stdout.write("\nTiming:\n");
|
|
986
|
+
for (const [k, v] of timing) process.stdout.write(` ${k}: ${v.toFixed(2)}s
|
|
987
|
+
`);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
var program = new Command().name("goldenpipe-js").description("GoldenPipe: Golden Suite orchestrator (TypeScript)").version(VERSION);
|
|
992
|
+
program.command("run <source>").description("Run a pipeline on a CSV data file").option("-c, --config <path>", "Pipeline YAML config").option("-v, --verbose", "Show reasoning and timing").action(async (source, opts) => {
|
|
993
|
+
const result = await run(source, opts.config !== void 0 ? { config: opts.config } : {});
|
|
994
|
+
printResult(result, opts.verbose ?? false);
|
|
995
|
+
if (result.status === "failed") process.exitCode = 1;
|
|
996
|
+
});
|
|
997
|
+
program.command("stages").description("List all registered stages").action(() => {
|
|
998
|
+
const reg = buildDefaultRegistry();
|
|
999
|
+
const all = reg.listAll();
|
|
1000
|
+
process.stdout.write("Registered stages\n");
|
|
1001
|
+
process.stdout.write("Name Produces Consumes\n");
|
|
1002
|
+
for (const [name, info] of Object.entries(all).sort()) {
|
|
1003
|
+
process.stdout.write(
|
|
1004
|
+
`${name.padEnd(26)} ${info.produces.join(", ").padEnd(20)} ${info.consumes.join(", ")}
|
|
1005
|
+
`
|
|
1006
|
+
);
|
|
1007
|
+
}
|
|
1008
|
+
process.stdout.write(`
|
|
1009
|
+
${Object.keys(all).length} stage(s) found
|
|
1010
|
+
`);
|
|
1011
|
+
});
|
|
1012
|
+
program.command("validate").description("Dry-run wiring validation without executing").requiredOption("-c, --config <path>", "Pipeline YAML config").action(async (opts) => {
|
|
1013
|
+
try {
|
|
1014
|
+
const cfg = await loadConfig(opts.config);
|
|
1015
|
+
const reg = buildDefaultRegistry();
|
|
1016
|
+
const plan = Resolver.resolve(cfg, reg);
|
|
1017
|
+
process.stdout.write(`Valid -- ${plan.stages.length} stages resolved
|
|
1018
|
+
`);
|
|
1019
|
+
for (const s of plan.stages) process.stdout.write(` ${s.name}
|
|
1020
|
+
`);
|
|
1021
|
+
} catch (e) {
|
|
1022
|
+
if (e instanceof WiringError) {
|
|
1023
|
+
process.stderr.write(`Wiring Error: ${e.message}
|
|
1024
|
+
`);
|
|
1025
|
+
} else {
|
|
1026
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
1027
|
+
process.stderr.write(`Error: ${message}
|
|
1028
|
+
`);
|
|
1029
|
+
}
|
|
1030
|
+
process.exitCode = 1;
|
|
1031
|
+
}
|
|
1032
|
+
});
|
|
1033
|
+
program.command("init").description("Generate a starter goldenpipe.yml from registered stages").option("-d, --dir <dir>", "Directory to create config in", ".").action((opts) => {
|
|
1034
|
+
const reg = buildDefaultRegistry();
|
|
1035
|
+
const all = reg.listAll();
|
|
1036
|
+
const lines = ["pipeline: my-pipeline", "stages:"];
|
|
1037
|
+
for (const name of Object.keys(all).sort()) {
|
|
1038
|
+
if (name === "load") continue;
|
|
1039
|
+
lines.push(` - ${name}`);
|
|
1040
|
+
}
|
|
1041
|
+
const out = join(opts.dir, "goldenpipe.yml");
|
|
1042
|
+
writeFileSync(out, lines.join("\n") + "\n");
|
|
1043
|
+
process.stdout.write(`Created ${out}
|
|
1044
|
+
`);
|
|
1045
|
+
});
|
|
1046
|
+
program.parseAsync(process.argv).catch((e) => {
|
|
1047
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
1048
|
+
process.stderr.write(`${message}
|
|
1049
|
+
`);
|
|
1050
|
+
process.exitCode = 1;
|
|
1051
|
+
});
|
|
1052
|
+
//# sourceMappingURL=cli.js.map
|
|
1053
|
+
//# sourceMappingURL=cli.js.map
|