goldenpipe 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,861 @@
1
+ import { TabularData, scanData, severityLabel } from 'goldencheck/core';
2
+ import { TransformEngine } from 'goldenflow/core';
3
+ import { makeConfig, dedupe, makeMatchkeyConfig, makeMatchkeyField, makeBlockingConfig } from 'goldenmatch/core';
4
+
5
+ // src/core/models.ts
6
+ var StageStatus = {
7
+ SUCCESS: "success",
8
+ SKIPPED: "skipped",
9
+ FAILED: "failed"
10
+ };
11
+ var PipeStatus = {
12
+ SUCCESS: "success",
13
+ PARTIAL: "partial",
14
+ FAILED: "failed"
15
+ };
16
+ function makeDecision(input) {
17
+ return {
18
+ skip: input?.skip ?? [],
19
+ abort: input?.abort ?? false,
20
+ insert: input?.insert ?? [],
21
+ reason: input?.reason ?? ""
22
+ };
23
+ }
24
+ function makePipeContext(input) {
25
+ return {
26
+ df: input?.df ?? null,
27
+ artifacts: input?.artifacts ?? {},
28
+ metadata: input?.metadata ?? {},
29
+ timing: input?.timing ?? {},
30
+ reasoning: input?.reasoning ?? {},
31
+ stageConfig: input?.stageConfig ?? {}
32
+ };
33
+ }
34
+ function makeStageSpec(input) {
35
+ if (typeof input === "string") {
36
+ return { use: input, needs: [], onError: "continue", config: {} };
37
+ }
38
+ return {
39
+ ...input.name !== void 0 ? { name: input.name } : {},
40
+ use: input.use,
41
+ needs: input.needs ?? [],
42
+ ...input.skipIf !== void 0 ? { skipIf: input.skipIf } : {},
43
+ onError: input.onError ?? "continue",
44
+ config: input.config ?? {}
45
+ };
46
+ }
47
+ function makePipelineConfig(input) {
48
+ return {
49
+ pipeline: input.pipeline,
50
+ ...input.source !== void 0 ? { source: input.source } : {},
51
+ ...input.output !== void 0 ? { output: input.output } : {},
52
+ stages: input.stages,
53
+ decisions: input.decisions ?? []
54
+ };
55
+ }
56
+ function stage(info, fn) {
57
+ return {
58
+ info,
59
+ validate(_ctx) {
60
+ },
61
+ async run(ctx) {
62
+ return fn(ctx);
63
+ },
64
+ rollback: null
65
+ };
66
+ }
67
+
68
+ // src/core/columnContext.ts
69
+ var ColumnType = {
70
+ NAME: "name",
71
+ EMAIL: "email",
72
+ PHONE: "phone",
73
+ DATE: "date",
74
+ GEO: "geo",
75
+ ADDRESS: "address",
76
+ ZIP: "zip",
77
+ IDENTIFIER: "identifier",
78
+ NUMERIC: "numeric",
79
+ STRING: "string",
80
+ DESCRIPTION: "description"
81
+ };
82
+ var CardinalityBand = {
83
+ UNSET: "",
84
+ LOW: "low",
85
+ MID: "mid",
86
+ HIGH: "high",
87
+ SKIP: "skip"
88
+ };
89
+ var MIN_CONFIDENCE = 0.3;
90
+ var IDENTIFIER_TYPES = /* @__PURE__ */ new Set([
91
+ ColumnType.NAME,
92
+ ColumnType.EMAIL,
93
+ ColumnType.PHONE
94
+ ]);
95
+ var NEVER_IDENTIFIER_TYPES = /* @__PURE__ */ new Set([
96
+ ColumnType.DATE,
97
+ ColumnType.NUMERIC,
98
+ ColumnType.IDENTIFIER
99
+ ]);
100
+ function makeColumnContext(input) {
101
+ const ctx = {
102
+ name: input.name,
103
+ inferredType: input.inferredType ?? ColumnType.STRING,
104
+ nullRate: input.nullRate ?? 0,
105
+ cardinality: input.cardinality ?? 0,
106
+ isIdentifier: input.isIdentifier ?? false,
107
+ transformsApplied: input.transformsApplied ?? [],
108
+ findings: input.findings ?? [],
109
+ confidence: input.confidence ?? 0.5,
110
+ cardinalityBand: input.cardinalityBand ?? CardinalityBand.UNSET
111
+ };
112
+ if (!ctx.name) {
113
+ throw new Error("ColumnContext.name must be non-empty");
114
+ }
115
+ if (!(ctx.nullRate >= 0 && ctx.nullRate <= 1)) {
116
+ throw new Error(`nullRate must be in [0, 1], got ${ctx.nullRate}`);
117
+ }
118
+ if (ctx.cardinality < 0) {
119
+ throw new Error(`cardinality must be >= 0, got ${ctx.cardinality}`);
120
+ }
121
+ if (!(ctx.confidence >= 0 && ctx.confidence <= 1)) {
122
+ throw new Error(`confidence must be in [0, 1], got ${ctx.confidence}`);
123
+ }
124
+ return ctx;
125
+ }
126
+ var NAME_PATTERNS = /(^name$|first.?name|last.?name|full.?name|fname|lname|surname|given.?name|middle)/i;
127
+ var EMAIL_PATTERNS = /(email|e.?mail|email.?addr)/i;
128
+ var PHONE_PATTERNS = /(phone|tel|mobile|fax|cell)/i;
129
+ var ZIP_PATTERNS = /(zip|postal|postcode|zip.?code)/i;
130
+ var ADDRESS_PATTERNS = /(address|street|addr|line.?1|line.?2)/i;
131
+ var GEO_PATTERNS = /(city|^state$|state.?cd|^country$|province|region|county)/i;
132
+ var DATE_PATTERNS = /(date|_dt$|_date$|registr|created|updated|birth.?d|dob)/i;
133
+ var ID_PATTERNS = /(^id$|^key$|^code$|^sku$|_id$|_key$)/i;
134
+ function classifyByName(colName) {
135
+ if (DATE_PATTERNS.test(colName)) return ColumnType.DATE;
136
+ if (EMAIL_PATTERNS.test(colName)) return ColumnType.EMAIL;
137
+ if (ZIP_PATTERNS.test(colName)) return ColumnType.ZIP;
138
+ if (GEO_PATTERNS.test(colName)) return ColumnType.GEO;
139
+ if (ADDRESS_PATTERNS.test(colName)) return ColumnType.ADDRESS;
140
+ if (PHONE_PATTERNS.test(colName)) return ColumnType.PHONE;
141
+ if (NAME_PATTERNS.test(colName)) return ColumnType.NAME;
142
+ if (ID_PATTERNS.test(colName)) return ColumnType.IDENTIFIER;
143
+ return null;
144
+ }
145
+ function normalizeDtype(rawType) {
146
+ const t = rawType.toLowerCase().trim();
147
+ if (t.includes("int") || t.includes("float")) return ColumnType.NUMERIC;
148
+ if (t.includes("date") || t.includes("time")) return ColumnType.DATE;
149
+ if (t.includes("bool")) return ColumnType.STRING;
150
+ return ColumnType.STRING;
151
+ }
152
+ function computeCardinalityBands(contexts) {
153
+ const stringContexts = contexts.filter(
154
+ (c) => c.inferredType !== ColumnType.NUMERIC && c.inferredType !== ColumnType.DATE
155
+ );
156
+ if (stringContexts.length < 3) {
157
+ return;
158
+ }
159
+ const cardinalities = stringContexts.map((c) => c.cardinality).sort((a, b) => a - b);
160
+ const n = cardinalities.length;
161
+ const q1 = cardinalities[Math.floor(n / 4)];
162
+ const q3 = cardinalities[Math.floor(3 * n / 4)];
163
+ for (const ctx of contexts) {
164
+ if (ctx.inferredType === ColumnType.NUMERIC || ctx.inferredType === ColumnType.DATE) {
165
+ ctx.cardinalityBand = CardinalityBand.SKIP;
166
+ continue;
167
+ }
168
+ if (ctx.cardinality <= q1) {
169
+ ctx.cardinalityBand = CardinalityBand.LOW;
170
+ } else if (ctx.cardinality >= q3) {
171
+ ctx.cardinalityBand = CardinalityBand.HIGH;
172
+ } else {
173
+ ctx.cardinalityBand = CardinalityBand.MID;
174
+ }
175
+ }
176
+ }
177
+ function applyCardinalitySignal(contexts) {
178
+ for (const ctx of contexts) {
179
+ if (NEVER_IDENTIFIER_TYPES.has(ctx.inferredType)) {
180
+ ctx.isIdentifier = false;
181
+ continue;
182
+ }
183
+ const hasNameSignal = IDENTIFIER_TYPES.has(ctx.inferredType);
184
+ const band = ctx.cardinalityBand;
185
+ if (hasNameSignal && band === CardinalityBand.MID) {
186
+ ctx.isIdentifier = true;
187
+ ctx.confidence = Math.min(ctx.confidence + 0.15, 1);
188
+ } else if (hasNameSignal && band === CardinalityBand.LOW) {
189
+ ctx.isIdentifier = false;
190
+ ctx.confidence = Math.max(ctx.confidence - 0.2, MIN_CONFIDENCE);
191
+ } else if (hasNameSignal && band === CardinalityBand.HIGH) {
192
+ ctx.isIdentifier = true;
193
+ ctx.confidence = Math.min(ctx.confidence + 0.05, 1);
194
+ } else if (!hasNameSignal && band === CardinalityBand.MID) {
195
+ if (ctx.inferredType === ColumnType.STRING) {
196
+ ctx.isIdentifier = true;
197
+ ctx.confidence = 0.5;
198
+ }
199
+ } else if (!hasNameSignal && band === CardinalityBand.LOW) {
200
+ ctx.isIdentifier = false;
201
+ } else if (!hasNameSignal && band === CardinalityBand.HIGH) {
202
+ ctx.isIdentifier = false;
203
+ }
204
+ if (ctx.nullRate > 0.3 && ctx.isIdentifier) {
205
+ ctx.confidence = Math.max(ctx.confidence - 0.1, MIN_CONFIDENCE);
206
+ }
207
+ }
208
+ }
209
+ function buildContextsFromCheck(findings, columnProfiles) {
210
+ if (!columnProfiles || columnProfiles.length === 0) {
211
+ return [];
212
+ }
213
+ const contexts = /* @__PURE__ */ new Map();
214
+ for (const cp of columnProfiles) {
215
+ let semanticType = classifyByName(cp.name);
216
+ if (!semanticType) {
217
+ semanticType = normalizeDtype(cp.inferredType ?? "string");
218
+ }
219
+ const ctx = makeColumnContext({
220
+ name: cp.name,
221
+ inferredType: semanticType,
222
+ nullRate: cp.nullPct ?? 0,
223
+ cardinality: cp.uniqueCount ?? 0,
224
+ isIdentifier: IDENTIFIER_TYPES.has(semanticType),
225
+ confidence: semanticType !== ColumnType.STRING ? 0.8 : 0.4
226
+ });
227
+ contexts.set(cp.name, ctx);
228
+ }
229
+ const contextList = [...contexts.values()];
230
+ computeCardinalityBands(contextList);
231
+ applyCardinalitySignal(contextList);
232
+ for (const f of findings) {
233
+ const colName = f.column;
234
+ if (!colName || !contexts.has(colName)) continue;
235
+ const ctx = contexts.get(colName);
236
+ const check = f.check ?? "";
237
+ const message = String(f.message ?? "").slice(0, 80);
238
+ ctx.findings.push(`${check}: ${message}`);
239
+ }
240
+ return contextList;
241
+ }
242
+ function enrichContextsFromFlow(contexts, records) {
243
+ if (!records) return;
244
+ const lookup = new Map(contexts.map((c) => [c.name, c]));
245
+ for (const record of records) {
246
+ const colName = record.column;
247
+ const transform = record.transform;
248
+ const affected = record.affectedRows ?? 0;
249
+ if (!colName || !lookup.has(colName)) continue;
250
+ const ctx = lookup.get(colName);
251
+ if (affected > 0 && transform) {
252
+ ctx.transformsApplied.push(transform);
253
+ }
254
+ if (transform && transform.toLowerCase().includes("date")) {
255
+ ctx.inferredType = ColumnType.DATE;
256
+ ctx.isIdentifier = false;
257
+ ctx.confidence = 0.95;
258
+ }
259
+ }
260
+ }
261
+ function distinctNonNull(rows, col) {
262
+ const seen = /* @__PURE__ */ new Set();
263
+ for (const row of rows) {
264
+ const v = row[col];
265
+ if (v === null || v === void 0 || v === "") continue;
266
+ seen.add(v);
267
+ }
268
+ return seen.size;
269
+ }
270
+ function nullRateOf(rows, col) {
271
+ if (rows.length === 0) return 1;
272
+ let nulls = 0;
273
+ for (const row of rows) {
274
+ const v = row[col];
275
+ if (v === null || v === void 0 || v === "") nulls += 1;
276
+ }
277
+ return nulls / rows.length;
278
+ }
279
+
280
+ // src/core/engine/registry.ts
281
+ var StageRegistry = class {
282
+ stages = /* @__PURE__ */ new Map();
283
+ /** Register a stage under its `info.name`. */
284
+ register(stage2) {
285
+ this.stages.set(stage2.info.name, stage2);
286
+ }
287
+ /** Retrieve a stage by name. Throws if not found. */
288
+ get(name) {
289
+ const stage2 = this.stages.get(name);
290
+ if (stage2 === void 0) {
291
+ throw new Error(`Stage '${name}' not found in registry`);
292
+ }
293
+ return stage2;
294
+ }
295
+ /** True when a stage with this name is registered. */
296
+ has(name) {
297
+ return this.stages.has(name);
298
+ }
299
+ /** Return `{ name: StageInfo }` for all registered stages. */
300
+ listAll() {
301
+ const out = {};
302
+ for (const [name, s] of this.stages) {
303
+ out[name] = s.info;
304
+ }
305
+ return out;
306
+ }
307
+ };
308
+ function defaultRegistry() {
309
+ const registry = new StageRegistry();
310
+ return registry;
311
+ }
312
+
313
+ // src/core/engine/resolver.ts
314
+ var WiringError = class extends Error {
315
+ constructor(message) {
316
+ super(message);
317
+ this.name = "WiringError";
318
+ }
319
+ };
320
+ var Resolver = {
321
+ /**
322
+ * Resolve a config + registry into an ordered ExecutionPlan. Auto-prepends
323
+ * the built-in `load` stage when available and validates that every stage's
324
+ * `consumes` is produced by an earlier stage.
325
+ */
326
+ resolve(config, registry) {
327
+ const plan = { stages: [] };
328
+ const availableArtifacts = /* @__PURE__ */ new Set();
329
+ if (registry.has("load")) {
330
+ const load = registry.get("load");
331
+ plan.stages.push({
332
+ name: "load",
333
+ stage: load,
334
+ spec: makeStageSpec("load"),
335
+ config: {}
336
+ });
337
+ for (const p of load.info.produces) availableArtifacts.add(p);
338
+ } else {
339
+ availableArtifacts.add("df");
340
+ }
341
+ for (const rawSpec of config.stages) {
342
+ const spec = makeStageSpec(rawSpec);
343
+ const stageObj = registry.get(spec.use);
344
+ const name = spec.name ?? stageObj.info.name;
345
+ for (const dep of stageObj.info.consumes) {
346
+ if (!availableArtifacts.has(dep)) {
347
+ throw new WiringError(
348
+ `Stage '${name}' consumes '${dep}' but no prior stage produces it. Available: ${[...availableArtifacts].sort().join(", ")}`
349
+ );
350
+ }
351
+ }
352
+ plan.stages.push({ name, stage: stageObj, spec, config: spec.config });
353
+ for (const p of stageObj.info.produces) availableArtifacts.add(p);
354
+ }
355
+ return plan;
356
+ }
357
+ };
358
+
359
+ // src/core/engine/router.ts
360
+ var Router = {
361
+ /**
362
+ * Apply a Decision (skip / abort / insert) to the remaining stages and
363
+ * return the new remaining list. Records `decision.reason` in
364
+ * `ctx.reasoning._router`.
365
+ */
366
+ apply(decision, remaining, ctx, registry) {
367
+ if (decision.reason) {
368
+ ctx.reasoning["_router"] = decision.reason;
369
+ }
370
+ if (decision.abort) {
371
+ ctx.reasoning["_router"] = `ABORT: ${decision.reason}`;
372
+ return [];
373
+ }
374
+ let next = remaining;
375
+ if (decision.skip.length > 0) {
376
+ const skipSet = new Set(decision.skip);
377
+ next = next.filter((s) => !skipSet.has(s.name));
378
+ }
379
+ if (decision.insert.length > 0) {
380
+ const inserted = [];
381
+ for (const name of decision.insert) {
382
+ const stageObj = registry.get(name);
383
+ inserted.push({
384
+ name,
385
+ stage: stageObj,
386
+ spec: makeStageSpec(name),
387
+ config: {}
388
+ });
389
+ }
390
+ next = [...inserted, ...next];
391
+ }
392
+ return next;
393
+ }
394
+ };
395
+
396
+ // src/core/engine/runner.ts
397
+ function isFalsy(value) {
398
+ if (value === null || value === void 0) return true;
399
+ if (value === false || value === 0 || value === "") return true;
400
+ if (Array.isArray(value)) return value.length === 0;
401
+ if (value instanceof Map || value instanceof Set) return value.size === 0;
402
+ if (typeof value === "object") return Object.keys(value).length === 0;
403
+ return false;
404
+ }
405
+ var Runner = class {
406
+ constructor(registry) {
407
+ this.registry = registry;
408
+ }
409
+ registry;
410
+ /** Execute an ExecutionPlan against a PipeContext, returning per-stage results. */
411
+ async run(plan, ctx) {
412
+ const results = {};
413
+ let remaining = [...plan.stages];
414
+ while (remaining.length > 0) {
415
+ const planned = remaining.shift();
416
+ if (planned.spec.skipIf) {
417
+ const artifact = ctx.artifacts[planned.spec.skipIf];
418
+ if (isFalsy(artifact)) {
419
+ results[planned.name] = { status: StageStatus.SKIPPED };
420
+ ctx.reasoning[planned.name] = `Skipped: artifact '${planned.spec.skipIf}' is missing/falsy`;
421
+ continue;
422
+ }
423
+ }
424
+ const start = performance.now();
425
+ try {
426
+ ctx.stageConfig = planned.config;
427
+ await planned.stage.validate(ctx);
428
+ const result = await planned.stage.run(ctx);
429
+ ctx.timing[planned.name] = (performance.now() - start) / 1e3;
430
+ results[planned.name] = result;
431
+ if (result.decision != null) {
432
+ remaining = Router.apply(result.decision, remaining, ctx, this.registry);
433
+ }
434
+ } catch (e) {
435
+ ctx.timing[planned.name] = (performance.now() - start) / 1e3;
436
+ const message = e instanceof Error ? e.message : String(e);
437
+ results[planned.name] = { status: StageStatus.FAILED, error: message };
438
+ ctx.reasoning[planned.name] = `Failed: ${message}`;
439
+ if (planned.spec.onError === "abort") {
440
+ break;
441
+ }
442
+ }
443
+ }
444
+ return results;
445
+ }
446
+ };
447
+
448
+ // src/core/engine/reporter.ts
449
+ var Reporter = {
450
+ build(ctx, stages) {
451
+ const entries = Object.entries(stages);
452
+ const errors = entries.filter(([, r]) => r.status === StageStatus.FAILED && r.error).map(([name, r]) => `${name}: ${r.error}`);
453
+ const skipped = entries.filter(([, r]) => r.status === StageStatus.SKIPPED).map(([name]) => name);
454
+ const nonSkip = entries.map(([, r]) => r.status).filter((s) => s !== StageStatus.SKIPPED);
455
+ let status;
456
+ if (nonSkip.length === 0) {
457
+ status = PipeStatus.SUCCESS;
458
+ } else if (nonSkip.every((s) => s === StageStatus.FAILED)) {
459
+ status = PipeStatus.FAILED;
460
+ } else if (nonSkip.every((s) => s === StageStatus.SUCCESS)) {
461
+ status = PipeStatus.SUCCESS;
462
+ } else {
463
+ status = PipeStatus.PARTIAL;
464
+ }
465
+ return {
466
+ status,
467
+ source: typeof ctx.metadata["source"] === "string" ? ctx.metadata["source"] : "",
468
+ inputRows: typeof ctx.metadata["input_rows"] === "number" ? ctx.metadata["input_rows"] : 0,
469
+ stages,
470
+ artifacts: { ...ctx.artifacts },
471
+ skipped,
472
+ errors,
473
+ reasoning: { ...ctx.reasoning },
474
+ timing: { ...ctx.timing }
475
+ };
476
+ }
477
+ };
478
+
479
+ // src/core/decisions.ts
480
+ function findingsOf(ctx) {
481
+ const findings = ctx.artifacts["findings"];
482
+ if (!Array.isArray(findings) || findings.length === 0) return null;
483
+ return findings;
484
+ }
485
+ function severityGate(ctx) {
486
+ const findings = findingsOf(ctx);
487
+ if (!findings) return null;
488
+ const hasCritical = findings.some((f) => f.severity === "critical");
489
+ if (hasCritical) {
490
+ return makeDecision({ abort: true, reason: "Critical findings detected" });
491
+ }
492
+ return null;
493
+ }
494
+ function piiRouter(ctx) {
495
+ const findings = findingsOf(ctx);
496
+ if (!findings) return null;
497
+ const hasPii = findings.some((f) => f.check === "pii_detection");
498
+ if (hasPii) {
499
+ return makeDecision({
500
+ skip: ["goldenmatch.dedupe"],
501
+ insert: ["goldenmatch.dedupe_pprl"],
502
+ reason: "PII detected, routing to PPRL matching"
503
+ });
504
+ }
505
+ return null;
506
+ }
507
+ function rowCountGate(ctx) {
508
+ const rowCount = typeof ctx.metadata["input_rows"] === "number" ? ctx.metadata["input_rows"] : 0;
509
+ if (rowCount < 2) {
510
+ return makeDecision({
511
+ skip: ["goldenmatch.dedupe"],
512
+ reason: `Only ${rowCount} row(s), skipping deduplication`
513
+ });
514
+ }
515
+ return null;
516
+ }
517
+
518
+ // src/core/adapters/load.ts
519
+ var LoadStage = {
520
+ info: { name: "load", produces: ["df"], consumes: [] },
521
+ validate(_ctx) {
522
+ },
523
+ async run(_ctx) {
524
+ return { status: StageStatus.SUCCESS };
525
+ },
526
+ rollback: null
527
+ };
528
+ function normalizeFinding(f) {
529
+ return {
530
+ severity: severityLabel(f.severity).toLowerCase(),
531
+ check: f.check,
532
+ column: f.column,
533
+ message: f.message
534
+ };
535
+ }
536
+ function toColumnProfileLike(cp) {
537
+ return {
538
+ name: cp.name,
539
+ inferredType: cp.inferredType,
540
+ nullPct: cp.nullPct,
541
+ uniqueCount: cp.uniqueCount
542
+ };
543
+ }
544
+ var ScanStage = {
545
+ info: { name: "goldencheck.scan", produces: ["findings", "profile"], consumes: ["df"] },
546
+ validate(ctx) {
547
+ if (ctx.df === null) {
548
+ throw new Error("ScanStage: no df in context");
549
+ }
550
+ },
551
+ async run(ctx) {
552
+ const rows = ctx.df ?? [];
553
+ const data = new TabularData(rows);
554
+ const stageCfg = ctx.stageConfig;
555
+ const opts = stageCfg && Object.keys(stageCfg).length > 0 ? stageCfg : void 0;
556
+ const result = scanData(data, opts);
557
+ const findings = result.findings.map(normalizeFinding);
558
+ const columnProfiles = result.profile.columns;
559
+ ctx.artifacts["findings"] = findings;
560
+ ctx.artifacts["profile"] = result.profile;
561
+ try {
562
+ const profileLikes = columnProfiles.map(toColumnProfileLike);
563
+ const findingLikes = findings.map((f) => ({
564
+ column: f.column,
565
+ check: f.check,
566
+ message: f.message
567
+ }));
568
+ ctx.artifacts["column_contexts"] = buildContextsFromCheck(findingLikes, profileLikes);
569
+ } catch {
570
+ ctx.artifacts["column_contexts"] = [];
571
+ }
572
+ return { status: StageStatus.SUCCESS };
573
+ },
574
+ rollback: null
575
+ };
576
+ var TransformStage = {
577
+ info: { name: "goldenflow.transform", produces: ["df", "manifest"], consumes: ["df"] },
578
+ validate(ctx) {
579
+ if (ctx.df === null) {
580
+ throw new Error("TransformStage: no df in context");
581
+ }
582
+ },
583
+ async run(ctx) {
584
+ const rows = ctx.df ?? [];
585
+ const stageCfg = ctx.stageConfig;
586
+ const config = stageCfg && Object.keys(stageCfg).length > 0 ? stageCfg : void 0;
587
+ const engine = new TransformEngine(config);
588
+ const result = engine.transformDf(rows);
589
+ ctx.df = [...result.rows];
590
+ ctx.artifacts["manifest"] = result.manifest;
591
+ const contexts = ctx.artifacts["column_contexts"];
592
+ if (Array.isArray(contexts)) {
593
+ try {
594
+ const records = result.manifest.records.map((r) => ({
595
+ column: r.column,
596
+ transform: r.transform,
597
+ affectedRows: r.affectedRows
598
+ }));
599
+ enrichContextsFromFlow(contexts, records);
600
+ } catch {
601
+ }
602
+ }
603
+ return { status: StageStatus.SUCCESS };
604
+ },
605
+ rollback: null
606
+ };
607
+ function castRowsToString(rows) {
608
+ return rows.map((row) => {
609
+ const out = {};
610
+ for (const [k, v] of Object.entries(row)) {
611
+ out[k] = v === null || v === void 0 ? "" : String(v);
612
+ }
613
+ return out;
614
+ });
615
+ }
616
+ var DedupeStage = {
617
+ info: { name: "goldenmatch.dedupe", produces: ["clusters", "golden"], consumes: ["df"] },
618
+ validate(ctx) {
619
+ if (ctx.df === null) {
620
+ throw new Error("DedupeStage: no df in context");
621
+ }
622
+ },
623
+ async run(ctx) {
624
+ const rows = castRowsToString(ctx.df ?? []);
625
+ ctx.df = rows;
626
+ const stageCfg = ctx.stageConfig;
627
+ let config = null;
628
+ if (stageCfg && Object.keys(stageCfg).length > 0) {
629
+ config = makeConfig(stageCfg);
630
+ } else {
631
+ const contexts = ctx.artifacts["column_contexts"];
632
+ if (Array.isArray(contexts) && contexts.length > 0) {
633
+ config = buildConfigFromContexts(contexts, rows);
634
+ }
635
+ }
636
+ const result = config !== null ? await dedupe(rows, { config }) : await dedupe(rows);
637
+ ctx.artifacts["clusters"] = result.clusters;
638
+ ctx.artifacts["golden"] = result.goldenRecords;
639
+ ctx.artifacts["unique"] = result.unique;
640
+ ctx.artifacts["dupes"] = result.dupes;
641
+ ctx.artifacts["match_stats"] = result.stats;
642
+ ctx.artifacts["scored_pairs"] = result.scoredPairs;
643
+ const mks = config?.matchkeys;
644
+ if (mks && mks.length > 0) {
645
+ ctx.artifacts["matchkey_used"] = mks[0].name;
646
+ }
647
+ return { status: StageStatus.SUCCESS };
648
+ },
649
+ rollback: null
650
+ };
651
+ function buildConfigFromContexts(contexts, rows) {
652
+ const nameCols = contexts.filter(
653
+ (c) => c.inferredType === ColumnType.NAME && c.isIdentifier
654
+ );
655
+ const emailCols = contexts.filter((c) => c.inferredType === ColumnType.EMAIL);
656
+ const geoCols = contexts.filter((c) => c.inferredType === ColumnType.GEO);
657
+ const matchkeys = [];
658
+ for (const col of emailCols) {
659
+ matchkeys.push(
660
+ makeMatchkeyConfig({
661
+ name: `exact_${col.name}`,
662
+ type: "exact",
663
+ fields: [makeMatchkeyField({ field: col.name, transforms: ["lowercase", "strip"], scorer: "exact" })]
664
+ })
665
+ );
666
+ }
667
+ if (nameCols.length > 0) {
668
+ const fuzzyFields = nameCols.map(
669
+ (col) => makeMatchkeyField({
670
+ field: col.name,
671
+ scorer: "jaro_winkler",
672
+ weight: 1,
673
+ transforms: ["lowercase", "strip"]
674
+ })
675
+ );
676
+ matchkeys.push(
677
+ makeMatchkeyConfig({
678
+ name: "fuzzy_names",
679
+ type: "weighted",
680
+ threshold: 0.85,
681
+ fields: fuzzyFields
682
+ })
683
+ );
684
+ }
685
+ if (matchkeys.length === 0) {
686
+ let stringCols = contexts.filter(
687
+ (c) => c.inferredType === ColumnType.STRING || c.inferredType === ColumnType.NAME
688
+ );
689
+ if (rows.length > 0) {
690
+ const minCardinality = Math.max(10, Math.floor(rows.length * 0.05));
691
+ stringCols = stringCols.filter((c) => distinctNonNull(rows, c.name) >= minCardinality);
692
+ }
693
+ const fallbackFields = stringCols.slice(0, 3).map(
694
+ (col) => makeMatchkeyField({
695
+ field: col.name,
696
+ scorer: "jaro_winkler",
697
+ weight: 1,
698
+ transforms: ["lowercase", "strip"]
699
+ })
700
+ );
701
+ if (fallbackFields.length > 0) {
702
+ matchkeys.push(
703
+ makeMatchkeyConfig({
704
+ name: "fuzzy_fallback",
705
+ type: "weighted",
706
+ threshold: 0.85,
707
+ fields: fallbackFields
708
+ })
709
+ );
710
+ }
711
+ }
712
+ if (matchkeys.length === 0) {
713
+ return null;
714
+ }
715
+ let bestGeo = null;
716
+ if (geoCols.length > 0 && rows.length > 0) {
717
+ const maxNullRate = 0.2;
718
+ const geoCandidates = [];
719
+ for (const g of geoCols) {
720
+ if (nullRateOf(rows, g.name) <= maxNullRate) {
721
+ geoCandidates.push([g.name, distinctNonNull(rows, g.name)]);
722
+ }
723
+ }
724
+ if (geoCandidates.length > 0) {
725
+ geoCandidates.sort((a, b) => a[1] - b[1]);
726
+ bestGeo = geoCandidates[0][0];
727
+ }
728
+ }
729
+ const makeBlocking = (primaryFields, recallName, withGeo = false) => {
730
+ const passes = [
731
+ { fields: primaryFields, transforms: ["lowercase", "strip"] }
732
+ ];
733
+ if (withGeo) {
734
+ passes.push({ fields: primaryFields, transforms: ["lowercase", "substring:0:3"] });
735
+ }
736
+ passes.push({ fields: [recallName], transforms: ["lowercase", "soundex"] });
737
+ return makeBlockingConfig({
738
+ strategy: "multi_pass",
739
+ keys: [passes[0]],
740
+ passes,
741
+ maxBlockSize: 500,
742
+ skipOversized: true
743
+ });
744
+ };
745
+ let blocking = null;
746
+ const lastNameCols = nameCols.filter((c) => c.name.toLowerCase().includes("last"));
747
+ if (lastNameCols.length > 0) {
748
+ const bestName = lastNameCols[0].name;
749
+ blocking = bestGeo ? makeBlocking([bestGeo, bestName], bestName, true) : makeBlocking([bestName], bestName);
750
+ } else if (nameCols.length > 0) {
751
+ const bestName = nameCols[0].name;
752
+ if (bestGeo) {
753
+ blocking = makeBlocking([bestGeo, bestName], bestName, true);
754
+ } else {
755
+ blocking = makeBlockingConfig({
756
+ strategy: "static",
757
+ keys: [{ fields: [bestName], transforms: ["lowercase", "soundex"] }],
758
+ maxBlockSize: 500,
759
+ skipOversized: true
760
+ });
761
+ }
762
+ }
763
+ if (!blocking && bestGeo && matchkeys.length > 0) {
764
+ const fuzzyMks = matchkeys.filter((mk) => mk.type === "weighted");
765
+ const first = fuzzyMks[0];
766
+ if (first && first.fields.length > 0) {
767
+ const anchor = first.fields[0].field;
768
+ blocking = makeBlocking([bestGeo, anchor], anchor, true);
769
+ }
770
+ }
771
+ if (!blocking) {
772
+ blocking = makeBlockingConfig({ keys: [], autoSuggest: true });
773
+ }
774
+ return makeConfig({ matchkeys, blocking });
775
+ }
776
+
777
+ // src/core/adapters/index.ts
778
+ function buildDefaultRegistry() {
779
+ const registry = new StageRegistry();
780
+ registry.register(LoadStage);
781
+ registry.register(ScanStage);
782
+ registry.register(TransformStage);
783
+ registry.register(DedupeStage);
784
+ return registry;
785
+ }
786
+
787
+ // src/core/pipeline.ts
788
+ var DEFAULT_STAGE_ORDER = [
789
+ "goldencheck.scan",
790
+ "goldenflow.transform",
791
+ "goldenmatch.dedupe"
792
+ ];
793
+ var Pipeline = class {
794
+ config;
795
+ registry;
796
+ constructor(options) {
797
+ this.config = options?.config;
798
+ this.registry = options?.registry ?? buildDefaultRegistry();
799
+ }
800
+ /** Run the pipeline on an array of rows. */
801
+ async run(rows, source = "<rows>") {
802
+ const ctx = makePipeContext({
803
+ df: [...rows],
804
+ metadata: { source, input_rows: rows.length }
805
+ });
806
+ const config = this.config ?? this.autoConfig();
807
+ let plan;
808
+ try {
809
+ plan = Resolver.resolve(config, this.registry);
810
+ } catch (e) {
811
+ const message = e instanceof Error ? e.message : String(e);
812
+ return {
813
+ status: PipeStatus.FAILED,
814
+ source,
815
+ inputRows: rows.length,
816
+ stages: {},
817
+ artifacts: {},
818
+ skipped: [],
819
+ errors: [`Pipeline resolution failed: ${message}`],
820
+ reasoning: {},
821
+ timing: {}
822
+ };
823
+ }
824
+ const runner = new Runner(this.registry);
825
+ const stages = await runner.run(plan, ctx);
826
+ return Reporter.build(ctx, stages);
827
+ }
828
+ /** Build the default check→flow→dedupe config from the available stages. */
829
+ autoConfig() {
830
+ const available = this.registry.listAll();
831
+ const stages = DEFAULT_STAGE_ORDER.filter((name) => name in available).map(
832
+ (name) => makeStageSpec(name)
833
+ );
834
+ return makePipelineConfig({ pipeline: "auto", stages });
835
+ }
836
+ };
837
+ async function runDf(rows, config, source = "<rows>") {
838
+ const pipe = new Pipeline(config !== void 0 ? { config } : {});
839
+ return pipe.run(rows, source);
840
+ }
841
+ async function runStages(stages, rows) {
842
+ const registry = new StageRegistry();
843
+ for (const s of stages) registry.register(s);
844
+ const config = makePipelineConfig({
845
+ pipeline: "programmatic",
846
+ stages: stages.map((s) => makeStageSpec(s.info.name))
847
+ });
848
+ const ctx = makePipeContext({
849
+ df: [...rows],
850
+ metadata: { source: "<programmatic>", input_rows: rows.length }
851
+ });
852
+ const plan = Resolver.resolve(config, registry);
853
+ plan.stages = plan.stages.filter((s) => s.name !== "load");
854
+ const runner = new Runner(registry);
855
+ const results = await runner.run(plan, ctx);
856
+ return Reporter.build(ctx, results);
857
+ }
858
+
859
+ export { CardinalityBand, ColumnType, DedupeStage, LoadStage, MIN_CONFIDENCE, PipeStatus, Pipeline, Reporter, Resolver, Router, Runner, ScanStage, StageRegistry, StageStatus, TransformStage, WiringError, buildConfigFromContexts, buildContextsFromCheck, buildDefaultRegistry, classifyByName, defaultRegistry, distinctNonNull, enrichContextsFromFlow, makeColumnContext, makeDecision, makePipeContext, makePipelineConfig, makeStageSpec, normalizeDtype, nullRateOf, piiRouter, rowCountGate, runDf, runStages, severityGate, stage };
860
+ //# sourceMappingURL=index.js.map
861
+ //# sourceMappingURL=index.js.map