goldenmatch 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +54 -2
- package/dist/cli.cjs +371 -14
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +371 -14
- package/dist/cli.js.map +1 -1
- package/dist/core/index.cjs +948 -213
- package/dist/core/index.cjs.map +1 -1
- package/dist/core/index.d.cts +10 -59
- package/dist/core/index.d.ts +10 -59
- package/dist/core/index.js +944 -214
- package/dist/core/index.js.map +1 -1
- package/dist/index.cjs +948 -213
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +944 -214
- package/dist/index.js.map +1 -1
- package/dist/node/backends/score-worker.cjs.map +1 -1
- package/dist/node/backends/score-worker.d.cts +1 -1
- package/dist/node/backends/score-worker.d.ts +1 -1
- package/dist/node/backends/score-worker.js.map +1 -1
- package/dist/node/index.cjs +967 -213
- package/dist/node/index.cjs.map +1 -1
- package/dist/node/index.d.cts +3 -3
- package/dist/node/index.d.ts +3 -3
- package/dist/node/index.js +964 -214
- package/dist/node/index.js.map +1 -1
- package/dist/types-C-JSr4mQ.d.cts +557 -0
- package/dist/types-C-JSr4mQ.d.ts +557 -0
- package/examples/README.md +2 -0
- package/examples/strictModeParity.ts +78 -0
- package/examples/verificationInspection.ts +99 -0
- package/package.json +1 -1
- package/src/core/autoconfig.ts +133 -10
- package/src/core/autoconfigVerify.ts +907 -0
- package/src/core/domain.ts +9 -0
- package/src/core/index.ts +25 -0
- package/src/core/pipeline.ts +74 -2
- package/src/core/profiler.ts +122 -12
- package/src/core/types.ts +18 -0
- package/tests/parity/autoconfig-verify-fixtures.json +1147 -0
- package/tests/parity/autoconfigVerify-parity.test.ts +114 -0
- package/tests/smoke.test.ts +7 -0
- package/tests/unit/autoconfig-classifier.test.ts +59 -0
- package/tests/unit/autoconfig-multi-name.test.ts +33 -0
- package/tests/unit/autoconfig-preflight-integration.test.ts +37 -0
- package/tests/unit/autoconfig-weight-cap.test.ts +18 -0
- package/tests/unit/autoconfig-year.test.ts +49 -0
- package/tests/unit/autoconfig.test.ts +29 -8
- package/tests/unit/autoconfigVerify-invariants.test.ts +202 -0
- package/tests/unit/autoconfigVerify-postflight.test.ts +238 -0
- package/tests/unit/autoconfigVerify-preflight.test.ts +410 -0
- package/tests/unit/autoconfigVerify-types.test.ts +117 -0
- package/tests/unit/pprl-protocol.test.ts +1 -1
- package/tests/unit/profiler-confidence.test.ts +28 -0
- package/dist/types-DhUdX5Rc.d.cts +0 -304
- package/dist/types-DhUdX5Rc.d.ts +0 -304
package/README.md
CHANGED
|
@@ -9,7 +9,7 @@ npm install goldenmatch
|
|
|
9
9
|
[](https://www.npmjs.com/package/goldenmatch)
|
|
10
10
|
[](https://nodejs.org/)
|
|
11
11
|
[](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE)
|
|
12
|
-
[](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
|
|
13
13
|
|
|
14
14
|
---
|
|
15
15
|
|
|
@@ -18,7 +18,7 @@ npm install goldenmatch
|
|
|
18
18
|
- **Edge-safe core** — the matching engine runs in browsers, Workers, Vercel Edge Runtime, Deno
|
|
19
19
|
- **Pure TypeScript** — no native dependencies required; peer deps unlock performance (hnswlib, ONNX, piscina)
|
|
20
20
|
- **Feature parity with Python goldenmatch** — same scorers, same clustering, same YAML configs
|
|
21
|
-
- **
|
|
21
|
+
- **590 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
|
|
22
22
|
|
|
23
23
|
## Quick Start
|
|
24
24
|
|
|
@@ -45,6 +45,58 @@ for (const record of result.goldenRecords) {
|
|
|
45
45
|
}
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
+
## Auto-Config Verification (v0.3)
|
|
49
|
+
|
|
50
|
+
Auto-generated configs are now checked both before the pipeline runs and after
|
|
51
|
+
scoring finishes, so you get actionable diagnostics instead of silent failures
|
|
52
|
+
on edge-case data.
|
|
53
|
+
|
|
54
|
+
### Preflight — six static checks
|
|
55
|
+
|
|
56
|
+
When you call `autoConfigureRows(rows)`, the returned config ships with a
|
|
57
|
+
`_preflightReport` summarising six config-time checks:
|
|
58
|
+
|
|
59
|
+
1. **missing_column** — matchkey/blocking references a column not in the data
|
|
60
|
+
2. **cardinality_high** — a column is near-unique (poor blocking signal)
|
|
61
|
+
3. **cardinality_low** — a column has too few distinct values to discriminate
|
|
62
|
+
4. **block_size** — a blocking key would produce oversized blocks
|
|
63
|
+
5. **remote_asset** — a scorer requires a model download (gated offline)
|
|
64
|
+
6. **weight_confidence** — a weighted matchkey's weights look unbalanced
|
|
65
|
+
|
|
66
|
+
Many findings trigger **auto-repairs** (field dropped, scorer swapped,
|
|
67
|
+
weight clamped). `hasErrors === true` on unrepairable errors raises
|
|
68
|
+
`ConfigValidationError` with the full report attached.
|
|
69
|
+
|
|
70
|
+
```ts
|
|
71
|
+
import { autoConfigureRows, ConfigValidationError } from "goldenmatch";
|
|
72
|
+
|
|
73
|
+
const cfg = autoConfigureRows(rows);
|
|
74
|
+
for (const f of cfg._preflightReport!.findings) {
|
|
75
|
+
console.log(`[${f.severity}] ${f.check}/${f.subject}: ${f.message}`);
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Defaults are **offline-safe**: remote-asset scorers (cross-encoder, remote
|
|
80
|
+
embeddings) are dropped unless you opt in with `allowRemoteAssets: true`.
|
|
81
|
+
|
|
82
|
+
### Postflight — four runtime signals
|
|
83
|
+
|
|
84
|
+
Inside `dedupe()` / `match()`, after scoring but before clustering, the
|
|
85
|
+
pipeline computes four signals attached as `result.postflightReport`:
|
|
86
|
+
|
|
87
|
+
1. **scoreHistogram** — 100-bin pair-score distribution
|
|
88
|
+
2. **blockSizePercentiles** + **preliminaryClusterSizes** — p50/p95/p99/max
|
|
89
|
+
3. **thresholdOverlapPct** — fraction of pairs near the current threshold
|
|
90
|
+
4. **oversizedClusters** — components above size limit, with bottleneck pair
|
|
91
|
+
|
|
92
|
+
If the score distribution is clearly bimodal, postflight proposes a
|
|
93
|
+
threshold adjustment. In **strict mode** (`autoConfigureRows(rows, { strict: true })`
|
|
94
|
+
or manual `_strictAutoconfig: true`) the signals are still emitted but the
|
|
95
|
+
threshold is never touched — use this for reproducible CI pipelines.
|
|
96
|
+
|
|
97
|
+
See `examples/verificationInspection.ts` and `examples/strictModeParity.ts`
|
|
98
|
+
for runnable demos.
|
|
99
|
+
|
|
48
100
|
## Three entrypoints
|
|
49
101
|
|
|
50
102
|
```typescript
|
package/dist/cli.cjs
CHANGED
|
@@ -2492,6 +2492,261 @@ var init_golden = __esm({
|
|
|
2492
2492
|
];
|
|
2493
2493
|
}
|
|
2494
2494
|
});
|
|
2495
|
+
var init_domain = __esm({
|
|
2496
|
+
"src/core/domain.ts"() {
|
|
2497
|
+
new Set(
|
|
2498
|
+
[
|
|
2499
|
+
"apple",
|
|
2500
|
+
"samsung",
|
|
2501
|
+
"sony",
|
|
2502
|
+
"lg",
|
|
2503
|
+
"dell",
|
|
2504
|
+
"hp",
|
|
2505
|
+
"lenovo",
|
|
2506
|
+
"asus",
|
|
2507
|
+
"acer",
|
|
2508
|
+
"microsoft",
|
|
2509
|
+
"google",
|
|
2510
|
+
"amazon",
|
|
2511
|
+
"bose",
|
|
2512
|
+
"canon",
|
|
2513
|
+
"nikon",
|
|
2514
|
+
"panasonic",
|
|
2515
|
+
"philips",
|
|
2516
|
+
"toshiba"
|
|
2517
|
+
].map((s) => s.toLowerCase())
|
|
2518
|
+
);
|
|
2519
|
+
}
|
|
2520
|
+
});
|
|
2521
|
+
|
|
2522
|
+
// src/core/autoconfigVerify.ts
|
|
2523
|
+
function signalScoreHistogram(pairScores) {
|
|
2524
|
+
const bins = [];
|
|
2525
|
+
for (let i = 0; i <= 100; i++) bins.push(i / 100);
|
|
2526
|
+
const counts = new Array(100).fill(0);
|
|
2527
|
+
for (const p of pairScores) {
|
|
2528
|
+
const idx = Math.min(99, Math.max(0, Math.floor(p.score * 100)));
|
|
2529
|
+
counts[idx] += 1;
|
|
2530
|
+
}
|
|
2531
|
+
const smoothed = counts.map((_, i) => {
|
|
2532
|
+
let sum = 0;
|
|
2533
|
+
let n = 0;
|
|
2534
|
+
for (let j = -2; j <= 2; j++) {
|
|
2535
|
+
const k = i + j;
|
|
2536
|
+
if (k >= 0 && k < counts.length) {
|
|
2537
|
+
sum += counts[k];
|
|
2538
|
+
n += 1;
|
|
2539
|
+
}
|
|
2540
|
+
}
|
|
2541
|
+
return sum / Math.max(1, n);
|
|
2542
|
+
});
|
|
2543
|
+
const max = Math.max(...smoothed, 0);
|
|
2544
|
+
const mean = smoothed.reduce((a, b) => a + b, 0) / Math.max(1, smoothed.length);
|
|
2545
|
+
const minHeight = Math.max(max * 0.3, mean * 2);
|
|
2546
|
+
const peaks = [];
|
|
2547
|
+
for (let i = 1; i < smoothed.length - 1; i++) {
|
|
2548
|
+
if (smoothed[i] >= minHeight && smoothed[i] > smoothed[i - 1] && smoothed[i] >= smoothed[i + 1]) {
|
|
2549
|
+
peaks.push(i);
|
|
2550
|
+
}
|
|
2551
|
+
}
|
|
2552
|
+
if (peaks.length < 2) {
|
|
2553
|
+
return {
|
|
2554
|
+
histogram: { bins, counts },
|
|
2555
|
+
valleyLocation: null,
|
|
2556
|
+
isBimodal: false
|
|
2557
|
+
};
|
|
2558
|
+
}
|
|
2559
|
+
const first = peaks[0];
|
|
2560
|
+
const last = peaks[peaks.length - 1];
|
|
2561
|
+
if (last - first <= 10) {
|
|
2562
|
+
return {
|
|
2563
|
+
histogram: { bins, counts },
|
|
2564
|
+
valleyLocation: null,
|
|
2565
|
+
isBimodal: false
|
|
2566
|
+
};
|
|
2567
|
+
}
|
|
2568
|
+
let valleyIdx = first;
|
|
2569
|
+
let valleyVal = smoothed[first];
|
|
2570
|
+
for (let i = first + 1; i < last; i++) {
|
|
2571
|
+
if (smoothed[i] < valleyVal) {
|
|
2572
|
+
valleyVal = smoothed[i];
|
|
2573
|
+
valleyIdx = i;
|
|
2574
|
+
}
|
|
2575
|
+
}
|
|
2576
|
+
const depthRatio = valleyVal / Math.min(smoothed[first], smoothed[last]);
|
|
2577
|
+
if (depthRatio >= 0.5) {
|
|
2578
|
+
return {
|
|
2579
|
+
histogram: { bins, counts },
|
|
2580
|
+
valleyLocation: null,
|
|
2581
|
+
isBimodal: false
|
|
2582
|
+
};
|
|
2583
|
+
}
|
|
2584
|
+
return {
|
|
2585
|
+
histogram: { bins, counts },
|
|
2586
|
+
valleyLocation: valleyIdx / 100,
|
|
2587
|
+
isBimodal: true
|
|
2588
|
+
};
|
|
2589
|
+
}
|
|
2590
|
+
function getFirstWeightedThreshold(config) {
|
|
2591
|
+
for (const mk of config.matchkeys ?? []) {
|
|
2592
|
+
if (mk.type === "weighted") return mk.threshold;
|
|
2593
|
+
}
|
|
2594
|
+
return null;
|
|
2595
|
+
}
|
|
2596
|
+
function signalBlockingRecall() {
|
|
2597
|
+
return "deferred";
|
|
2598
|
+
}
|
|
2599
|
+
function signalBlockSizePercentiles(rows, config) {
|
|
2600
|
+
const b = config.blocking;
|
|
2601
|
+
if (b === void 0 || b.keys.length === 0 || rows.length === 0) {
|
|
2602
|
+
return { p50: 0, p95: 0, p99: 0, max: 0 };
|
|
2603
|
+
}
|
|
2604
|
+
const sample = rows.slice(0, Math.min(rows.length, 1e4));
|
|
2605
|
+
const sizes = [];
|
|
2606
|
+
for (const key of b.keys) {
|
|
2607
|
+
if (key.fields.length === 0) continue;
|
|
2608
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2609
|
+
for (const row of sample) {
|
|
2610
|
+
const parts = key.fields.map((f) => {
|
|
2611
|
+
const v = row[f];
|
|
2612
|
+
return v === null || v === void 0 ? "\0" : String(v);
|
|
2613
|
+
});
|
|
2614
|
+
const k = parts.join("");
|
|
2615
|
+
counts.set(k, (counts.get(k) ?? 0) + 1);
|
|
2616
|
+
}
|
|
2617
|
+
for (const n of counts.values()) sizes.push(n);
|
|
2618
|
+
}
|
|
2619
|
+
sizes.sort((a, b2) => a - b2);
|
|
2620
|
+
const pct = (q) => sizes.length === 0 ? 0 : sizes[Math.min(sizes.length - 1, Math.floor(sizes.length * q))] ?? 0;
|
|
2621
|
+
return {
|
|
2622
|
+
p50: pct(0.5),
|
|
2623
|
+
p95: pct(0.95),
|
|
2624
|
+
p99: pct(0.99),
|
|
2625
|
+
max: sizes.length === 0 ? 0 : sizes[sizes.length - 1]
|
|
2626
|
+
};
|
|
2627
|
+
}
|
|
2628
|
+
function signalThresholdOverlap(pairScores, threshold) {
|
|
2629
|
+
if (pairScores.length === 0) return 0;
|
|
2630
|
+
const lo = threshold - 0.02;
|
|
2631
|
+
const hi = threshold + 0.02;
|
|
2632
|
+
let inBand = 0;
|
|
2633
|
+
for (const p of pairScores) {
|
|
2634
|
+
if (p.score >= lo && p.score <= hi) inBand += 1;
|
|
2635
|
+
}
|
|
2636
|
+
return inBand / pairScores.length;
|
|
2637
|
+
}
|
|
2638
|
+
function signalClusterSizes(pairScores, threshold) {
|
|
2639
|
+
const above = pairScores.filter((p) => p.score >= threshold);
|
|
2640
|
+
const parent = /* @__PURE__ */ new Map();
|
|
2641
|
+
const find = (x) => {
|
|
2642
|
+
let root = x;
|
|
2643
|
+
while ((parent.get(root) ?? root) !== root) root = parent.get(root);
|
|
2644
|
+
let y = x;
|
|
2645
|
+
while ((parent.get(y) ?? y) !== root) {
|
|
2646
|
+
const next = parent.get(y) ?? y;
|
|
2647
|
+
parent.set(y, root);
|
|
2648
|
+
y = next;
|
|
2649
|
+
}
|
|
2650
|
+
return root;
|
|
2651
|
+
};
|
|
2652
|
+
const union = (a, b) => {
|
|
2653
|
+
const ra = find(a);
|
|
2654
|
+
const rb = find(b);
|
|
2655
|
+
if (ra !== rb) parent.set(ra, rb);
|
|
2656
|
+
};
|
|
2657
|
+
for (const p of above) {
|
|
2658
|
+
if (!parent.has(p.idA)) parent.set(p.idA, p.idA);
|
|
2659
|
+
if (!parent.has(p.idB)) parent.set(p.idB, p.idB);
|
|
2660
|
+
union(p.idA, p.idB);
|
|
2661
|
+
}
|
|
2662
|
+
const sizeByRoot = /* @__PURE__ */ new Map();
|
|
2663
|
+
const membersByRoot = /* @__PURE__ */ new Map();
|
|
2664
|
+
for (const id of parent.keys()) {
|
|
2665
|
+
const r = find(id);
|
|
2666
|
+
sizeByRoot.set(r, (sizeByRoot.get(r) ?? 0) + 1);
|
|
2667
|
+
if (!membersByRoot.has(r)) membersByRoot.set(r, /* @__PURE__ */ new Set());
|
|
2668
|
+
membersByRoot.get(r).add(id);
|
|
2669
|
+
}
|
|
2670
|
+
const sizes = Array.from(sizeByRoot.values()).sort((a, b) => a - b);
|
|
2671
|
+
const pct = (q) => sizes.length === 0 ? 0 : sizes[Math.min(sizes.length - 1, Math.floor(sizes.length * q))] ?? 0;
|
|
2672
|
+
const percentiles = {
|
|
2673
|
+
p50: pct(0.5),
|
|
2674
|
+
p95: pct(0.95),
|
|
2675
|
+
p99: pct(0.99),
|
|
2676
|
+
max: sizes.length === 0 ? 0 : sizes[sizes.length - 1],
|
|
2677
|
+
count: sizes.length
|
|
2678
|
+
};
|
|
2679
|
+
const oversized = [];
|
|
2680
|
+
let clusterId = 0;
|
|
2681
|
+
for (const [root, size] of sizeByRoot) {
|
|
2682
|
+
if (size <= 100) continue;
|
|
2683
|
+
const members = membersByRoot.get(root);
|
|
2684
|
+
let bottleneckPair = [-1, -1];
|
|
2685
|
+
let minScore = Infinity;
|
|
2686
|
+
for (const p of above) {
|
|
2687
|
+
if (members.has(p.idA) && members.has(p.idB) && p.score < minScore) {
|
|
2688
|
+
minScore = p.score;
|
|
2689
|
+
bottleneckPair = [
|
|
2690
|
+
Math.min(p.idA, p.idB),
|
|
2691
|
+
Math.max(p.idA, p.idB)
|
|
2692
|
+
];
|
|
2693
|
+
}
|
|
2694
|
+
}
|
|
2695
|
+
oversized.push({ clusterId: clusterId++, size, bottleneckPair });
|
|
2696
|
+
}
|
|
2697
|
+
return { percentiles, oversized };
|
|
2698
|
+
}
|
|
2699
|
+
function postflight(rows, config, options) {
|
|
2700
|
+
const currentThreshold = options.currentThreshold ?? getFirstWeightedThreshold(config) ?? 0.7;
|
|
2701
|
+
const hist = signalScoreHistogram(options.pairScores);
|
|
2702
|
+
const adjustments = [];
|
|
2703
|
+
const advisories = [];
|
|
2704
|
+
if (hist.isBimodal && hist.valleyLocation !== null) {
|
|
2705
|
+
if (!config._strictAutoconfig && Math.abs(hist.valleyLocation - currentThreshold) > 0.05) {
|
|
2706
|
+
adjustments.push({
|
|
2707
|
+
field: "threshold",
|
|
2708
|
+
fromValue: currentThreshold,
|
|
2709
|
+
toValue: hist.valleyLocation,
|
|
2710
|
+
reason: "histogram valley location differs from current threshold",
|
|
2711
|
+
signal: "scoreHistogram"
|
|
2712
|
+
});
|
|
2713
|
+
}
|
|
2714
|
+
} else {
|
|
2715
|
+
advisories.push(
|
|
2716
|
+
"score distribution is unimodal; threshold cannot be auto-set"
|
|
2717
|
+
);
|
|
2718
|
+
}
|
|
2719
|
+
const clusterResult = signalClusterSizes(
|
|
2720
|
+
options.pairScores,
|
|
2721
|
+
currentThreshold
|
|
2722
|
+
);
|
|
2723
|
+
const overlapPct = signalThresholdOverlap(
|
|
2724
|
+
options.pairScores,
|
|
2725
|
+
currentThreshold
|
|
2726
|
+
);
|
|
2727
|
+
if (overlapPct > 0.2 && config.llmScorer?.enabled !== true) {
|
|
2728
|
+
advisories.push(
|
|
2729
|
+
`${(overlapPct * 100).toFixed(1)}% of pairs within threshold \xB10.02 \u2014 consider enabling LLM auto mode for calibration`
|
|
2730
|
+
);
|
|
2731
|
+
}
|
|
2732
|
+
const blockSizePercentiles = signalBlockSizePercentiles(rows, config);
|
|
2733
|
+
const signals = {
|
|
2734
|
+
scoreHistogram: hist.histogram,
|
|
2735
|
+
blockingRecall: signalBlockingRecall(),
|
|
2736
|
+
blockSizePercentiles,
|
|
2737
|
+
thresholdOverlapPct: overlapPct,
|
|
2738
|
+
totalPairsScored: options.pairScores.length,
|
|
2739
|
+
currentThreshold,
|
|
2740
|
+
preliminaryClusterSizes: clusterResult.percentiles,
|
|
2741
|
+
oversizedClusters: clusterResult.oversized
|
|
2742
|
+
};
|
|
2743
|
+
return { signals, adjustments, advisories };
|
|
2744
|
+
}
|
|
2745
|
+
var init_autoconfigVerify = __esm({
|
|
2746
|
+
"src/core/autoconfigVerify.ts"() {
|
|
2747
|
+
init_domain();
|
|
2748
|
+
}
|
|
2749
|
+
});
|
|
2495
2750
|
|
|
2496
2751
|
// src/core/pipeline.ts
|
|
2497
2752
|
function buildSourceLookup(rows) {
|
|
@@ -2521,6 +2776,38 @@ function assignClusterIds(rows, clusters) {
|
|
|
2521
2776
|
return cid !== void 0 ? { ...row, __cluster_id__: cid } : row;
|
|
2522
2777
|
});
|
|
2523
2778
|
}
|
|
2779
|
+
function isPreflightReport(v) {
|
|
2780
|
+
return typeof v === "object" && v !== null && "findings" in v && Array.isArray(v.findings);
|
|
2781
|
+
}
|
|
2782
|
+
function applyPostflight(rows, config, pairScores) {
|
|
2783
|
+
const pre = config._preflightReport;
|
|
2784
|
+
if (!isPreflightReport(pre)) {
|
|
2785
|
+
return { pairScores, report: void 0 };
|
|
2786
|
+
}
|
|
2787
|
+
const report = postflight(rows, config, {
|
|
2788
|
+
pairScores: pairScores.map((p) => ({
|
|
2789
|
+
idA: p.idA,
|
|
2790
|
+
idB: p.idB,
|
|
2791
|
+
score: p.score
|
|
2792
|
+
}))
|
|
2793
|
+
});
|
|
2794
|
+
let filtered = pairScores;
|
|
2795
|
+
if (config._strictAutoconfig !== true) {
|
|
2796
|
+
for (const adj of report.adjustments) {
|
|
2797
|
+
if (adj.field === "threshold") {
|
|
2798
|
+
const newThreshold = adj.toValue;
|
|
2799
|
+
const prev = filtered.length;
|
|
2800
|
+
filtered = filtered.filter((p) => p.score >= newThreshold);
|
|
2801
|
+
if (prev > 0 && filtered.length === 0) {
|
|
2802
|
+
report.advisories.push(
|
|
2803
|
+
`threshold adjustment to ${newThreshold.toFixed(3)} dropped all ${prev} pairs`
|
|
2804
|
+
);
|
|
2805
|
+
}
|
|
2806
|
+
}
|
|
2807
|
+
}
|
|
2808
|
+
}
|
|
2809
|
+
return { pairScores: filtered, report };
|
|
2810
|
+
}
|
|
2524
2811
|
function runDedupePipeline(rows, config, options) {
|
|
2525
2812
|
if (rows.length === 0) {
|
|
2526
2813
|
return _emptyDedupeResult(config);
|
|
@@ -2570,8 +2857,13 @@ function runDedupePipeline(rows, config, options) {
|
|
|
2570
2857
|
}
|
|
2571
2858
|
}
|
|
2572
2859
|
}
|
|
2860
|
+
const { pairScores: finalPairs, report: postflightReport } = applyPostflight(
|
|
2861
|
+
processed,
|
|
2862
|
+
config,
|
|
2863
|
+
allPairs
|
|
2864
|
+
);
|
|
2573
2865
|
const allIds = collectRowIds(processed);
|
|
2574
|
-
const pairTuples =
|
|
2866
|
+
const pairTuples = finalPairs.map((p) => [
|
|
2575
2867
|
p.idA,
|
|
2576
2868
|
p.idB,
|
|
2577
2869
|
p.score
|
|
@@ -2640,8 +2932,9 @@ function runDedupePipeline(rows, config, options) {
|
|
|
2640
2932
|
dupes,
|
|
2641
2933
|
unique,
|
|
2642
2934
|
stats,
|
|
2643
|
-
scoredPairs:
|
|
2644
|
-
config
|
|
2935
|
+
scoredPairs: finalPairs,
|
|
2936
|
+
config,
|
|
2937
|
+
...postflightReport !== void 0 ? { postflightReport } : {}
|
|
2645
2938
|
};
|
|
2646
2939
|
}
|
|
2647
2940
|
function runMatchPipeline(targetRows, referenceRows, config) {
|
|
@@ -2695,7 +2988,8 @@ function runMatchPipeline(targetRows, referenceRows, config) {
|
|
|
2695
2988
|
matchedCount: matched.length,
|
|
2696
2989
|
unmatchedCount: unmatched.length,
|
|
2697
2990
|
matchRate: targetRows.length > 0 ? matched.length / targetRows.length : 0
|
|
2698
|
-
}
|
|
2991
|
+
},
|
|
2992
|
+
...result.postflightReport !== void 0 ? { postflightReport: result.postflightReport } : {}
|
|
2699
2993
|
};
|
|
2700
2994
|
}
|
|
2701
2995
|
function _emptyDedupeResult(config) {
|
|
@@ -2724,6 +3018,7 @@ var init_pipeline = __esm({
|
|
|
2724
3018
|
init_scorer();
|
|
2725
3019
|
init_cluster();
|
|
2726
3020
|
init_golden();
|
|
3021
|
+
init_autoconfigVerify();
|
|
2727
3022
|
}
|
|
2728
3023
|
});
|
|
2729
3024
|
|
|
@@ -3567,6 +3862,12 @@ var init_explain = __esm({
|
|
|
3567
3862
|
});
|
|
3568
3863
|
|
|
3569
3864
|
// src/core/profiler.ts
|
|
3865
|
+
function isYearValue(v) {
|
|
3866
|
+
const normalized = v.replace(/\.0+$/, "");
|
|
3867
|
+
const n = Number(normalized);
|
|
3868
|
+
if (Number.isNaN(n) || !Number.isFinite(n) || !Number.isInteger(n)) return false;
|
|
3869
|
+
return n >= 1900 && n <= 2100;
|
|
3870
|
+
}
|
|
3570
3871
|
function toStringOrNull(value) {
|
|
3571
3872
|
if (value === null || value === void 0) return null;
|
|
3572
3873
|
if (typeof value === "string") {
|
|
@@ -3575,10 +3876,22 @@ function toStringOrNull(value) {
|
|
|
3575
3876
|
}
|
|
3576
3877
|
return String(value);
|
|
3577
3878
|
}
|
|
3578
|
-
function
|
|
3579
|
-
if (values.length === 0) return "text";
|
|
3580
|
-
const n = values.length;
|
|
3879
|
+
function guessTypeByName(columnName) {
|
|
3581
3880
|
const lname = columnName.toLowerCase();
|
|
3881
|
+
if (/email|e_mail|e-mail/i.test(lname)) return "email";
|
|
3882
|
+
if (/phone|tel(?!e)|mobile|cell/i.test(lname)) return "phone";
|
|
3883
|
+
if (/zip|postal|postcode/i.test(lname)) return "zip";
|
|
3884
|
+
if (YEAR_NAME_RE.test(lname)) return "year";
|
|
3885
|
+
if (/date|created|modified|updated|_at$|birth|dob/i.test(lname)) return "date";
|
|
3886
|
+
if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
|
|
3887
|
+
if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
|
|
3888
|
+
if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
|
|
3889
|
+
if (/name|first|last|full_name|surname/i.test(lname)) return "name";
|
|
3890
|
+
return null;
|
|
3891
|
+
}
|
|
3892
|
+
function guessTypeByData(values) {
|
|
3893
|
+
if (values.length === 0) return null;
|
|
3894
|
+
const n = values.length;
|
|
3582
3895
|
const emailCount = values.reduce(
|
|
3583
3896
|
(acc, v) => acc + (EMAIL_VALUE_RE.test(v) ? 1 : 0),
|
|
3584
3897
|
0
|
|
@@ -3597,14 +3910,16 @@ function guessType(values, columnName) {
|
|
|
3597
3910
|
0
|
|
3598
3911
|
);
|
|
3599
3912
|
if (zipCount / n > 0.6) return "zip";
|
|
3913
|
+
let yearCount = 0;
|
|
3914
|
+
for (const v of values) {
|
|
3915
|
+
if (isYearValue(v)) yearCount++;
|
|
3916
|
+
}
|
|
3917
|
+
if (yearCount / n >= 0.95) return "year";
|
|
3600
3918
|
let dateCount = 0;
|
|
3601
3919
|
for (const v of values) {
|
|
3602
3920
|
if (DATE_VALUE_RES.some((re) => re.test(v))) dateCount++;
|
|
3603
3921
|
}
|
|
3604
3922
|
if (dateCount / n > 0.6) return "date";
|
|
3605
|
-
if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
|
|
3606
|
-
if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
|
|
3607
|
-
if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
|
|
3608
3923
|
const nameCount = values.reduce(
|
|
3609
3924
|
(acc, v) => acc + (NAME_VALUE_RE.test(v) ? 1 : 0),
|
|
3610
3925
|
0
|
|
@@ -3615,7 +3930,44 @@ function guessType(values, columnName) {
|
|
|
3615
3930
|
if (/^-?\d+(\.\d+)?$/.test(v)) numericCount++;
|
|
3616
3931
|
}
|
|
3617
3932
|
if (numericCount / n > 0.8) return "numeric";
|
|
3618
|
-
|
|
3933
|
+
let totalLen = 0;
|
|
3934
|
+
let delimRows = 0;
|
|
3935
|
+
let totalDelims = 0;
|
|
3936
|
+
for (const v of values) {
|
|
3937
|
+
totalLen += v.length;
|
|
3938
|
+
const commas = (v.match(/,/g) ?? []).length;
|
|
3939
|
+
const semis = (v.match(/;/g) ?? []).length;
|
|
3940
|
+
const count = commas + semis;
|
|
3941
|
+
if (count > 0) {
|
|
3942
|
+
delimRows++;
|
|
3943
|
+
totalDelims += count;
|
|
3944
|
+
}
|
|
3945
|
+
}
|
|
3946
|
+
const avgLen = totalLen / n;
|
|
3947
|
+
const delimFraction = delimRows / n;
|
|
3948
|
+
const avgDelimsPerDelimRow = delimRows > 0 ? totalDelims / delimRows : 0;
|
|
3949
|
+
if (avgLen > 30 && delimFraction >= 0.7 && avgDelimsPerDelimRow >= 2) {
|
|
3950
|
+
return "multi_name";
|
|
3951
|
+
}
|
|
3952
|
+
return null;
|
|
3953
|
+
}
|
|
3954
|
+
function guessTypeAndConfidence(values, columnName) {
|
|
3955
|
+
if (values.length === 0) return { type: "text", confidence: 0.3 };
|
|
3956
|
+
const nameType = guessTypeByName(columnName);
|
|
3957
|
+
const dataType = guessTypeByData(values);
|
|
3958
|
+
if (nameType !== null && dataType !== null) {
|
|
3959
|
+
if (nameType === dataType) {
|
|
3960
|
+
return { type: nameType, confidence: 0.9 };
|
|
3961
|
+
}
|
|
3962
|
+
const nameAuthoritative = nameType === "date" || nameType === "year" || nameType === "geo" || nameType === "id" || nameType === "email" || nameType === "zip" || nameType === "name";
|
|
3963
|
+
return {
|
|
3964
|
+
type: nameAuthoritative ? nameType : dataType,
|
|
3965
|
+
confidence: 0.7
|
|
3966
|
+
};
|
|
3967
|
+
}
|
|
3968
|
+
if (nameType !== null) return { type: nameType, confidence: 0.7 };
|
|
3969
|
+
if (dataType !== null) return { type: dataType, confidence: 0.7 };
|
|
3970
|
+
return { type: "text", confidence: 0.3 };
|
|
3619
3971
|
}
|
|
3620
3972
|
function profileColumn(name, rawValues) {
|
|
3621
3973
|
const totalCount = rawValues.length;
|
|
@@ -3643,7 +3995,10 @@ function profileColumn(name, rawValues) {
|
|
|
3643
3995
|
if (sampleValues.length >= 5) break;
|
|
3644
3996
|
}
|
|
3645
3997
|
const sampleForType = nonNull.length > 500 ? nonNull.slice(0, 500) : nonNull;
|
|
3646
|
-
const inferredType =
|
|
3998
|
+
const { type: inferredType, confidence } = guessTypeAndConfidence(
|
|
3999
|
+
sampleForType,
|
|
4000
|
+
name
|
|
4001
|
+
);
|
|
3647
4002
|
return {
|
|
3648
4003
|
name,
|
|
3649
4004
|
nullRate,
|
|
@@ -3654,7 +4009,8 @@ function profileColumn(name, rawValues) {
|
|
|
3654
4009
|
inferredType,
|
|
3655
4010
|
avgLength,
|
|
3656
4011
|
maxLength: maxLen,
|
|
3657
|
-
sampleValues
|
|
4012
|
+
sampleValues,
|
|
4013
|
+
confidence
|
|
3658
4014
|
};
|
|
3659
4015
|
}
|
|
3660
4016
|
function profileRows(rows) {
|
|
@@ -3682,7 +4038,7 @@ function profileRows(rows) {
|
|
|
3682
4038
|
byName
|
|
3683
4039
|
};
|
|
3684
4040
|
}
|
|
3685
|
-
var EMAIL_VALUE_RE, PHONE_STRIP_RE, DATE_VALUE_RES, ZIP_VALUE_RE, NAME_VALUE_RE;
|
|
4041
|
+
var EMAIL_VALUE_RE, PHONE_STRIP_RE, DATE_VALUE_RES, ZIP_VALUE_RE, NAME_VALUE_RE, YEAR_NAME_RE;
|
|
3686
4042
|
var init_profiler = __esm({
|
|
3687
4043
|
"src/core/profiler.ts"() {
|
|
3688
4044
|
EMAIL_VALUE_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
@@ -3694,6 +4050,7 @@ var init_profiler = __esm({
|
|
|
3694
4050
|
];
|
|
3695
4051
|
ZIP_VALUE_RE = /^\d{5}(-?\d{4})?$/;
|
|
3696
4052
|
NAME_VALUE_RE = /^[A-Za-z][A-Za-z \-']{0,28}[A-Za-z]$|^[A-Za-z]{2,3}$/;
|
|
4053
|
+
YEAR_NAME_RE = /(^|_)(year|yr)(_|$)/i;
|
|
3697
4054
|
}
|
|
3698
4055
|
});
|
|
3699
4056
|
|