goldenmatch 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +50 -0
- package/dist/cli.cjs +93 -4
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +93 -4
- package/dist/cli.js.map +1 -1
- package/dist/core/index.cjs +236 -5
- package/dist/core/index.cjs.map +1 -1
- package/dist/core/index.d.cts +220 -146
- package/dist/core/index.d.ts +220 -146
- package/dist/core/index.js +232 -6
- package/dist/core/index.js.map +1 -1
- package/dist/index.cjs +236 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +232 -6
- package/dist/index.js.map +1 -1
- package/dist/node/backends/score-worker.cjs +36 -1
- package/dist/node/backends/score-worker.cjs.map +1 -1
- package/dist/node/backends/score-worker.d.cts +1 -1
- package/dist/node/backends/score-worker.d.ts +1 -1
- package/dist/node/backends/score-worker.js +36 -1
- package/dist/node/backends/score-worker.js.map +1 -1
- package/dist/node/index.cjs +236 -5
- package/dist/node/index.cjs.map +1 -1
- package/dist/node/index.d.cts +3 -3
- package/dist/node/index.d.ts +3 -3
- package/dist/node/index.js +232 -6
- package/dist/node/index.js.map +1 -1
- package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
- package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
- package/package.json +1 -1
- package/src/core/autoconfigController.ts +14 -1
- package/src/core/autoconfigNegativeEvidence.ts +337 -0
- package/src/core/index.ts +12 -0
- package/src/core/pipeline.ts +7 -0
- package/src/core/scorer.ts +16 -1
- package/src/core/types.ts +61 -1
- package/tests/parity/negative-evidence-fixtures.json +1034 -0
- package/tests/parity/negativeEvidence.parity.test.ts +183 -0
- package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
- package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
- package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
- package/tests/unit/scorer.pathY.test.ts +69 -0
- package/tests/unit/types.negativeEvidence.test.ts +80 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,56 @@ All notable changes to goldenmatch-js are documented in this file.
|
|
|
4
4
|
|
|
5
5
|
Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
|
|
6
6
|
|
|
7
|
+
## [0.7.0] - 2026-05-10
|
|
8
|
+
|
|
9
|
+
Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
|
|
10
|
+
Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
|
|
11
|
+
NE as a post-filter on exact matchkeys directly; this release ports that
|
|
12
|
+
machinery to the TS runtime.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
|
|
17
|
+
in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
|
|
18
|
+
`MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
|
|
19
|
+
`ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
|
|
20
|
+
`ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
|
|
21
|
+
default 0.5 cutoff when NE is added without a user-set threshold.
|
|
22
|
+
- `src/core/autoconfigNegativeEvidence.ts`:
|
|
23
|
+
- `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
|
|
24
|
+
- `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
|
|
25
|
+
post-filter for `findExactMatches` output.
|
|
26
|
+
- `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
|
|
27
|
+
that walks both weighted AND exact matchkeys (v1.12 change). The
|
|
28
|
+
`_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
|
|
29
|
+
- `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
|
|
30
|
+
matching Python `_pick_scorer_for_column` (`email→token_sort`,
|
|
31
|
+
`phone→exact+digits_only`, `address→token_sort`, otherwise
|
|
32
|
+
`ensemble`).
|
|
33
|
+
|
|
34
|
+
### Changed
|
|
35
|
+
|
|
36
|
+
- `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
|
|
37
|
+
weighted-sum aggregation, before the threshold compare. No-op when the
|
|
38
|
+
matchkey has no `negativeEvidence`.
|
|
39
|
+
- `pipeline.ts` — after `findExactMatches`, calls
|
|
40
|
+
`applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
|
|
41
|
+
Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
|
|
42
|
+
is unchanged.
|
|
43
|
+
- `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
|
|
44
|
+
once on the full row set (not the sample) before the iteration loop,
|
|
45
|
+
matching Python's `auto_configure_df` pre-iteration pass.
|
|
46
|
+
|
|
47
|
+
### Tested
|
|
48
|
+
|
|
49
|
+
- 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
|
|
50
|
+
`scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
|
|
51
|
+
- 6 new Python-parity fixtures in
|
|
52
|
+
`tests/parity/negative-evidence-fixtures.json` covering
|
|
53
|
+
clustered-email-different-surname, clustered-phone-different-name,
|
|
54
|
+
dense-population promotion, sparse no-op, blocking-field skip, and
|
|
55
|
+
idempotency. All 6 green vs Python `promote_negative_evidence`.
|
|
56
|
+
|
|
7
57
|
## [0.6.0] - 2026-05-10
|
|
8
58
|
|
|
9
59
|
Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.
|
package/dist/cli.cjs
CHANGED
|
@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
|
|
|
264
264
|
const type = partial.type ?? "weighted";
|
|
265
265
|
const fields = partial.fields ?? [];
|
|
266
266
|
if (type === "exact") {
|
|
267
|
-
|
|
267
|
+
const out2 = {
|
|
268
|
+
name: partial.name,
|
|
269
|
+
type: "exact",
|
|
270
|
+
fields,
|
|
271
|
+
...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
|
|
272
|
+
...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
|
|
273
|
+
};
|
|
274
|
+
return out2;
|
|
268
275
|
}
|
|
269
276
|
if (type === "probabilistic") {
|
|
270
277
|
const out2 = {
|
|
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
|
|
|
275
282
|
...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
|
|
276
283
|
...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
|
|
277
284
|
...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
|
|
278
|
-
...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
|
|
285
|
+
...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
|
|
286
|
+
...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
|
|
279
287
|
};
|
|
280
288
|
return out2;
|
|
281
289
|
}
|
|
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
|
|
|
287
295
|
...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
|
|
288
296
|
...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
|
|
289
297
|
...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
|
|
290
|
-
...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
|
|
298
|
+
...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
|
|
299
|
+
...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
|
|
291
300
|
};
|
|
292
301
|
return out;
|
|
293
302
|
}
|
|
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
|
|
|
1748
1757
|
}
|
|
1749
1758
|
});
|
|
1750
1759
|
|
|
1760
|
+
// src/core/autoconfigNegativeEvidence.ts
|
|
1761
|
+
function applyNegativeEvidence(matchkey, rowA, rowB) {
|
|
1762
|
+
const ne = getNegativeEvidence(matchkey);
|
|
1763
|
+
if (!ne || ne.length === 0) return 0;
|
|
1764
|
+
let total = 0;
|
|
1765
|
+
for (const f of ne) {
|
|
1766
|
+
const rawA = asString(rowA[f.field]);
|
|
1767
|
+
const rawB = asString(rowB[f.field]);
|
|
1768
|
+
const valA = applyTransforms(rawA, f.transforms);
|
|
1769
|
+
const valB = applyTransforms(rawB, f.transforms);
|
|
1770
|
+
let sim;
|
|
1771
|
+
try {
|
|
1772
|
+
sim = scoreField(valA, valB, f.scorer);
|
|
1773
|
+
} catch {
|
|
1774
|
+
continue;
|
|
1775
|
+
}
|
|
1776
|
+
if (sim === null) continue;
|
|
1777
|
+
if (sim < f.threshold) total += f.penalty;
|
|
1778
|
+
}
|
|
1779
|
+
return total;
|
|
1780
|
+
}
|
|
1781
|
+
function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
|
|
1782
|
+
const ne = getNegativeEvidence(matchkey);
|
|
1783
|
+
if (!ne || ne.length === 0) return [...pairs];
|
|
1784
|
+
const threshold = exactThresholdForNe(matchkey);
|
|
1785
|
+
const lookup = /* @__PURE__ */ new Map();
|
|
1786
|
+
for (const r of allRows) {
|
|
1787
|
+
const rid = r["__row_id__"];
|
|
1788
|
+
if (rid !== void 0) lookup.set(rid, r);
|
|
1789
|
+
}
|
|
1790
|
+
const out = [];
|
|
1791
|
+
for (const p of pairs) {
|
|
1792
|
+
const rowA = lookup.get(p.idA);
|
|
1793
|
+
const rowB = lookup.get(p.idB);
|
|
1794
|
+
if (rowA === void 0 || rowB === void 0) continue;
|
|
1795
|
+
const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
|
|
1796
|
+
const finalScore = Math.max(0, 1 - penalty);
|
|
1797
|
+
if (finalScore >= threshold) {
|
|
1798
|
+
out.push({ ...p, score: finalScore });
|
|
1799
|
+
}
|
|
1800
|
+
}
|
|
1801
|
+
return out;
|
|
1802
|
+
}
|
|
1803
|
+
function getNegativeEvidence(mk) {
|
|
1804
|
+
if (mk.type === "weighted") return mk.negativeEvidence;
|
|
1805
|
+
if (mk.type === "exact") return mk.negativeEvidence;
|
|
1806
|
+
if (mk.type === "probabilistic")
|
|
1807
|
+
return mk.negativeEvidence;
|
|
1808
|
+
return void 0;
|
|
1809
|
+
}
|
|
1810
|
+
function exactThresholdForNe(mk) {
|
|
1811
|
+
if (mk.type === "exact") {
|
|
1812
|
+
const t = mk.threshold;
|
|
1813
|
+
return t ?? 0.5;
|
|
1814
|
+
}
|
|
1815
|
+
if (mk.type === "weighted") return mk.threshold;
|
|
1816
|
+
if (mk.type === "probabilistic") {
|
|
1817
|
+
return mk.threshold ?? 0.5;
|
|
1818
|
+
}
|
|
1819
|
+
return 0.5;
|
|
1820
|
+
}
|
|
1821
|
+
var init_autoconfigNegativeEvidence = __esm({
|
|
1822
|
+
"src/core/autoconfigNegativeEvidence.ts"() {
|
|
1823
|
+
init_types();
|
|
1824
|
+
init_transforms();
|
|
1825
|
+
init_scorer();
|
|
1826
|
+
}
|
|
1827
|
+
});
|
|
1828
|
+
|
|
1751
1829
|
// src/core/scorer.ts
|
|
1752
1830
|
function asString(v) {
|
|
1753
1831
|
if (v === null || v === void 0) return null;
|
|
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
|
|
|
2214
2292
|
}
|
|
2215
2293
|
}
|
|
2216
2294
|
}
|
|
2295
|
+
const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
|
|
2296
|
+
const neActive = ne !== void 0 && ne.length > 0;
|
|
2217
2297
|
const results = [];
|
|
2218
2298
|
for (let i = 0; i < n; i++) {
|
|
2219
2299
|
for (let j = i + 1; j < n; j++) {
|
|
2220
|
-
|
|
2300
|
+
let score = combined[i][j];
|
|
2301
|
+
if (neActive) {
|
|
2302
|
+
const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
|
|
2303
|
+
score = Math.max(0, score - penalty);
|
|
2304
|
+
}
|
|
2221
2305
|
if (score < threshold) continue;
|
|
2222
2306
|
const idA = Math.min(rowIds[i], rowIds[j]);
|
|
2223
2307
|
const idB = Math.max(rowIds[i], rowIds[j]);
|
|
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
|
|
|
2274
2358
|
init_types();
|
|
2275
2359
|
init_cluster();
|
|
2276
2360
|
init_transforms();
|
|
2361
|
+
init_autoconfigNegativeEvidence();
|
|
2277
2362
|
}
|
|
2278
2363
|
});
|
|
2279
2364
|
|
|
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
|
|
|
3214
3299
|
for (const mk of matchkeys) {
|
|
3215
3300
|
if (mk.type === "exact") {
|
|
3216
3301
|
let pairs = findExactMatches(processed, mk);
|
|
3302
|
+
if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
|
|
3303
|
+
pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
|
|
3304
|
+
}
|
|
3217
3305
|
if (acrossFilesOnly) {
|
|
3218
3306
|
pairs = pairs.filter((p) => {
|
|
3219
3307
|
const srcA = sourceLookup.get(p.idA);
|
|
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
|
|
|
3408
3496
|
init_standardize();
|
|
3409
3497
|
init_blocker();
|
|
3410
3498
|
init_scorer();
|
|
3499
|
+
init_autoconfigNegativeEvidence();
|
|
3411
3500
|
init_cluster();
|
|
3412
3501
|
init_golden();
|
|
3413
3502
|
init_autoconfigVerify();
|