goldenmatch 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/dist/cli.cjs +93 -4
  3. package/dist/cli.cjs.map +1 -1
  4. package/dist/cli.js +93 -4
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/index.cjs +236 -5
  7. package/dist/core/index.cjs.map +1 -1
  8. package/dist/core/index.d.cts +220 -146
  9. package/dist/core/index.d.ts +220 -146
  10. package/dist/core/index.js +232 -6
  11. package/dist/core/index.js.map +1 -1
  12. package/dist/index.cjs +236 -5
  13. package/dist/index.cjs.map +1 -1
  14. package/dist/index.d.cts +2 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +232 -6
  17. package/dist/index.js.map +1 -1
  18. package/dist/node/backends/score-worker.cjs +36 -1
  19. package/dist/node/backends/score-worker.cjs.map +1 -1
  20. package/dist/node/backends/score-worker.d.cts +1 -1
  21. package/dist/node/backends/score-worker.d.ts +1 -1
  22. package/dist/node/backends/score-worker.js +36 -1
  23. package/dist/node/backends/score-worker.js.map +1 -1
  24. package/dist/node/index.cjs +236 -5
  25. package/dist/node/index.cjs.map +1 -1
  26. package/dist/node/index.d.cts +3 -3
  27. package/dist/node/index.d.ts +3 -3
  28. package/dist/node/index.js +232 -6
  29. package/dist/node/index.js.map +1 -1
  30. package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
  31. package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
  32. package/package.json +1 -1
  33. package/src/core/autoconfigController.ts +14 -1
  34. package/src/core/autoconfigNegativeEvidence.ts +337 -0
  35. package/src/core/index.ts +12 -0
  36. package/src/core/pipeline.ts +7 -0
  37. package/src/core/scorer.ts +16 -1
  38. package/src/core/types.ts +61 -1
  39. package/tests/parity/negative-evidence-fixtures.json +1034 -0
  40. package/tests/parity/negativeEvidence.parity.test.ts +183 -0
  41. package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
  42. package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
  43. package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
  44. package/tests/unit/scorer.pathY.test.ts +69 -0
  45. package/tests/unit/types.negativeEvidence.test.ts +80 -0
package/CHANGELOG.md CHANGED
@@ -4,6 +4,56 @@ All notable changes to goldenmatch-js are documented in this file.
4
4
 
5
5
  Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
6
6
 
7
+ ## [0.7.0] - 2026-05-10
8
+
9
+ Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
10
+ Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
11
+ NE as a post-filter on exact matchkeys directly; this release ports that
12
+ machinery to the TS runtime.
13
+
14
+ ### Added
15
+
16
+ - `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
17
+ in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
18
+ `MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
19
+ `ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
20
+ `ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
21
+ default 0.5 cutoff when NE is added without a user-set threshold.
22
+ - `src/core/autoconfigNegativeEvidence.ts`:
23
+ - `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
24
+ - `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
25
+ post-filter for `findExactMatches` output.
26
+ - `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
27
+ that walks both weighted AND exact matchkeys (v1.12 change). The
28
+ `_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
29
+ - `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
30
+ matching Python `_pick_scorer_for_column` (`email→token_sort`,
31
+ `phone→exact+digits_only`, `address→token_sort`, otherwise
32
+ `ensemble`).
33
+
34
+ ### Changed
35
+
36
+ - `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
37
+ weighted-sum aggregation, before the threshold compare. No-op when the
38
+ matchkey has no `negativeEvidence`.
39
+ - `pipeline.ts` — after `findExactMatches`, calls
40
+ `applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
41
+ Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
42
+ is unchanged.
43
+ - `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
44
+ once on the full row set (not the sample) before the iteration loop,
45
+ matching Python's `auto_configure_df` pre-iteration pass.
46
+
47
+ ### Tested
48
+
49
+ - 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
50
+ `scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
51
+ - 6 new Python-parity fixtures in
52
+ `tests/parity/negative-evidence-fixtures.json` covering
53
+ clustered-email-different-surname, clustered-phone-different-name,
54
+ dense-population promotion, sparse no-op, blocking-field skip, and
55
+ idempotency. All 6 green vs Python `promote_negative_evidence`.
56
+
7
57
  ## [0.6.0] - 2026-05-10
8
58
 
9
59
  Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.
package/dist/cli.cjs CHANGED
@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
264
264
  const type = partial.type ?? "weighted";
265
265
  const fields = partial.fields ?? [];
266
266
  if (type === "exact") {
267
- return { name: partial.name, type: "exact", fields };
267
+ const out2 = {
268
+ name: partial.name,
269
+ type: "exact",
270
+ fields,
271
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
272
+ ...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
273
+ };
274
+ return out2;
268
275
  }
269
276
  if (type === "probabilistic") {
270
277
  const out2 = {
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
275
282
  ...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
276
283
  ...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
277
284
  ...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
278
- ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
285
+ ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
286
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
279
287
  };
280
288
  return out2;
281
289
  }
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
287
295
  ...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
288
296
  ...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
289
297
  ...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
290
- ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
298
+ ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
299
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
291
300
  };
292
301
  return out;
293
302
  }
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
1748
1757
  }
1749
1758
  });
1750
1759
 
1760
+ // src/core/autoconfigNegativeEvidence.ts
1761
+ function applyNegativeEvidence(matchkey, rowA, rowB) {
1762
+ const ne = getNegativeEvidence(matchkey);
1763
+ if (!ne || ne.length === 0) return 0;
1764
+ let total = 0;
1765
+ for (const f of ne) {
1766
+ const rawA = asString(rowA[f.field]);
1767
+ const rawB = asString(rowB[f.field]);
1768
+ const valA = applyTransforms(rawA, f.transforms);
1769
+ const valB = applyTransforms(rawB, f.transforms);
1770
+ let sim;
1771
+ try {
1772
+ sim = scoreField(valA, valB, f.scorer);
1773
+ } catch {
1774
+ continue;
1775
+ }
1776
+ if (sim === null) continue;
1777
+ if (sim < f.threshold) total += f.penalty;
1778
+ }
1779
+ return total;
1780
+ }
1781
+ function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
1782
+ const ne = getNegativeEvidence(matchkey);
1783
+ if (!ne || ne.length === 0) return [...pairs];
1784
+ const threshold = exactThresholdForNe(matchkey);
1785
+ const lookup = /* @__PURE__ */ new Map();
1786
+ for (const r of allRows) {
1787
+ const rid = r["__row_id__"];
1788
+ if (rid !== void 0) lookup.set(rid, r);
1789
+ }
1790
+ const out = [];
1791
+ for (const p of pairs) {
1792
+ const rowA = lookup.get(p.idA);
1793
+ const rowB = lookup.get(p.idB);
1794
+ if (rowA === void 0 || rowB === void 0) continue;
1795
+ const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
1796
+ const finalScore = Math.max(0, 1 - penalty);
1797
+ if (finalScore >= threshold) {
1798
+ out.push({ ...p, score: finalScore });
1799
+ }
1800
+ }
1801
+ return out;
1802
+ }
1803
+ function getNegativeEvidence(mk) {
1804
+ if (mk.type === "weighted") return mk.negativeEvidence;
1805
+ if (mk.type === "exact") return mk.negativeEvidence;
1806
+ if (mk.type === "probabilistic")
1807
+ return mk.negativeEvidence;
1808
+ return void 0;
1809
+ }
1810
+ function exactThresholdForNe(mk) {
1811
+ if (mk.type === "exact") {
1812
+ const t = mk.threshold;
1813
+ return t ?? 0.5;
1814
+ }
1815
+ if (mk.type === "weighted") return mk.threshold;
1816
+ if (mk.type === "probabilistic") {
1817
+ return mk.threshold ?? 0.5;
1818
+ }
1819
+ return 0.5;
1820
+ }
1821
+ var init_autoconfigNegativeEvidence = __esm({
1822
+ "src/core/autoconfigNegativeEvidence.ts"() {
1823
+ init_types();
1824
+ init_transforms();
1825
+ init_scorer();
1826
+ }
1827
+ });
1828
+
1751
1829
  // src/core/scorer.ts
1752
1830
  function asString(v) {
1753
1831
  if (v === null || v === void 0) return null;
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
2214
2292
  }
2215
2293
  }
2216
2294
  }
2295
+ const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
2296
+ const neActive = ne !== void 0 && ne.length > 0;
2217
2297
  const results = [];
2218
2298
  for (let i = 0; i < n; i++) {
2219
2299
  for (let j = i + 1; j < n; j++) {
2220
- const score = combined[i][j];
2300
+ let score = combined[i][j];
2301
+ if (neActive) {
2302
+ const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
2303
+ score = Math.max(0, score - penalty);
2304
+ }
2221
2305
  if (score < threshold) continue;
2222
2306
  const idA = Math.min(rowIds[i], rowIds[j]);
2223
2307
  const idB = Math.max(rowIds[i], rowIds[j]);
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
2274
2358
  init_types();
2275
2359
  init_cluster();
2276
2360
  init_transforms();
2361
+ init_autoconfigNegativeEvidence();
2277
2362
  }
2278
2363
  });
2279
2364
 
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
3214
3299
  for (const mk of matchkeys) {
3215
3300
  if (mk.type === "exact") {
3216
3301
  let pairs = findExactMatches(processed, mk);
3302
+ if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
3303
+ pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
3304
+ }
3217
3305
  if (acrossFilesOnly) {
3218
3306
  pairs = pairs.filter((p) => {
3219
3307
  const srcA = sourceLookup.get(p.idA);
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
3408
3496
  init_standardize();
3409
3497
  init_blocker();
3410
3498
  init_scorer();
3499
+ init_autoconfigNegativeEvidence();
3411
3500
  init_cluster();
3412
3501
  init_golden();
3413
3502
  init_autoconfigVerify();