goldenmatch 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +104 -0
  2. package/dist/cli.cjs +93 -4
  3. package/dist/cli.cjs.map +1 -1
  4. package/dist/cli.js +93 -4
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/index.cjs +1108 -151
  7. package/dist/core/index.cjs.map +1 -1
  8. package/dist/core/index.d.cts +267 -148
  9. package/dist/core/index.d.ts +267 -148
  10. package/dist/core/index.js +1104 -152
  11. package/dist/core/index.js.map +1 -1
  12. package/dist/index.cjs +1108 -151
  13. package/dist/index.cjs.map +1 -1
  14. package/dist/index.d.cts +2 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +1104 -152
  17. package/dist/index.js.map +1 -1
  18. package/dist/node/backends/score-worker.cjs +36 -1
  19. package/dist/node/backends/score-worker.cjs.map +1 -1
  20. package/dist/node/backends/score-worker.d.cts +1 -1
  21. package/dist/node/backends/score-worker.d.ts +1 -1
  22. package/dist/node/backends/score-worker.js +36 -1
  23. package/dist/node/backends/score-worker.js.map +1 -1
  24. package/dist/node/index.cjs +1108 -151
  25. package/dist/node/index.cjs.map +1 -1
  26. package/dist/node/index.d.cts +3 -3
  27. package/dist/node/index.d.ts +3 -3
  28. package/dist/node/index.js +1104 -152
  29. package/dist/node/index.js.map +1 -1
  30. package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
  31. package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
  32. package/package.json +1 -1
  33. package/src/core/autoconfig.ts +185 -166
  34. package/src/core/autoconfigController.ts +20 -2
  35. package/src/core/autoconfigNegativeEvidence.ts +337 -0
  36. package/src/core/autoconfigPolicy.ts +13 -1
  37. package/src/core/autoconfigRules.ts +437 -0
  38. package/src/core/index.ts +12 -0
  39. package/src/core/indicators.ts +491 -0
  40. package/src/core/pipeline.ts +7 -0
  41. package/src/core/scorer.ts +16 -1
  42. package/src/core/types.ts +61 -1
  43. package/tests/parity/controller-stoppoint-fixtures.json +246 -0
  44. package/tests/parity/controller-stoppoint.parity.test.ts +120 -45
  45. package/tests/parity/indicators-fixtures.json +542 -0
  46. package/tests/parity/indicators.parity.test.ts +116 -0
  47. package/tests/parity/negative-evidence-fixtures.json +1034 -0
  48. package/tests/parity/negativeEvidence.parity.test.ts +183 -0
  49. package/tests/unit/autoconfig-classifier.test.ts +3 -0
  50. package/tests/unit/autoconfig.test.ts +11 -3
  51. package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
  52. package/tests/unit/autoconfigRules.indicators.test.ts +291 -0
  53. package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
  54. package/tests/unit/indicators.test.ts +195 -0
  55. package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
  56. package/tests/unit/scorer.pathY.test.ts +69 -0
  57. package/tests/unit/types.negativeEvidence.test.ts +80 -0
package/CHANGELOG.md CHANGED
@@ -4,6 +4,110 @@ All notable changes to goldenmatch-js are documented in this file.
4
4
 
5
5
  Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
6
6
 
7
+ ## [0.7.0] - 2026-05-10
8
+
9
+ Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
10
+ Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
11
+ NE as a post-filter on exact matchkeys directly; this release ports that
12
+ machinery to the TS runtime.
13
+
14
+ ### Added
15
+
16
+ - `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
17
+ in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
18
+ `MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
19
+ `ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
20
+ `ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
21
+ default 0.5 cutoff when NE is added without a user-set threshold.
22
+ - `src/core/autoconfigNegativeEvidence.ts`:
23
+ - `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
24
+ - `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
25
+ post-filter for `findExactMatches` output.
26
+ - `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
27
+ that walks both weighted AND exact matchkeys (v1.12 change). The
28
+ `_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
29
+ - `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
30
+ matching Python `_pick_scorer_for_column` (`email→token_sort`,
31
+ `phone→exact+digits_only`, `address→token_sort`, otherwise
32
+ `ensemble`).
33
+
34
+ ### Changed
35
+
36
+ - `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
37
+ weighted-sum aggregation, before the threshold compare. No-op when the
38
+ matchkey has no `negativeEvidence`.
39
+ - `pipeline.ts` — after `findExactMatches`, calls
40
+ `applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
41
+ Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
42
+ is unchanged.
43
+ - `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
44
+ once on the full row set (not the sample) before the iteration loop,
45
+ matching Python's `auto_configure_df` pre-iteration pass.
46
+
47
+ ### Tested
48
+
49
+ - 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
50
+ `scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
51
+ - 6 new Python-parity fixtures in
52
+ `tests/parity/negative-evidence-fixtures.json` covering
53
+ clustered-email-different-surname, clustered-phone-different-name,
54
+ dense-population promotion, sparse no-op, blocking-field skip, and
55
+ idempotency. All 6 green vs Python `promote_negative_evidence`.
56
+
57
+ ## [0.6.0] - 2026-05-10
58
+
59
+ Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.
60
+
61
+ ### Added
62
+
63
+ - `IndicatorContext` memoization layer (`src/core/indicators.ts`) and 5 pure
64
+ complexity indicators ported from Python `core/indicators.py`:
65
+ `computeColumnPriors`, `estimateSparseMatchSignal`,
66
+ `computeCorruptionScore`, `estimateFullPopHits`,
67
+ `computeCrossBlockingOverlap`, plus `computeIdentityCollisionSignal`
68
+ used by the collision-aware refit rule.
69
+ - 7 new indicator-aware refit rules in `autoconfigRules.ts`:
70
+ `ruleUniformHeavyBlocking`, `ruleBlockingFieldNullHeavy`,
71
+ `ruleRecallGapSuspected`, `ruleCollisionSignalTooHigh`,
72
+ `ruleSparseMatchExpand`, `ruleCrossBlockingDisagreement`,
73
+ `ruleCorruptionNormalize`.
74
+ - `DEFAULT_RULES_V1_10` — 14-rule list mirroring Python's `DEFAULT_RULES`
75
+ order. The legacy `DEFAULT_RULES_V1_7_V1_8` 7-rule list is still exported
76
+ for callers that opt into base-only behavior.
77
+ - `RuleContext.indicators` optional field carries the per-iteration
78
+ `IndicatorContext`; rules that need indicator signals are silent no-ops
79
+ when callers run the legacy v1.7/v1.8 rule list.
80
+ - `RefitPolicy.propose(profile, current, history, indicators?)` — fourth
81
+ positional argument (back-compat: defaults to `null`).
82
+
83
+ ### Changed
84
+
85
+ - `autoConfigureRows` rewrite: matchkey naming now matches Python
86
+ (`fuzzy_match` for weighted, `exact_<col>` for exact). Scorer selection
87
+ follows Python's `_SCORER_MAP` (e.g. `name → ensemble`,
88
+ `email → exact`). Adaptive threshold uses Python's formula plus the
89
+ post-build data-quality adjustment (avg_null > 0.15 → −0.05;
90
+ avg_len < 5 → +0.05).
91
+ - `buildBlocking` aligned with Python: prefers high-cardinality
92
+ exact-eligible columns (email/phone/zip/identifier/year) for static
93
+ blocking, falls back to multi-pass name blocking
94
+ (`soundex` + `substring:0:5` + `token_sort + substring:0:8`).
95
+ - Controller provisions a fresh `IndicatorContext` per iteration and
96
+ threads it into `policy.propose()` for v1.10 rule consumption.
97
+
98
+ ### Parity status
99
+
100
+ - Controller stoppoint parity: 6/6 datasets pass shape-level assertions,
101
+ 2/6 (`dirty_people`, `mixed_blocking`) byte-equal on the normalized
102
+ committed config. The remaining 4 diverge because Python's iteration
103
+ path hits a `ModuleNotFoundError` on subsequent iterations and falls
104
+ back to a virtual v0 entry (out-of-scope to replicate in TS).
105
+ - Indicators parity: 8/8 fixture datasets pass at 4-decimal tolerance
106
+ on the 5 indicators. Identity-collision signal is unit-tested only —
107
+ the TS pure-JS token-sort approximation diverges numerically from
108
+ Python's `rapidfuzz.token_sort_ratio` at sub-rule precision, but the
109
+ rule-firing boundary (rate > 0.75) is preserved.
110
+
7
111
  ## [0.5.0] - 2026-05-10
8
112
 
9
113
  Auto-config controller parity with Python `goldenmatch` v1.7 + v1.8.
package/dist/cli.cjs CHANGED
@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
264
264
  const type = partial.type ?? "weighted";
265
265
  const fields = partial.fields ?? [];
266
266
  if (type === "exact") {
267
- return { name: partial.name, type: "exact", fields };
267
+ const out2 = {
268
+ name: partial.name,
269
+ type: "exact",
270
+ fields,
271
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
272
+ ...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
273
+ };
274
+ return out2;
268
275
  }
269
276
  if (type === "probabilistic") {
270
277
  const out2 = {
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
275
282
  ...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
276
283
  ...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
277
284
  ...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
278
- ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
285
+ ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
286
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
279
287
  };
280
288
  return out2;
281
289
  }
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
287
295
  ...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
288
296
  ...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
289
297
  ...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
290
- ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
298
+ ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
299
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
291
300
  };
292
301
  return out;
293
302
  }
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
1748
1757
  }
1749
1758
  });
1750
1759
 
1760
+ // src/core/autoconfigNegativeEvidence.ts
1761
+ function applyNegativeEvidence(matchkey, rowA, rowB) {
1762
+ const ne = getNegativeEvidence(matchkey);
1763
+ if (!ne || ne.length === 0) return 0;
1764
+ let total = 0;
1765
+ for (const f of ne) {
1766
+ const rawA = asString(rowA[f.field]);
1767
+ const rawB = asString(rowB[f.field]);
1768
+ const valA = applyTransforms(rawA, f.transforms);
1769
+ const valB = applyTransforms(rawB, f.transforms);
1770
+ let sim;
1771
+ try {
1772
+ sim = scoreField(valA, valB, f.scorer);
1773
+ } catch {
1774
+ continue;
1775
+ }
1776
+ if (sim === null) continue;
1777
+ if (sim < f.threshold) total += f.penalty;
1778
+ }
1779
+ return total;
1780
+ }
1781
+ function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
1782
+ const ne = getNegativeEvidence(matchkey);
1783
+ if (!ne || ne.length === 0) return [...pairs];
1784
+ const threshold = exactThresholdForNe(matchkey);
1785
+ const lookup = /* @__PURE__ */ new Map();
1786
+ for (const r of allRows) {
1787
+ const rid = r["__row_id__"];
1788
+ if (rid !== void 0) lookup.set(rid, r);
1789
+ }
1790
+ const out = [];
1791
+ for (const p of pairs) {
1792
+ const rowA = lookup.get(p.idA);
1793
+ const rowB = lookup.get(p.idB);
1794
+ if (rowA === void 0 || rowB === void 0) continue;
1795
+ const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
1796
+ const finalScore = Math.max(0, 1 - penalty);
1797
+ if (finalScore >= threshold) {
1798
+ out.push({ ...p, score: finalScore });
1799
+ }
1800
+ }
1801
+ return out;
1802
+ }
1803
+ function getNegativeEvidence(mk) {
1804
+ if (mk.type === "weighted") return mk.negativeEvidence;
1805
+ if (mk.type === "exact") return mk.negativeEvidence;
1806
+ if (mk.type === "probabilistic")
1807
+ return mk.negativeEvidence;
1808
+ return void 0;
1809
+ }
1810
+ function exactThresholdForNe(mk) {
1811
+ if (mk.type === "exact") {
1812
+ const t = mk.threshold;
1813
+ return t ?? 0.5;
1814
+ }
1815
+ if (mk.type === "weighted") return mk.threshold;
1816
+ if (mk.type === "probabilistic") {
1817
+ return mk.threshold ?? 0.5;
1818
+ }
1819
+ return 0.5;
1820
+ }
1821
+ var init_autoconfigNegativeEvidence = __esm({
1822
+ "src/core/autoconfigNegativeEvidence.ts"() {
1823
+ init_types();
1824
+ init_transforms();
1825
+ init_scorer();
1826
+ }
1827
+ });
1828
+
1751
1829
  // src/core/scorer.ts
1752
1830
  function asString(v) {
1753
1831
  if (v === null || v === void 0) return null;
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
2214
2292
  }
2215
2293
  }
2216
2294
  }
2295
+ const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
2296
+ const neActive = ne !== void 0 && ne.length > 0;
2217
2297
  const results = [];
2218
2298
  for (let i = 0; i < n; i++) {
2219
2299
  for (let j = i + 1; j < n; j++) {
2220
- const score = combined[i][j];
2300
+ let score = combined[i][j];
2301
+ if (neActive) {
2302
+ const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
2303
+ score = Math.max(0, score - penalty);
2304
+ }
2221
2305
  if (score < threshold) continue;
2222
2306
  const idA = Math.min(rowIds[i], rowIds[j]);
2223
2307
  const idB = Math.max(rowIds[i], rowIds[j]);
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
2274
2358
  init_types();
2275
2359
  init_cluster();
2276
2360
  init_transforms();
2361
+ init_autoconfigNegativeEvidence();
2277
2362
  }
2278
2363
  });
2279
2364
 
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
3214
3299
  for (const mk of matchkeys) {
3215
3300
  if (mk.type === "exact") {
3216
3301
  let pairs = findExactMatches(processed, mk);
3302
+ if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
3303
+ pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
3304
+ }
3217
3305
  if (acrossFilesOnly) {
3218
3306
  pairs = pairs.filter((p) => {
3219
3307
  const srcA = sourceLookup.get(p.idA);
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
3408
3496
  init_standardize();
3409
3497
  init_blocker();
3410
3498
  init_scorer();
3499
+ init_autoconfigNegativeEvidence();
3411
3500
  init_cluster();
3412
3501
  init_golden();
3413
3502
  init_autoconfigVerify();