npm - goldenmatch - Versions diffs - 0.5.0 → 0.7.0 - Mend

goldenmatch 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/CHANGELOG.md +104 -0
package/dist/cli.cjs +93 -4
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +93 -4
package/dist/cli.js.map +1 -1
package/dist/core/index.cjs +1108 -151
package/dist/core/index.cjs.map +1 -1
package/dist/core/index.d.cts +267 -148
package/dist/core/index.d.ts +267 -148
package/dist/core/index.js +1104 -152
package/dist/core/index.js.map +1 -1
package/dist/index.cjs +1108 -151
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +1104 -152
package/dist/index.js.map +1 -1
package/dist/node/backends/score-worker.cjs +36 -1
package/dist/node/backends/score-worker.cjs.map +1 -1
package/dist/node/backends/score-worker.d.cts +1 -1
package/dist/node/backends/score-worker.d.ts +1 -1
package/dist/node/backends/score-worker.js +36 -1
package/dist/node/backends/score-worker.js.map +1 -1
package/dist/node/index.cjs +1108 -151
package/dist/node/index.cjs.map +1 -1
package/dist/node/index.d.cts +3 -3
package/dist/node/index.d.ts +3 -3
package/dist/node/index.js +1104 -152
package/dist/node/index.js.map +1 -1
package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
package/package.json +1 -1
package/src/core/autoconfig.ts +185 -166
package/src/core/autoconfigController.ts +20 -2
package/src/core/autoconfigNegativeEvidence.ts +337 -0
package/src/core/autoconfigPolicy.ts +13 -1
package/src/core/autoconfigRules.ts +437 -0
package/src/core/index.ts +12 -0
package/src/core/indicators.ts +491 -0
package/src/core/pipeline.ts +7 -0
package/src/core/scorer.ts +16 -1
package/src/core/types.ts +61 -1
package/tests/parity/controller-stoppoint-fixtures.json +246 -0
package/tests/parity/controller-stoppoint.parity.test.ts +120 -45
package/tests/parity/indicators-fixtures.json +542 -0
package/tests/parity/indicators.parity.test.ts +116 -0
package/tests/parity/negative-evidence-fixtures.json +1034 -0
package/tests/parity/negativeEvidence.parity.test.ts +183 -0
package/tests/unit/autoconfig-classifier.test.ts +3 -0
package/tests/unit/autoconfig.test.ts +11 -3
package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
package/tests/unit/autoconfigRules.indicators.test.ts +291 -0
package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
package/tests/unit/indicators.test.ts +195 -0
package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
package/tests/unit/scorer.pathY.test.ts +69 -0
package/tests/unit/types.negativeEvidence.test.ts +80 -0

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,110 @@ All notable changes to goldenmatch-js are documented in this file.
 Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
+## [0.7.0] - 2026-05-10
+Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
+Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
+NE as a post-filter on exact matchkeys directly; this release ports that
+machinery to the TS runtime.
+### Added
+- `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
+  in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
+  `MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
+  `ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
+  `ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
+  default 0.5 cutoff when NE is added without a user-set threshold.
+- `src/core/autoconfigNegativeEvidence.ts`:
+  - `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
+  - `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
+    post-filter for `findExactMatches` output.
+  - `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
+    that walks both weighted AND exact matchkeys (v1.12 change). The
+    `_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
+  - `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
+    matching Python `_pick_scorer_for_column` (`email→token_sort`,
+    `phone→exact+digits_only`, `address→token_sort`, otherwise
+    `ensemble`).
+### Changed
+- `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
+  weighted-sum aggregation, before the threshold compare. No-op when the
+  matchkey has no `negativeEvidence`.
+- `pipeline.ts` — after `findExactMatches`, calls
+  `applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
+  Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
+  is unchanged.
+- `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
+  once on the full row set (not the sample) before the iteration loop,
+  matching Python's `auto_configure_df` pre-iteration pass.
+### Tested
+- 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
+  `scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
+- 6 new Python-parity fixtures in
+  `tests/parity/negative-evidence-fixtures.json` covering
+  clustered-email-different-surname, clustered-phone-different-name,
+  dense-population promotion, sparse no-op, blocking-field skip, and
+  idempotency. All 6 green vs Python `promote_negative_evidence`.
+## [0.6.0] - 2026-05-10
+Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.
+### Added
+- `IndicatorContext` memoization layer (`src/core/indicators.ts`) and 5 pure
+  complexity indicators ported from Python `core/indicators.py`:
+  `computeColumnPriors`, `estimateSparseMatchSignal`,
+  `computeCorruptionScore`, `estimateFullPopHits`,
+  `computeCrossBlockingOverlap`, plus `computeIdentityCollisionSignal`
+  used by the collision-aware refit rule.
+- 7 new indicator-aware refit rules in `autoconfigRules.ts`:
+  `ruleUniformHeavyBlocking`, `ruleBlockingFieldNullHeavy`,
+  `ruleRecallGapSuspected`, `ruleCollisionSignalTooHigh`,
+  `ruleSparseMatchExpand`, `ruleCrossBlockingDisagreement`,
+  `ruleCorruptionNormalize`.
+- `DEFAULT_RULES_V1_10` — 14-rule list mirroring Python's `DEFAULT_RULES`
+  order. The legacy `DEFAULT_RULES_V1_7_V1_8` 7-rule list is still exported
+  for callers that opt into base-only behavior.
+- `RuleContext.indicators` optional field carries the per-iteration
+  `IndicatorContext`; rules that need indicator signals are silent no-ops
+  when callers run the legacy v1.7/v1.8 rule list.
+- `RefitPolicy.propose(profile, current, history, indicators?)` — fourth
+  positional argument (back-compat: defaults to `null`).
+### Changed
+- `autoConfigureRows` rewrite: matchkey naming now matches Python
+  (`fuzzy_match` for weighted, `exact_<col>` for exact). Scorer selection
+  follows Python's `_SCORER_MAP` (e.g. `name → ensemble`,
+  `email → exact`). Adaptive threshold uses Python's formula plus the
+  post-build data-quality adjustment (avg_null > 0.15 → −0.05;
+  avg_len < 5 → +0.05).
+- `buildBlocking` aligned with Python: prefers high-cardinality
+  exact-eligible columns (email/phone/zip/identifier/year) for static
+  blocking, falls back to multi-pass name blocking
+  (`soundex` + `substring:0:5` + `token_sort + substring:0:8`).
+- Controller provisions a fresh `IndicatorContext` per iteration and
+  threads it into `policy.propose()` for v1.10 rule consumption.
+### Parity status
+- Controller stoppoint parity: 6/6 datasets pass shape-level assertions,
+  2/6 (`dirty_people`, `mixed_blocking`) byte-equal on the normalized
+  committed config. The remaining 4 diverge because Python's iteration
+  path hits a `ModuleNotFoundError` on subsequent iterations and falls
+  back to a virtual v0 entry (out-of-scope to replicate in TS).
+- Indicators parity: 8/8 fixture datasets pass at 4-decimal tolerance
+  on the 5 indicators. Identity-collision signal is unit-tested only —
+  the TS pure-JS token-sort approximation diverges numerically from
+  Python's `rapidfuzz.token_sort_ratio` at sub-rule precision, but the
+  rule-firing boundary (rate > 0.75) is preserved.
 ## [0.5.0] - 2026-05-10
 Auto-config controller parity with Python `goldenmatch` v1.7 + v1.8.

package/dist/cli.cjs CHANGED Viewed

@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
   const type = partial.type ?? "weighted";
   const fields = partial.fields ?? [];
   if (type === "exact") {
-    return { name: partial.name, type: "exact", fields };
+    const out2 = {
+      name: partial.name,
+      type: "exact",
+      fields,
+      ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
+      ...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
+    };
+    return out2;
   }
   if (type === "probabilistic") {
     const out2 = {
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
       ...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
       ...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
       ...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
-      ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
+      ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
+      ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
     };
     return out2;
   }
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
     ...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
     ...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
     ...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
-    ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
+    ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
+    ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
   };
   return out;
 }
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
   }
 });
+// src/core/autoconfigNegativeEvidence.ts
+function applyNegativeEvidence(matchkey, rowA, rowB) {
+  const ne = getNegativeEvidence(matchkey);
+  if (!ne || ne.length === 0) return 0;
+  let total = 0;
+  for (const f of ne) {
+    const rawA = asString(rowA[f.field]);
+    const rawB = asString(rowB[f.field]);
+    const valA = applyTransforms(rawA, f.transforms);
+    const valB = applyTransforms(rawB, f.transforms);
+    let sim;
+    try {
+      sim = scoreField(valA, valB, f.scorer);
+    } catch {
+      continue;
+    }
+    if (sim === null) continue;
+    if (sim < f.threshold) total += f.penalty;
+  }
+  return total;
+}
+function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
+  const ne = getNegativeEvidence(matchkey);
+  if (!ne || ne.length === 0) return [...pairs];
+  const threshold = exactThresholdForNe(matchkey);
+  const lookup = /* @__PURE__ */ new Map();
+  for (const r of allRows) {
+    const rid = r["__row_id__"];
+    if (rid !== void 0) lookup.set(rid, r);
+  }
+  const out = [];
+  for (const p of pairs) {
+    const rowA = lookup.get(p.idA);
+    const rowB = lookup.get(p.idB);
+    if (rowA === void 0 || rowB === void 0) continue;
+    const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
+    const finalScore = Math.max(0, 1 - penalty);
+    if (finalScore >= threshold) {
+      out.push({ ...p, score: finalScore });
+    }
+  }
+  return out;
+}
+function getNegativeEvidence(mk) {
+  if (mk.type === "weighted") return mk.negativeEvidence;
+  if (mk.type === "exact") return mk.negativeEvidence;
+  if (mk.type === "probabilistic")
+    return mk.negativeEvidence;
+  return void 0;
+}
+function exactThresholdForNe(mk) {
+  if (mk.type === "exact") {
+    const t = mk.threshold;
+    return t ?? 0.5;
+  }
+  if (mk.type === "weighted") return mk.threshold;
+  if (mk.type === "probabilistic") {
+    return mk.threshold ?? 0.5;
+  }
+  return 0.5;
+}
+var init_autoconfigNegativeEvidence = __esm({
+  "src/core/autoconfigNegativeEvidence.ts"() {
+    init_types();
+    init_transforms();
+    init_scorer();
+  }
+});
 // src/core/scorer.ts
 function asString(v) {
   if (v === null || v === void 0) return null;
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
       }
     }
   }
+  const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
+  const neActive = ne !== void 0 && ne.length > 0;
   const results = [];
   for (let i = 0; i < n; i++) {
     for (let j = i + 1; j < n; j++) {
-      const score = combined[i][j];
+      let score = combined[i][j];
+      if (neActive) {
+        const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
+        score = Math.max(0, score - penalty);
+      }
       if (score < threshold) continue;
       const idA = Math.min(rowIds[i], rowIds[j]);
       const idB = Math.max(rowIds[i], rowIds[j]);
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
     init_types();
     init_cluster();
     init_transforms();
+    init_autoconfigNegativeEvidence();
   }
 });
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
   for (const mk of matchkeys) {
     if (mk.type === "exact") {
       let pairs = findExactMatches(processed, mk);
+      if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
+        pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
+      }
       if (acrossFilesOnly) {
         pairs = pairs.filter((p) => {
           const srcA = sourceLookup.get(p.idA);
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
     init_standardize();
     init_blocker();
     init_scorer();
+    init_autoconfigNegativeEvidence();
     init_cluster();
     init_golden();
     init_autoconfigVerify();