goldenmatch 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/CLAUDE.md +68 -0
  3. package/dist/cli.cjs +93 -4
  4. package/dist/cli.cjs.map +1 -1
  5. package/dist/cli.js +93 -4
  6. package/dist/cli.js.map +1 -1
  7. package/dist/core/index.cjs +505 -5
  8. package/dist/core/index.cjs.map +1 -1
  9. package/dist/core/index.d.cts +407 -146
  10. package/dist/core/index.d.ts +407 -146
  11. package/dist/core/index.js +493 -6
  12. package/dist/core/index.js.map +1 -1
  13. package/dist/index.cjs +505 -5
  14. package/dist/index.cjs.map +1 -1
  15. package/dist/index.d.cts +2 -2
  16. package/dist/index.d.ts +2 -2
  17. package/dist/index.js +493 -6
  18. package/dist/index.js.map +1 -1
  19. package/dist/node/backends/score-worker.cjs +36 -1
  20. package/dist/node/backends/score-worker.cjs.map +1 -1
  21. package/dist/node/backends/score-worker.d.cts +1 -1
  22. package/dist/node/backends/score-worker.d.ts +1 -1
  23. package/dist/node/backends/score-worker.js +36 -1
  24. package/dist/node/backends/score-worker.js.map +1 -1
  25. package/dist/node/index.cjs +505 -5
  26. package/dist/node/index.cjs.map +1 -1
  27. package/dist/node/index.d.cts +3 -3
  28. package/dist/node/index.d.ts +3 -3
  29. package/dist/node/index.js +493 -6
  30. package/dist/node/index.js.map +1 -1
  31. package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
  32. package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
  33. package/package.json +8 -2
  34. package/src/core/autoconfigController.ts +14 -1
  35. package/src/core/autoconfigNegativeEvidence.ts +337 -0
  36. package/src/core/identity/in-memory-store.ts +193 -0
  37. package/src/core/identity/index.ts +19 -0
  38. package/src/core/identity/new-entity-id.ts +49 -0
  39. package/src/core/identity/query.ts +142 -0
  40. package/src/core/identity/types.ts +136 -0
  41. package/src/core/index.ts +13 -0
  42. package/src/core/pipeline.ts +7 -0
  43. package/src/core/scorer.ts +16 -1
  44. package/src/core/types.ts +61 -1
  45. package/tests/identity/in-memory-store.test.ts +220 -0
  46. package/tests/parity/negative-evidence-fixtures.json +1034 -0
  47. package/tests/parity/negativeEvidence.parity.test.ts +183 -0
  48. package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
  49. package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
  50. package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
  51. package/tests/unit/scorer.pathY.test.ts +69 -0
  52. package/tests/unit/types.negativeEvidence.test.ts +80 -0
package/CHANGELOG.md CHANGED
@@ -4,6 +4,56 @@ All notable changes to goldenmatch-js are documented in this file.
4
4
 
5
5
  Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
6
6
 
7
+ ## [0.7.0] - 2026-05-10
8
+
9
+ Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
10
+ Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
11
+ NE as a post-filter on exact matchkeys directly; this release ports that
12
+ machinery to the TS runtime.
13
+
14
+ ### Added
15
+
16
+ - `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
17
+ in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
18
+ `MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
19
+ `ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
20
+ `ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
21
+ default 0.5 cutoff when NE is added without a user-set threshold.
22
+ - `src/core/autoconfigNegativeEvidence.ts`:
23
+ - `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
24
+ - `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
25
+ post-filter for `findExactMatches` output.
26
+ - `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
27
+ that walks both weighted AND exact matchkeys (v1.12 change). The
28
+ `_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
29
+ - `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
30
+ matching Python `_pick_scorer_for_column` (`email→token_sort`,
31
+ `phone→exact+digits_only`, `address→token_sort`, otherwise
32
+ `ensemble`).
33
+
34
+ ### Changed
35
+
36
+ - `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
37
+ weighted-sum aggregation, before the threshold compare. No-op when the
38
+ matchkey has no `negativeEvidence`.
39
+ - `pipeline.ts` — after `findExactMatches`, calls
40
+ `applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
41
+ Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
42
+ is unchanged.
43
+ - `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
44
+ once on the full row set (not the sample) before the iteration loop,
45
+ matching Python's `auto_configure_df` pre-iteration pass.
46
+
47
+ ### Tested
48
+
49
+ - 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
50
+ `scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
51
+ - 6 new Python-parity fixtures in
52
+ `tests/parity/negative-evidence-fixtures.json` covering
53
+ clustered-email-different-surname, clustered-phone-different-name,
54
+ dense-population promotion, sparse no-op, blocking-field skip, and
55
+ idempotency. All 6 green vs Python `promote_negative_evidence`.
56
+
7
57
  ## [0.6.0] - 2026-05-10
8
58
 
9
59
  Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.
package/CLAUDE.md ADDED
@@ -0,0 +1,68 @@
1
+ # goldenmatch (TypeScript)
2
+
3
+ npm package `goldenmatch`. Three-wave parity port of the Python sibling at `packages/python/goldenmatch/`. Currently at **v0.7.0** (parity with Python v1.12).
4
+
5
+ ## Wave history
6
+ | npm | Python parity | Headline |
7
+ |-----|---------------|----------|
8
+ | 0.4.0 | v1.6.0 | Learning Memory + scorer ground truth |
9
+ | 0.5.0 | v1.7 + v1.8 | AutoConfigController, ComplexityProfile, RunHistory, StopReason telemetry |
10
+ | 0.6.0 | v1.9 + v1.10 | 5 complexity indicators + indicator-aware refit rules; scorer selection aligned with Python |
11
+ | 0.7.0 | v1.11 + v1.12 | NegativeEvidenceField + Path Y (exact-MK post-filter) |
12
+
13
+ Each wave's spec/plan: `docs/superpowers/specs/2026-05-10-ts-parity-arc-design.md` + per-wave plans.
14
+
15
+ ## Commands
16
+ ```bash
17
+ cd packages/typescript/goldenmatch
18
+ pnpm --filter goldenmatch test # vitest (841 tests at v0.7.0)
19
+ pnpm --filter goldenmatch typecheck # tsc --noEmit (strict)
20
+ pnpm --filter goldenmatch build # tsup (5 entry points)
21
+ npx vitest run tests/parity/ # parity-only suite
22
+ ```
23
+
24
+ ## Edge-safety rule
25
+ `src/core/**` MUST NOT import `node:*`. Node-only code lives in `src/node/`. Memory backed by SQLite is `src/node/memory/`; the edge-safe interface is `src/core/memory/`. This is enforced by build separation, not by lint — verify when adding new imports.
26
+
27
+ ## Strict TS
28
+ `noUncheckedIndexedAccess` + `exactOptionalPropertyTypes`. Idioms:
29
+ - Bounded-loop indices: use `arr[i]!` after a length check.
30
+ - Optional props: `...(x !== undefined ? { field: x } : {})` — never spread `undefined`.
31
+ - Optional peer deps (sqlite, sentence-transformers): `await import("pkg-name" as string)` — the `as string` cast prevents tsup from resolving at build time.
32
+
33
+ ## Parity contract
34
+ - **Scorer output:** 4-decimal tolerance vs Python (`tests/parity/scorer-ground-truth.test.ts`).
35
+ - **Hash bytes:** SHA-256 truncated to 16 hex via Web Crypto. UTF-8 mandatory. Hash input = values joined by `|` (NOT `<col>=<val>`). `__row_id__` excluded from `record_hash` so corrections survive row reordering.
36
+ - **Cross-language fixtures:** committed under `tests/parity/fixtures/`. Regen via `packages/python/goldenmatch/tests/parity/memory/gen_memory_fixtures.py --rebuild-db` and the wave-specific emitters in `packages/python/goldenmatch/scripts/emit_ts_parity_fixtures.py`. Determinism clamp: pinned UUIDs, pinned `created_at` (no `datetime.now()`).
37
+ - **Negative-evidence parity** (v0.7.0): 6 fixture datasets exercising Path Y filtering on exact MKs + weighted-MK NE. Live in `tests/parity/negative-evidence-fixtures.json`.
38
+ - **Controller parity** (v0.5.0): structural-only on 4 of 6 fixtures, byte-equal on 2. Python-side `ModuleNotFoundError` on polars/sklearn in the divergent 4 — TS doesn't replicate that import wart.
39
+
40
+ ## Public API surface (v0.7.0)
41
+ - `dedupeFile`, `dedupe`, `matchFile`, `match` — all return Promises.
42
+ - `autoConfigureRows` (sync, single-pass) and `autoConfigureRowsIterate` (Promise, full controller).
43
+ - `AutoConfigController`, `RunHistory`, `ComplexityProfile`, `HealthVerdict`, `StopReason`.
44
+ - `NegativeEvidenceField`, `applyNegativeEvidence`, `applyNegativeEvidenceToExactPairs`, `promoteNegativeEvidence`.
45
+ - Memory mirror: `getMemory`, `addCorrection`, `learn`, `memoryStats`.
46
+ - MCP tool count: 24 (19 base + 5 memory). Description literal at `src/node/mcp/server.ts:6` — keep in sync via the existing regex test.
47
+
48
+ ## Build outputs
49
+ - tsup with 5 entry points: `index`, `core/index`, `node/index`, `cli`, `node/backends/score-worker` (piscina worker).
50
+ - Build artifacts to `dist/` (gitignored).
51
+ - Test count discipline: bump when adding parity datasets so future audits can diff.
52
+
53
+ ## Config-types invariants
54
+ - **No `make*` factory functions** for config types — test fixtures use full literals. Required fields:
55
+ - `MatchkeyField`: `field` + `transforms` + `scorer` + `weight`
56
+ - `BlockingKeyConfig`: `fields` + `transforms`
57
+ - `BlockingConfig`: `strategy` + `keys` + `maxBlockSize` + `skipOversized`
58
+ - **Scorer names are snake_case** (same as Python): `token_sort`, `record_embedding`, `soundex_match`, `ensemble`, `exact`, `jaro_winkler`, `levenshtein`.
59
+ - **`DOMAIN_EXTRACTED_COLS`** (in `src/core/domain.ts`) has only 3 entries (`__brand__`, `__model__`, `__version__`); Python's has 12. Don't assume parity when porting domain features.
60
+
61
+ ## Vitest gotchas
62
+ - Default timeout 5s. Heavier integration tests (PPRL multi-level, postflight end-to-end) need `{ timeout: 15000 }`. CI concurrent load has bitten this (cost a release: v0.3.0 → v0.3.1).
63
+
64
+ ## Publish workflow
65
+ - `.github/workflows/publish-goldenmatch-js.yml` at monorepo root. Triggers on `goldenmatch-js-v*` tag or `workflow_dispatch` with `ref` input.
66
+ - Tag MUST point at a commit that has the workflow file, otherwise the trigger doesn't fire (root CLAUDE.md "Workflow trigger ordering" gotcha).
67
+ - Uses `NPM_TOKEN` secret. Trusted publishing not configured.
68
+ - The tag-version-must-match-package.json check (in the workflow) means you cannot tag multiple versions at the same commit. Each release commit has its own version bump and tag.
package/dist/cli.cjs CHANGED
@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
264
264
  const type = partial.type ?? "weighted";
265
265
  const fields = partial.fields ?? [];
266
266
  if (type === "exact") {
267
- return { name: partial.name, type: "exact", fields };
267
+ const out2 = {
268
+ name: partial.name,
269
+ type: "exact",
270
+ fields,
271
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
272
+ ...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
273
+ };
274
+ return out2;
268
275
  }
269
276
  if (type === "probabilistic") {
270
277
  const out2 = {
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
275
282
  ...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
276
283
  ...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
277
284
  ...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
278
- ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
285
+ ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
286
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
279
287
  };
280
288
  return out2;
281
289
  }
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
287
295
  ...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
288
296
  ...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
289
297
  ...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
290
- ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
298
+ ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
299
+ ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
291
300
  };
292
301
  return out;
293
302
  }
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
1748
1757
  }
1749
1758
  });
1750
1759
 
1760
+ // src/core/autoconfigNegativeEvidence.ts
1761
+ function applyNegativeEvidence(matchkey, rowA, rowB) {
1762
+ const ne = getNegativeEvidence(matchkey);
1763
+ if (!ne || ne.length === 0) return 0;
1764
+ let total = 0;
1765
+ for (const f of ne) {
1766
+ const rawA = asString(rowA[f.field]);
1767
+ const rawB = asString(rowB[f.field]);
1768
+ const valA = applyTransforms(rawA, f.transforms);
1769
+ const valB = applyTransforms(rawB, f.transforms);
1770
+ let sim;
1771
+ try {
1772
+ sim = scoreField(valA, valB, f.scorer);
1773
+ } catch {
1774
+ continue;
1775
+ }
1776
+ if (sim === null) continue;
1777
+ if (sim < f.threshold) total += f.penalty;
1778
+ }
1779
+ return total;
1780
+ }
1781
+ function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
1782
+ const ne = getNegativeEvidence(matchkey);
1783
+ if (!ne || ne.length === 0) return [...pairs];
1784
+ const threshold = exactThresholdForNe(matchkey);
1785
+ const lookup = /* @__PURE__ */ new Map();
1786
+ for (const r of allRows) {
1787
+ const rid = r["__row_id__"];
1788
+ if (rid !== void 0) lookup.set(rid, r);
1789
+ }
1790
+ const out = [];
1791
+ for (const p of pairs) {
1792
+ const rowA = lookup.get(p.idA);
1793
+ const rowB = lookup.get(p.idB);
1794
+ if (rowA === void 0 || rowB === void 0) continue;
1795
+ const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
1796
+ const finalScore = Math.max(0, 1 - penalty);
1797
+ if (finalScore >= threshold) {
1798
+ out.push({ ...p, score: finalScore });
1799
+ }
1800
+ }
1801
+ return out;
1802
+ }
1803
+ function getNegativeEvidence(mk) {
1804
+ if (mk.type === "weighted") return mk.negativeEvidence;
1805
+ if (mk.type === "exact") return mk.negativeEvidence;
1806
+ if (mk.type === "probabilistic")
1807
+ return mk.negativeEvidence;
1808
+ return void 0;
1809
+ }
1810
+ function exactThresholdForNe(mk) {
1811
+ if (mk.type === "exact") {
1812
+ const t = mk.threshold;
1813
+ return t ?? 0.5;
1814
+ }
1815
+ if (mk.type === "weighted") return mk.threshold;
1816
+ if (mk.type === "probabilistic") {
1817
+ return mk.threshold ?? 0.5;
1818
+ }
1819
+ return 0.5;
1820
+ }
1821
+ var init_autoconfigNegativeEvidence = __esm({
1822
+ "src/core/autoconfigNegativeEvidence.ts"() {
1823
+ init_types();
1824
+ init_transforms();
1825
+ init_scorer();
1826
+ }
1827
+ });
1828
+
1751
1829
  // src/core/scorer.ts
1752
1830
  function asString(v) {
1753
1831
  if (v === null || v === void 0) return null;
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
2214
2292
  }
2215
2293
  }
2216
2294
  }
2295
+ const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
2296
+ const neActive = ne !== void 0 && ne.length > 0;
2217
2297
  const results = [];
2218
2298
  for (let i = 0; i < n; i++) {
2219
2299
  for (let j = i + 1; j < n; j++) {
2220
- const score = combined[i][j];
2300
+ let score = combined[i][j];
2301
+ if (neActive) {
2302
+ const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
2303
+ score = Math.max(0, score - penalty);
2304
+ }
2221
2305
  if (score < threshold) continue;
2222
2306
  const idA = Math.min(rowIds[i], rowIds[j]);
2223
2307
  const idB = Math.max(rowIds[i], rowIds[j]);
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
2274
2358
  init_types();
2275
2359
  init_cluster();
2276
2360
  init_transforms();
2361
+ init_autoconfigNegativeEvidence();
2277
2362
  }
2278
2363
  });
2279
2364
 
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
3214
3299
  for (const mk of matchkeys) {
3215
3300
  if (mk.type === "exact") {
3216
3301
  let pairs = findExactMatches(processed, mk);
3302
+ if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
3303
+ pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
3304
+ }
3217
3305
  if (acrossFilesOnly) {
3218
3306
  pairs = pairs.filter((p) => {
3219
3307
  const srcA = sourceLookup.get(p.idA);
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
3408
3496
  init_standardize();
3409
3497
  init_blocker();
3410
3498
  init_scorer();
3499
+ init_autoconfigNegativeEvidence();
3411
3500
  init_cluster();
3412
3501
  init_golden();
3413
3502
  init_autoconfigVerify();