npm - goldenmatch - Versions diffs - 0.6.0 → 0.8.0 - Mend

goldenmatch 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/CHANGELOG.md +50 -0
package/CLAUDE.md +68 -0
package/dist/cli.cjs +93 -4
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +93 -4
package/dist/cli.js.map +1 -1
package/dist/core/index.cjs +505 -5
package/dist/core/index.cjs.map +1 -1
package/dist/core/index.d.cts +407 -146
package/dist/core/index.d.ts +407 -146
package/dist/core/index.js +493 -6
package/dist/core/index.js.map +1 -1
package/dist/index.cjs +505 -5
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +493 -6
package/dist/index.js.map +1 -1
package/dist/node/backends/score-worker.cjs +36 -1
package/dist/node/backends/score-worker.cjs.map +1 -1
package/dist/node/backends/score-worker.d.cts +1 -1
package/dist/node/backends/score-worker.d.ts +1 -1
package/dist/node/backends/score-worker.js +36 -1
package/dist/node/backends/score-worker.js.map +1 -1
package/dist/node/index.cjs +505 -5
package/dist/node/index.cjs.map +1 -1
package/dist/node/index.d.cts +3 -3
package/dist/node/index.d.ts +3 -3
package/dist/node/index.js +493 -6
package/dist/node/index.js.map +1 -1
package/dist/{types-DlzBTOit.d.cts → types-9yDagamh.d.cts} +33 -1
package/dist/{types-DlzBTOit.d.ts → types-9yDagamh.d.ts} +33 -1
package/package.json +8 -2
package/src/core/autoconfigController.ts +14 -1
package/src/core/autoconfigNegativeEvidence.ts +337 -0
package/src/core/identity/in-memory-store.ts +193 -0
package/src/core/identity/index.ts +19 -0
package/src/core/identity/new-entity-id.ts +49 -0
package/src/core/identity/query.ts +142 -0
package/src/core/identity/types.ts +136 -0
package/src/core/index.ts +13 -0
package/src/core/pipeline.ts +7 -0
package/src/core/scorer.ts +16 -1
package/src/core/types.ts +61 -1
package/tests/identity/in-memory-store.test.ts +220 -0
package/tests/parity/negative-evidence-fixtures.json +1034 -0
package/tests/parity/negativeEvidence.parity.test.ts +183 -0
package/tests/unit/autoconfigNegativeEvidence.test.ts +282 -0
package/tests/unit/autoconfigRules.negativeEvidence.test.ts +109 -0
package/tests/unit/scorer.negativeEvidence.test.ts +72 -0
package/tests/unit/scorer.pathY.test.ts +69 -0
package/tests/unit/types.negativeEvidence.test.ts +80 -0

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,56 @@ All notable changes to goldenmatch-js are documented in this file.
 Format follows [Keep a Changelog](https://keepachangelog.com/). Versioning follows [Semantic Versioning](https://semver.org/) (strict after v1.0.0).
+## [0.7.0] - 2026-05-10
+Negative-evidence parity with Python `goldenmatch` v1.11 + v1.12 (Path Y).
+Python v1.12 lifted DQbench T3 from 53.8% F1 to 85.5% (+31.7 pp) by applying
+NE as a post-filter on exact matchkeys directly; this release ports that
+machinery to the TS runtime.
+### Added
+- `NegativeEvidenceField` interface and `makeNegativeEvidenceField` factory
+  in `src/core/types.ts` (defaults: `threshold=0.5`, `penalty=0.5`).
+  `MatchkeyConfig` variants (`ExactMatchkey`, `WeightedMatchkey`,
+  `ProbabilisticMatchkey`) now accept optional `negativeEvidence`.
+  `ExactMatchkey` also gains optional `threshold` so Path Y can stamp the
+  default 0.5 cutoff when NE is added without a user-set threshold.
+- `src/core/autoconfigNegativeEvidence.ts`:
+  - `applyNegativeEvidence(mk, rowA, rowB)` — per-pair penalty sum.
+  - `applyNegativeEvidenceToExactPairs(pairs, mk, allRows)` — v1.12 Path Y
+    post-filter for `findExactMatches` output.
+  - `promoteNegativeEvidence(config, rows, columnPriors)` — eager rule
+    that walks both weighted AND exact matchkeys (v1.12 change). The
+    `_is_exact_matchkey_field` anchor gate is skipped on the exact branch.
+  - `pickScorerForColumn(colName, colType?)` — name-keyed scorer dispatch
+    matching Python `_pick_scorer_for_column` (`email→token_sort`,
+    `phone→exact+digits_only`, `address→token_sort`, otherwise
+    `ensemble`).
+### Changed
+- `findFuzzyMatches` (`src/core/scorer.ts`) — applies NE penalty after
+  weighted-sum aggregation, before the threshold compare. No-op when the
+  matchkey has no `negativeEvidence`.
+- `pipeline.ts` — after `findExactMatches`, calls
+  `applyNegativeEvidenceToExactPairs` when the exact matchkey has NE set.
+  Mirrors Python v1.12 post-filter design; `findExactMatches`'s signature
+  is unchanged.
+- `AutoConfigController.run()` — eager `promoteNegativeEvidence` pass runs
+  once on the full row set (not the sample) before the iteration loop,
+  matching Python's `auto_configure_df` pre-iteration pass.
+### Tested
+- 19 new unit tests across `types.negativeEvidence`, `autoconfigNegativeEvidence`,
+  `scorer.negativeEvidence`, `scorer.pathY`, and `autoconfigRules.negativeEvidence`.
+- 6 new Python-parity fixtures in
+  `tests/parity/negative-evidence-fixtures.json` covering
+  clustered-email-different-surname, clustered-phone-different-name,
+  dense-population promotion, sparse no-op, blocking-field skip, and
+  idempotency. All 6 green vs Python `promote_negative_evidence`.
 ## [0.6.0] - 2026-05-10
 Indicator-aware refit parity with Python `goldenmatch` v1.9 + v1.10.

package/CLAUDE.md ADDED Viewed

@@ -0,0 +1,68 @@
+# goldenmatch (TypeScript)
+npm package `goldenmatch`. Three-wave parity port of the Python sibling at `packages/python/goldenmatch/`. Currently at **v0.7.0** (parity with Python v1.12).
+## Wave history
+| npm | Python parity | Headline |
+|-----|---------------|----------|
+| 0.4.0 | v1.6.0 | Learning Memory + scorer ground truth |
+| 0.5.0 | v1.7 + v1.8 | AutoConfigController, ComplexityProfile, RunHistory, StopReason telemetry |
+| 0.6.0 | v1.9 + v1.10 | 5 complexity indicators + indicator-aware refit rules; scorer selection aligned with Python |
+| 0.7.0 | v1.11 + v1.12 | NegativeEvidenceField + Path Y (exact-MK post-filter) |
+Each wave's spec/plan: `docs/superpowers/specs/2026-05-10-ts-parity-arc-design.md` + per-wave plans.
+## Commands
+```bash
+cd packages/typescript/goldenmatch
+pnpm --filter goldenmatch test      # vitest (841 tests at v0.7.0)
+pnpm --filter goldenmatch typecheck # tsc --noEmit (strict)
+pnpm --filter goldenmatch build     # tsup (5 entry points)
+npx vitest run tests/parity/        # parity-only suite
+```
+## Edge-safety rule
+`src/core/**` MUST NOT import `node:*`. Node-only code lives in `src/node/`. Memory backed by SQLite is `src/node/memory/`; the edge-safe interface is `src/core/memory/`. This is enforced by build separation, not by lint — verify when adding new imports.
+## Strict TS
+`noUncheckedIndexedAccess` + `exactOptionalPropertyTypes`. Idioms:
+- Bounded-loop indices: use `arr[i]!` after a length check.
+- Optional props: `...(x !== undefined ? { field: x } : {})` — never spread `undefined`.
+- Optional peer deps (sqlite, sentence-transformers): `await import("pkg-name" as string)` — the `as string` cast prevents tsup from resolving at build time.
+## Parity contract
+- **Scorer output:** 4-decimal tolerance vs Python (`tests/parity/scorer-ground-truth.test.ts`).
+- **Hash bytes:** SHA-256 truncated to 16 hex via Web Crypto. UTF-8 mandatory. Hash input = values joined by `|` (NOT `<col>=<val>`). `__row_id__` excluded from `record_hash` so corrections survive row reordering.
+- **Cross-language fixtures:** committed under `tests/parity/fixtures/`. Regen via `packages/python/goldenmatch/tests/parity/memory/gen_memory_fixtures.py --rebuild-db` and the wave-specific emitters in `packages/python/goldenmatch/scripts/emit_ts_parity_fixtures.py`. Determinism clamp: pinned UUIDs, pinned `created_at` (no `datetime.now()`).
+- **Negative-evidence parity** (v0.7.0): 6 fixture datasets exercising Path Y filtering on exact MKs + weighted-MK NE. Live in `tests/parity/negative-evidence-fixtures.json`.
+- **Controller parity** (v0.5.0): structural-only on 4 of 6 fixtures, byte-equal on 2. Python-side `ModuleNotFoundError` on polars/sklearn in the divergent 4 — TS doesn't replicate that import wart.
+## Public API surface (v0.7.0)
+- `dedupeFile`, `dedupe`, `matchFile`, `match` — all return Promises.
+- `autoConfigureRows` (sync, single-pass) and `autoConfigureRowsIterate` (Promise, full controller).
+- `AutoConfigController`, `RunHistory`, `ComplexityProfile`, `HealthVerdict`, `StopReason`.
+- `NegativeEvidenceField`, `applyNegativeEvidence`, `applyNegativeEvidenceToExactPairs`, `promoteNegativeEvidence`.
+- Memory mirror: `getMemory`, `addCorrection`, `learn`, `memoryStats`.
+- MCP tool count: 24 (19 base + 5 memory). Description literal at `src/node/mcp/server.ts:6` — keep in sync via the existing regex test.
+## Build outputs
+- tsup with 5 entry points: `index`, `core/index`, `node/index`, `cli`, `node/backends/score-worker` (piscina worker).
+- Build artifacts to `dist/` (gitignored).
+- Test count discipline: bump when adding parity datasets so future audits can diff.
+## Config-types invariants
+- **No `make*` factory functions** for config types — test fixtures use full literals. Required fields:
+  - `MatchkeyField`: `field` + `transforms` + `scorer` + `weight`
+  - `BlockingKeyConfig`: `fields` + `transforms`
+  - `BlockingConfig`: `strategy` + `keys` + `maxBlockSize` + `skipOversized`
+- **Scorer names are snake_case** (same as Python): `token_sort`, `record_embedding`, `soundex_match`, `ensemble`, `exact`, `jaro_winkler`, `levenshtein`.
+- **`DOMAIN_EXTRACTED_COLS`** (in `src/core/domain.ts`) has only 3 entries (`__brand__`, `__model__`, `__version__`); Python's has 12. Don't assume parity when porting domain features.
+## Vitest gotchas
+- Default timeout 5s. Heavier integration tests (PPRL multi-level, postflight end-to-end) need `{ timeout: 15000 }`. CI concurrent load has bitten this (cost a release: v0.3.0 → v0.3.1).
+## Publish workflow
+- `.github/workflows/publish-goldenmatch-js.yml` at monorepo root. Triggers on `goldenmatch-js-v*` tag or `workflow_dispatch` with `ref` input.
+- Tag MUST point at a commit that has the workflow file, otherwise the trigger doesn't fire (root CLAUDE.md "Workflow trigger ordering" gotcha).
+- Uses `NPM_TOKEN` secret. Trusted publishing not configured.
+- The tag-version-must-match-package.json check (in the workflow) means you cannot tag multiple versions at the same commit. Each release commit has its own version bump and tag.

package/dist/cli.cjs CHANGED Viewed

@@ -264,7 +264,14 @@ function makeMatchkeyConfig(partial) {
   const type = partial.type ?? "weighted";
   const fields = partial.fields ?? [];
   if (type === "exact") {
-    return { name: partial.name, type: "exact", fields };
+    const out2 = {
+      name: partial.name,
+      type: "exact",
+      fields,
+      ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {},
+      ...partial.threshold !== void 0 ? { threshold: partial.threshold } : {}
+    };
+    return out2;
   }
   if (type === "probabilistic") {
     const out2 = {
@@ -275,7 +282,8 @@ function makeMatchkeyConfig(partial) {
       ...partial.emIterations !== void 0 ? { emIterations: partial.emIterations } : {},
       ...partial.convergenceThreshold !== void 0 ? { convergenceThreshold: partial.convergenceThreshold } : {},
       ...partial.linkThreshold !== void 0 ? { linkThreshold: partial.linkThreshold } : {},
-      ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {}
+      ...partial.reviewThreshold !== void 0 ? { reviewThreshold: partial.reviewThreshold } : {},
+      ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
     };
     return out2;
   }
@@ -287,7 +295,8 @@ function makeMatchkeyConfig(partial) {
     ...partial.autoThreshold !== void 0 ? { autoThreshold: partial.autoThreshold } : {},
     ...partial.rerank !== void 0 ? { rerank: partial.rerank } : {},
     ...partial.rerankModel !== void 0 ? { rerankModel: partial.rerankModel } : {},
-    ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {}
+    ...partial.rerankBand !== void 0 ? { rerankBand: partial.rerankBand } : {},
+    ...partial.negativeEvidence !== void 0 ? { negativeEvidence: partial.negativeEvidence } : {}
   };
   return out;
 }
@@ -1748,6 +1757,75 @@ var init_cluster = __esm({
   }
 });
+// src/core/autoconfigNegativeEvidence.ts
+function applyNegativeEvidence(matchkey, rowA, rowB) {
+  const ne = getNegativeEvidence(matchkey);
+  if (!ne || ne.length === 0) return 0;
+  let total = 0;
+  for (const f of ne) {
+    const rawA = asString(rowA[f.field]);
+    const rawB = asString(rowB[f.field]);
+    const valA = applyTransforms(rawA, f.transforms);
+    const valB = applyTransforms(rawB, f.transforms);
+    let sim;
+    try {
+      sim = scoreField(valA, valB, f.scorer);
+    } catch {
+      continue;
+    }
+    if (sim === null) continue;
+    if (sim < f.threshold) total += f.penalty;
+  }
+  return total;
+}
+function applyNegativeEvidenceToExactPairs(pairs, matchkey, allRows) {
+  const ne = getNegativeEvidence(matchkey);
+  if (!ne || ne.length === 0) return [...pairs];
+  const threshold = exactThresholdForNe(matchkey);
+  const lookup = /* @__PURE__ */ new Map();
+  for (const r of allRows) {
+    const rid = r["__row_id__"];
+    if (rid !== void 0) lookup.set(rid, r);
+  }
+  const out = [];
+  for (const p of pairs) {
+    const rowA = lookup.get(p.idA);
+    const rowB = lookup.get(p.idB);
+    if (rowA === void 0 || rowB === void 0) continue;
+    const penalty = applyNegativeEvidence(matchkey, rowA, rowB);
+    const finalScore = Math.max(0, 1 - penalty);
+    if (finalScore >= threshold) {
+      out.push({ ...p, score: finalScore });
+    }
+  }
+  return out;
+}
+function getNegativeEvidence(mk) {
+  if (mk.type === "weighted") return mk.negativeEvidence;
+  if (mk.type === "exact") return mk.negativeEvidence;
+  if (mk.type === "probabilistic")
+    return mk.negativeEvidence;
+  return void 0;
+}
+function exactThresholdForNe(mk) {
+  if (mk.type === "exact") {
+    const t = mk.threshold;
+    return t ?? 0.5;
+  }
+  if (mk.type === "weighted") return mk.threshold;
+  if (mk.type === "probabilistic") {
+    return mk.threshold ?? 0.5;
+  }
+  return 0.5;
+}
+var init_autoconfigNegativeEvidence = __esm({
+  "src/core/autoconfigNegativeEvidence.ts"() {
+    init_types();
+    init_transforms();
+    init_scorer();
+  }
+});
 // src/core/scorer.ts
 function asString(v) {
   if (v === null || v === void 0) return null;
@@ -2214,10 +2292,16 @@ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
       }
     }
   }
+  const ne = mk.type === "weighted" ? mk.negativeEvidence : void 0;
+  const neActive = ne !== void 0 && ne.length > 0;
   const results = [];
   for (let i = 0; i < n; i++) {
     for (let j = i + 1; j < n; j++) {
-      const score = combined[i][j];
+      let score = combined[i][j];
+      if (neActive) {
+        const penalty = applyNegativeEvidence(mk, rows[i], rows[j]);
+        score = Math.max(0, score - penalty);
+      }
       if (score < threshold) continue;
       const idA = Math.min(rowIds[i], rowIds[j]);
       const idB = Math.max(rowIds[i], rowIds[j]);
@@ -2274,6 +2358,7 @@ var init_scorer = __esm({
     init_types();
     init_cluster();
     init_transforms();
+    init_autoconfigNegativeEvidence();
   }
 });
@@ -3214,6 +3299,9 @@ async function runDedupePipeline(rows, config, options) {
   for (const mk of matchkeys) {
     if (mk.type === "exact") {
       let pairs = findExactMatches(processed, mk);
+      if (mk.negativeEvidence !== void 0 && mk.negativeEvidence.length > 0) {
+        pairs = applyNegativeEvidenceToExactPairs(pairs, mk, processed);
+      }
       if (acrossFilesOnly) {
         pairs = pairs.filter((p) => {
           const srcA = sourceLookup.get(p.idA);
@@ -3408,6 +3496,7 @@ var init_pipeline = __esm({
     init_standardize();
     init_blocker();
     init_scorer();
+    init_autoconfigNegativeEvidence();
     init_cluster();
     init_golden();
     init_autoconfigVerify();