goldenmatch 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CLAUDE.md ADDED
@@ -0,0 +1,68 @@
1
+ # goldenmatch (TypeScript)
2
+
3
+ npm package `goldenmatch`. Three-wave parity port of the Python sibling at `packages/python/goldenmatch/`. Currently at **v0.7.0** (parity with Python v1.12).
4
+
5
+ ## Wave history
6
+ | npm | Python parity | Headline |
7
+ |-----|---------------|----------|
8
+ | 0.4.0 | v1.6.0 | Learning Memory + scorer ground truth |
9
+ | 0.5.0 | v1.7 + v1.8 | AutoConfigController, ComplexityProfile, RunHistory, StopReason telemetry |
10
+ | 0.6.0 | v1.9 + v1.10 | 5 complexity indicators + indicator-aware refit rules; scorer selection aligned with Python |
11
+ | 0.7.0 | v1.11 + v1.12 | NegativeEvidenceField + Path Y (exact-MK post-filter) |
12
+
13
+ Each wave's spec/plan: `docs/superpowers/specs/2026-05-10-ts-parity-arc-design.md` + per-wave plans.
14
+
15
+ ## Commands
16
+ ```bash
17
+ cd packages/typescript/goldenmatch
18
+ pnpm --filter goldenmatch test # vitest (841 tests at v0.7.0)
19
+ pnpm --filter goldenmatch typecheck # tsc --noEmit (strict)
20
+ pnpm --filter goldenmatch build # tsup (5 entry points)
21
+ npx vitest run tests/parity/ # parity-only suite
22
+ ```
23
+
24
+ ## Edge-safety rule
25
+ `src/core/**` MUST NOT import `node:*`. Node-only code lives in `src/node/`. Memory backed by SQLite is `src/node/memory/`; the edge-safe interface is `src/core/memory/`. This is enforced by build separation, not by lint — verify when adding new imports.
26
+
27
+ ## Strict TS
28
+ `noUncheckedIndexedAccess` + `exactOptionalPropertyTypes`. Idioms:
29
+ - Bounded-loop indices: use `arr[i]!` after a length check.
30
+ - Optional props: `...(x !== undefined ? { field: x } : {})` — never spread `undefined`.
31
+ - Optional peer deps (sqlite, sentence-transformers): `await import("pkg-name" as string)` — the `as string` cast prevents tsup from resolving at build time.
32
+
33
+ ## Parity contract
34
+ - **Scorer output:** 4-decimal tolerance vs Python (`tests/parity/scorer-ground-truth.test.ts`).
35
+ - **Hash bytes:** SHA-256 truncated to 16 hex via Web Crypto. UTF-8 mandatory. Hash input = values joined by `|` (NOT `<col>=<val>`). `__row_id__` excluded from `record_hash` so corrections survive row reordering.
36
+ - **Cross-language fixtures:** committed under `tests/parity/fixtures/`. Regen via `packages/python/goldenmatch/tests/parity/memory/gen_memory_fixtures.py --rebuild-db` and the wave-specific emitters in `packages/python/goldenmatch/scripts/emit_ts_parity_fixtures.py`. Determinism clamp: pinned UUIDs, pinned `created_at` (no `datetime.now()`).
37
+ - **Negative-evidence parity** (v0.7.0): 6 fixture datasets exercising Path Y filtering on exact MKs + weighted-MK NE. Live in `tests/parity/negative-evidence-fixtures.json`.
38
+ - **Controller parity** (v0.5.0): structural-only on 4 of 6 fixtures, byte-equal on 2. Python-side `ModuleNotFoundError` on polars/sklearn in the divergent 4 — TS doesn't replicate that import wart.
39
+
40
+ ## Public API surface (v0.7.0)
41
+ - `dedupeFile`, `dedupe`, `matchFile`, `match` — all return Promises.
42
+ - `autoConfigureRows` (sync, single-pass) and `autoConfigureRowsIterate` (Promise, full controller).
43
+ - `AutoConfigController`, `RunHistory`, `ComplexityProfile`, `HealthVerdict`, `StopReason`.
44
+ - `NegativeEvidenceField`, `applyNegativeEvidence`, `applyNegativeEvidenceToExactPairs`, `promoteNegativeEvidence`.
45
+ - Memory mirror: `getMemory`, `addCorrection`, `learn`, `memoryStats`.
46
+ - MCP tool count: 24 (19 base + 5 memory). Description literal at `src/node/mcp/server.ts:6` — keep in sync via the existing regex test.
47
+
48
+ ## Build outputs
49
+ - tsup with 5 entry points: `index`, `core/index`, `node/index`, `cli`, `node/backends/score-worker` (piscina worker).
50
+ - Build artifacts to `dist/` (gitignored).
51
+ - Test count discipline: bump when adding parity datasets so future audits can diff.
52
+
53
+ ## Config-types invariants
54
+ - **No `make*` factory functions** for config types — test fixtures use full literals. Required fields:
55
+ - `MatchkeyField`: `field` + `transforms` + `scorer` + `weight`
56
+ - `BlockingKeyConfig`: `fields` + `transforms`
57
+ - `BlockingConfig`: `strategy` + `keys` + `maxBlockSize` + `skipOversized`
58
+ - **Scorer names are snake_case** (same as Python): `token_sort`, `record_embedding`, `soundex_match`, `ensemble`, `exact`, `jaro_winkler`, `levenshtein`.
59
+ - **`DOMAIN_EXTRACTED_COLS`** (in `src/core/domain.ts`) has only 3 entries (`__brand__`, `__model__`, `__version__`); Python's has 12. Don't assume parity when porting domain features.
60
+
61
+ ## Vitest gotchas
62
+ - Default timeout 5s. Heavier integration tests (PPRL multi-level, postflight end-to-end) need `{ timeout: 15000 }`. CI concurrent load has bitten this (cost a release: v0.3.0 → v0.3.1).
63
+
64
+ ## Publish workflow
65
+ - `.github/workflows/publish-goldenmatch-js.yml` at monorepo root. Triggers on `goldenmatch-js-v*` tag or `workflow_dispatch` with `ref` input.
66
+ - Tag MUST point at a commit that has the workflow file, otherwise the trigger doesn't fire (root CLAUDE.md "Workflow trigger ordering" gotcha).
67
+ - Uses `NPM_TOKEN` secret. Trusted publishing not configured.
68
+ - The tag-version-must-match-package.json check (in the workflow) means you cannot tag multiple versions at the same commit. Each release commit has its own version bump and tag.
@@ -11661,6 +11661,267 @@ init_corrections();
11661
11661
  init_learner();
11662
11662
  init_hash();
11663
11663
 
11664
+ // src/core/identity/types.ts
11665
+ function canonRecordPair(a, b) {
11666
+ return a <= b ? [a, b] : [b, a];
11667
+ }
11668
+
11669
+ // src/core/identity/new-entity-id.ts
11670
+ function newEntityId() {
11671
+ const tsMs = BigInt(Date.now()) & (1n << 48n) - 1n;
11672
+ const randBytes = randomBytes(10);
11673
+ const randA = BigInt(randBytes[0] & 15) << 8n | BigInt(randBytes[1]);
11674
+ let lo = 0n;
11675
+ for (let i = 2; i < 10; i++) {
11676
+ lo = lo << 8n | BigInt(randBytes[i]);
11677
+ }
11678
+ lo = lo & (1n << 62n) - 1n;
11679
+ const high = tsMs << 16n | 0x7n << 12n | randA;
11680
+ const low = 0b10n << 62n | lo;
11681
+ return formatUuid(high, low);
11682
+ }
11683
+ function randomBytes(n) {
11684
+ const out = new Uint8Array(n);
11685
+ if (typeof crypto !== "undefined" && typeof crypto.getRandomValues === "function") {
11686
+ crypto.getRandomValues(out);
11687
+ return out;
11688
+ }
11689
+ for (let i = 0; i < n; i++) out[i] = Math.floor(Math.random() * 256);
11690
+ return out;
11691
+ }
11692
+ function formatUuid(high, low) {
11693
+ const hh = high.toString(16).padStart(16, "0");
11694
+ const ll = low.toString(16).padStart(16, "0");
11695
+ return hh.slice(0, 8) + "-" + hh.slice(8, 12) + "-" + hh.slice(12, 16) + "-" + ll.slice(0, 4) + "-" + ll.slice(4, 16);
11696
+ }
11697
+
11698
+ // src/core/identity/in-memory-store.ts
11699
+ var InMemoryIdentityStore = class {
11700
+ identities = /* @__PURE__ */ new Map();
11701
+ records = /* @__PURE__ */ new Map();
11702
+ edges = [];
11703
+ events = [];
11704
+ aliases = /* @__PURE__ */ new Map();
11705
+ nextEdgeId = 1;
11706
+ nextEventId = 1;
11707
+ async upsertIdentity(node) {
11708
+ const existing = this.identities.get(node.entityId);
11709
+ if (existing) {
11710
+ this.identities.set(node.entityId, {
11711
+ ...node,
11712
+ createdAt: existing.createdAt,
11713
+ updatedAt: node.updatedAt
11714
+ });
11715
+ } else {
11716
+ this.identities.set(node.entityId, { ...node });
11717
+ }
11718
+ }
11719
+ async getIdentity(entityId) {
11720
+ const n = this.identities.get(entityId);
11721
+ return n ? { ...n } : null;
11722
+ }
11723
+ async listIdentities(opts = {}) {
11724
+ const all = Array.from(this.identities.values()).filter((n) => opts.dataset === void 0 || n.dataset === opts.dataset).filter((n) => opts.status === void 0 || n.status === opts.status).sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
11725
+ const offset = opts.offset ?? 0;
11726
+ const limit = opts.limit ?? 100;
11727
+ return all.slice(offset, offset + limit).map((n) => ({ ...n }));
11728
+ }
11729
+ async countIdentities(dataset) {
11730
+ if (dataset === void 0) return this.identities.size;
11731
+ let n = 0;
11732
+ for (const node of this.identities.values()) {
11733
+ if (node.dataset === dataset) n++;
11734
+ }
11735
+ return n;
11736
+ }
11737
+ async retireIdentity(entityId, mergedInto) {
11738
+ const node = this.identities.get(entityId);
11739
+ if (!node) return;
11740
+ const next = {
11741
+ ...node,
11742
+ status: mergedInto ? "merged_into" : "retired",
11743
+ mergedInto: mergedInto ?? null,
11744
+ updatedAt: /* @__PURE__ */ new Date()
11745
+ };
11746
+ this.identities.set(entityId, next);
11747
+ }
11748
+ async upsertRecord(rec) {
11749
+ const existing = this.records.get(rec.recordId);
11750
+ if (existing) {
11751
+ this.records.set(rec.recordId, {
11752
+ ...rec,
11753
+ firstSeenAt: existing.firstSeenAt,
11754
+ lastSeenAt: rec.lastSeenAt
11755
+ });
11756
+ } else {
11757
+ this.records.set(rec.recordId, { ...rec });
11758
+ }
11759
+ }
11760
+ async getRecord(recordId) {
11761
+ const r = this.records.get(recordId);
11762
+ return r ? { ...r } : null;
11763
+ }
11764
+ async getRecordsForEntity(entityId) {
11765
+ return Array.from(this.records.values()).filter((r) => r.entityId === entityId).sort((a, b) => a.firstSeenAt.getTime() - b.firstSeenAt.getTime()).map((r) => ({ ...r }));
11766
+ }
11767
+ async findEntityByRecord(recordId) {
11768
+ return this.records.get(recordId)?.entityId ?? null;
11769
+ }
11770
+ async lookupEntityIds(recordIds) {
11771
+ const out = /* @__PURE__ */ new Map();
11772
+ for (const rid of recordIds) {
11773
+ const eid = this.records.get(rid)?.entityId;
11774
+ if (eid) out.set(rid, eid);
11775
+ }
11776
+ return out;
11777
+ }
11778
+ async addEdge(edge) {
11779
+ const [a, b] = canonRecordPair(edge.recordAId, edge.recordBId);
11780
+ const runKey = edge.runName ?? "";
11781
+ for (const e of this.edges) {
11782
+ if (e.entityId === edge.entityId && e.recordAId === a && e.recordBId === b && (e.runName ?? "") === runKey) {
11783
+ return e.edgeId;
11784
+ }
11785
+ }
11786
+ const stored = {
11787
+ ...edge,
11788
+ recordAId: a,
11789
+ recordBId: b,
11790
+ edgeId: this.nextEdgeId++
11791
+ };
11792
+ this.edges.push(stored);
11793
+ return stored.edgeId;
11794
+ }
11795
+ async edgesForEntity(entityId) {
11796
+ return this.edges.filter((e) => e.entityId === entityId).sort((a, b) => a.recordedAt.getTime() - b.recordedAt.getTime()).map((e) => ({ ...e }));
11797
+ }
11798
+ async findConflicts(dataset) {
11799
+ return this.edges.filter((e) => e.kind === "conflicts_with").filter((e) => dataset === void 0 || e.dataset === dataset).sort((a, b) => b.recordedAt.getTime() - a.recordedAt.getTime()).map((e) => ({ ...e }));
11800
+ }
11801
+ async emitEvent(event) {
11802
+ const stored = { ...event, eventId: this.nextEventId++ };
11803
+ this.events.push(stored);
11804
+ return stored.eventId;
11805
+ }
11806
+ async history(entityId, limit) {
11807
+ const filtered = this.events.filter((e) => e.entityId === entityId).sort((a, b) => (a.eventId ?? 0) - (b.eventId ?? 0));
11808
+ return (limit ? filtered.slice(0, limit) : filtered).map((e) => ({ ...e }));
11809
+ }
11810
+ async hasRunEvent(entityId, runName, kind) {
11811
+ return this.events.some(
11812
+ (e) => e.entityId === entityId && e.runName === runName && e.kind === kind
11813
+ );
11814
+ }
11815
+ async addAlias(alias) {
11816
+ this.aliases.set(`${alias.alias}|${alias.kind}|${alias.dataset ?? ""}`, { ...alias });
11817
+ }
11818
+ async resolveAlias(alias, kind = "external_id") {
11819
+ for (const [key, val] of this.aliases.entries()) {
11820
+ const parts = key.split("|");
11821
+ if (parts[0] === alias && parts[1] === kind) return val.entityId;
11822
+ }
11823
+ return null;
11824
+ }
11825
+ async close() {
11826
+ }
11827
+ };
11828
+
11829
+ // src/core/identity/query.ts
11830
+ async function getEntity(store, entityId, eventLimit = 100) {
11831
+ const node = await store.getIdentity(entityId);
11832
+ if (!node) return null;
11833
+ const [records, edges, events] = await Promise.all([
11834
+ store.getRecordsForEntity(entityId),
11835
+ store.edgesForEntity(entityId),
11836
+ store.history(entityId, eventLimit)
11837
+ ]);
11838
+ return { node, records, edges, events };
11839
+ }
11840
+ async function findByRecord(store, recordId) {
11841
+ const eid = await store.findEntityByRecord(recordId);
11842
+ if (!eid) return null;
11843
+ return getEntity(store, eid);
11844
+ }
11845
+ async function listEntities(store, opts = {}) {
11846
+ return store.listIdentities(opts);
11847
+ }
11848
+ async function manualMerge(store, keepEntityId, absorbEntityId, opts = {}) {
11849
+ const winner = await store.getIdentity(keepEntityId);
11850
+ const loser = await store.getIdentity(absorbEntityId);
11851
+ if (!winner || !loser) throw new Error("Both entity_ids must exist");
11852
+ if (winner.status !== "active") throw new Error("Winner must be active");
11853
+ const now2 = /* @__PURE__ */ new Date();
11854
+ const losersRecords = await store.getRecordsForEntity(absorbEntityId);
11855
+ for (const r of losersRecords) {
11856
+ await store.upsertRecord({ ...r, entityId: keepEntityId, lastSeenAt: now2 });
11857
+ }
11858
+ await store.retireIdentity(absorbEntityId, keepEntityId);
11859
+ const runName = opts.runName ?? "manual";
11860
+ await store.emitEvent({
11861
+ eventId: null,
11862
+ entityId: keepEntityId,
11863
+ kind: "manual_merge",
11864
+ payload: { absorbed: absorbEntityId, reason: opts.reason ?? null },
11865
+ runName,
11866
+ dataset: winner.dataset,
11867
+ recordedAt: now2
11868
+ });
11869
+ await store.emitEvent({
11870
+ eventId: null,
11871
+ entityId: absorbEntityId,
11872
+ kind: "manual_merge",
11873
+ payload: { merged_into: keepEntityId, reason: opts.reason ?? null },
11874
+ runName,
11875
+ dataset: loser.dataset,
11876
+ recordedAt: now2
11877
+ });
11878
+ return { keep: keepEntityId, absorbed: absorbEntityId, at: now2.toISOString() };
11879
+ }
11880
+ async function manualSplit(store, entityId, recordIds, opts = {}) {
11881
+ const parent = await store.getIdentity(entityId);
11882
+ if (!parent) throw new Error(`Entity ${entityId} not found`);
11883
+ if (recordIds.length === 0) throw new Error("recordIds must be non-empty");
11884
+ const now2 = /* @__PURE__ */ new Date();
11885
+ const newId = newEntityId();
11886
+ await store.upsertIdentity({
11887
+ entityId: newId,
11888
+ status: "active",
11889
+ mergedInto: null,
11890
+ goldenRecord: null,
11891
+ confidence: null,
11892
+ dataset: parent.dataset,
11893
+ createdAt: now2,
11894
+ updatedAt: now2
11895
+ });
11896
+ const moved = [];
11897
+ for (const rid of recordIds) {
11898
+ const rec = await store.getRecord(rid);
11899
+ if (!rec || rec.entityId !== entityId) continue;
11900
+ await store.upsertRecord({ ...rec, entityId: newId, lastSeenAt: now2 });
11901
+ moved.push(rid);
11902
+ }
11903
+ const runName = opts.runName ?? "manual";
11904
+ await store.emitEvent({
11905
+ eventId: null,
11906
+ entityId,
11907
+ kind: "manual_split",
11908
+ payload: { split_to: newId, records: moved, reason: opts.reason ?? null },
11909
+ runName,
11910
+ dataset: parent.dataset,
11911
+ recordedAt: now2
11912
+ });
11913
+ await store.emitEvent({
11914
+ eventId: null,
11915
+ entityId: newId,
11916
+ kind: "manual_split",
11917
+ payload: { split_from: entityId, records: moved, reason: opts.reason ?? null },
11918
+ runName,
11919
+ dataset: parent.dataset,
11920
+ recordedAt: now2
11921
+ });
11922
+ return { newEntityId: newId, moved, at: now2.toISOString() };
11923
+ }
11924
+
11664
11925
  // src/core/pprl/protocol.ts
11665
11926
  init_transforms();
11666
11927
  init_scorer();
@@ -11757,6 +12018,7 @@ exports.BudgetTracker = BudgetTracker;
11757
12018
  exports.CrossEncoderHttpError = CrossEncoderHttpError;
11758
12019
  exports.CrossEncoderModel = CrossEncoderModel;
11759
12020
  exports.HIGH_TRUST_SOURCES = HIGH_TRUST_SOURCES;
12021
+ exports.InMemoryIdentityStore = InMemoryIdentityStore;
11760
12022
  exports.InMemoryStore = InMemoryStore;
11761
12023
  exports.ReviewQueue = ReviewQueue;
11762
12024
  exports.StreamProcessor = StreamProcessor;
@@ -11792,6 +12054,7 @@ exports.buildLineage = buildLineage;
11792
12054
  exports.buildMst = buildMst;
11793
12055
  exports.buildMultiPassBlocks = buildMultiPassBlocks;
11794
12056
  exports.buildStaticBlocks = buildStaticBlocks;
12057
+ exports.canonRecordPair = canonRecordPair;
11795
12058
  exports.compareClusters = compareClusters;
11796
12059
  exports.complexityHealth = complexityHealth;
11797
12060
  exports.computeClusterConfidence = computeClusterConfidence;
@@ -11819,12 +12082,14 @@ exports.evaluatePairs = evaluatePairs;
11819
12082
  exports.explainCluster = explainCluster;
11820
12083
  exports.explainPair = explainPair;
11821
12084
  exports.extractFeatures = extractFeatures;
12085
+ exports.findByRecord = findByRecord;
11822
12086
  exports.findExactMatches = findExactMatches;
11823
12087
  exports.findExactMatchesOne = findExactMatchesOne;
11824
12088
  exports.findFuzzyMatches = findFuzzyMatches;
11825
12089
  exports.gatePairs = gatePairs;
11826
12090
  exports.getClusterPairScores = getClusterPairScores;
11827
12091
  exports.getEmbedder = getEmbedder;
12092
+ exports.getEntity = getEntity;
11828
12093
  exports.getLastControllerRun = getLastControllerRun;
11829
12094
  exports.getMatchkeys = getMatchkeys;
11830
12095
  exports.indelDistance = indelDistance;
@@ -11838,6 +12103,7 @@ exports.levenshteinDistance = levenshteinDistance;
11838
12103
  exports.levenshteinSimilarity = levenshteinSimilarity;
11839
12104
  exports.lineageFromJson = lineageFromJson;
11840
12105
  exports.lineageToJson = lineageToJson;
12106
+ exports.listEntities = listEntities;
11841
12107
  exports.llmClusterPairs = llmClusterPairs;
11842
12108
  exports.llmExplainPair = llmExplainPair;
11843
12109
  exports.llmScorePairs = llmScorePairs;
@@ -11859,10 +12125,13 @@ exports.makePreflightReport = makePreflightReport;
11859
12125
  exports.makeProfileMeta = makeProfileMeta;
11860
12126
  exports.makeScoredPair = makeScoredPair;
11861
12127
  exports.makeScoringProfile = makeScoringProfile;
12128
+ exports.manualMerge = manualMerge;
12129
+ exports.manualSplit = manualSplit;
11862
12130
  exports.match = match;
11863
12131
  exports.matchOne = matchOne;
11864
12132
  exports.mergeField = mergeField;
11865
12133
  exports.metaphone = metaphone;
12134
+ exports.newEntityId = newEntityId;
11866
12135
  exports.normalizedSignalVector = normalizedSignalVector;
11867
12136
  exports.pairKey = pairKey;
11868
12137
  exports.parseConfig = parseConfig;