goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* probabilistic.ts — Fellegi-Sunter probabilistic matching with EM-trained
|
|
3
|
+
* parameters. Ports `goldenmatch/core/probabilistic.py` (discrete path).
|
|
4
|
+
*
|
|
5
|
+
* Implements:
|
|
6
|
+
* - Comparison vectors (2/3/N-level field agreements)
|
|
7
|
+
* - Splink-style EM: u estimated from random pairs (fixed), m trained via EM
|
|
8
|
+
* - Blocking fields get fixed neutral priors
|
|
9
|
+
* - Match weights as log2(m/u) log-likelihood ratios, normalized to [0,1]
|
|
10
|
+
*
|
|
11
|
+
* Edge-safe: no `node:` imports, no numpy. Uses typed arrays where helpful.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { Row, MatchkeyConfig, MatchkeyField, ScoredPair } from "./types.js";
|
|
15
|
+
import { makeScoredPair } from "./types.js";
|
|
16
|
+
import { scoreField, asString } from "./scorer.js";
|
|
17
|
+
import { applyTransforms } from "./transforms.js";
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Types
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
export interface EMOptions {
|
|
24
|
+
readonly maxIterations?: number;
|
|
25
|
+
readonly convergence?: number;
|
|
26
|
+
readonly blockingFields?: readonly string[];
|
|
27
|
+
readonly seed?: number;
|
|
28
|
+
readonly nSamplePairs?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface EMResult {
|
|
32
|
+
/** P(level | match) per field. */
|
|
33
|
+
readonly m: Readonly<Record<string, readonly number[]>>;
|
|
34
|
+
/** P(level | non-match) per field. */
|
|
35
|
+
readonly u: Readonly<Record<string, readonly number[]>>;
|
|
36
|
+
/** log2(m / u) per level per field. Score weights. */
|
|
37
|
+
readonly matchWeights: Readonly<Record<string, readonly number[]>>;
|
|
38
|
+
/** Estimated p(match) in the sampled population. */
|
|
39
|
+
readonly proportionMatched: number;
|
|
40
|
+
readonly iterations: number;
|
|
41
|
+
readonly converged: boolean;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Deterministic RNG (xorshift32) — avoids relying on Math.random's seedability
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
function makeRng(seed: number): () => number {
|
|
49
|
+
let x = seed | 0 || 1;
|
|
50
|
+
return () => {
|
|
51
|
+
x ^= x << 13;
|
|
52
|
+
x ^= x >>> 17;
|
|
53
|
+
x ^= x << 5;
|
|
54
|
+
// Return in [0, 1): divide by 2^32 (not 2^32-1) so the value cannot reach 1.0.
|
|
55
|
+
return (x >>> 0) / 0x100000000;
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Field levels helper
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
function fieldLevels(f: MatchkeyField): number {
|
|
64
|
+
return f.levels ?? 2;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function fieldPartialThreshold(f: MatchkeyField): number {
|
|
68
|
+
return f.partialThreshold ?? 0.7;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
// Public: buildComparisonVector
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Build a comparison vector: one integer level per field.
|
|
77
|
+
* levels=2: 0=disagree, 1=agree
|
|
78
|
+
* levels=3: 0=disagree, 1=partial, 2=agree (>= 0.95)
|
|
79
|
+
* levels=N: evenly spaced thresholds k/N for k in 1..N-1
|
|
80
|
+
*/
|
|
81
|
+
export function buildComparisonVector(
|
|
82
|
+
rowA: Row,
|
|
83
|
+
rowB: Row,
|
|
84
|
+
fields: readonly MatchkeyField[],
|
|
85
|
+
): readonly number[] {
|
|
86
|
+
const levels: number[] = [];
|
|
87
|
+
for (const f of fields) {
|
|
88
|
+
let valA = asString(rowA[f.field]);
|
|
89
|
+
let valB = asString(rowB[f.field]);
|
|
90
|
+
if (f.transforms.length > 0) {
|
|
91
|
+
valA = applyTransforms(valA, f.transforms);
|
|
92
|
+
valB = applyTransforms(valB, f.transforms);
|
|
93
|
+
}
|
|
94
|
+
const s = scoreField(valA, valB, f.scorer);
|
|
95
|
+
const n = fieldLevels(f);
|
|
96
|
+
const partial = fieldPartialThreshold(f);
|
|
97
|
+
|
|
98
|
+
if (s === null) {
|
|
99
|
+
levels.push(0);
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (n === 2) {
|
|
104
|
+
levels.push(s >= partial ? 1 : 0);
|
|
105
|
+
} else if (n === 3) {
|
|
106
|
+
if (s >= 0.95) levels.push(2);
|
|
107
|
+
else if (s >= partial) levels.push(1);
|
|
108
|
+
else levels.push(0);
|
|
109
|
+
} else {
|
|
110
|
+
let level = 0;
|
|
111
|
+
for (let k = 1; k < n; k++) {
|
|
112
|
+
if (s >= k / n) level = k;
|
|
113
|
+
}
|
|
114
|
+
levels.push(level);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return levels;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
// Random-pair sampling (used for u estimation)
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
function samplePairs(
|
|
125
|
+
rows: readonly Row[],
|
|
126
|
+
nPairs: number,
|
|
127
|
+
rand: () => number,
|
|
128
|
+
): Array<readonly [number, number]> {
|
|
129
|
+
const ids: number[] = [];
|
|
130
|
+
for (const r of rows) {
|
|
131
|
+
const id = r["__row_id__"];
|
|
132
|
+
if (typeof id === "number") ids.push(id);
|
|
133
|
+
}
|
|
134
|
+
if (ids.length < 2) return [];
|
|
135
|
+
|
|
136
|
+
const maxPossible = (ids.length * (ids.length - 1)) / 2;
|
|
137
|
+
if (maxPossible <= nPairs) {
|
|
138
|
+
const out: Array<readonly [number, number]> = [];
|
|
139
|
+
for (let i = 0; i < ids.length; i++) {
|
|
140
|
+
for (let j = i + 1; j < ids.length; j++) {
|
|
141
|
+
out.push([ids[i]!, ids[j]!] as const);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return out;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const seen = new Set<string>();
|
|
148
|
+
const pairs: Array<readonly [number, number]> = [];
|
|
149
|
+
const maxAttempts = nPairs * 10;
|
|
150
|
+
let attempts = 0;
|
|
151
|
+
while (pairs.length < nPairs && attempts < maxAttempts) {
|
|
152
|
+
attempts++;
|
|
153
|
+
const i = Math.floor(rand() * ids.length);
|
|
154
|
+
let j = Math.floor(rand() * ids.length);
|
|
155
|
+
if (j === i) j = (j + 1) % ids.length;
|
|
156
|
+
const a = Math.min(ids[i]!, ids[j]!);
|
|
157
|
+
const b = Math.max(ids[i]!, ids[j]!);
|
|
158
|
+
const key = `${a}:${b}`;
|
|
159
|
+
if (seen.has(key)) continue;
|
|
160
|
+
seen.add(key);
|
|
161
|
+
pairs.push([a, b] as const);
|
|
162
|
+
}
|
|
163
|
+
return pairs;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function buildComparisonMatrix(
|
|
167
|
+
pairs: ReadonlyArray<readonly [number, number]>,
|
|
168
|
+
rowById: ReadonlyMap<number, Row>,
|
|
169
|
+
fields: readonly MatchkeyField[],
|
|
170
|
+
): number[][] {
|
|
171
|
+
const out: number[][] = [];
|
|
172
|
+
for (const [a, b] of pairs) {
|
|
173
|
+
const rowA = rowById.get(a) ?? {};
|
|
174
|
+
const rowB = rowById.get(b) ?? {};
|
|
175
|
+
const vec = buildComparisonVector(rowA, rowB, fields);
|
|
176
|
+
out.push([...vec]);
|
|
177
|
+
}
|
|
178
|
+
return out;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
// Public: trainEM
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Splink-style EM training:
|
|
187
|
+
* 1. Estimate u from random pairs (fixed throughout).
|
|
188
|
+
* 2. Train m via EM starting from exponential priors.
|
|
189
|
+
* 3. Blocking fields bypass EM and receive fixed neutral u + linear weights.
|
|
190
|
+
*/
|
|
191
|
+
export function trainEM(
|
|
192
|
+
rows: readonly Row[],
|
|
193
|
+
mk: MatchkeyConfig,
|
|
194
|
+
options?: EMOptions,
|
|
195
|
+
): EMResult {
|
|
196
|
+
// Probabilistic-only parameters; fall through to defaults for other variants.
|
|
197
|
+
const emIterations =
|
|
198
|
+
mk.type === "probabilistic" ? mk.emIterations : undefined;
|
|
199
|
+
const convergenceThreshold =
|
|
200
|
+
mk.type === "probabilistic" ? mk.convergenceThreshold : undefined;
|
|
201
|
+
const maxIterations = options?.maxIterations ?? emIterations ?? 20;
|
|
202
|
+
const convergence = options?.convergence ?? convergenceThreshold ?? 0.001;
|
|
203
|
+
const blockingFields = new Set(options?.blockingFields ?? []);
|
|
204
|
+
const seed = options?.seed ?? 42;
|
|
205
|
+
const nSamplePairs = options?.nSamplePairs ?? 10000;
|
|
206
|
+
|
|
207
|
+
const fields = mk.fields;
|
|
208
|
+
if (fields.length === 0) return fallbackResult(mk);
|
|
209
|
+
|
|
210
|
+
const rand = makeRng(seed);
|
|
211
|
+
const rowById = new Map<number, Row>();
|
|
212
|
+
for (const r of rows) {
|
|
213
|
+
const id = r["__row_id__"];
|
|
214
|
+
if (typeof id === "number") rowById.set(id, r);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Step 1: u from random pairs.
|
|
218
|
+
const sampleForU = samplePairs(rows, Math.min(nSamplePairs, 5000), rand);
|
|
219
|
+
if (sampleForU.length < 10) return fallbackResult(mk);
|
|
220
|
+
const uMatrix = buildComparisonMatrix(sampleForU, rowById, fields);
|
|
221
|
+
|
|
222
|
+
const u: Record<string, number[]> = {};
|
|
223
|
+
fields.forEach((f, j) => {
|
|
224
|
+
const n = fieldLevels(f);
|
|
225
|
+
const counts = new Array<number>(n).fill(0);
|
|
226
|
+
for (const row of uMatrix) {
|
|
227
|
+
const lvl = row[j]!;
|
|
228
|
+
if (lvl >= 0 && lvl < n) counts[lvl]! += 1;
|
|
229
|
+
}
|
|
230
|
+
const total = counts.reduce((a, b) => a + b, 0) + n * 1e-6;
|
|
231
|
+
u[f.field] = counts.map((c) => (c + 1e-6) / total);
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
// Blocking fields get neutral u.
|
|
235
|
+
for (const f of fields) {
|
|
236
|
+
if (blockingFields.has(f.field)) {
|
|
237
|
+
const n = fieldLevels(f);
|
|
238
|
+
if (n === 2) u[f.field] = [0.5, 0.5];
|
|
239
|
+
else u[f.field] = [0.34, 0.33, ...new Array<number>(n - 2).fill(0.33 / Math.max(1, n - 2))];
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Step 2: m priors (exponential: highest level gets most mass).
|
|
244
|
+
const m: Record<string, number[]> = {};
|
|
245
|
+
for (const f of fields) {
|
|
246
|
+
const n = fieldLevels(f);
|
|
247
|
+
const raw: number[] = [];
|
|
248
|
+
for (let k = 0; k < n; k++) raw.push(2 ** k);
|
|
249
|
+
const sum = raw.reduce((a, b) => a + b, 0);
|
|
250
|
+
m[f.field] = raw.map((r) => r / sum);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Use the same random-pair matrix for EM. In Python, blocked pairs are
|
|
254
|
+
// preferred when available; we don't have blocks in this entry point, so
|
|
255
|
+
// we train on the random sample (the fallback path).
|
|
256
|
+
const compMatrix = uMatrix;
|
|
257
|
+
const nPairs = compMatrix.length;
|
|
258
|
+
|
|
259
|
+
let pMatch = 0.02;
|
|
260
|
+
let converged = false;
|
|
261
|
+
let iterations = 0;
|
|
262
|
+
|
|
263
|
+
for (let iter = 0; iter < maxIterations; iter++) {
|
|
264
|
+
iterations = iter + 1;
|
|
265
|
+
const oldM: Record<string, number[]> = {};
|
|
266
|
+
for (const k of Object.keys(m)) oldM[k] = [...m[k]!];
|
|
267
|
+
|
|
268
|
+
// E-step.
|
|
269
|
+
const posteriors = new Float64Array(nPairs);
|
|
270
|
+
for (let i = 0; i < nPairs; i++) {
|
|
271
|
+
let logM = Math.log(Math.max(pMatch, 1e-10));
|
|
272
|
+
let logU = Math.log(Math.max(1 - pMatch, 1e-10));
|
|
273
|
+
for (let j = 0; j < fields.length; j++) {
|
|
274
|
+
const f = fields[j]!;
|
|
275
|
+
const level = compMatrix[i]![j]!;
|
|
276
|
+
const mProb = Math.max(m[f.field]![level] ?? 1e-10, 1e-10);
|
|
277
|
+
const uProb = Math.max(u[f.field]![level] ?? 1e-10, 1e-10);
|
|
278
|
+
logM += Math.log(mProb);
|
|
279
|
+
logU += Math.log(uProb);
|
|
280
|
+
}
|
|
281
|
+
const maxLog = Math.max(logM, logU);
|
|
282
|
+
const em = Math.exp(logM - maxLog);
|
|
283
|
+
const eu = Math.exp(logU - maxLog);
|
|
284
|
+
posteriors[i] = em / (em + eu);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// M-step (m only).
|
|
288
|
+
let totalMatch = 0;
|
|
289
|
+
for (let i = 0; i < nPairs; i++) totalMatch += posteriors[i]!;
|
|
290
|
+
pMatch = Math.max(totalMatch / nPairs, 1e-6);
|
|
291
|
+
|
|
292
|
+
for (let j = 0; j < fields.length; j++) {
|
|
293
|
+
const f = fields[j]!;
|
|
294
|
+
if (blockingFields.has(f.field)) continue;
|
|
295
|
+
const n = fieldLevels(f);
|
|
296
|
+
const newM = new Array<number>(n).fill(0);
|
|
297
|
+
for (let i = 0; i < nPairs; i++) {
|
|
298
|
+
const level = compMatrix[i]![j]!;
|
|
299
|
+
if (level >= 0 && level < n) newM[level]! += posteriors[i]!;
|
|
300
|
+
}
|
|
301
|
+
const denom = totalMatch + n * 1e-6;
|
|
302
|
+
for (let k = 0; k < n; k++) {
|
|
303
|
+
newM[k] = (newM[k]! + 1e-6) / denom;
|
|
304
|
+
}
|
|
305
|
+
m[f.field] = newM;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Convergence.
|
|
309
|
+
let maxDelta = 0;
|
|
310
|
+
for (const f of fields) {
|
|
311
|
+
if (blockingFields.has(f.field)) continue;
|
|
312
|
+
const n = fieldLevels(f);
|
|
313
|
+
for (let k = 0; k < n; k++) {
|
|
314
|
+
const d = Math.abs(m[f.field]![k]! - oldM[f.field]![k]!);
|
|
315
|
+
if (d > maxDelta) maxDelta = d;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
if (maxDelta < convergence) {
|
|
319
|
+
converged = true;
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Match weights = log2(m/u), with fixed linear weights for blocking fields.
|
|
325
|
+
const matchWeights: Record<string, number[]> = {};
|
|
326
|
+
for (const f of fields) {
|
|
327
|
+
const n = fieldLevels(f);
|
|
328
|
+
if (blockingFields.has(f.field)) {
|
|
329
|
+
const w: number[] = [];
|
|
330
|
+
for (let k = 0; k < n; k++) {
|
|
331
|
+
w.push(n > 1 ? -3.0 + (6.0 * k) / (n - 1) : 3.0);
|
|
332
|
+
}
|
|
333
|
+
matchWeights[f.field] = w;
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
const w: number[] = [];
|
|
337
|
+
for (let k = 0; k < n; k++) {
|
|
338
|
+
const mVal = Math.max(m[f.field]![k]!, 1e-10);
|
|
339
|
+
const uVal = Math.max(u[f.field]![k]!, 1e-10);
|
|
340
|
+
w.push(Math.log2(mVal / uVal));
|
|
341
|
+
}
|
|
342
|
+
matchWeights[f.field] = w;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
m: m as Readonly<Record<string, readonly number[]>>,
|
|
347
|
+
u: u as Readonly<Record<string, readonly number[]>>,
|
|
348
|
+
matchWeights: matchWeights as Readonly<Record<string, readonly number[]>>,
|
|
349
|
+
proportionMatched: pMatch,
|
|
350
|
+
iterations,
|
|
351
|
+
converged,
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// ---------------------------------------------------------------------------
|
|
356
|
+
// Public: scoreProbabilistic
|
|
357
|
+
// ---------------------------------------------------------------------------
|
|
358
|
+
|
|
359
|
+
export interface ProbScoreOptions {
|
|
360
|
+
readonly excludePairs?: ReadonlySet<string>;
|
|
361
|
+
readonly threshold?: number;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Score all pairs in a block using F-S match weights.
|
|
366
|
+
* Returns normalized scores in [0,1] (weight sum mapped to 0-1 via min/max).
|
|
367
|
+
* Pairs below threshold are filtered out.
|
|
368
|
+
*/
|
|
369
|
+
export function scoreProbabilistic(
|
|
370
|
+
rows: readonly Row[],
|
|
371
|
+
mk: MatchkeyConfig,
|
|
372
|
+
em: EMResult,
|
|
373
|
+
options?: ProbScoreOptions,
|
|
374
|
+
): ScoredPair[] {
|
|
375
|
+
const fields = mk.fields;
|
|
376
|
+
if (fields.length === 0) return [];
|
|
377
|
+
|
|
378
|
+
const excludePairs = options?.excludePairs ?? new Set<string>();
|
|
379
|
+
const linkThreshold =
|
|
380
|
+
mk.type === "probabilistic" ? mk.linkThreshold : undefined;
|
|
381
|
+
const threshold = options?.threshold ?? linkThreshold ?? 0.5;
|
|
382
|
+
|
|
383
|
+
// Min/max possible weight totals for normalization.
|
|
384
|
+
let maxWeight = 0;
|
|
385
|
+
let minWeight = 0;
|
|
386
|
+
for (const f of fields) {
|
|
387
|
+
const w = em.matchWeights[f.field];
|
|
388
|
+
if (!w || w.length === 0) continue;
|
|
389
|
+
maxWeight += Math.max(...w);
|
|
390
|
+
minWeight += Math.min(...w);
|
|
391
|
+
}
|
|
392
|
+
const weightRange = maxWeight - minWeight;
|
|
393
|
+
|
|
394
|
+
const rowIds: number[] = [];
|
|
395
|
+
const rowLookup: Row[] = [];
|
|
396
|
+
for (const r of rows) {
|
|
397
|
+
const id = r["__row_id__"];
|
|
398
|
+
if (typeof id === "number") {
|
|
399
|
+
rowIds.push(id);
|
|
400
|
+
rowLookup.push(r);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const results: ScoredPair[] = [];
|
|
405
|
+
for (let i = 0; i < rowIds.length; i++) {
|
|
406
|
+
for (let j = i + 1; j < rowIds.length; j++) {
|
|
407
|
+
const a = Math.min(rowIds[i]!, rowIds[j]!);
|
|
408
|
+
const b = Math.max(rowIds[i]!, rowIds[j]!);
|
|
409
|
+
const key = `${a}:${b}`;
|
|
410
|
+
if (excludePairs.has(key)) continue;
|
|
411
|
+
|
|
412
|
+
const vec = buildComparisonVector(rowLookup[i]!, rowLookup[j]!, fields);
|
|
413
|
+
|
|
414
|
+
let total = 0;
|
|
415
|
+
for (let k = 0; k < fields.length; k++) {
|
|
416
|
+
const f = fields[k]!;
|
|
417
|
+
const level = vec[k]!;
|
|
418
|
+
const w = em.matchWeights[f.field];
|
|
419
|
+
if (!w) continue;
|
|
420
|
+
total += w[level] ?? 0;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const normalized =
|
|
424
|
+
weightRange > 0 ? (total - minWeight) / weightRange : 0.5;
|
|
425
|
+
|
|
426
|
+
if (normalized >= threshold) {
|
|
427
|
+
results.push(
|
|
428
|
+
makeScoredPair(a, b, Math.round(normalized * 10000) / 10000),
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
return results;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// ---------------------------------------------------------------------------
|
|
437
|
+
// Public: scoreProbabilisticPair (single-pair variant for match_one use)
|
|
438
|
+
// ---------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
export function scoreProbabilisticPair(
|
|
441
|
+
rowA: Row,
|
|
442
|
+
rowB: Row,
|
|
443
|
+
mk: MatchkeyConfig,
|
|
444
|
+
em: EMResult,
|
|
445
|
+
): number {
|
|
446
|
+
const fields = mk.fields;
|
|
447
|
+
if (fields.length === 0) return 0.5;
|
|
448
|
+
|
|
449
|
+
let maxWeight = 0;
|
|
450
|
+
let minWeight = 0;
|
|
451
|
+
for (const f of fields) {
|
|
452
|
+
const w = em.matchWeights[f.field];
|
|
453
|
+
if (!w || w.length === 0) continue;
|
|
454
|
+
maxWeight += Math.max(...w);
|
|
455
|
+
minWeight += Math.min(...w);
|
|
456
|
+
}
|
|
457
|
+
const weightRange = maxWeight - minWeight;
|
|
458
|
+
if (weightRange <= 0) return 0.5;
|
|
459
|
+
|
|
460
|
+
const vec = buildComparisonVector(rowA, rowB, fields);
|
|
461
|
+
let total = 0;
|
|
462
|
+
for (let k = 0; k < fields.length; k++) {
|
|
463
|
+
const f = fields[k]!;
|
|
464
|
+
const level = vec[k]!;
|
|
465
|
+
const w = em.matchWeights[f.field];
|
|
466
|
+
if (!w) continue;
|
|
467
|
+
total += w[level] ?? 0;
|
|
468
|
+
}
|
|
469
|
+
return (total - minWeight) / weightRange;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// ---------------------------------------------------------------------------
|
|
473
|
+
// Fallback result for tiny datasets
|
|
474
|
+
// ---------------------------------------------------------------------------
|
|
475
|
+
|
|
476
|
+
function fallbackResult(mk: MatchkeyConfig): EMResult {
|
|
477
|
+
const m: Record<string, number[]> = {};
|
|
478
|
+
const u: Record<string, number[]> = {};
|
|
479
|
+
const w: Record<string, number[]> = {};
|
|
480
|
+
for (const f of mk.fields) {
|
|
481
|
+
const n = fieldLevels(f);
|
|
482
|
+
if (n === 2) {
|
|
483
|
+
m[f.field] = [0.1, 0.9];
|
|
484
|
+
u[f.field] = [0.9, 0.1];
|
|
485
|
+
w[f.field] = [Math.log2(0.1 / 0.9), Math.log2(0.9 / 0.1)];
|
|
486
|
+
} else if (n === 3) {
|
|
487
|
+
m[f.field] = [0.05, 0.15, 0.8];
|
|
488
|
+
u[f.field] = [0.8, 0.15, 0.05];
|
|
489
|
+
w[f.field] = [
|
|
490
|
+
Math.log2(0.05 / 0.8),
|
|
491
|
+
Math.log2(0.15 / 0.15),
|
|
492
|
+
Math.log2(0.8 / 0.05),
|
|
493
|
+
];
|
|
494
|
+
} else {
|
|
495
|
+
// Uniform fallback.
|
|
496
|
+
const mv = new Array<number>(n).fill(1 / n);
|
|
497
|
+
const uv = new Array<number>(n).fill(1 / n);
|
|
498
|
+
m[f.field] = mv;
|
|
499
|
+
u[f.field] = uv;
|
|
500
|
+
w[f.field] = new Array<number>(n).fill(0);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
m,
|
|
505
|
+
u,
|
|
506
|
+
matchWeights: w,
|
|
507
|
+
proportionMatched: 0.05,
|
|
508
|
+
iterations: 0,
|
|
509
|
+
converged: false,
|
|
510
|
+
};
|
|
511
|
+
}
|