goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,855 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scorer.ts — Fuzzy scoring module for GoldenMatch.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/scorer.py. The Python version uses `rapidfuzz`
|
|
6
|
+
* for vectorized NxN scoring. Here we implement all algorithms in pure TS.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type {
|
|
10
|
+
Row,
|
|
11
|
+
MatchkeyField,
|
|
12
|
+
MatchkeyConfig,
|
|
13
|
+
PairKey,
|
|
14
|
+
ScoredPair,
|
|
15
|
+
BlockResult,
|
|
16
|
+
} from "./types.js";
|
|
17
|
+
import { makeScoredPair } from "./types.js";
|
|
18
|
+
import { pairKey } from "./cluster.js";
|
|
19
|
+
import { applyTransforms, soundex } from "./transforms.js";
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Helper: coerce unknown to string | null
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
/** Convert unknown value to string or null. */
|
|
26
|
+
export function asString(v: unknown): string | null {
|
|
27
|
+
if (v === null || v === undefined) return null;
|
|
28
|
+
if (typeof v === "string") return v;
|
|
29
|
+
return String(v);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Scoring algorithms — pure TS
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Jaro similarity between two strings.
|
|
38
|
+
*
|
|
39
|
+
* matchWindow = floor(max(lenA, lenB) / 2) - 1
|
|
40
|
+
* Count matches (chars within window) and transpositions.
|
|
41
|
+
* jaro = (m/lenA + m/lenB + (m - t/2) / m) / 3
|
|
42
|
+
*/
|
|
43
|
+
export function jaro(a: string, b: string): number {
|
|
44
|
+
if (a === b) return 1.0;
|
|
45
|
+
const lenA = a.length;
|
|
46
|
+
const lenB = b.length;
|
|
47
|
+
if (lenA === 0 || lenB === 0) return 0.0;
|
|
48
|
+
|
|
49
|
+
const matchWindow = Math.max(Math.floor(Math.max(lenA, lenB) / 2) - 1, 0);
|
|
50
|
+
|
|
51
|
+
const aMatched = new Uint8Array(lenA); // 0 = unmatched
|
|
52
|
+
const bMatched = new Uint8Array(lenB);
|
|
53
|
+
let matches = 0;
|
|
54
|
+
|
|
55
|
+
// Find matching characters
|
|
56
|
+
for (let i = 0; i < lenA; i++) {
|
|
57
|
+
const lo = Math.max(0, i - matchWindow);
|
|
58
|
+
const hi = Math.min(lenB - 1, i + matchWindow);
|
|
59
|
+
for (let j = lo; j <= hi; j++) {
|
|
60
|
+
if (bMatched[j] !== 0 || a[i] !== b[j]) continue;
|
|
61
|
+
aMatched[i] = 1;
|
|
62
|
+
bMatched[j] = 1;
|
|
63
|
+
matches++;
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (matches === 0) return 0.0;
|
|
69
|
+
|
|
70
|
+
// Count transpositions
|
|
71
|
+
let transpositions = 0;
|
|
72
|
+
let k = 0;
|
|
73
|
+
for (let i = 0; i < lenA; i++) {
|
|
74
|
+
if (aMatched[i] === 0) continue;
|
|
75
|
+
while (bMatched[k] === 0) k++;
|
|
76
|
+
if (a[i] !== b[k]) transpositions++;
|
|
77
|
+
k++;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return (
|
|
81
|
+
(matches / lenA + matches / lenB + (matches - transpositions / 2) / matches) / 3
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Jaro-Winkler similarity.
|
|
87
|
+
* Adds a bonus for a common prefix of up to 4 characters, scaling factor 0.1.
|
|
88
|
+
*/
|
|
89
|
+
export function jaroWinkler(a: string, b: string): number {
|
|
90
|
+
const jaroSim = jaro(a, b);
|
|
91
|
+
if (jaroSim === 0.0) return 0.0;
|
|
92
|
+
|
|
93
|
+
// Common prefix up to 4 chars
|
|
94
|
+
const maxPrefix = Math.min(4, Math.min(a.length, b.length));
|
|
95
|
+
let prefix = 0;
|
|
96
|
+
for (let i = 0; i < maxPrefix; i++) {
|
|
97
|
+
if (a[i] === b[i]) prefix++;
|
|
98
|
+
else break;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return jaroSim + prefix * 0.1 * (1 - jaroSim);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Levenshtein edit distance (classic DP, 2-row optimization).
|
|
106
|
+
*/
|
|
107
|
+
export function levenshteinDistance(a: string, b: string): number {
|
|
108
|
+
const lenA = a.length;
|
|
109
|
+
const lenB = b.length;
|
|
110
|
+
if (lenA === 0) return lenB;
|
|
111
|
+
if (lenB === 0) return lenA;
|
|
112
|
+
|
|
113
|
+
// Two-row DP
|
|
114
|
+
let prev = new Uint32Array(lenB + 1);
|
|
115
|
+
let curr = new Uint32Array(lenB + 1);
|
|
116
|
+
|
|
117
|
+
for (let j = 0; j <= lenB; j++) prev[j] = j;
|
|
118
|
+
|
|
119
|
+
for (let i = 1; i <= lenA; i++) {
|
|
120
|
+
curr[0] = i;
|
|
121
|
+
for (let j = 1; j <= lenB; j++) {
|
|
122
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
123
|
+
curr[j] = Math.min(
|
|
124
|
+
prev[j]! + 1, // deletion
|
|
125
|
+
curr[j - 1]! + 1, // insertion
|
|
126
|
+
prev[j - 1]! + cost, // substitution
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
// Swap rows
|
|
130
|
+
[prev, curr] = [curr, prev];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return prev[lenB]!;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Normalized Levenshtein similarity: 1 - distance / max(lenA, lenB).
|
|
138
|
+
*/
|
|
139
|
+
export function levenshteinSimilarity(a: string, b: string): number {
|
|
140
|
+
if (a === b) return 1.0;
|
|
141
|
+
const maxLen = Math.max(a.length, b.length);
|
|
142
|
+
if (maxLen === 0) return 1.0;
|
|
143
|
+
return 1 - levenshteinDistance(a, b) / maxLen;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Indel (insertion+deletion) edit distance.
|
|
148
|
+
*
|
|
149
|
+
* Like Levenshtein but without substitutions — a substitution costs 2
|
|
150
|
+
* (one delete + one insert) instead of 1. This matches the distance
|
|
151
|
+
* metric used by rapidfuzz's Indel ratio, which underlies
|
|
152
|
+
* `rapidfuzz.fuzz.token_sort_ratio` in Python.
|
|
153
|
+
*/
|
|
154
|
+
export function indelDistance(a: string, b: string): number {
|
|
155
|
+
if (a === b) return 0;
|
|
156
|
+
if (a.length === 0) return b.length;
|
|
157
|
+
if (b.length === 0) return a.length;
|
|
158
|
+
const m = a.length;
|
|
159
|
+
const n = b.length;
|
|
160
|
+
let prev = new Uint32Array(n + 1);
|
|
161
|
+
let curr = new Uint32Array(n + 1);
|
|
162
|
+
for (let j = 0; j <= n; j++) prev[j] = j;
|
|
163
|
+
for (let i = 1; i <= m; i++) {
|
|
164
|
+
curr[0] = i;
|
|
165
|
+
for (let j = 1; j <= n; j++) {
|
|
166
|
+
if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
|
|
167
|
+
curr[j] = prev[j - 1]!;
|
|
168
|
+
} else {
|
|
169
|
+
// Only insert or delete allowed — cost 1 each. No substitution.
|
|
170
|
+
curr[j] = Math.min(prev[j]! + 1, curr[j - 1]! + 1);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
[prev, curr] = [curr, prev];
|
|
174
|
+
}
|
|
175
|
+
return prev[n]!;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Indel normalized similarity: `1 - d_indel / (len_a + len_b)`.
|
|
180
|
+
* Matches rapidfuzz's `Indel.normalized_similarity`.
|
|
181
|
+
*/
|
|
182
|
+
export function indelSimilarity(a: string, b: string): number {
|
|
183
|
+
const total = a.length + b.length;
|
|
184
|
+
if (total === 0) return 1.0;
|
|
185
|
+
return 1 - indelDistance(a, b) / total;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Token sort ratio, rapidfuzz-compatible.
|
|
190
|
+
*
|
|
191
|
+
* Matches `rapidfuzz.fuzz.token_sort_ratio`:
|
|
192
|
+
* 1. Lowercase both strings.
|
|
193
|
+
* 2. Strip non-alphanumeric characters (replace with whitespace).
|
|
194
|
+
* 3. Split on whitespace, drop empties, sort tokens, rejoin with single space.
|
|
195
|
+
* 4. Compare via Indel normalized similarity (NOT Levenshtein).
|
|
196
|
+
*
|
|
197
|
+
* Python reference: for ("John Smith", "Smith Johnson") returns ~0.8571.
|
|
198
|
+
*/
|
|
199
|
+
export function tokenSortRatio(a: string, b: string): number {
|
|
200
|
+
const normalize = (s: string): string =>
|
|
201
|
+
s
|
|
202
|
+
.toLowerCase()
|
|
203
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
204
|
+
.trim()
|
|
205
|
+
.split(/\s+/)
|
|
206
|
+
.filter(Boolean)
|
|
207
|
+
.sort()
|
|
208
|
+
.join(" ");
|
|
209
|
+
return indelSimilarity(normalize(a), normalize(b));
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Soundex match: 1.0 if soundex codes equal, else 0.0.
|
|
214
|
+
*/
|
|
215
|
+
export function soundexMatch(a: string, b: string): number {
|
|
216
|
+
return soundex(a) === soundex(b) ? 1.0 : 0.0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
// Bloom filter / PPRL scorers
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
/** Convert a hex string to a Uint8Array of bytes. */
|
|
224
|
+
function hexToBytes(hex: string): Uint8Array {
|
|
225
|
+
const len = hex.length >>> 1;
|
|
226
|
+
const bytes = new Uint8Array(len);
|
|
227
|
+
for (let i = 0; i < len; i++) {
|
|
228
|
+
bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
229
|
+
}
|
|
230
|
+
return bytes;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/** Count the number of set bits (popcount) in a byte array. */
|
|
234
|
+
function popcount(bytes: Uint8Array): number {
|
|
235
|
+
let count = 0;
|
|
236
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
237
|
+
let b = bytes[i]!;
|
|
238
|
+
// Brian Kernighan's algorithm
|
|
239
|
+
while (b !== 0) {
|
|
240
|
+
b &= b - 1;
|
|
241
|
+
count++;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
return count;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/** Count set bits in bitwise AND of two byte arrays. */
|
|
248
|
+
function popcountAnd(a: Uint8Array, b: Uint8Array): number {
|
|
249
|
+
const len = Math.min(a.length, b.length);
|
|
250
|
+
let count = 0;
|
|
251
|
+
for (let i = 0; i < len; i++) {
|
|
252
|
+
let v = (a[i]! & b[i]!);
|
|
253
|
+
while (v !== 0) {
|
|
254
|
+
v &= v - 1;
|
|
255
|
+
count++;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return count;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/** Count set bits in bitwise OR of two byte arrays. */
|
|
262
|
+
function popcountOr(a: Uint8Array, b: Uint8Array): number {
|
|
263
|
+
const maxLen = Math.max(a.length, b.length);
|
|
264
|
+
let count = 0;
|
|
265
|
+
for (let i = 0; i < maxLen; i++) {
|
|
266
|
+
let v = ((a[i] ?? 0) | (b[i] ?? 0));
|
|
267
|
+
while (v !== 0) {
|
|
268
|
+
v &= v - 1;
|
|
269
|
+
count++;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
return count;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Dice coefficient on two hex-encoded bloom filters.
|
|
277
|
+
* 2 * intersection / (popcount_a + popcount_b)
|
|
278
|
+
*/
|
|
279
|
+
export function diceCoefficient(a: string, b: string): number {
|
|
280
|
+
const bytesA = hexToBytes(a);
|
|
281
|
+
const bytesB = hexToBytes(b);
|
|
282
|
+
const pcA = popcount(bytesA);
|
|
283
|
+
const pcB = popcount(bytesB);
|
|
284
|
+
const total = pcA + pcB;
|
|
285
|
+
if (total === 0) return 0.0;
|
|
286
|
+
const intersection = popcountAnd(bytesA, bytesB);
|
|
287
|
+
return (2 * intersection) / total;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Jaccard similarity on two hex-encoded bloom filters.
|
|
292
|
+
* intersection / union of bits
|
|
293
|
+
*/
|
|
294
|
+
export function jaccardSimilarity(a: string, b: string): number {
|
|
295
|
+
const bytesA = hexToBytes(a);
|
|
296
|
+
const bytesB = hexToBytes(b);
|
|
297
|
+
const intersection = popcountAnd(bytesA, bytesB);
|
|
298
|
+
const union = popcountOr(bytesA, bytesB);
|
|
299
|
+
if (union === 0) return 0.0;
|
|
300
|
+
return intersection / union;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// ---------------------------------------------------------------------------
|
|
304
|
+
// Ensemble scorer
|
|
305
|
+
// ---------------------------------------------------------------------------
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Ensemble scorer: combines jaro_winkler, token_sort, and soundex_match * 0.8.
|
|
309
|
+
* Takes element-wise max of all three.
|
|
310
|
+
*/
|
|
311
|
+
export function ensembleScore(a: string, b: string): number {
|
|
312
|
+
const jw = jaroWinkler(a, b);
|
|
313
|
+
const ts = tokenSortRatio(a, b);
|
|
314
|
+
const sx = soundexMatch(a, b) * 0.8;
|
|
315
|
+
return Math.max(jw, ts, sx);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// ---------------------------------------------------------------------------
|
|
319
|
+
// Public: scoreField
|
|
320
|
+
// ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Score two field values using the specified scorer.
|
|
324
|
+
* Returns null if either value is null.
|
|
325
|
+
*/
|
|
326
|
+
export function scoreField(
|
|
327
|
+
valA: string | null,
|
|
328
|
+
valB: string | null,
|
|
329
|
+
scorer: string,
|
|
330
|
+
): number | null {
|
|
331
|
+
if (valA === null || valB === null) return null;
|
|
332
|
+
|
|
333
|
+
switch (scorer) {
|
|
334
|
+
case "exact":
|
|
335
|
+
return valA === valB ? 1.0 : 0.0;
|
|
336
|
+
case "jaro_winkler":
|
|
337
|
+
return jaroWinkler(valA, valB);
|
|
338
|
+
case "levenshtein":
|
|
339
|
+
return levenshteinSimilarity(valA, valB);
|
|
340
|
+
case "token_sort":
|
|
341
|
+
return tokenSortRatio(valA, valB);
|
|
342
|
+
case "soundex_match":
|
|
343
|
+
return soundexMatch(valA, valB);
|
|
344
|
+
case "dice":
|
|
345
|
+
return diceCoefficient(valA, valB);
|
|
346
|
+
case "jaccard":
|
|
347
|
+
return jaccardSimilarity(valA, valB);
|
|
348
|
+
case "ensemble":
|
|
349
|
+
return ensembleScore(valA, valB);
|
|
350
|
+
default:
|
|
351
|
+
throw new Error(`Unknown scorer: ${JSON.stringify(scorer)}`);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// ---------------------------------------------------------------------------
|
|
356
|
+
// Public: scorePair
|
|
357
|
+
// ---------------------------------------------------------------------------
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* Score a pair of rows across all fields using weighted aggregation.
|
|
361
|
+
* Fields that produce null scores are excluded. If all null -> 0.0.
|
|
362
|
+
*/
|
|
363
|
+
export function scorePair(
|
|
364
|
+
rowA: Row,
|
|
365
|
+
rowB: Row,
|
|
366
|
+
fields: readonly MatchkeyField[],
|
|
367
|
+
): number {
|
|
368
|
+
let weightedSum = 0;
|
|
369
|
+
let weightSum = 0;
|
|
370
|
+
for (const f of fields) {
|
|
371
|
+
const valA = applyTransforms(asString(rowA[f.field]), f.transforms);
|
|
372
|
+
const valB = applyTransforms(asString(rowB[f.field]), f.transforms);
|
|
373
|
+
const fieldScore = scoreField(valA, valB, f.scorer);
|
|
374
|
+
if (fieldScore !== null) {
|
|
375
|
+
weightedSum += fieldScore * f.weight;
|
|
376
|
+
weightSum += f.weight;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
return weightSum === 0 ? 0 : weightedSum / weightSum;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// ---------------------------------------------------------------------------
|
|
383
|
+
// NxN score matrix
|
|
384
|
+
// ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Build an NxN score matrix for a list of values using a scorer.
|
|
388
|
+
* Symmetric: matrix[i][j] === matrix[j][i]. Diagonal is 0.
|
|
389
|
+
*/
|
|
390
|
+
export function scoreMatrix(
|
|
391
|
+
values: (string | null)[],
|
|
392
|
+
scorerName: string,
|
|
393
|
+
): number[][] {
|
|
394
|
+
const n = values.length;
|
|
395
|
+
const matrix: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
396
|
+
for (let i = 0; i < n; i++) {
|
|
397
|
+
for (let j = i + 1; j < n; j++) {
|
|
398
|
+
const s = scoreField(values[i]!, values[j]!, scorerName) ?? 0;
|
|
399
|
+
matrix[i]![j] = s;
|
|
400
|
+
matrix[j]![i] = s;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return matrix;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ---------------------------------------------------------------------------
|
|
407
|
+
// Exact score matrix (hash-based grouping, O(n))
|
|
408
|
+
// ---------------------------------------------------------------------------
|
|
409
|
+
|
|
410
|
+
function exactScoreMatrix(values: (string | null)[]): number[][] {
|
|
411
|
+
const n = values.length;
|
|
412
|
+
const matrix: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
413
|
+
// Group indices by value
|
|
414
|
+
const groups = new Map<string, number[]>();
|
|
415
|
+
for (let i = 0; i < n; i++) {
|
|
416
|
+
const v = values[i];
|
|
417
|
+
if (v != null) {
|
|
418
|
+
const existing = groups.get(v);
|
|
419
|
+
if (existing !== undefined) {
|
|
420
|
+
existing.push(i);
|
|
421
|
+
} else {
|
|
422
|
+
groups.set(v, [i]);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
groups.forEach((indices) => {
|
|
427
|
+
if (indices.length > 1) {
|
|
428
|
+
for (let a = 0; a < indices.length; a++) {
|
|
429
|
+
for (let b = a + 1; b < indices.length; b++) {
|
|
430
|
+
matrix[indices[a]!]![indices[b]!] = 1.0;
|
|
431
|
+
matrix[indices[b]!]![indices[a]!] = 1.0;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
});
|
|
436
|
+
return matrix;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/** Soundex score matrix: group by soundex code, 1.0 for same code. */
|
|
440
|
+
function soundexScoreMatrix(values: (string | null)[]): number[][] {
|
|
441
|
+
const codes = values.map((v) => (v !== null ? soundex(v) : null));
|
|
442
|
+
return exactScoreMatrix(codes);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/** Ensemble score matrix: max of jaro_winkler, token_sort, soundex*0.8 */
|
|
446
|
+
function ensembleScoreMatrix(values: (string | null)[]): number[][] {
|
|
447
|
+
const n = values.length;
|
|
448
|
+
const clean = values.map((v) => v ?? "");
|
|
449
|
+
const jw: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
450
|
+
const ts: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
451
|
+
const sx = soundexScoreMatrix(values);
|
|
452
|
+
const result: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
453
|
+
|
|
454
|
+
for (let i = 0; i < n; i++) {
|
|
455
|
+
for (let j = i + 1; j < n; j++) {
|
|
456
|
+
if (values[i] === null || values[j] === null) continue;
|
|
457
|
+
jw[i]![j] = jaroWinkler(clean[i]!, clean[j]!);
|
|
458
|
+
jw[j]![i] = jw[i]![j]!;
|
|
459
|
+
ts[i]![j] = tokenSortRatio(clean[i]!, clean[j]!);
|
|
460
|
+
ts[j]![i] = ts[i]![j]!;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
for (let i = 0; i < n; i++) {
|
|
465
|
+
for (let j = i + 1; j < n; j++) {
|
|
466
|
+
const val = Math.max(jw[i]![j]!, ts[i]![j]!, sx[i]![j]! * 0.8);
|
|
467
|
+
result[i]![j] = val;
|
|
468
|
+
result[j]![i] = val;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
return result;
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
/**
|
|
475
|
+
* Build an NxN null mask: true where either value is null.
|
|
476
|
+
*/
|
|
477
|
+
function buildNullMask(values: (string | null)[]): boolean[][] {
|
|
478
|
+
const n = values.length;
|
|
479
|
+
const mask: boolean[][] = Array.from({ length: n }, () => new Array<boolean>(n).fill(false));
|
|
480
|
+
for (let i = 0; i < n; i++) {
|
|
481
|
+
if (values[i] === null) {
|
|
482
|
+
for (let j = 0; j < n; j++) {
|
|
483
|
+
mask[i]![j] = true;
|
|
484
|
+
mask[j]![i] = true;
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
return mask;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Build the appropriate score matrix for a scorer name.
|
|
493
|
+
*/
|
|
494
|
+
function buildScoreMatrix(values: (string | null)[], scorerName: string): number[][] {
|
|
495
|
+
switch (scorerName) {
|
|
496
|
+
case "exact":
|
|
497
|
+
return exactScoreMatrix(values);
|
|
498
|
+
case "soundex_match":
|
|
499
|
+
return soundexScoreMatrix(values);
|
|
500
|
+
case "ensemble":
|
|
501
|
+
return ensembleScoreMatrix(values);
|
|
502
|
+
default:
|
|
503
|
+
return scoreMatrix(values, scorerName);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// ---------------------------------------------------------------------------
|
|
508
|
+
// Get transformed values for a field from block rows
|
|
509
|
+
// ---------------------------------------------------------------------------
|
|
510
|
+
|
|
511
|
+
function getTransformedValues(
|
|
512
|
+
rows: readonly Row[],
|
|
513
|
+
field: MatchkeyField,
|
|
514
|
+
): (string | null)[] {
|
|
515
|
+
return rows.map((row) => {
|
|
516
|
+
const raw = asString(row[field.field]);
|
|
517
|
+
return applyTransforms(raw, field.transforms);
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// ---------------------------------------------------------------------------
|
|
522
|
+
// Public: findExactMatches
|
|
523
|
+
// ---------------------------------------------------------------------------
|
|
524
|
+
|
|
525
|
+
/**
|
|
526
|
+
* Find exact matches by grouping rows on matchkey columns.
|
|
527
|
+
* Builds a composite key from all matchkey fields (with transforms applied),
|
|
528
|
+
* groups rows sharing the same key, and returns all pairs with score 1.0.
|
|
529
|
+
*
|
|
530
|
+
* Rows must have a `__row_id__` field.
|
|
531
|
+
*/
|
|
532
|
+
export function findExactMatches(
|
|
533
|
+
rows: readonly Row[],
|
|
534
|
+
mk: MatchkeyConfig,
|
|
535
|
+
): ScoredPair[] {
|
|
536
|
+
if (rows.length < 2) return [];
|
|
537
|
+
|
|
538
|
+
// Build composite matchkey for each row
|
|
539
|
+
const groups = new Map<string, number[]>();
|
|
540
|
+
for (let i = 0; i < rows.length; i++) {
|
|
541
|
+
const row = rows[i]!;
|
|
542
|
+
const rowId = row["__row_id__"] as number;
|
|
543
|
+
// Build key from all fields
|
|
544
|
+
let keyParts: (string | null)[] = [];
|
|
545
|
+
let hasNull = false;
|
|
546
|
+
for (const f of mk.fields) {
|
|
547
|
+
const raw = asString(row[f.field]);
|
|
548
|
+
const transformed = applyTransforms(raw, f.transforms);
|
|
549
|
+
if (transformed === null) {
|
|
550
|
+
hasNull = true;
|
|
551
|
+
break;
|
|
552
|
+
}
|
|
553
|
+
keyParts.push(transformed);
|
|
554
|
+
}
|
|
555
|
+
// Skip rows with any null field (nulls don't match)
|
|
556
|
+
if (hasNull) continue;
|
|
557
|
+
|
|
558
|
+
const key = keyParts.join("\x00"); // null byte separator
|
|
559
|
+
const existing = groups.get(key);
|
|
560
|
+
if (existing !== undefined) {
|
|
561
|
+
existing.push(rowId);
|
|
562
|
+
} else {
|
|
563
|
+
groups.set(key, [rowId]);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Extract pairs from groups
|
|
568
|
+
const pairs: ScoredPair[] = [];
|
|
569
|
+
groups.forEach((members) => {
|
|
570
|
+
if (members.length < 2) return;
|
|
571
|
+
for (let i = 0; i < members.length; i++) {
|
|
572
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
573
|
+
pairs.push(makeScoredPair(members[i]!, members[j]!, 1.0));
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
});
|
|
577
|
+
return pairs;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// ---------------------------------------------------------------------------
|
|
581
|
+
// Public: findFuzzyMatches
|
|
582
|
+
// ---------------------------------------------------------------------------
|
|
583
|
+
|
|
584
|
+
/**
|
|
585
|
+
* Find fuzzy matches within a block of rows (NxN scoring).
|
|
586
|
+
*
|
|
587
|
+
* Implements early termination:
|
|
588
|
+
* - Score cheap fields (exact/soundex) first
|
|
589
|
+
* - Check if max possible score can reach threshold
|
|
590
|
+
* - Score expensive fuzzy fields only for promising pairs
|
|
591
|
+
*
|
|
592
|
+
* Rows must have a `__row_id__` field.
|
|
593
|
+
*/
|
|
594
|
+
export function findFuzzyMatches(
|
|
595
|
+
rows: readonly Row[],
|
|
596
|
+
mk: MatchkeyConfig,
|
|
597
|
+
excludePairs?: ReadonlySet<PairKey>,
|
|
598
|
+
preScoredPairs?: readonly ScoredPair[],
|
|
599
|
+
): ScoredPair[] {
|
|
600
|
+
// findFuzzyMatches only runs for weighted/probabilistic matchkeys
|
|
601
|
+
// (exact is handled via findExactMatches). Exact has no threshold.
|
|
602
|
+
const threshold = mk.type === "exact" ? 1.0 : (mk.threshold ?? 0.85);
|
|
603
|
+
|
|
604
|
+
// Fast path: pre-scored pairs (from ANN blocking)
|
|
605
|
+
if (preScoredPairs !== undefined) {
|
|
606
|
+
const results: ScoredPair[] = [];
|
|
607
|
+
for (const p of preScoredPairs) {
|
|
608
|
+
if (p.score < threshold) continue;
|
|
609
|
+
const idA = Math.min(p.idA, p.idB);
|
|
610
|
+
const idB = Math.max(p.idA, p.idB);
|
|
611
|
+
const key = pairKey(idA, idB);
|
|
612
|
+
if (excludePairs !== undefined && excludePairs.has(key)) continue;
|
|
613
|
+
results.push(makeScoredPair(idA, idB, p.score));
|
|
614
|
+
}
|
|
615
|
+
return results;
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
const n = rows.length;
|
|
619
|
+
if (n < 2) return [];
|
|
620
|
+
|
|
621
|
+
const rowIds = rows.map((r) => r["__row_id__"] as number);
|
|
622
|
+
|
|
623
|
+
// Separate cheap (exact + soundex) from expensive (fuzzy) fields
|
|
624
|
+
const cheapFields = mk.fields.filter(
|
|
625
|
+
(f) => f.scorer === "exact" || f.scorer === "soundex_match",
|
|
626
|
+
);
|
|
627
|
+
const fuzzyFields = mk.fields.filter(
|
|
628
|
+
(f) => f.scorer !== "exact" && f.scorer !== "soundex_match" && f.scorer !== "record_embedding",
|
|
629
|
+
);
|
|
630
|
+
|
|
631
|
+
const totalWeight = mk.fields.reduce((sum, f) => sum + f.weight, 0);
|
|
632
|
+
if (totalWeight === 0) return [];
|
|
633
|
+
|
|
634
|
+
// Phase 1: Score cheap fields and build null masks
|
|
635
|
+
// cheapNumerator[i][j] = sum(fieldScore * weight) for cheap fields
|
|
636
|
+
// cheapDenominator[i][j] = sum(weight) for non-null cheap fields
|
|
637
|
+
const cheapNumerator: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
638
|
+
const cheapDenominator: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
639
|
+
|
|
640
|
+
for (const f of cheapFields) {
|
|
641
|
+
const values = getTransformedValues(rows, f);
|
|
642
|
+
const nullMask = buildNullMask(values);
|
|
643
|
+
const scores =
|
|
644
|
+
f.scorer === "exact"
|
|
645
|
+
? exactScoreMatrix(values)
|
|
646
|
+
: soundexScoreMatrix(values);
|
|
647
|
+
|
|
648
|
+
for (let i = 0; i < n; i++) {
|
|
649
|
+
for (let j = i + 1; j < n; j++) {
|
|
650
|
+
if (!nullMask[i]![j]!) {
|
|
651
|
+
cheapNumerator[i]![j]! += scores[i]![j]! * f.weight;
|
|
652
|
+
cheapNumerator[j]![i]! = cheapNumerator[i]![j]!;
|
|
653
|
+
cheapDenominator[i]![j]! += f.weight;
|
|
654
|
+
cheapDenominator[j]![i]! = cheapDenominator[i]![j]!;
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// Phase 2: Early termination check
|
|
661
|
+
const fuzzyTotalWeight = fuzzyFields.reduce((sum, f) => sum + f.weight, 0);
|
|
662
|
+
|
|
663
|
+
// Track which pairs are impossible (can't reach threshold)
|
|
664
|
+
const impossible: boolean[][] = Array.from({ length: n }, () => new Array<boolean>(n).fill(false));
|
|
665
|
+
|
|
666
|
+
let combined: number[][];
|
|
667
|
+
|
|
668
|
+
if (fuzzyFields.length === 0) {
|
|
669
|
+
// No fuzzy fields — just use cheap scores
|
|
670
|
+
combined = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
671
|
+
for (let i = 0; i < n; i++) {
|
|
672
|
+
for (let j = i + 1; j < n; j++) {
|
|
673
|
+
combined[i]![j] =
|
|
674
|
+
cheapDenominator[i]![j]! > 0
|
|
675
|
+
? cheapNumerator[i]![j]! / cheapDenominator[i]![j]!
|
|
676
|
+
: 0;
|
|
677
|
+
combined[j]![i] = combined[i]![j]!;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
} else {
|
|
681
|
+
// Check which pairs can possibly reach threshold
|
|
682
|
+
for (let i = 0; i < n; i++) {
|
|
683
|
+
for (let j = i + 1; j < n; j++) {
|
|
684
|
+
const maxNum = cheapNumerator[i]![j]! + fuzzyTotalWeight;
|
|
685
|
+
const maxDen = cheapDenominator[i]![j]! + fuzzyTotalWeight;
|
|
686
|
+
const maxPossible = maxDen > 0 ? maxNum / maxDen : 0;
|
|
687
|
+
if (maxPossible < threshold) {
|
|
688
|
+
impossible[i]![j] = true;
|
|
689
|
+
impossible[j]![i] = true;
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// Phase 3: Score fuzzy fields with intra-field early termination
|
|
695
|
+
const fuzzyNumerator: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
696
|
+
const fuzzyDenominator: number[][] = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
697
|
+
|
|
698
|
+
for (let fIdx = 0; fIdx < fuzzyFields.length; fIdx++) {
|
|
699
|
+
const f = fuzzyFields[fIdx]!;
|
|
700
|
+
const values = getTransformedValues(rows, f);
|
|
701
|
+
const nullMask = buildNullMask(values);
|
|
702
|
+
const scores = buildScoreMatrix(values, f.scorer);
|
|
703
|
+
|
|
704
|
+
for (let i = 0; i < n; i++) {
|
|
705
|
+
for (let j = i + 1; j < n; j++) {
|
|
706
|
+
if (!nullMask[i]![j]!) {
|
|
707
|
+
fuzzyNumerator[i]![j]! += scores[i]![j]! * f.weight;
|
|
708
|
+
fuzzyNumerator[j]![i] = fuzzyNumerator[i]![j]!;
|
|
709
|
+
fuzzyDenominator[i]![j]! += f.weight;
|
|
710
|
+
fuzzyDenominator[j]![i] = fuzzyDenominator[i]![j]!;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// Intra-field early termination: check if any pair can still reach threshold
|
|
716
|
+
const remainingWeight = fuzzyFields
|
|
717
|
+
.slice(fIdx + 1)
|
|
718
|
+
.reduce((sum, ff) => sum + ff.weight, 0);
|
|
719
|
+
|
|
720
|
+
if (remainingWeight > 0) {
|
|
721
|
+
let anyCanReach = false;
|
|
722
|
+
for (let i = 0; i < n && !anyCanReach; i++) {
|
|
723
|
+
for (let j = i + 1; j < n && !anyCanReach; j++) {
|
|
724
|
+
if (impossible[i]![j]!) continue;
|
|
725
|
+
const totalNum =
|
|
726
|
+
cheapNumerator[i]![j]! + fuzzyNumerator[i]![j]! + remainingWeight;
|
|
727
|
+
const totalDen =
|
|
728
|
+
cheapDenominator[i]![j]! + fuzzyDenominator[i]![j]! + remainingWeight;
|
|
729
|
+
const bestPossible = totalDen > 0 ? totalNum / totalDen : 0;
|
|
730
|
+
if (bestPossible >= threshold) {
|
|
731
|
+
anyCanReach = true;
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
if (!anyCanReach) break; // No pair can reach threshold — skip remaining fields
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// Combine cheap + fuzzy
|
|
740
|
+
combined = Array.from({ length: n }, () => new Array<number>(n).fill(0));
|
|
741
|
+
for (let i = 0; i < n; i++) {
|
|
742
|
+
for (let j = i + 1; j < n; j++) {
|
|
743
|
+
if (impossible[i]![j]!) {
|
|
744
|
+
combined[i]![j] = 0;
|
|
745
|
+
} else {
|
|
746
|
+
const totalNum = cheapNumerator[i]![j]! + fuzzyNumerator[i]![j]!;
|
|
747
|
+
const totalDen = cheapDenominator[i]![j]! + fuzzyDenominator[i]![j]!;
|
|
748
|
+
combined[i]![j] = totalDen > 0 ? totalNum / totalDen : 0;
|
|
749
|
+
}
|
|
750
|
+
combined[j]![i] = combined[i]![j]!;
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// Extract upper triangle pairs above threshold
|
|
756
|
+
const results: ScoredPair[] = [];
|
|
757
|
+
for (let i = 0; i < n; i++) {
|
|
758
|
+
for (let j = i + 1; j < n; j++) {
|
|
759
|
+
const score = combined[i]![j]!;
|
|
760
|
+
if (score < threshold) continue;
|
|
761
|
+
const idA = Math.min(rowIds[i]!, rowIds[j]!);
|
|
762
|
+
const idB = Math.max(rowIds[i]!, rowIds[j]!);
|
|
763
|
+
const key = pairKey(idA, idB);
|
|
764
|
+
if (excludePairs !== undefined && excludePairs.has(key)) continue;
|
|
765
|
+
results.push(makeScoredPair(idA, idB, score));
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
return results;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// ---------------------------------------------------------------------------
|
|
772
|
+
// Public: scoreBlocksSequential
|
|
773
|
+
// ---------------------------------------------------------------------------
|
|
774
|
+
|
|
775
|
+
export interface ScoreBlocksOptions {
|
|
776
|
+
/** Filter to cross-source pairs only. */
|
|
777
|
+
readonly acrossFilesOnly?: boolean;
|
|
778
|
+
/** Row ID -> source name mapping (for acrossFilesOnly). */
|
|
779
|
+
readonly sourceLookup?: ReadonlyMap<number, string>;
|
|
780
|
+
/** Target IDs for match mode — filter to target/ref cross pairs. */
|
|
781
|
+
readonly targetIds?: ReadonlySet<number>;
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
/**
|
|
785
|
+
* Score all blocks sequentially.
|
|
786
|
+
*
|
|
787
|
+
* In JS there is no GIL, so we use sequential scoring as the default.
|
|
788
|
+
* For web workers or similar concurrency, the caller can partition blocks.
|
|
789
|
+
*/
|
|
790
|
+
export function scoreBlocksSequential(
|
|
791
|
+
blocks: readonly BlockResult[],
|
|
792
|
+
mk: MatchkeyConfig,
|
|
793
|
+
matchedPairs: Set<PairKey>,
|
|
794
|
+
options?: ScoreBlocksOptions,
|
|
795
|
+
): ScoredPair[] {
|
|
796
|
+
if (blocks.length === 0) return [];
|
|
797
|
+
|
|
798
|
+
const acrossFilesOnly = options?.acrossFilesOnly ?? false;
|
|
799
|
+
const sourceLookup = options?.sourceLookup;
|
|
800
|
+
const targetIds = options?.targetIds;
|
|
801
|
+
|
|
802
|
+
const allPairs: ScoredPair[] = [];
|
|
803
|
+
|
|
804
|
+
for (const block of blocks) {
|
|
805
|
+
// For cross-file mode, check that block has records from multiple sources
|
|
806
|
+
if (acrossFilesOnly && sourceLookup !== undefined) {
|
|
807
|
+
const sourcesInBlock = new Set<string>();
|
|
808
|
+
for (const row of block.rows) {
|
|
809
|
+
const src = sourceLookup.get(row["__row_id__"] as number);
|
|
810
|
+
if (src !== undefined) sourcesInBlock.add(src);
|
|
811
|
+
}
|
|
812
|
+
if (sourcesInBlock.size < 2) continue;
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
// Use a frozen copy of matchedPairs for consistency
|
|
816
|
+
const excludeSnapshot: ReadonlySet<PairKey> = new Set(matchedPairs);
|
|
817
|
+
|
|
818
|
+
let pairs = findFuzzyMatches(
|
|
819
|
+
block.rows,
|
|
820
|
+
mk,
|
|
821
|
+
excludeSnapshot,
|
|
822
|
+
block.preScoredPairs,
|
|
823
|
+
);
|
|
824
|
+
|
|
825
|
+
// Cross-file filter
|
|
826
|
+
if (acrossFilesOnly && sourceLookup !== undefined) {
|
|
827
|
+
pairs = pairs.filter((p) => {
|
|
828
|
+
const srcA = sourceLookup.get(p.idA);
|
|
829
|
+
const srcB = sourceLookup.get(p.idB);
|
|
830
|
+
return srcA !== srcB;
|
|
831
|
+
});
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
// Target/ref cross filter for match mode
|
|
835
|
+
if (targetIds !== undefined) {
|
|
836
|
+
pairs = pairs.filter(
|
|
837
|
+
(p) => targetIds.has(p.idA) !== targetIds.has(p.idB),
|
|
838
|
+
);
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
for (const p of pairs) {
|
|
842
|
+
allPairs.push(p);
|
|
843
|
+
matchedPairs.add(pairKey(p.idA, p.idB));
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
return allPairs;
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
// ---------------------------------------------------------------------------
|
|
851
|
+
// Utility: canonicalize pair key
|
|
852
|
+
// ---------------------------------------------------------------------------
|
|
853
|
+
|
|
854
|
+
// Re-export pairKey from cluster.ts — single canonical source of truth.
|
|
855
|
+
export { pairKey };
|