goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,934 @@
1
+ 'use strict';
2
+
3
+ // src/core/types.ts
4
+ function makeScoredPair(a, b, score) {
5
+ const lo = a < b ? a : b;
6
+ const hi = a < b ? b : a;
7
+ return { idA: lo, idB: hi, score };
8
+ }
9
+
10
+ // src/core/cluster.ts
11
+ function pairKey(a, b) {
12
+ const lo = a < b ? a : b;
13
+ const hi = a < b ? b : a;
14
+ return `${lo}:${hi}`;
15
+ }
16
+
17
+ // src/core/transforms.ts
18
+ function applyTransform(value, transform) {
19
+ if (value === null) return null;
20
+ if (transform.startsWith("substring:")) return applySubstring(value, transform);
21
+ if (transform.startsWith("qgram:")) return applyQgram(value, transform);
22
+ if (transform.startsWith("bloom_filter")) return applyBloomFilter(value, transform);
23
+ switch (transform) {
24
+ case "lowercase":
25
+ return value.toLowerCase();
26
+ case "uppercase":
27
+ return value.toUpperCase();
28
+ case "strip":
29
+ return value.trim();
30
+ case "strip_all":
31
+ return value.replace(/\s+/g, "");
32
+ case "digits_only":
33
+ return value.replace(/\D/g, "");
34
+ case "alpha_only":
35
+ return value.replace(/[^a-zA-Z]/g, "");
36
+ case "normalize_whitespace":
37
+ return value.trim().replace(/\s+/g, " ");
38
+ case "token_sort":
39
+ return value.trim().split(/\s+/).sort().join(" ");
40
+ case "first_token":
41
+ return value.trim().split(/\s+/)[0] ?? "";
42
+ case "last_token": {
43
+ const tokens = value.trim().split(/\s+/);
44
+ return tokens[tokens.length - 1] ?? "";
45
+ }
46
+ case "soundex":
47
+ return soundex(value);
48
+ case "metaphone":
49
+ return metaphone(value);
50
+ default:
51
+ return value;
52
+ }
53
+ }
54
+ function applyTransforms(value, transforms) {
55
+ let result = value;
56
+ for (const t of transforms) {
57
+ result = applyTransform(result, t);
58
+ if (result === null) return null;
59
+ }
60
+ return result;
61
+ }
62
+ function applySubstring(value, transform) {
63
+ const parts = transform.split(":");
64
+ const start = parseInt(parts[1] ?? "0", 10);
65
+ const end = parts[2] !== void 0 ? parseInt(parts[2], 10) : void 0;
66
+ return value.slice(start, end);
67
+ }
68
+ function applyQgram(value, transform) {
69
+ const parts = transform.split(":");
70
+ const n = parseInt(parts[1] ?? "2", 10);
71
+ if (n <= 0 || value.length < n) return value;
72
+ const grams = [];
73
+ for (let i = 0; i <= value.length - n; i++) {
74
+ grams.push(value.slice(i, i + n));
75
+ }
76
+ return grams.sort().join(" ");
77
+ }
78
+ var SOUNDEX_MAP = {
79
+ B: "1",
80
+ F: "1",
81
+ P: "1",
82
+ V: "1",
83
+ C: "2",
84
+ G: "2",
85
+ J: "2",
86
+ K: "2",
87
+ Q: "2",
88
+ S: "2",
89
+ X: "2",
90
+ Z: "2",
91
+ D: "3",
92
+ T: "3",
93
+ L: "4",
94
+ M: "5",
95
+ N: "5",
96
+ R: "6"
97
+ };
98
+ function soundex(value) {
99
+ const clean = value.toUpperCase().replace(/[^A-Z]/g, "");
100
+ if (clean.length === 0) return "0000";
101
+ const firstLetter = clean[0];
102
+ let code = firstLetter;
103
+ let lastDigit = SOUNDEX_MAP[firstLetter] ?? "0";
104
+ for (let i = 1; i < clean.length && code.length < 4; i++) {
105
+ const ch = clean[i];
106
+ const digit = SOUNDEX_MAP[ch];
107
+ if (digit && digit !== lastDigit) {
108
+ code += digit;
109
+ lastDigit = digit;
110
+ } else if (!digit) {
111
+ if (ch !== "H" && ch !== "W") {
112
+ lastDigit = "0";
113
+ }
114
+ }
115
+ }
116
+ return (code + "0000").slice(0, 4);
117
+ }
118
+ function metaphone(value) {
119
+ let word = value.toUpperCase().replace(/[^A-Z]/g, "");
120
+ if (word.length === 0) return "";
121
+ const dropPrefixes = ["AE", "GN", "KN", "PN", "WR"];
122
+ for (const prefix of dropPrefixes) {
123
+ if (word.startsWith(prefix)) {
124
+ word = word.slice(1);
125
+ break;
126
+ }
127
+ }
128
+ if (word.endsWith("MB")) {
129
+ word = word.slice(0, -1);
130
+ }
131
+ let code = "";
132
+ let i = 0;
133
+ while (i < word.length && code.length < 4) {
134
+ const ch = word[i];
135
+ const next = word[i + 1] ?? "";
136
+ const prev = i > 0 ? word[i - 1] : "";
137
+ if (ch === prev && ch !== "C") {
138
+ i++;
139
+ continue;
140
+ }
141
+ switch (ch) {
142
+ case "A":
143
+ case "E":
144
+ case "I":
145
+ case "O":
146
+ case "U":
147
+ if (i === 0) code += ch;
148
+ break;
149
+ case "B":
150
+ code += "B";
151
+ break;
152
+ case "C":
153
+ if (next === "I" || next === "E" || next === "Y") {
154
+ code += "S";
155
+ } else {
156
+ code += "K";
157
+ }
158
+ break;
159
+ case "D":
160
+ if (next === "G" && "IEY".includes(word[i + 2] ?? "")) {
161
+ code += "J";
162
+ } else {
163
+ code += "T";
164
+ }
165
+ break;
166
+ case "F":
167
+ code += "F";
168
+ break;
169
+ case "G":
170
+ if (next === "H" && i + 2 < word.length && !"AEIOU".includes(word[i + 2] ?? "")) {
171
+ i += 2;
172
+ continue;
173
+ } else if (i > 0 && (next === "N" || next === "N" && word[i + 2] === "E" && i + 2 === word.length - 1)) ; else if (prev === "G") ; else if (next === "I" || next === "E" || next === "Y") {
174
+ code += "J";
175
+ } else {
176
+ code += "K";
177
+ }
178
+ break;
179
+ case "H":
180
+ if ("AEIOU".includes(next) && !"AEIOU".includes(prev)) {
181
+ code += "H";
182
+ }
183
+ break;
184
+ case "J":
185
+ code += "J";
186
+ break;
187
+ case "K":
188
+ if (prev !== "C") code += "K";
189
+ break;
190
+ case "L":
191
+ code += "L";
192
+ break;
193
+ case "M":
194
+ code += "M";
195
+ break;
196
+ case "N":
197
+ code += "N";
198
+ break;
199
+ case "P":
200
+ if (next === "H") {
201
+ code += "F";
202
+ i++;
203
+ } else {
204
+ code += "P";
205
+ }
206
+ break;
207
+ case "Q":
208
+ code += "K";
209
+ break;
210
+ case "R":
211
+ code += "R";
212
+ break;
213
+ case "S":
214
+ if (next === "H" || next === "I" && (word[i + 2] === "O" || word[i + 2] === "A")) {
215
+ code += "X";
216
+ i++;
217
+ } else if (next === "C" && word[i + 2] === "H") {
218
+ code += "SK";
219
+ i += 2;
220
+ } else {
221
+ code += "S";
222
+ }
223
+ break;
224
+ case "T":
225
+ if (next === "H") {
226
+ code += "0";
227
+ i++;
228
+ } else if (next === "I" && (word[i + 2] === "O" || word[i + 2] === "A")) {
229
+ code += "X";
230
+ } else {
231
+ code += "T";
232
+ }
233
+ break;
234
+ case "V":
235
+ code += "F";
236
+ break;
237
+ case "W":
238
+ case "Y":
239
+ if ("AEIOU".includes(next)) {
240
+ code += ch;
241
+ }
242
+ break;
243
+ case "X":
244
+ code += "KS";
245
+ break;
246
+ case "Z":
247
+ code += "S";
248
+ break;
249
+ }
250
+ i++;
251
+ }
252
+ return code.slice(0, 4);
253
+ }
254
+ var BLOOM_PRESETS = {
255
+ standard: { size: 512, k: 20, ngram: 2, hmac: false, balanced: false },
256
+ high: { size: 1024, k: 30, ngram: 2, hmac: true, balanced: false },
257
+ paranoid: { size: 2048, k: 40, ngram: 3, hmac: true, balanced: true }
258
+ };
259
+ var BLOOM_DEFAULTS = { size: 1024, k: 20, ngram: 2 };
260
+ var BLOOM_DEFAULT_HMAC_KEY = "default_field_key";
261
+ function applyBloomFilter(value, transform) {
262
+ let ngramSize = BLOOM_DEFAULTS.ngram;
263
+ let numHashes = BLOOM_DEFAULTS.k;
264
+ let filterSize = BLOOM_DEFAULTS.size;
265
+ let hmacKey = null;
266
+ let balanced = false;
267
+ if (transform === "bloom_filter") ; else {
268
+ const parts = transform.split(":");
269
+ const maybeLevel = parts[1];
270
+ if (maybeLevel && maybeLevel in BLOOM_PRESETS) {
271
+ const preset = BLOOM_PRESETS[maybeLevel];
272
+ ngramSize = preset.ngram;
273
+ numHashes = preset.k;
274
+ filterSize = preset.size;
275
+ balanced = preset.balanced;
276
+ if (preset.hmac) {
277
+ hmacKey = parts[2] && parts[2].length > 0 ? parts[2] : BLOOM_DEFAULT_HMAC_KEY;
278
+ }
279
+ } else {
280
+ ngramSize = parseInt(parts[1] ?? String(BLOOM_DEFAULTS.ngram), 10);
281
+ numHashes = parseInt(parts[2] ?? String(BLOOM_DEFAULTS.k), 10);
282
+ filterSize = parseInt(parts[3] ?? String(BLOOM_DEFAULTS.size), 10);
283
+ if (parts.length > 4 && parts[4].length > 0) {
284
+ hmacKey = parts[4];
285
+ }
286
+ }
287
+ }
288
+ const filterBytes = Math.floor(filterSize / 8);
289
+ const bits = new Uint8Array(filterBytes);
290
+ let padded = value.toLowerCase().trim();
291
+ if (padded.length < ngramSize) {
292
+ padded = padded.padEnd(ngramSize, "_");
293
+ }
294
+ if (balanced && padded.length < 8) {
295
+ const salt = sha256Hex(padded).slice(0, 8);
296
+ padded = padded + salt;
297
+ }
298
+ const ngrams = [];
299
+ for (let i = 0; i <= padded.length - ngramSize; i++) {
300
+ ngrams.push(padded.slice(i, i + ngramSize));
301
+ }
302
+ for (const ngram of ngrams) {
303
+ for (let k = 0; k < numHashes; k++) {
304
+ const hex = hmacKey ? hmacSha256Hex(`${hmacKey}:${k}`, ngram) : sha256Hex(`${k}:${ngram}`);
305
+ const bitPos = Number(modHexBigInt(hex, filterSize));
306
+ bits[bitPos >> 3] |= 1 << (bitPos & 7);
307
+ }
308
+ }
309
+ return hexEncode(bits);
310
+ }
311
+ function modHexBigInt(hex, modulus) {
312
+ const big = BigInt("0x" + hex);
313
+ const mod = BigInt(modulus);
314
+ const rem = big % mod;
315
+ return Number(rem);
316
+ }
317
+ function hexEncode(bytes) {
318
+ const hex = [];
319
+ for (let i = 0; i < bytes.length; i++) {
320
+ hex.push(bytes[i].toString(16).padStart(2, "0"));
321
+ }
322
+ return hex.join("");
323
+ }
324
+ var K256 = new Uint32Array([
325
+ 1116352408,
326
+ 1899447441,
327
+ 3049323471,
328
+ 3921009573,
329
+ 961987163,
330
+ 1508970993,
331
+ 2453635748,
332
+ 2870763221,
333
+ 3624381080,
334
+ 310598401,
335
+ 607225278,
336
+ 1426881987,
337
+ 1925078388,
338
+ 2162078206,
339
+ 2614888103,
340
+ 3248222580,
341
+ 3835390401,
342
+ 4022224774,
343
+ 264347078,
344
+ 604807628,
345
+ 770255983,
346
+ 1249150122,
347
+ 1555081692,
348
+ 1996064986,
349
+ 2554220882,
350
+ 2821834349,
351
+ 2952996808,
352
+ 3210313671,
353
+ 3336571891,
354
+ 3584528711,
355
+ 113926993,
356
+ 338241895,
357
+ 666307205,
358
+ 773529912,
359
+ 1294757372,
360
+ 1396182291,
361
+ 1695183700,
362
+ 1986661051,
363
+ 2177026350,
364
+ 2456956037,
365
+ 2730485921,
366
+ 2820302411,
367
+ 3259730800,
368
+ 3345764771,
369
+ 3516065817,
370
+ 3600352804,
371
+ 4094571909,
372
+ 275423344,
373
+ 430227734,
374
+ 506948616,
375
+ 659060556,
376
+ 883997877,
377
+ 958139571,
378
+ 1322822218,
379
+ 1537002063,
380
+ 1747873779,
381
+ 1955562222,
382
+ 2024104815,
383
+ 2227730452,
384
+ 2361852424,
385
+ 2428436474,
386
+ 2756734187,
387
+ 3204031479,
388
+ 3329325298
389
+ ]);
390
+ function rotr32(x, n) {
391
+ return (x >>> n | x << 32 - n) >>> 0;
392
+ }
393
+ function utf8Encode(input) {
394
+ return new TextEncoder().encode(input);
395
+ }
396
+ function sha256Bytes(msg) {
397
+ const H = new Uint32Array([
398
+ 1779033703,
399
+ 3144134277,
400
+ 1013904242,
401
+ 2773480762,
402
+ 1359893119,
403
+ 2600822924,
404
+ 528734635,
405
+ 1541459225
406
+ ]);
407
+ const msgLen = msg.length;
408
+ const bitLen = msgLen * 8;
409
+ const withPadLen = msgLen + 9 + 63 >> 6 << 6;
410
+ const padded = new Uint8Array(withPadLen);
411
+ padded.set(msg);
412
+ padded[msgLen] = 128;
413
+ const hi = Math.floor(bitLen / 4294967296);
414
+ const lo = bitLen >>> 0;
415
+ const dv = new DataView(padded.buffer);
416
+ dv.setUint32(withPadLen - 8, hi, false);
417
+ dv.setUint32(withPadLen - 4, lo, false);
418
+ const W = new Uint32Array(64);
419
+ for (let offset = 0; offset < withPadLen; offset += 64) {
420
+ for (let t = 0; t < 16; t++) {
421
+ W[t] = dv.getUint32(offset + t * 4, false);
422
+ }
423
+ for (let t = 16; t < 64; t++) {
424
+ const w15 = W[t - 15];
425
+ const w2 = W[t - 2];
426
+ const s0 = rotr32(w15, 7) ^ rotr32(w15, 18) ^ w15 >>> 3;
427
+ const s1 = rotr32(w2, 17) ^ rotr32(w2, 19) ^ w2 >>> 10;
428
+ W[t] = W[t - 16] + s0 + W[t - 7] + s1 >>> 0;
429
+ }
430
+ let a = H[0], b = H[1], c = H[2], d = H[3];
431
+ let e = H[4], f = H[5], g = H[6], h = H[7];
432
+ for (let t = 0; t < 64; t++) {
433
+ const S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
434
+ const ch = e & f ^ ~e & g;
435
+ const T1 = h + S1 + ch + K256[t] + W[t] >>> 0;
436
+ const S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
437
+ const mj = a & b ^ a & c ^ b & c;
438
+ const T2 = S0 + mj >>> 0;
439
+ h = g;
440
+ g = f;
441
+ f = e;
442
+ e = d + T1 >>> 0;
443
+ d = c;
444
+ c = b;
445
+ b = a;
446
+ a = T1 + T2 >>> 0;
447
+ }
448
+ H[0] = H[0] + a >>> 0;
449
+ H[1] = H[1] + b >>> 0;
450
+ H[2] = H[2] + c >>> 0;
451
+ H[3] = H[3] + d >>> 0;
452
+ H[4] = H[4] + e >>> 0;
453
+ H[5] = H[5] + f >>> 0;
454
+ H[6] = H[6] + g >>> 0;
455
+ H[7] = H[7] + h >>> 0;
456
+ }
457
+ const out = new Uint8Array(32);
458
+ const outDv = new DataView(out.buffer);
459
+ for (let i = 0; i < 8; i++) outDv.setUint32(i * 4, H[i], false);
460
+ return out;
461
+ }
462
+ function sha256Hex(input) {
463
+ return hexEncode(sha256Bytes(utf8Encode(input)));
464
+ }
465
+ function hmacSha256Hex(key, msg) {
466
+ const blockSize = 64;
467
+ let keyBytes = utf8Encode(key);
468
+ if (keyBytes.length > blockSize) {
469
+ keyBytes = sha256Bytes(keyBytes);
470
+ }
471
+ const kPad = new Uint8Array(blockSize);
472
+ kPad.set(keyBytes);
473
+ const oKeyPad = new Uint8Array(blockSize);
474
+ const iKeyPad = new Uint8Array(blockSize);
475
+ for (let i = 0; i < blockSize; i++) {
476
+ oKeyPad[i] = kPad[i] ^ 92;
477
+ iKeyPad[i] = kPad[i] ^ 54;
478
+ }
479
+ const msgBytes = utf8Encode(msg);
480
+ const inner = new Uint8Array(blockSize + msgBytes.length);
481
+ inner.set(iKeyPad);
482
+ inner.set(msgBytes, blockSize);
483
+ const innerHash = sha256Bytes(inner);
484
+ const outer = new Uint8Array(blockSize + innerHash.length);
485
+ outer.set(oKeyPad);
486
+ outer.set(innerHash, blockSize);
487
+ return hexEncode(sha256Bytes(outer));
488
+ }
489
+
490
+ // src/core/scorer.ts
491
+ function asString(v) {
492
+ if (v === null || v === void 0) return null;
493
+ if (typeof v === "string") return v;
494
+ return String(v);
495
+ }
496
+ function jaro(a, b) {
497
+ if (a === b) return 1;
498
+ const lenA = a.length;
499
+ const lenB = b.length;
500
+ if (lenA === 0 || lenB === 0) return 0;
501
+ const matchWindow = Math.max(Math.floor(Math.max(lenA, lenB) / 2) - 1, 0);
502
+ const aMatched = new Uint8Array(lenA);
503
+ const bMatched = new Uint8Array(lenB);
504
+ let matches = 0;
505
+ for (let i = 0; i < lenA; i++) {
506
+ const lo = Math.max(0, i - matchWindow);
507
+ const hi = Math.min(lenB - 1, i + matchWindow);
508
+ for (let j = lo; j <= hi; j++) {
509
+ if (bMatched[j] !== 0 || a[i] !== b[j]) continue;
510
+ aMatched[i] = 1;
511
+ bMatched[j] = 1;
512
+ matches++;
513
+ break;
514
+ }
515
+ }
516
+ if (matches === 0) return 0;
517
+ let transpositions = 0;
518
+ let k = 0;
519
+ for (let i = 0; i < lenA; i++) {
520
+ if (aMatched[i] === 0) continue;
521
+ while (bMatched[k] === 0) k++;
522
+ if (a[i] !== b[k]) transpositions++;
523
+ k++;
524
+ }
525
+ return (matches / lenA + matches / lenB + (matches - transpositions / 2) / matches) / 3;
526
+ }
527
+ function jaroWinkler(a, b) {
528
+ const jaroSim = jaro(a, b);
529
+ if (jaroSim === 0) return 0;
530
+ const maxPrefix = Math.min(4, Math.min(a.length, b.length));
531
+ let prefix = 0;
532
+ for (let i = 0; i < maxPrefix; i++) {
533
+ if (a[i] === b[i]) prefix++;
534
+ else break;
535
+ }
536
+ return jaroSim + prefix * 0.1 * (1 - jaroSim);
537
+ }
538
+ function levenshteinDistance(a, b) {
539
+ const lenA = a.length;
540
+ const lenB = b.length;
541
+ if (lenA === 0) return lenB;
542
+ if (lenB === 0) return lenA;
543
+ let prev = new Uint32Array(lenB + 1);
544
+ let curr = new Uint32Array(lenB + 1);
545
+ for (let j = 0; j <= lenB; j++) prev[j] = j;
546
+ for (let i = 1; i <= lenA; i++) {
547
+ curr[0] = i;
548
+ for (let j = 1; j <= lenB; j++) {
549
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
550
+ curr[j] = Math.min(
551
+ prev[j] + 1,
552
+ // deletion
553
+ curr[j - 1] + 1,
554
+ // insertion
555
+ prev[j - 1] + cost
556
+ // substitution
557
+ );
558
+ }
559
+ [prev, curr] = [curr, prev];
560
+ }
561
+ return prev[lenB];
562
+ }
563
+ function levenshteinSimilarity(a, b) {
564
+ if (a === b) return 1;
565
+ const maxLen = Math.max(a.length, b.length);
566
+ if (maxLen === 0) return 1;
567
+ return 1 - levenshteinDistance(a, b) / maxLen;
568
+ }
569
+ function indelDistance(a, b) {
570
+ if (a === b) return 0;
571
+ if (a.length === 0) return b.length;
572
+ if (b.length === 0) return a.length;
573
+ const m = a.length;
574
+ const n = b.length;
575
+ let prev = new Uint32Array(n + 1);
576
+ let curr = new Uint32Array(n + 1);
577
+ for (let j = 0; j <= n; j++) prev[j] = j;
578
+ for (let i = 1; i <= m; i++) {
579
+ curr[0] = i;
580
+ for (let j = 1; j <= n; j++) {
581
+ if (a.charCodeAt(i - 1) === b.charCodeAt(j - 1)) {
582
+ curr[j] = prev[j - 1];
583
+ } else {
584
+ curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1);
585
+ }
586
+ }
587
+ [prev, curr] = [curr, prev];
588
+ }
589
+ return prev[n];
590
+ }
591
+ function indelSimilarity(a, b) {
592
+ const total = a.length + b.length;
593
+ if (total === 0) return 1;
594
+ return 1 - indelDistance(a, b) / total;
595
+ }
596
+ function tokenSortRatio(a, b) {
597
+ const normalize = (s) => s.toLowerCase().replace(/[^a-z0-9\s]/g, " ").trim().split(/\s+/).filter(Boolean).sort().join(" ");
598
+ return indelSimilarity(normalize(a), normalize(b));
599
+ }
600
+ function soundexMatch(a, b) {
601
+ return soundex(a) === soundex(b) ? 1 : 0;
602
+ }
603
+ function hexToBytes(hex) {
604
+ const len = hex.length >>> 1;
605
+ const bytes = new Uint8Array(len);
606
+ for (let i = 0; i < len; i++) {
607
+ bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
608
+ }
609
+ return bytes;
610
+ }
611
+ function popcount(bytes) {
612
+ let count = 0;
613
+ for (let i = 0; i < bytes.length; i++) {
614
+ let b = bytes[i];
615
+ while (b !== 0) {
616
+ b &= b - 1;
617
+ count++;
618
+ }
619
+ }
620
+ return count;
621
+ }
622
+ function popcountAnd(a, b) {
623
+ const len = Math.min(a.length, b.length);
624
+ let count = 0;
625
+ for (let i = 0; i < len; i++) {
626
+ let v = a[i] & b[i];
627
+ while (v !== 0) {
628
+ v &= v - 1;
629
+ count++;
630
+ }
631
+ }
632
+ return count;
633
+ }
634
+ function popcountOr(a, b) {
635
+ const maxLen = Math.max(a.length, b.length);
636
+ let count = 0;
637
+ for (let i = 0; i < maxLen; i++) {
638
+ let v = (a[i] ?? 0) | (b[i] ?? 0);
639
+ while (v !== 0) {
640
+ v &= v - 1;
641
+ count++;
642
+ }
643
+ }
644
+ return count;
645
+ }
646
+ function diceCoefficient(a, b) {
647
+ const bytesA = hexToBytes(a);
648
+ const bytesB = hexToBytes(b);
649
+ const pcA = popcount(bytesA);
650
+ const pcB = popcount(bytesB);
651
+ const total = pcA + pcB;
652
+ if (total === 0) return 0;
653
+ const intersection = popcountAnd(bytesA, bytesB);
654
+ return 2 * intersection / total;
655
+ }
656
+ function jaccardSimilarity(a, b) {
657
+ const bytesA = hexToBytes(a);
658
+ const bytesB = hexToBytes(b);
659
+ const intersection = popcountAnd(bytesA, bytesB);
660
+ const union = popcountOr(bytesA, bytesB);
661
+ if (union === 0) return 0;
662
+ return intersection / union;
663
+ }
664
+ function ensembleScore(a, b) {
665
+ const jw = jaroWinkler(a, b);
666
+ const ts = tokenSortRatio(a, b);
667
+ const sx = soundexMatch(a, b) * 0.8;
668
+ return Math.max(jw, ts, sx);
669
+ }
670
+ function scoreField(valA, valB, scorer) {
671
+ if (valA === null || valB === null) return null;
672
+ switch (scorer) {
673
+ case "exact":
674
+ return valA === valB ? 1 : 0;
675
+ case "jaro_winkler":
676
+ return jaroWinkler(valA, valB);
677
+ case "levenshtein":
678
+ return levenshteinSimilarity(valA, valB);
679
+ case "token_sort":
680
+ return tokenSortRatio(valA, valB);
681
+ case "soundex_match":
682
+ return soundexMatch(valA, valB);
683
+ case "dice":
684
+ return diceCoefficient(valA, valB);
685
+ case "jaccard":
686
+ return jaccardSimilarity(valA, valB);
687
+ case "ensemble":
688
+ return ensembleScore(valA, valB);
689
+ default:
690
+ throw new Error(`Unknown scorer: ${JSON.stringify(scorer)}`);
691
+ }
692
+ }
693
+ function scoreMatrix(values, scorerName) {
694
+ const n = values.length;
695
+ const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
696
+ for (let i = 0; i < n; i++) {
697
+ for (let j = i + 1; j < n; j++) {
698
+ const s = scoreField(values[i], values[j], scorerName) ?? 0;
699
+ matrix[i][j] = s;
700
+ matrix[j][i] = s;
701
+ }
702
+ }
703
+ return matrix;
704
+ }
705
+ function exactScoreMatrix(values) {
706
+ const n = values.length;
707
+ const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
708
+ const groups = /* @__PURE__ */ new Map();
709
+ for (let i = 0; i < n; i++) {
710
+ const v = values[i];
711
+ if (v != null) {
712
+ const existing = groups.get(v);
713
+ if (existing !== void 0) {
714
+ existing.push(i);
715
+ } else {
716
+ groups.set(v, [i]);
717
+ }
718
+ }
719
+ }
720
+ groups.forEach((indices) => {
721
+ if (indices.length > 1) {
722
+ for (let a = 0; a < indices.length; a++) {
723
+ for (let b = a + 1; b < indices.length; b++) {
724
+ matrix[indices[a]][indices[b]] = 1;
725
+ matrix[indices[b]][indices[a]] = 1;
726
+ }
727
+ }
728
+ }
729
+ });
730
+ return matrix;
731
+ }
732
+ function soundexScoreMatrix(values) {
733
+ const codes = values.map((v) => v !== null ? soundex(v) : null);
734
+ return exactScoreMatrix(codes);
735
+ }
736
+ function ensembleScoreMatrix(values) {
737
+ const n = values.length;
738
+ const clean = values.map((v) => v ?? "");
739
+ const jw = Array.from({ length: n }, () => new Array(n).fill(0));
740
+ const ts = Array.from({ length: n }, () => new Array(n).fill(0));
741
+ const sx = soundexScoreMatrix(values);
742
+ const result = Array.from({ length: n }, () => new Array(n).fill(0));
743
+ for (let i = 0; i < n; i++) {
744
+ for (let j = i + 1; j < n; j++) {
745
+ if (values[i] === null || values[j] === null) continue;
746
+ jw[i][j] = jaroWinkler(clean[i], clean[j]);
747
+ jw[j][i] = jw[i][j];
748
+ ts[i][j] = tokenSortRatio(clean[i], clean[j]);
749
+ ts[j][i] = ts[i][j];
750
+ }
751
+ }
752
+ for (let i = 0; i < n; i++) {
753
+ for (let j = i + 1; j < n; j++) {
754
+ const val = Math.max(jw[i][j], ts[i][j], sx[i][j] * 0.8);
755
+ result[i][j] = val;
756
+ result[j][i] = val;
757
+ }
758
+ }
759
+ return result;
760
+ }
761
+ function buildNullMask(values) {
762
+ const n = values.length;
763
+ const mask = Array.from({ length: n }, () => new Array(n).fill(false));
764
+ for (let i = 0; i < n; i++) {
765
+ if (values[i] === null) {
766
+ for (let j = 0; j < n; j++) {
767
+ mask[i][j] = true;
768
+ mask[j][i] = true;
769
+ }
770
+ }
771
+ }
772
+ return mask;
773
+ }
774
+ function buildScoreMatrix(values, scorerName) {
775
+ switch (scorerName) {
776
+ case "exact":
777
+ return exactScoreMatrix(values);
778
+ case "soundex_match":
779
+ return soundexScoreMatrix(values);
780
+ case "ensemble":
781
+ return ensembleScoreMatrix(values);
782
+ default:
783
+ return scoreMatrix(values, scorerName);
784
+ }
785
+ }
786
+ function getTransformedValues(rows, field) {
787
+ return rows.map((row) => {
788
+ const raw = asString(row[field.field]);
789
+ return applyTransforms(raw, field.transforms);
790
+ });
791
+ }
792
+ function findFuzzyMatches(rows, mk, excludePairs, preScoredPairs) {
793
+ const threshold = mk.type === "exact" ? 1 : mk.threshold ?? 0.85;
794
+ if (preScoredPairs !== void 0) {
795
+ const results2 = [];
796
+ for (const p of preScoredPairs) {
797
+ if (p.score < threshold) continue;
798
+ const idA = Math.min(p.idA, p.idB);
799
+ const idB = Math.max(p.idA, p.idB);
800
+ const key = pairKey(idA, idB);
801
+ if (excludePairs !== void 0 && excludePairs.has(key)) continue;
802
+ results2.push(makeScoredPair(idA, idB, p.score));
803
+ }
804
+ return results2;
805
+ }
806
+ const n = rows.length;
807
+ if (n < 2) return [];
808
+ const rowIds = rows.map((r) => r["__row_id__"]);
809
+ const cheapFields = mk.fields.filter(
810
+ (f) => f.scorer === "exact" || f.scorer === "soundex_match"
811
+ );
812
+ const fuzzyFields = mk.fields.filter(
813
+ (f) => f.scorer !== "exact" && f.scorer !== "soundex_match" && f.scorer !== "record_embedding"
814
+ );
815
+ const totalWeight = mk.fields.reduce((sum, f) => sum + f.weight, 0);
816
+ if (totalWeight === 0) return [];
817
+ const cheapNumerator = Array.from({ length: n }, () => new Array(n).fill(0));
818
+ const cheapDenominator = Array.from({ length: n }, () => new Array(n).fill(0));
819
+ for (const f of cheapFields) {
820
+ const values = getTransformedValues(rows, f);
821
+ const nullMask = buildNullMask(values);
822
+ const scores = f.scorer === "exact" ? exactScoreMatrix(values) : soundexScoreMatrix(values);
823
+ for (let i = 0; i < n; i++) {
824
+ for (let j = i + 1; j < n; j++) {
825
+ if (!nullMask[i][j]) {
826
+ cheapNumerator[i][j] += scores[i][j] * f.weight;
827
+ cheapNumerator[j][i] = cheapNumerator[i][j];
828
+ cheapDenominator[i][j] += f.weight;
829
+ cheapDenominator[j][i] = cheapDenominator[i][j];
830
+ }
831
+ }
832
+ }
833
+ }
834
+ const fuzzyTotalWeight = fuzzyFields.reduce((sum, f) => sum + f.weight, 0);
835
+ const impossible = Array.from({ length: n }, () => new Array(n).fill(false));
836
+ let combined;
837
+ if (fuzzyFields.length === 0) {
838
+ combined = Array.from({ length: n }, () => new Array(n).fill(0));
839
+ for (let i = 0; i < n; i++) {
840
+ for (let j = i + 1; j < n; j++) {
841
+ combined[i][j] = cheapDenominator[i][j] > 0 ? cheapNumerator[i][j] / cheapDenominator[i][j] : 0;
842
+ combined[j][i] = combined[i][j];
843
+ }
844
+ }
845
+ } else {
846
+ for (let i = 0; i < n; i++) {
847
+ for (let j = i + 1; j < n; j++) {
848
+ const maxNum = cheapNumerator[i][j] + fuzzyTotalWeight;
849
+ const maxDen = cheapDenominator[i][j] + fuzzyTotalWeight;
850
+ const maxPossible = maxDen > 0 ? maxNum / maxDen : 0;
851
+ if (maxPossible < threshold) {
852
+ impossible[i][j] = true;
853
+ impossible[j][i] = true;
854
+ }
855
+ }
856
+ }
857
+ const fuzzyNumerator = Array.from({ length: n }, () => new Array(n).fill(0));
858
+ const fuzzyDenominator = Array.from({ length: n }, () => new Array(n).fill(0));
859
+ for (let fIdx = 0; fIdx < fuzzyFields.length; fIdx++) {
860
+ const f = fuzzyFields[fIdx];
861
+ const values = getTransformedValues(rows, f);
862
+ const nullMask = buildNullMask(values);
863
+ const scores = buildScoreMatrix(values, f.scorer);
864
+ for (let i = 0; i < n; i++) {
865
+ for (let j = i + 1; j < n; j++) {
866
+ if (!nullMask[i][j]) {
867
+ fuzzyNumerator[i][j] += scores[i][j] * f.weight;
868
+ fuzzyNumerator[j][i] = fuzzyNumerator[i][j];
869
+ fuzzyDenominator[i][j] += f.weight;
870
+ fuzzyDenominator[j][i] = fuzzyDenominator[i][j];
871
+ }
872
+ }
873
+ }
874
+ const remainingWeight = fuzzyFields.slice(fIdx + 1).reduce((sum, ff) => sum + ff.weight, 0);
875
+ if (remainingWeight > 0) {
876
+ let anyCanReach = false;
877
+ for (let i = 0; i < n && !anyCanReach; i++) {
878
+ for (let j = i + 1; j < n && !anyCanReach; j++) {
879
+ if (impossible[i][j]) continue;
880
+ const totalNum = cheapNumerator[i][j] + fuzzyNumerator[i][j] + remainingWeight;
881
+ const totalDen = cheapDenominator[i][j] + fuzzyDenominator[i][j] + remainingWeight;
882
+ const bestPossible = totalDen > 0 ? totalNum / totalDen : 0;
883
+ if (bestPossible >= threshold) {
884
+ anyCanReach = true;
885
+ }
886
+ }
887
+ }
888
+ if (!anyCanReach) break;
889
+ }
890
+ }
891
+ combined = Array.from({ length: n }, () => new Array(n).fill(0));
892
+ for (let i = 0; i < n; i++) {
893
+ for (let j = i + 1; j < n; j++) {
894
+ if (impossible[i][j]) {
895
+ combined[i][j] = 0;
896
+ } else {
897
+ const totalNum = cheapNumerator[i][j] + fuzzyNumerator[i][j];
898
+ const totalDen = cheapDenominator[i][j] + fuzzyDenominator[i][j];
899
+ combined[i][j] = totalDen > 0 ? totalNum / totalDen : 0;
900
+ }
901
+ combined[j][i] = combined[i][j];
902
+ }
903
+ }
904
+ }
905
+ const results = [];
906
+ for (let i = 0; i < n; i++) {
907
+ for (let j = i + 1; j < n; j++) {
908
+ const score = combined[i][j];
909
+ if (score < threshold) continue;
910
+ const idA = Math.min(rowIds[i], rowIds[j]);
911
+ const idB = Math.max(rowIds[i], rowIds[j]);
912
+ const key = pairKey(idA, idB);
913
+ if (excludePairs !== void 0 && excludePairs.has(key)) continue;
914
+ results.push(makeScoredPair(idA, idB, score));
915
+ }
916
+ }
917
+ return results;
918
+ }
919
+
920
+ // src/node/backends/score-worker.ts
921
+ function scoreWorker(input) {
922
+ const excludeSet = new Set(input.matchedPairs);
923
+ const pairs = findFuzzyMatches(
924
+ input.block.rows,
925
+ input.mk,
926
+ excludeSet,
927
+ input.block.preScoredPairs
928
+ );
929
+ return { pairs };
930
+ }
931
+
932
+ module.exports = scoreWorker;
933
+ //# sourceMappingURL=score-worker.cjs.map
934
+ //# sourceMappingURL=score-worker.cjs.map