goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,655 @@
1
+ /**
2
+ * blocker.ts — Groups records into blocks for pairwise comparison.
3
+ *
4
+ * Edge-safe: no Node.js imports. Pure TypeScript only.
5
+ *
6
+ * Ports `goldenmatch/core/blocker.py`.
7
+ */
8
+
9
+ import type {
10
+ BlockingConfig,
11
+ BlockingKeyConfig,
12
+ BlockResult,
13
+ Row,
14
+ SortKeyField,
15
+ } from "./types.js";
16
+ import { applyTransforms } from "./transforms.js";
17
+ import { buildANNBlocks, buildANNPairBlocks } from "./ann-blocker.js";
18
+
19
+ // ---------------------------------------------------------------------------
20
+ // Internal helpers
21
+ // ---------------------------------------------------------------------------
22
+
23
+ /**
24
+ * Build a composite block key string for a single row.
25
+ *
26
+ * For each field in `keyConfig.fields`, extracts the value, applies
27
+ * transforms, and concatenates with "||". Returns `null` if any field
28
+ * value is null/undefined or any transform returns null.
29
+ */
30
+ function buildBlockKey(
31
+ row: Row,
32
+ keyConfig: BlockingKeyConfig,
33
+ ): string | null {
34
+ const parts: string[] = [];
35
+ for (const field of keyConfig.fields) {
36
+ const raw = row[field];
37
+ if (raw === null || raw === undefined) return null;
38
+ const str = String(raw);
39
+ if (keyConfig.transforms.length > 0) {
40
+ const val = applyTransforms(str, keyConfig.transforms);
41
+ if (val === null || val === undefined) return null;
42
+ parts.push(val);
43
+ } else {
44
+ parts.push(str);
45
+ }
46
+ }
47
+ return parts.join("||");
48
+ }
49
+
50
+ /**
51
+ * Build a sort key string for a row using SortKeyField config.
52
+ * Returns `null` if any field value is null/undefined.
53
+ */
54
+ function buildSortKey(
55
+ row: Row,
56
+ sortKeyFields: readonly SortKeyField[],
57
+ ): string | null {
58
+ const parts: string[] = [];
59
+ for (const skf of sortKeyFields) {
60
+ const raw = row[skf.column];
61
+ if (raw === null || raw === undefined) return null;
62
+ const str = String(raw);
63
+ if (skf.transforms.length > 0) {
64
+ const val = applyTransforms(str, skf.transforms);
65
+ if (val === null || val === undefined) return null;
66
+ parts.push(val);
67
+ } else {
68
+ parts.push(str);
69
+ }
70
+ }
71
+ return parts.join("||");
72
+ }
73
+
74
+ // ---------------------------------------------------------------------------
75
+ // Static blocking
76
+ // ---------------------------------------------------------------------------
77
+
78
+ /**
79
+ * Group rows by blocking key. Skip blocks with fewer than 2 rows.
80
+ * Handle oversized blocks per `config.skipOversized`.
81
+ */
82
+ export function buildStaticBlocks(
83
+ rows: readonly Row[],
84
+ config: BlockingConfig,
85
+ ): BlockResult[] {
86
+ if (rows.length < 2) return [];
87
+
88
+ const results: BlockResult[] = [];
89
+
90
+ for (const keyConfig of config.keys) {
91
+ const groups = new Map<string, Row[]>();
92
+
93
+ for (const row of rows) {
94
+ const key = buildBlockKey(row, keyConfig);
95
+ if (key === null) continue;
96
+ let group = groups.get(key);
97
+ if (!group) {
98
+ group = [];
99
+ groups.set(key, group);
100
+ }
101
+ group.push(row);
102
+ }
103
+
104
+ for (const [key, group] of groups) {
105
+ if (group.length < 2) continue;
106
+
107
+ if (group.length > config.maxBlockSize) {
108
+ if (config.skipOversized) {
109
+ // Skip oversized blocks
110
+ continue;
111
+ }
112
+ // Process anyway (caller is warned via the oversized size)
113
+ }
114
+
115
+ results.push({
116
+ blockKey: key,
117
+ rows: group,
118
+ strategy: "static",
119
+ depth: 0,
120
+ });
121
+ }
122
+ }
123
+
124
+ return results;
125
+ }
126
+
127
+ // ---------------------------------------------------------------------------
128
+ // Multi-pass blocking
129
+ // ---------------------------------------------------------------------------
130
+
131
+ /**
132
+ * Run multiple blocking passes using `config.passes`.
133
+ *
134
+ * Each pass uses a different `BlockingKeyConfig`. Blocks are deduplicated
135
+ * by block key so each unique key appears only once.
136
+ */
137
+ export function buildMultiPassBlocks(
138
+ rows: readonly Row[],
139
+ config: BlockingConfig,
140
+ ): BlockResult[] {
141
+ if (rows.length < 2) return [];
142
+
143
+ const passes = config.passes ?? [];
144
+ if (passes.length === 0) return [];
145
+
146
+ const allBlocks: BlockResult[] = [];
147
+ const seenKeys = new Set<string>();
148
+
149
+ for (const passConfig of passes) {
150
+ // Build a temporary config with just this pass's key
151
+ const tempConfig: BlockingConfig = {
152
+ ...config,
153
+ strategy: "static",
154
+ keys: [passConfig],
155
+ };
156
+
157
+ const blocks = buildStaticBlocks(rows, tempConfig);
158
+
159
+ for (const block of blocks) {
160
+ if (!seenKeys.has(block.blockKey)) {
161
+ seenKeys.add(block.blockKey);
162
+ allBlocks.push({
163
+ ...block,
164
+ strategy: "multi_pass",
165
+ });
166
+ }
167
+ }
168
+ }
169
+
170
+ return allBlocks;
171
+ }
172
+
173
+ // ---------------------------------------------------------------------------
174
+ // Sorted neighborhood blocking
175
+ // ---------------------------------------------------------------------------
176
+
177
+ /**
178
+ * Sort rows by a composite sort key, then slide a window of
179
+ * `config.windowSize` through the sorted data.
180
+ *
181
+ * Each window position produces one block. Requires `config.sortKey`
182
+ * to be configured.
183
+ */
184
+ export function buildSortedNeighborhoodBlocks(
185
+ rows: readonly Row[],
186
+ config: BlockingConfig,
187
+ ): BlockResult[] {
188
+ if (rows.length < 2) return [];
189
+
190
+ const sortKeyFields = config.sortKey;
191
+ if (!sortKeyFields || sortKeyFields.length === 0) {
192
+ throw new Error(
193
+ "sorted_neighborhood strategy requires sortKey configuration.",
194
+ );
195
+ }
196
+
197
+ const windowSize = config.windowSize ?? 10;
198
+
199
+ // Build (sortKey, row) pairs, filter nulls, and sort
200
+ const keyed: Array<{ key: string; row: Row }> = [];
201
+ for (const row of rows) {
202
+ const key = buildSortKey(row, sortKeyFields);
203
+ if (key !== null) {
204
+ keyed.push({ key, row });
205
+ }
206
+ }
207
+
208
+ keyed.sort((a, b) => {
209
+ if (a.key < b.key) return -1;
210
+ if (a.key > b.key) return 1;
211
+ return 0;
212
+ });
213
+
214
+ const n = keyed.length;
215
+ if (n < 2) return [];
216
+
217
+ const results: BlockResult[] = [];
218
+
219
+ if (n <= windowSize) {
220
+ // Dataset smaller than window -- single block
221
+ results.push({
222
+ blockKey: "sorted_window_0",
223
+ rows: keyed.map((k) => k.row),
224
+ strategy: "sorted_neighborhood",
225
+ depth: 0,
226
+ });
227
+ return results;
228
+ }
229
+
230
+ // Slide window through sorted data
231
+ for (let i = 0; i <= n - windowSize; i++) {
232
+ const windowRows = keyed.slice(i, i + windowSize).map((k) => k.row);
233
+ results.push({
234
+ blockKey: `sorted_window_${i}`,
235
+ rows: windowRows,
236
+ strategy: "sorted_neighborhood",
237
+ depth: 0,
238
+ });
239
+ }
240
+
241
+ return results;
242
+ }
243
+
244
+ // ---------------------------------------------------------------------------
245
+ // Auto-split oversized block
246
+ // ---------------------------------------------------------------------------
247
+
248
+ /**
249
+ * Split an oversized block by the column with the most unique values
250
+ * that produces useful groups (>= 2 rows each).
251
+ *
252
+ * This is a zero-config fallback when no `subBlockKeys` are configured
253
+ * for adaptive blocking.
254
+ */
255
+ export function autoSplitBlock(
256
+ blockRows: readonly Row[],
257
+ maxBlockSize: number,
258
+ parentKey: string,
259
+ ): BlockResult[] {
260
+ if (blockRows.length < 2) return [];
261
+
262
+ // Find non-internal columns (not prefixed with __)
263
+ const sampleRow = blockRows[0];
264
+ if (!sampleRow) return [];
265
+
266
+ const candidates = Object.keys(sampleRow).filter(
267
+ (c) => !c.startsWith("__"),
268
+ );
269
+
270
+ if (candidates.length === 0) {
271
+ // No non-internal columns -- return as-is
272
+ return [
273
+ {
274
+ blockKey: parentKey,
275
+ rows: blockRows,
276
+ strategy: "adaptive",
277
+ depth: 1,
278
+ parentKey,
279
+ },
280
+ ];
281
+ }
282
+
283
+ // Pick column whose cardinality best splits blocks.
284
+ // Score = number of groups with >= 2 rows (useful groups).
285
+ let bestCol = candidates[0]!;
286
+ let bestUsefulGroups = 0;
287
+ let bestNunique = 0;
288
+
289
+ for (const col of candidates) {
290
+ const groups = new Map<string, number>();
291
+ for (const row of blockRows) {
292
+ const val = row[col];
293
+ const key = val === null || val === undefined ? "__null__" : String(val);
294
+ groups.set(key, (groups.get(key) ?? 0) + 1);
295
+ }
296
+
297
+ const nunique = groups.size;
298
+ let usefulGroups = 0;
299
+ for (const count of groups.values()) {
300
+ if (count >= 2) usefulGroups++;
301
+ }
302
+
303
+ const avgGroup = nunique > 0 ? blockRows.length / nunique : blockRows.length;
304
+
305
+ if (
306
+ usefulGroups > bestUsefulGroups ||
307
+ (usefulGroups === bestUsefulGroups &&
308
+ avgGroup <= maxBlockSize &&
309
+ nunique > bestNunique)
310
+ ) {
311
+ bestUsefulGroups = usefulGroups;
312
+ bestNunique = nunique;
313
+ bestCol = col;
314
+ }
315
+ }
316
+
317
+ // Split by the chosen column
318
+ const splitGroups = new Map<string, Row[]>();
319
+ for (const row of blockRows) {
320
+ const val = row[bestCol];
321
+ const key = val === null || val === undefined ? "__null__" : String(val);
322
+ let group = splitGroups.get(key);
323
+ if (!group) {
324
+ group = [];
325
+ splitGroups.set(key, group);
326
+ }
327
+ group.push(row);
328
+ }
329
+
330
+ const results: BlockResult[] = [];
331
+ for (const [key, group] of splitGroups) {
332
+ if (key === "__null__") continue; // skip null groups
333
+ if (group.length < 2) continue;
334
+ results.push({
335
+ blockKey: `${parentKey}||${key}`,
336
+ rows: group,
337
+ strategy: "adaptive",
338
+ depth: 1,
339
+ parentKey,
340
+ });
341
+ }
342
+
343
+ // If no useful splits, return the block as-is
344
+ if (results.length === 0) {
345
+ return [
346
+ {
347
+ blockKey: parentKey,
348
+ rows: blockRows,
349
+ strategy: "adaptive",
350
+ depth: 1,
351
+ parentKey,
352
+ },
353
+ ];
354
+ }
355
+
356
+ return results;
357
+ }
358
+
359
+ // ---------------------------------------------------------------------------
360
+ // Adaptive blocking (static + auto-split for oversized)
361
+ // ---------------------------------------------------------------------------
362
+
363
+ /**
364
+ * Build static blocks first, then auto-split any oversized blocks
365
+ * using the highest-cardinality column.
366
+ *
367
+ * If `config.subBlockKeys` is configured, uses recursive sub-blocking
368
+ * instead of auto-split.
369
+ */
370
+ export function buildAdaptiveBlocks(
371
+ rows: readonly Row[],
372
+ config: BlockingConfig,
373
+ ): BlockResult[] {
374
+ if (rows.length < 2) return [];
375
+
376
+ const primaryBlocks = buildStaticBlocks(rows, config);
377
+ const subBlockKeys = config.subBlockKeys ?? [];
378
+
379
+ const results: BlockResult[] = [];
380
+
381
+ for (const block of primaryBlocks) {
382
+ const size = block.rows.length;
383
+
384
+ if (size > config.maxBlockSize && subBlockKeys.length > 0) {
385
+ // Recursive sub-blocking with configured keys
386
+ const subResults = subBlock(
387
+ block.rows,
388
+ subBlockKeys,
389
+ config.maxBlockSize,
390
+ 1,
391
+ block.blockKey,
392
+ );
393
+ results.push(...subResults);
394
+ } else if (size > config.maxBlockSize && !config.skipOversized) {
395
+ // Auto-split by highest-cardinality column
396
+ const autoResults = autoSplitBlock(
397
+ block.rows,
398
+ config.maxBlockSize,
399
+ block.blockKey,
400
+ );
401
+ results.push(...autoResults);
402
+ } else {
403
+ results.push(block);
404
+ }
405
+ }
406
+
407
+ return results;
408
+ }
409
+
410
+ /**
411
+ * Recursively sub-block an oversized block using configured sub-block keys.
412
+ *
413
+ * Max recursion depth is 3. If all keys are exhausted or depth exceeds 3,
414
+ * the block is returned as-is.
415
+ */
416
+ function subBlock(
417
+ blockRows: readonly Row[],
418
+ subBlockKeys: readonly BlockingKeyConfig[],
419
+ maxBlockSize: number,
420
+ depth: number,
421
+ parentKey: string,
422
+ ): BlockResult[] {
423
+ if (depth > 3 || subBlockKeys.length === 0) {
424
+ // Max depth or no more keys -- return as-is
425
+ return [
426
+ {
427
+ blockKey: parentKey,
428
+ rows: blockRows,
429
+ strategy: "adaptive",
430
+ depth,
431
+ parentKey,
432
+ },
433
+ ];
434
+ }
435
+
436
+ const currentKey = subBlockKeys[0]!;
437
+ const remainingKeys = subBlockKeys.slice(1);
438
+
439
+ const groups = new Map<string, Row[]>();
440
+ for (const row of blockRows) {
441
+ const key = buildBlockKey(row, currentKey);
442
+ if (key === null) continue;
443
+ let group = groups.get(key);
444
+ if (!group) {
445
+ group = [];
446
+ groups.set(key, group);
447
+ }
448
+ group.push(row);
449
+ }
450
+
451
+ const results: BlockResult[] = [];
452
+ for (const [key, group] of groups) {
453
+ if (group.length < 2) continue;
454
+
455
+ if (group.length > maxBlockSize && remainingKeys.length > 0 && depth < 3) {
456
+ // Recurse with next sub-block key
457
+ const subResults = subBlock(
458
+ group,
459
+ remainingKeys,
460
+ maxBlockSize,
461
+ depth + 1,
462
+ parentKey,
463
+ );
464
+ results.push(...subResults);
465
+ } else {
466
+ results.push({
467
+ blockKey: key,
468
+ rows: group,
469
+ strategy: "adaptive",
470
+ depth,
471
+ parentKey,
472
+ });
473
+ }
474
+ }
475
+
476
+ return results;
477
+ }
478
+
479
+ // ---------------------------------------------------------------------------
480
+ // Best blocking key selection
481
+ // ---------------------------------------------------------------------------
482
+
483
+ /**
484
+ * Evaluate candidate blocking keys and select the one with the smallest
485
+ * max group size while maintaining >= 50% coverage.
486
+ *
487
+ * Coverage = fraction of rows that produce a non-null block key.
488
+ * If only one key is provided, returns it directly.
489
+ */
490
+ export function selectBestBlockingKey(
491
+ rows: readonly Row[],
492
+ keys: readonly BlockingKeyConfig[],
493
+ maxBlockSize: number = 5000,
494
+ ): BlockingKeyConfig {
495
+ if (keys.length === 0) {
496
+ throw new Error("selectBestBlockingKey requires at least one key.");
497
+ }
498
+ if (keys.length === 1) return keys[0]!;
499
+
500
+ const total = rows.length;
501
+ if (total === 0) return keys[0]!;
502
+
503
+ let bestKey: BlockingKeyConfig = keys[0]!;
504
+ let bestMaxSize = Infinity;
505
+
506
+ for (const keyConfig of keys) {
507
+ const groupSizes = new Map<string, number>();
508
+ let nonNull = 0;
509
+
510
+ for (const row of rows) {
511
+ const key = buildBlockKey(row, keyConfig);
512
+ if (key !== null) {
513
+ nonNull++;
514
+ groupSizes.set(key, (groupSizes.get(key) ?? 0) + 1);
515
+ }
516
+ }
517
+
518
+ const coverage = nonNull / total;
519
+ if (coverage < 0.5) continue; // Skip low-coverage keys
520
+
521
+ // Find max group size
522
+ let maxSize = 0;
523
+ for (const size of groupSizes.values()) {
524
+ if (size > maxSize) maxSize = size;
525
+ }
526
+
527
+ if (
528
+ maxSize < bestMaxSize ||
529
+ (maxSize === bestMaxSize && groupSizes.size > 0)
530
+ ) {
531
+ bestMaxSize = maxSize;
532
+ bestKey = keyConfig;
533
+ }
534
+ }
535
+
536
+ return bestKey;
537
+ }
538
+
539
+ // ---------------------------------------------------------------------------
540
+ // Main entry point
541
+ // ---------------------------------------------------------------------------
542
+
543
+ /**
544
+ * Build blocks from rows based on blocking configuration.
545
+ *
546
+ * Routes by `config.strategy`:
547
+ * - `"static"` -- hash-based grouping on blocking keys
548
+ * - `"multi_pass"` -- multiple passes with deduplication
549
+ * - `"sorted_neighborhood"` -- sliding window over sorted data
550
+ * - `"adaptive"` -- static + auto-split for oversized blocks
551
+ * - `"ann"`, `"ann_pairs"`, `"canopy"`, `"learned"` -- not yet implemented
552
+ *
553
+ * If `config.autoSelect` is true and multiple keys are configured,
554
+ * automatically selects the best key before blocking.
555
+ */
556
+ export function buildBlocks(
557
+ rows: readonly Row[],
558
+ config: BlockingConfig,
559
+ ): BlockResult[] {
560
+ if (rows.length < 2) return [];
561
+
562
+ // Auto-select best key if enabled
563
+ let effectiveConfig = config;
564
+ if (config.autoSelect && config.keys.length > 1) {
565
+ const bestKey = selectBestBlockingKey(
566
+ rows,
567
+ config.keys,
568
+ config.maxBlockSize,
569
+ );
570
+ effectiveConfig = {
571
+ ...config,
572
+ keys: [bestKey],
573
+ autoSelect: false,
574
+ };
575
+ }
576
+
577
+ switch (effectiveConfig.strategy) {
578
+ case "static":
579
+ return buildStaticBlocks(rows, effectiveConfig);
580
+
581
+ case "multi_pass":
582
+ return buildMultiPassBlocks(rows, effectiveConfig);
583
+
584
+ case "sorted_neighborhood":
585
+ return buildSortedNeighborhoodBlocks(rows, effectiveConfig);
586
+
587
+ case "adaptive":
588
+ return buildAdaptiveBlocks(rows, effectiveConfig);
589
+
590
+ case "ann":
591
+ case "ann_pairs":
592
+ throw new Error(
593
+ `ANN blocking strategy "${effectiveConfig.strategy}" is not yet implemented in the TypeScript port. ` +
594
+ "It requires FAISS or a similar approximate nearest neighbor library.",
595
+ );
596
+
597
+ case "canopy":
598
+ throw new Error(
599
+ 'Canopy blocking strategy is not yet implemented in the TypeScript port. ' +
600
+ "It requires TF-IDF vectorization.",
601
+ );
602
+
603
+ case "learned":
604
+ throw new Error(
605
+ 'Learned blocking strategy is not yet implemented in the TypeScript port. ' +
606
+ "It requires predicate learning from training pairs.",
607
+ );
608
+
609
+ default: {
610
+ // Exhaustive check -- if a new strategy is added to the union type
611
+ // but not handled here, this will cause a compile-time error.
612
+ const _exhaustive: never = effectiveConfig.strategy;
613
+ throw new Error(`Unknown blocking strategy: ${String(_exhaustive)}`);
614
+ }
615
+ }
616
+ }
617
+
618
+ // ---------------------------------------------------------------------------
619
+ // Async entry point — required for ANN strategies that fetch embeddings.
620
+ // ---------------------------------------------------------------------------
621
+
622
+ /**
623
+ * Async variant of `buildBlocks`. Required for `"ann"` and `"ann_pairs"`
624
+ * strategies which need to fetch embeddings via HTTP. All other strategies
625
+ * delegate to the synchronous `buildBlocks` path.
626
+ */
627
+ export async function buildBlocksAsync(
628
+ rows: readonly Row[],
629
+ config: BlockingConfig,
630
+ ): Promise<BlockResult[]> {
631
+ if (rows.length < 2) return [];
632
+
633
+ if (config.strategy === "ann") {
634
+ if (!config.annColumn) {
635
+ throw new Error('"ann" strategy requires `annColumn` in BlockingConfig.');
636
+ }
637
+ return await buildANNBlocks(rows, config.annColumn, {
638
+ ...(config.annTopK !== undefined ? { topK: config.annTopK } : {}),
639
+ ...(config.annModel !== undefined ? { model: config.annModel } : {}),
640
+ ...(config.maxBlockSize !== undefined ? { maxBlockSize: config.maxBlockSize } : {}),
641
+ });
642
+ }
643
+
644
+ if (config.strategy === "ann_pairs") {
645
+ if (!config.annColumn) {
646
+ throw new Error('"ann_pairs" strategy requires `annColumn` in BlockingConfig.');
647
+ }
648
+ return await buildANNPairBlocks(rows, config.annColumn, {
649
+ ...(config.annTopK !== undefined ? { topK: config.annTopK } : {}),
650
+ ...(config.annModel !== undefined ? { model: config.annModel } : {}),
651
+ });
652
+ }
653
+
654
+ return buildBlocks(rows, config);
655
+ }