goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* blocker.ts — Groups records into blocks for pairwise comparison.
|
|
3
|
+
*
|
|
4
|
+
* Edge-safe: no Node.js imports. Pure TypeScript only.
|
|
5
|
+
*
|
|
6
|
+
* Ports `goldenmatch/core/blocker.py`.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type {
|
|
10
|
+
BlockingConfig,
|
|
11
|
+
BlockingKeyConfig,
|
|
12
|
+
BlockResult,
|
|
13
|
+
Row,
|
|
14
|
+
SortKeyField,
|
|
15
|
+
} from "./types.js";
|
|
16
|
+
import { applyTransforms } from "./transforms.js";
|
|
17
|
+
import { buildANNBlocks, buildANNPairBlocks } from "./ann-blocker.js";
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Internal helpers
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Build a composite block key string for a single row.
|
|
25
|
+
*
|
|
26
|
+
* For each field in `keyConfig.fields`, extracts the value, applies
|
|
27
|
+
* transforms, and concatenates with "||". Returns `null` if any field
|
|
28
|
+
* value is null/undefined or any transform returns null.
|
|
29
|
+
*/
|
|
30
|
+
function buildBlockKey(
|
|
31
|
+
row: Row,
|
|
32
|
+
keyConfig: BlockingKeyConfig,
|
|
33
|
+
): string | null {
|
|
34
|
+
const parts: string[] = [];
|
|
35
|
+
for (const field of keyConfig.fields) {
|
|
36
|
+
const raw = row[field];
|
|
37
|
+
if (raw === null || raw === undefined) return null;
|
|
38
|
+
const str = String(raw);
|
|
39
|
+
if (keyConfig.transforms.length > 0) {
|
|
40
|
+
const val = applyTransforms(str, keyConfig.transforms);
|
|
41
|
+
if (val === null || val === undefined) return null;
|
|
42
|
+
parts.push(val);
|
|
43
|
+
} else {
|
|
44
|
+
parts.push(str);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return parts.join("||");
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Build a sort key string for a row using SortKeyField config.
|
|
52
|
+
* Returns `null` if any field value is null/undefined.
|
|
53
|
+
*/
|
|
54
|
+
function buildSortKey(
|
|
55
|
+
row: Row,
|
|
56
|
+
sortKeyFields: readonly SortKeyField[],
|
|
57
|
+
): string | null {
|
|
58
|
+
const parts: string[] = [];
|
|
59
|
+
for (const skf of sortKeyFields) {
|
|
60
|
+
const raw = row[skf.column];
|
|
61
|
+
if (raw === null || raw === undefined) return null;
|
|
62
|
+
const str = String(raw);
|
|
63
|
+
if (skf.transforms.length > 0) {
|
|
64
|
+
const val = applyTransforms(str, skf.transforms);
|
|
65
|
+
if (val === null || val === undefined) return null;
|
|
66
|
+
parts.push(val);
|
|
67
|
+
} else {
|
|
68
|
+
parts.push(str);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return parts.join("||");
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
// Static blocking
|
|
76
|
+
// ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Group rows by blocking key. Skip blocks with fewer than 2 rows.
|
|
80
|
+
* Handle oversized blocks per `config.skipOversized`.
|
|
81
|
+
*/
|
|
82
|
+
export function buildStaticBlocks(
|
|
83
|
+
rows: readonly Row[],
|
|
84
|
+
config: BlockingConfig,
|
|
85
|
+
): BlockResult[] {
|
|
86
|
+
if (rows.length < 2) return [];
|
|
87
|
+
|
|
88
|
+
const results: BlockResult[] = [];
|
|
89
|
+
|
|
90
|
+
for (const keyConfig of config.keys) {
|
|
91
|
+
const groups = new Map<string, Row[]>();
|
|
92
|
+
|
|
93
|
+
for (const row of rows) {
|
|
94
|
+
const key = buildBlockKey(row, keyConfig);
|
|
95
|
+
if (key === null) continue;
|
|
96
|
+
let group = groups.get(key);
|
|
97
|
+
if (!group) {
|
|
98
|
+
group = [];
|
|
99
|
+
groups.set(key, group);
|
|
100
|
+
}
|
|
101
|
+
group.push(row);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (const [key, group] of groups) {
|
|
105
|
+
if (group.length < 2) continue;
|
|
106
|
+
|
|
107
|
+
if (group.length > config.maxBlockSize) {
|
|
108
|
+
if (config.skipOversized) {
|
|
109
|
+
// Skip oversized blocks
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
// Process anyway (caller is warned via the oversized size)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
results.push({
|
|
116
|
+
blockKey: key,
|
|
117
|
+
rows: group,
|
|
118
|
+
strategy: "static",
|
|
119
|
+
depth: 0,
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return results;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
// Multi-pass blocking
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Run multiple blocking passes using `config.passes`.
|
|
133
|
+
*
|
|
134
|
+
* Each pass uses a different `BlockingKeyConfig`. Blocks are deduplicated
|
|
135
|
+
* by block key so each unique key appears only once.
|
|
136
|
+
*/
|
|
137
|
+
export function buildMultiPassBlocks(
|
|
138
|
+
rows: readonly Row[],
|
|
139
|
+
config: BlockingConfig,
|
|
140
|
+
): BlockResult[] {
|
|
141
|
+
if (rows.length < 2) return [];
|
|
142
|
+
|
|
143
|
+
const passes = config.passes ?? [];
|
|
144
|
+
if (passes.length === 0) return [];
|
|
145
|
+
|
|
146
|
+
const allBlocks: BlockResult[] = [];
|
|
147
|
+
const seenKeys = new Set<string>();
|
|
148
|
+
|
|
149
|
+
for (const passConfig of passes) {
|
|
150
|
+
// Build a temporary config with just this pass's key
|
|
151
|
+
const tempConfig: BlockingConfig = {
|
|
152
|
+
...config,
|
|
153
|
+
strategy: "static",
|
|
154
|
+
keys: [passConfig],
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
const blocks = buildStaticBlocks(rows, tempConfig);
|
|
158
|
+
|
|
159
|
+
for (const block of blocks) {
|
|
160
|
+
if (!seenKeys.has(block.blockKey)) {
|
|
161
|
+
seenKeys.add(block.blockKey);
|
|
162
|
+
allBlocks.push({
|
|
163
|
+
...block,
|
|
164
|
+
strategy: "multi_pass",
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return allBlocks;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// ---------------------------------------------------------------------------
|
|
174
|
+
// Sorted neighborhood blocking
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Sort rows by a composite sort key, then slide a window of
|
|
179
|
+
* `config.windowSize` through the sorted data.
|
|
180
|
+
*
|
|
181
|
+
* Each window position produces one block. Requires `config.sortKey`
|
|
182
|
+
* to be configured.
|
|
183
|
+
*/
|
|
184
|
+
export function buildSortedNeighborhoodBlocks(
|
|
185
|
+
rows: readonly Row[],
|
|
186
|
+
config: BlockingConfig,
|
|
187
|
+
): BlockResult[] {
|
|
188
|
+
if (rows.length < 2) return [];
|
|
189
|
+
|
|
190
|
+
const sortKeyFields = config.sortKey;
|
|
191
|
+
if (!sortKeyFields || sortKeyFields.length === 0) {
|
|
192
|
+
throw new Error(
|
|
193
|
+
"sorted_neighborhood strategy requires sortKey configuration.",
|
|
194
|
+
);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const windowSize = config.windowSize ?? 10;
|
|
198
|
+
|
|
199
|
+
// Build (sortKey, row) pairs, filter nulls, and sort
|
|
200
|
+
const keyed: Array<{ key: string; row: Row }> = [];
|
|
201
|
+
for (const row of rows) {
|
|
202
|
+
const key = buildSortKey(row, sortKeyFields);
|
|
203
|
+
if (key !== null) {
|
|
204
|
+
keyed.push({ key, row });
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
keyed.sort((a, b) => {
|
|
209
|
+
if (a.key < b.key) return -1;
|
|
210
|
+
if (a.key > b.key) return 1;
|
|
211
|
+
return 0;
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
const n = keyed.length;
|
|
215
|
+
if (n < 2) return [];
|
|
216
|
+
|
|
217
|
+
const results: BlockResult[] = [];
|
|
218
|
+
|
|
219
|
+
if (n <= windowSize) {
|
|
220
|
+
// Dataset smaller than window -- single block
|
|
221
|
+
results.push({
|
|
222
|
+
blockKey: "sorted_window_0",
|
|
223
|
+
rows: keyed.map((k) => k.row),
|
|
224
|
+
strategy: "sorted_neighborhood",
|
|
225
|
+
depth: 0,
|
|
226
|
+
});
|
|
227
|
+
return results;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Slide window through sorted data
|
|
231
|
+
for (let i = 0; i <= n - windowSize; i++) {
|
|
232
|
+
const windowRows = keyed.slice(i, i + windowSize).map((k) => k.row);
|
|
233
|
+
results.push({
|
|
234
|
+
blockKey: `sorted_window_${i}`,
|
|
235
|
+
rows: windowRows,
|
|
236
|
+
strategy: "sorted_neighborhood",
|
|
237
|
+
depth: 0,
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return results;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// ---------------------------------------------------------------------------
|
|
245
|
+
// Auto-split oversized block
|
|
246
|
+
// ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Split an oversized block by the column with the most unique values
|
|
250
|
+
* that produces useful groups (>= 2 rows each).
|
|
251
|
+
*
|
|
252
|
+
* This is a zero-config fallback when no `subBlockKeys` are configured
|
|
253
|
+
* for adaptive blocking.
|
|
254
|
+
*/
|
|
255
|
+
export function autoSplitBlock(
|
|
256
|
+
blockRows: readonly Row[],
|
|
257
|
+
maxBlockSize: number,
|
|
258
|
+
parentKey: string,
|
|
259
|
+
): BlockResult[] {
|
|
260
|
+
if (blockRows.length < 2) return [];
|
|
261
|
+
|
|
262
|
+
// Find non-internal columns (not prefixed with __)
|
|
263
|
+
const sampleRow = blockRows[0];
|
|
264
|
+
if (!sampleRow) return [];
|
|
265
|
+
|
|
266
|
+
const candidates = Object.keys(sampleRow).filter(
|
|
267
|
+
(c) => !c.startsWith("__"),
|
|
268
|
+
);
|
|
269
|
+
|
|
270
|
+
if (candidates.length === 0) {
|
|
271
|
+
// No non-internal columns -- return as-is
|
|
272
|
+
return [
|
|
273
|
+
{
|
|
274
|
+
blockKey: parentKey,
|
|
275
|
+
rows: blockRows,
|
|
276
|
+
strategy: "adaptive",
|
|
277
|
+
depth: 1,
|
|
278
|
+
parentKey,
|
|
279
|
+
},
|
|
280
|
+
];
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Pick column whose cardinality best splits blocks.
|
|
284
|
+
// Score = number of groups with >= 2 rows (useful groups).
|
|
285
|
+
let bestCol = candidates[0]!;
|
|
286
|
+
let bestUsefulGroups = 0;
|
|
287
|
+
let bestNunique = 0;
|
|
288
|
+
|
|
289
|
+
for (const col of candidates) {
|
|
290
|
+
const groups = new Map<string, number>();
|
|
291
|
+
for (const row of blockRows) {
|
|
292
|
+
const val = row[col];
|
|
293
|
+
const key = val === null || val === undefined ? "__null__" : String(val);
|
|
294
|
+
groups.set(key, (groups.get(key) ?? 0) + 1);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const nunique = groups.size;
|
|
298
|
+
let usefulGroups = 0;
|
|
299
|
+
for (const count of groups.values()) {
|
|
300
|
+
if (count >= 2) usefulGroups++;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
const avgGroup = nunique > 0 ? blockRows.length / nunique : blockRows.length;
|
|
304
|
+
|
|
305
|
+
if (
|
|
306
|
+
usefulGroups > bestUsefulGroups ||
|
|
307
|
+
(usefulGroups === bestUsefulGroups &&
|
|
308
|
+
avgGroup <= maxBlockSize &&
|
|
309
|
+
nunique > bestNunique)
|
|
310
|
+
) {
|
|
311
|
+
bestUsefulGroups = usefulGroups;
|
|
312
|
+
bestNunique = nunique;
|
|
313
|
+
bestCol = col;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Split by the chosen column
|
|
318
|
+
const splitGroups = new Map<string, Row[]>();
|
|
319
|
+
for (const row of blockRows) {
|
|
320
|
+
const val = row[bestCol];
|
|
321
|
+
const key = val === null || val === undefined ? "__null__" : String(val);
|
|
322
|
+
let group = splitGroups.get(key);
|
|
323
|
+
if (!group) {
|
|
324
|
+
group = [];
|
|
325
|
+
splitGroups.set(key, group);
|
|
326
|
+
}
|
|
327
|
+
group.push(row);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const results: BlockResult[] = [];
|
|
331
|
+
for (const [key, group] of splitGroups) {
|
|
332
|
+
if (key === "__null__") continue; // skip null groups
|
|
333
|
+
if (group.length < 2) continue;
|
|
334
|
+
results.push({
|
|
335
|
+
blockKey: `${parentKey}||${key}`,
|
|
336
|
+
rows: group,
|
|
337
|
+
strategy: "adaptive",
|
|
338
|
+
depth: 1,
|
|
339
|
+
parentKey,
|
|
340
|
+
});
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// If no useful splits, return the block as-is
|
|
344
|
+
if (results.length === 0) {
|
|
345
|
+
return [
|
|
346
|
+
{
|
|
347
|
+
blockKey: parentKey,
|
|
348
|
+
rows: blockRows,
|
|
349
|
+
strategy: "adaptive",
|
|
350
|
+
depth: 1,
|
|
351
|
+
parentKey,
|
|
352
|
+
},
|
|
353
|
+
];
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
return results;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// ---------------------------------------------------------------------------
|
|
360
|
+
// Adaptive blocking (static + auto-split for oversized)
|
|
361
|
+
// ---------------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Build static blocks first, then auto-split any oversized blocks
|
|
365
|
+
* using the highest-cardinality column.
|
|
366
|
+
*
|
|
367
|
+
* If `config.subBlockKeys` is configured, uses recursive sub-blocking
|
|
368
|
+
* instead of auto-split.
|
|
369
|
+
*/
|
|
370
|
+
export function buildAdaptiveBlocks(
|
|
371
|
+
rows: readonly Row[],
|
|
372
|
+
config: BlockingConfig,
|
|
373
|
+
): BlockResult[] {
|
|
374
|
+
if (rows.length < 2) return [];
|
|
375
|
+
|
|
376
|
+
const primaryBlocks = buildStaticBlocks(rows, config);
|
|
377
|
+
const subBlockKeys = config.subBlockKeys ?? [];
|
|
378
|
+
|
|
379
|
+
const results: BlockResult[] = [];
|
|
380
|
+
|
|
381
|
+
for (const block of primaryBlocks) {
|
|
382
|
+
const size = block.rows.length;
|
|
383
|
+
|
|
384
|
+
if (size > config.maxBlockSize && subBlockKeys.length > 0) {
|
|
385
|
+
// Recursive sub-blocking with configured keys
|
|
386
|
+
const subResults = subBlock(
|
|
387
|
+
block.rows,
|
|
388
|
+
subBlockKeys,
|
|
389
|
+
config.maxBlockSize,
|
|
390
|
+
1,
|
|
391
|
+
block.blockKey,
|
|
392
|
+
);
|
|
393
|
+
results.push(...subResults);
|
|
394
|
+
} else if (size > config.maxBlockSize && !config.skipOversized) {
|
|
395
|
+
// Auto-split by highest-cardinality column
|
|
396
|
+
const autoResults = autoSplitBlock(
|
|
397
|
+
block.rows,
|
|
398
|
+
config.maxBlockSize,
|
|
399
|
+
block.blockKey,
|
|
400
|
+
);
|
|
401
|
+
results.push(...autoResults);
|
|
402
|
+
} else {
|
|
403
|
+
results.push(block);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return results;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Recursively sub-block an oversized block using configured sub-block keys.
|
|
412
|
+
*
|
|
413
|
+
* Max recursion depth is 3. If all keys are exhausted or depth exceeds 3,
|
|
414
|
+
* the block is returned as-is.
|
|
415
|
+
*/
|
|
416
|
+
function subBlock(
|
|
417
|
+
blockRows: readonly Row[],
|
|
418
|
+
subBlockKeys: readonly BlockingKeyConfig[],
|
|
419
|
+
maxBlockSize: number,
|
|
420
|
+
depth: number,
|
|
421
|
+
parentKey: string,
|
|
422
|
+
): BlockResult[] {
|
|
423
|
+
if (depth > 3 || subBlockKeys.length === 0) {
|
|
424
|
+
// Max depth or no more keys -- return as-is
|
|
425
|
+
return [
|
|
426
|
+
{
|
|
427
|
+
blockKey: parentKey,
|
|
428
|
+
rows: blockRows,
|
|
429
|
+
strategy: "adaptive",
|
|
430
|
+
depth,
|
|
431
|
+
parentKey,
|
|
432
|
+
},
|
|
433
|
+
];
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const currentKey = subBlockKeys[0]!;
|
|
437
|
+
const remainingKeys = subBlockKeys.slice(1);
|
|
438
|
+
|
|
439
|
+
const groups = new Map<string, Row[]>();
|
|
440
|
+
for (const row of blockRows) {
|
|
441
|
+
const key = buildBlockKey(row, currentKey);
|
|
442
|
+
if (key === null) continue;
|
|
443
|
+
let group = groups.get(key);
|
|
444
|
+
if (!group) {
|
|
445
|
+
group = [];
|
|
446
|
+
groups.set(key, group);
|
|
447
|
+
}
|
|
448
|
+
group.push(row);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const results: BlockResult[] = [];
|
|
452
|
+
for (const [key, group] of groups) {
|
|
453
|
+
if (group.length < 2) continue;
|
|
454
|
+
|
|
455
|
+
if (group.length > maxBlockSize && remainingKeys.length > 0 && depth < 3) {
|
|
456
|
+
// Recurse with next sub-block key
|
|
457
|
+
const subResults = subBlock(
|
|
458
|
+
group,
|
|
459
|
+
remainingKeys,
|
|
460
|
+
maxBlockSize,
|
|
461
|
+
depth + 1,
|
|
462
|
+
parentKey,
|
|
463
|
+
);
|
|
464
|
+
results.push(...subResults);
|
|
465
|
+
} else {
|
|
466
|
+
results.push({
|
|
467
|
+
blockKey: key,
|
|
468
|
+
rows: group,
|
|
469
|
+
strategy: "adaptive",
|
|
470
|
+
depth,
|
|
471
|
+
parentKey,
|
|
472
|
+
});
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
return results;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// ---------------------------------------------------------------------------
|
|
480
|
+
// Best blocking key selection
|
|
481
|
+
// ---------------------------------------------------------------------------
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Evaluate candidate blocking keys and select the one with the smallest
|
|
485
|
+
* max group size while maintaining >= 50% coverage.
|
|
486
|
+
*
|
|
487
|
+
* Coverage = fraction of rows that produce a non-null block key.
|
|
488
|
+
* If only one key is provided, returns it directly.
|
|
489
|
+
*/
|
|
490
|
+
export function selectBestBlockingKey(
|
|
491
|
+
rows: readonly Row[],
|
|
492
|
+
keys: readonly BlockingKeyConfig[],
|
|
493
|
+
maxBlockSize: number = 5000,
|
|
494
|
+
): BlockingKeyConfig {
|
|
495
|
+
if (keys.length === 0) {
|
|
496
|
+
throw new Error("selectBestBlockingKey requires at least one key.");
|
|
497
|
+
}
|
|
498
|
+
if (keys.length === 1) return keys[0]!;
|
|
499
|
+
|
|
500
|
+
const total = rows.length;
|
|
501
|
+
if (total === 0) return keys[0]!;
|
|
502
|
+
|
|
503
|
+
let bestKey: BlockingKeyConfig = keys[0]!;
|
|
504
|
+
let bestMaxSize = Infinity;
|
|
505
|
+
|
|
506
|
+
for (const keyConfig of keys) {
|
|
507
|
+
const groupSizes = new Map<string, number>();
|
|
508
|
+
let nonNull = 0;
|
|
509
|
+
|
|
510
|
+
for (const row of rows) {
|
|
511
|
+
const key = buildBlockKey(row, keyConfig);
|
|
512
|
+
if (key !== null) {
|
|
513
|
+
nonNull++;
|
|
514
|
+
groupSizes.set(key, (groupSizes.get(key) ?? 0) + 1);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
const coverage = nonNull / total;
|
|
519
|
+
if (coverage < 0.5) continue; // Skip low-coverage keys
|
|
520
|
+
|
|
521
|
+
// Find max group size
|
|
522
|
+
let maxSize = 0;
|
|
523
|
+
for (const size of groupSizes.values()) {
|
|
524
|
+
if (size > maxSize) maxSize = size;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
if (
|
|
528
|
+
maxSize < bestMaxSize ||
|
|
529
|
+
(maxSize === bestMaxSize && groupSizes.size > 0)
|
|
530
|
+
) {
|
|
531
|
+
bestMaxSize = maxSize;
|
|
532
|
+
bestKey = keyConfig;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
return bestKey;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// ---------------------------------------------------------------------------
|
|
540
|
+
// Main entry point
|
|
541
|
+
// ---------------------------------------------------------------------------
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* Build blocks from rows based on blocking configuration.
|
|
545
|
+
*
|
|
546
|
+
* Routes by `config.strategy`:
|
|
547
|
+
* - `"static"` -- hash-based grouping on blocking keys
|
|
548
|
+
* - `"multi_pass"` -- multiple passes with deduplication
|
|
549
|
+
* - `"sorted_neighborhood"` -- sliding window over sorted data
|
|
550
|
+
* - `"adaptive"` -- static + auto-split for oversized blocks
|
|
551
|
+
* - `"ann"`, `"ann_pairs"`, `"canopy"`, `"learned"` -- not yet implemented
|
|
552
|
+
*
|
|
553
|
+
* If `config.autoSelect` is true and multiple keys are configured,
|
|
554
|
+
* automatically selects the best key before blocking.
|
|
555
|
+
*/
|
|
556
|
+
export function buildBlocks(
|
|
557
|
+
rows: readonly Row[],
|
|
558
|
+
config: BlockingConfig,
|
|
559
|
+
): BlockResult[] {
|
|
560
|
+
if (rows.length < 2) return [];
|
|
561
|
+
|
|
562
|
+
// Auto-select best key if enabled
|
|
563
|
+
let effectiveConfig = config;
|
|
564
|
+
if (config.autoSelect && config.keys.length > 1) {
|
|
565
|
+
const bestKey = selectBestBlockingKey(
|
|
566
|
+
rows,
|
|
567
|
+
config.keys,
|
|
568
|
+
config.maxBlockSize,
|
|
569
|
+
);
|
|
570
|
+
effectiveConfig = {
|
|
571
|
+
...config,
|
|
572
|
+
keys: [bestKey],
|
|
573
|
+
autoSelect: false,
|
|
574
|
+
};
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
switch (effectiveConfig.strategy) {
|
|
578
|
+
case "static":
|
|
579
|
+
return buildStaticBlocks(rows, effectiveConfig);
|
|
580
|
+
|
|
581
|
+
case "multi_pass":
|
|
582
|
+
return buildMultiPassBlocks(rows, effectiveConfig);
|
|
583
|
+
|
|
584
|
+
case "sorted_neighborhood":
|
|
585
|
+
return buildSortedNeighborhoodBlocks(rows, effectiveConfig);
|
|
586
|
+
|
|
587
|
+
case "adaptive":
|
|
588
|
+
return buildAdaptiveBlocks(rows, effectiveConfig);
|
|
589
|
+
|
|
590
|
+
case "ann":
|
|
591
|
+
case "ann_pairs":
|
|
592
|
+
throw new Error(
|
|
593
|
+
`ANN blocking strategy "${effectiveConfig.strategy}" is not yet implemented in the TypeScript port. ` +
|
|
594
|
+
"It requires FAISS or a similar approximate nearest neighbor library.",
|
|
595
|
+
);
|
|
596
|
+
|
|
597
|
+
case "canopy":
|
|
598
|
+
throw new Error(
|
|
599
|
+
'Canopy blocking strategy is not yet implemented in the TypeScript port. ' +
|
|
600
|
+
"It requires TF-IDF vectorization.",
|
|
601
|
+
);
|
|
602
|
+
|
|
603
|
+
case "learned":
|
|
604
|
+
throw new Error(
|
|
605
|
+
'Learned blocking strategy is not yet implemented in the TypeScript port. ' +
|
|
606
|
+
"It requires predicate learning from training pairs.",
|
|
607
|
+
);
|
|
608
|
+
|
|
609
|
+
default: {
|
|
610
|
+
// Exhaustive check -- if a new strategy is added to the union type
|
|
611
|
+
// but not handled here, this will cause a compile-time error.
|
|
612
|
+
const _exhaustive: never = effectiveConfig.strategy;
|
|
613
|
+
throw new Error(`Unknown blocking strategy: ${String(_exhaustive)}`);
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// ---------------------------------------------------------------------------
|
|
619
|
+
// Async entry point — required for ANN strategies that fetch embeddings.
|
|
620
|
+
// ---------------------------------------------------------------------------
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* Async variant of `buildBlocks`. Required for `"ann"` and `"ann_pairs"`
|
|
624
|
+
* strategies which need to fetch embeddings via HTTP. All other strategies
|
|
625
|
+
* delegate to the synchronous `buildBlocks` path.
|
|
626
|
+
*/
|
|
627
|
+
export async function buildBlocksAsync(
|
|
628
|
+
rows: readonly Row[],
|
|
629
|
+
config: BlockingConfig,
|
|
630
|
+
): Promise<BlockResult[]> {
|
|
631
|
+
if (rows.length < 2) return [];
|
|
632
|
+
|
|
633
|
+
if (config.strategy === "ann") {
|
|
634
|
+
if (!config.annColumn) {
|
|
635
|
+
throw new Error('"ann" strategy requires `annColumn` in BlockingConfig.');
|
|
636
|
+
}
|
|
637
|
+
return await buildANNBlocks(rows, config.annColumn, {
|
|
638
|
+
...(config.annTopK !== undefined ? { topK: config.annTopK } : {}),
|
|
639
|
+
...(config.annModel !== undefined ? { model: config.annModel } : {}),
|
|
640
|
+
...(config.maxBlockSize !== undefined ? { maxBlockSize: config.maxBlockSize } : {}),
|
|
641
|
+
});
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
if (config.strategy === "ann_pairs") {
|
|
645
|
+
if (!config.annColumn) {
|
|
646
|
+
throw new Error('"ann_pairs" strategy requires `annColumn` in BlockingConfig.');
|
|
647
|
+
}
|
|
648
|
+
return await buildANNPairBlocks(rows, config.annColumn, {
|
|
649
|
+
...(config.annTopK !== undefined ? { topK: config.annTopK } : {}),
|
|
650
|
+
...(config.annModel !== undefined ? { model: config.annModel } : {}),
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
return buildBlocks(rows, config);
|
|
655
|
+
}
|