goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,593 @@
1
+ /**
2
+ * ann-blocker.ts — Approximate nearest neighbour blocking.
3
+ *
4
+ * Edge-safe: no `node:` imports, no FAISS. Implements a brute-force kNN
5
+ * (O(n^2)) which is appropriate for <= ~10K records. Embeddings are
6
+ * fetched via `getEmbedder()` which uses HTTP `fetch()`.
7
+ *
8
+ * Ports `goldenmatch/core/ann_blocker.py`.
9
+ */
10
+
11
+ import type { BlockResult, Row, ScoredPair } from "./types.js";
12
+ import { makeScoredPair } from "./types.js";
13
+ import { getEmbedder, type EmbedderOptions } from "./embedder.js";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Public option types
17
+ // ---------------------------------------------------------------------------
18
+
19
+ export interface ANNBlockerOptions {
20
+ readonly topK?: number;
21
+ readonly metric?: "cosine" | "euclidean";
22
+ }
23
+
24
+ export interface BuildANNOptions {
25
+ readonly topK?: number;
26
+ readonly model?: string;
27
+ readonly apiKey?: string;
28
+ readonly provider?: EmbedderOptions["provider"];
29
+ /** Row identifier column (default `__row_id__`). */
30
+ readonly idColumn?: string;
31
+ /** Maximum block size produced by Union-Find grouping. */
32
+ readonly maxBlockSize?: number;
33
+ /** Use hnswlib-node fast-path when available (falls back to brute-force). */
34
+ readonly useHNSW?: boolean;
35
+ }
36
+
37
+ /**
38
+ * Minimal shape of the `hnswlib-node` module that we rely on. The caller
39
+ * passes in the loaded module; we deliberately keep the surface tiny so we
40
+ * don't hard-depend on its types.
41
+ */
42
+ export interface HNSWModule {
43
+ readonly HierarchicalNSW: new (
44
+ metric: string,
45
+ dim: number,
46
+ ) => HNSWIndexLike;
47
+ }
48
+
49
+ export interface HNSWIndexLike {
50
+ initIndex(
51
+ maxElements: number,
52
+ M?: number,
53
+ efConstruction?: number,
54
+ randomSeed?: number,
55
+ ): void;
56
+ setEf(ef: number): void;
57
+ addPoint(vector: number[] | Float32Array, labelId: number): void;
58
+ searchKnn(
59
+ query: number[] | Float32Array,
60
+ k: number,
61
+ ): { distances: number[]; neighbors: number[] };
62
+ }
63
+
64
+ export interface HNSWOptions {
65
+ readonly hnswModule: HNSWModule;
66
+ readonly topK?: number;
67
+ readonly metric?: "cosine" | "euclidean";
68
+ readonly maxElements?: number;
69
+ readonly M?: number;
70
+ readonly efConstruction?: number;
71
+ readonly efSearch?: number;
72
+ }
73
+
74
+ /** Shared interface so `ANNBlocker` and `HNSWANNBlocker` are interchangeable. */
75
+ export interface ANNBlockerBase {
76
+ buildIndex(embeddings: readonly Float32Array[]): void;
77
+ addToIndex(embedding: Float32Array): number;
78
+ query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]>;
79
+ queryWithScores(
80
+ queryEmbeddings: readonly Float32Array[],
81
+ ): Array<[number, number, number]>;
82
+ queryOne(queryEmbedding: Float32Array): Array<[number, number]>;
83
+ readonly indexSize: number;
84
+ }
85
+
86
+ // ---------------------------------------------------------------------------
87
+ // Distance helpers
88
+ // ---------------------------------------------------------------------------
89
+
90
+ export function cosineSim(a: Float32Array, b: Float32Array): number {
91
+ let dot = 0;
92
+ let na = 0;
93
+ let nb = 0;
94
+ const n = Math.min(a.length, b.length);
95
+ for (let i = 0; i < n; i++) {
96
+ const av = a[i]!;
97
+ const bv = b[i]!;
98
+ dot += av * bv;
99
+ na += av * av;
100
+ nb += bv * bv;
101
+ }
102
+ const denom = Math.sqrt(na) * Math.sqrt(nb);
103
+ return denom === 0 ? 0 : dot / denom;
104
+ }
105
+
106
+ export function euclideanDist(a: Float32Array, b: Float32Array): number {
107
+ let s = 0;
108
+ const n = Math.min(a.length, b.length);
109
+ for (let i = 0; i < n; i++) {
110
+ const d = a[i]! - b[i]!;
111
+ s += d * d;
112
+ }
113
+ return Math.sqrt(s);
114
+ }
115
+
116
+ // ---------------------------------------------------------------------------
117
+ // ANNBlocker
118
+ // ---------------------------------------------------------------------------
119
+
120
+ export class ANNBlocker implements ANNBlockerBase {
121
+ private embeddings: Float32Array[] = [];
122
+ private readonly topK: number;
123
+ private readonly metric: "cosine" | "euclidean";
124
+
125
+ constructor(options: ANNBlockerOptions = {}) {
126
+ this.topK = options.topK ?? 20;
127
+ this.metric = options.metric ?? "cosine";
128
+ }
129
+
130
+ /** Replace the index with a fresh set of embeddings. */
131
+ buildIndex(embeddings: readonly Float32Array[]): void {
132
+ this.embeddings = embeddings.map((e) => e);
133
+ }
134
+
135
+ /** Number of vectors currently in the index. */
136
+ get indexSize(): number {
137
+ return this.embeddings.length;
138
+ }
139
+
140
+ /** Append a single embedding; returns its position. */
141
+ addToIndex(embedding: Float32Array): number {
142
+ this.embeddings.push(embedding);
143
+ return this.embeddings.length - 1;
144
+ }
145
+
146
+ // ──────────────────────────────────────────────────────────
147
+ // Querying
148
+ // ──────────────────────────────────────────────────────────
149
+
150
+ /**
151
+ * For each query embedding, return up to topK (queryIdx, indexIdx) pairs.
152
+ * Self-matches (same index when queries == embeddings) are excluded only
153
+ * when the underlying object identity matches; otherwise the caller is
154
+ * responsible for filtering self-pairs.
155
+ *
156
+ * Pairs are canonicalised so the lower index is always first when querying
157
+ * against the same index population (queryIdx === indexIdx case removed).
158
+ */
159
+ query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]> {
160
+ const seen = new Set<number>();
161
+ const out: Array<[number, number]> = [];
162
+ const sameIndex = queryEmbeddings === (this.embeddings as readonly Float32Array[]);
163
+ for (let i = 0; i < queryEmbeddings.length; i++) {
164
+ const top = this.topKFor(queryEmbeddings[i]!);
165
+ for (const [neighbour] of top) {
166
+ if (sameIndex && neighbour === i) continue;
167
+ if (neighbour < 0) continue;
168
+ const a = Math.min(i, neighbour);
169
+ const b = Math.max(i, neighbour);
170
+ const key = a * 100000003 + b; // rough Cantor-like dedup key
171
+ if (seen.has(key)) continue;
172
+ seen.add(key);
173
+ out.push([a, b]);
174
+ }
175
+ }
176
+ return out;
177
+ }
178
+
179
+ /** Same as `query` but also returns the similarity score for each pair. */
180
+ queryWithScores(
181
+ queryEmbeddings: readonly Float32Array[],
182
+ ): Array<[number, number, number]> {
183
+ const best = new Map<number, [number, number, number]>();
184
+ const sameIndex = queryEmbeddings === (this.embeddings as readonly Float32Array[]);
185
+ for (let i = 0; i < queryEmbeddings.length; i++) {
186
+ const top = this.topKFor(queryEmbeddings[i]!);
187
+ for (const [neighbour, score] of top) {
188
+ if (sameIndex && neighbour === i) continue;
189
+ if (neighbour < 0) continue;
190
+ const a = Math.min(i, neighbour);
191
+ const b = Math.max(i, neighbour);
192
+ const key = a * 100000003 + b;
193
+ const prev = best.get(key);
194
+ if (!prev || score > prev[2]) {
195
+ best.set(key, [a, b, score]);
196
+ }
197
+ }
198
+ }
199
+ return [...best.values()];
200
+ }
201
+
202
+ /** Top-K matches for a single query. Returns (neighborIdx, score). */
203
+ queryOne(queryEmbedding: Float32Array): Array<[number, number]> {
204
+ return this.topKFor(queryEmbedding);
205
+ }
206
+
207
+ // ──────────────────────────────────────────────────────────
208
+ // Internals
209
+ // ──────────────────────────────────────────────────────────
210
+
211
+ private topKFor(query: Float32Array): Array<[number, number]> {
212
+ const n = this.embeddings.length;
213
+ if (n === 0) return [];
214
+ // Score every vector. For 10k×10k that's 100M ops — acceptable for an
215
+ // edge-safe brute-force fallback, but callers should pre-filter for very
216
+ // large datasets.
217
+ const scores = new Array<{ idx: number; score: number }>(n);
218
+ for (let i = 0; i < n; i++) {
219
+ const s =
220
+ this.metric === "cosine"
221
+ ? cosineSim(query, this.embeddings[i]!)
222
+ : -euclideanDist(query, this.embeddings[i]!);
223
+ scores[i] = { idx: i, score: s };
224
+ }
225
+ // Partial sort: top-K only.
226
+ const k = Math.min(this.topK, n);
227
+ scores.sort((a, b) => b.score - a.score);
228
+ const out: Array<[number, number]> = new Array(k);
229
+ for (let i = 0; i < k; i++) out[i] = [scores[i]!.idx, scores[i]!.score];
230
+ return out;
231
+ }
232
+ }
233
+
234
+ // ---------------------------------------------------------------------------
235
+ // HNSWANNBlocker — optional fast-path backed by `hnswlib-node`.
236
+ //
237
+ // The caller provides the loaded `hnswlib-node` module via `opts.hnswModule`,
238
+ // keeping this file edge-safe (we never import the native module here).
239
+ // ---------------------------------------------------------------------------
240
+
241
+ export class HNSWANNBlocker implements ANNBlockerBase {
242
+ private index: HNSWIndexLike | null = null;
243
+ private count = 0;
244
+ private readonly opts: HNSWOptions;
245
+ private readonly topK: number;
246
+ private readonly metric: "cosine" | "euclidean";
247
+
248
+ constructor(opts: HNSWOptions) {
249
+ this.opts = opts;
250
+ this.topK = opts.topK ?? 20;
251
+ this.metric = opts.metric ?? "cosine";
252
+ }
253
+
254
+ get indexSize(): number {
255
+ return this.count;
256
+ }
257
+
258
+ buildIndex(embeddings: readonly Float32Array[]): void {
259
+ if (embeddings.length === 0) {
260
+ this.index = null;
261
+ this.count = 0;
262
+ return;
263
+ }
264
+ const dim = embeddings[0]!.length;
265
+ const metricStr = this.metric === "euclidean" ? "l2" : "cosine";
266
+ const HierarchicalNSW = this.opts.hnswModule.HierarchicalNSW;
267
+ const index = new HierarchicalNSW(metricStr, dim);
268
+ const maxElements = this.opts.maxElements ?? Math.max(embeddings.length * 2, 16);
269
+ const M = this.opts.M ?? 16;
270
+ const efConstruction = this.opts.efConstruction ?? 200;
271
+ index.initIndex(maxElements, M, efConstruction, 100);
272
+ const efSearch = this.opts.efSearch ?? Math.max(this.topK, 50);
273
+ index.setEf(efSearch);
274
+ for (let i = 0; i < embeddings.length; i++) {
275
+ index.addPoint(Array.from(embeddings[i]!), i);
276
+ }
277
+ this.index = index;
278
+ this.count = embeddings.length;
279
+ }
280
+
281
+ addToIndex(embedding: Float32Array): number {
282
+ if (!this.index) {
283
+ throw new Error("HNSWANNBlocker.addToIndex called before buildIndex");
284
+ }
285
+ const id = this.count;
286
+ this.index.addPoint(Array.from(embedding), id);
287
+ this.count++;
288
+ return id;
289
+ }
290
+
291
+ query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]> {
292
+ const pairs: Array<[number, number]> = [];
293
+ if (!this.index || this.count === 0) return pairs;
294
+ const k = Math.min(this.topK, this.count);
295
+ const seen = new Set<number>();
296
+ for (let i = 0; i < queryEmbeddings.length; i++) {
297
+ const q = Array.from(queryEmbeddings[i]!);
298
+ const result = this.index.searchKnn(q, k);
299
+ for (const neighbour of result.neighbors) {
300
+ if (neighbour === i) continue;
301
+ if (neighbour < 0) continue;
302
+ const a = Math.min(i, neighbour);
303
+ const b = Math.max(i, neighbour);
304
+ const key = a * 100000003 + b;
305
+ if (seen.has(key)) continue;
306
+ seen.add(key);
307
+ pairs.push([a, b]);
308
+ }
309
+ }
310
+ return pairs;
311
+ }
312
+
313
+ queryWithScores(
314
+ queryEmbeddings: readonly Float32Array[],
315
+ ): Array<[number, number, number]> {
316
+ const best = new Map<number, [number, number, number]>();
317
+ if (!this.index || this.count === 0) return [];
318
+ const k = Math.min(this.topK, this.count);
319
+ for (let i = 0; i < queryEmbeddings.length; i++) {
320
+ const q = Array.from(queryEmbeddings[i]!);
321
+ const result = this.index.searchKnn(q, k);
322
+ for (let idx = 0; idx < result.neighbors.length; idx++) {
323
+ const neighbour = result.neighbors[idx]!;
324
+ const d = result.distances[idx]!;
325
+ if (neighbour === i) continue;
326
+ if (neighbour < 0) continue;
327
+ // For "cosine" metric hnswlib returns (1 - cos_sim); for "l2" it
328
+ // returns squared Euclidean distance. Convert to a similarity score
329
+ // bounded in (roughly) [0, 1].
330
+ const score = this.metric === "euclidean" ? 1 / (1 + d) : 1 - d;
331
+ const a = Math.min(i, neighbour);
332
+ const b = Math.max(i, neighbour);
333
+ const key = a * 100000003 + b;
334
+ const prev = best.get(key);
335
+ if (!prev || score > prev[2]) {
336
+ best.set(key, [a, b, score]);
337
+ }
338
+ }
339
+ }
340
+ return [...best.values()];
341
+ }
342
+
343
+ queryOne(queryEmbedding: Float32Array): Array<[number, number]> {
344
+ if (!this.index || this.count === 0) return [];
345
+ const k = Math.min(this.topK, this.count);
346
+ const result = this.index.searchKnn(Array.from(queryEmbedding), k);
347
+ const out: Array<[number, number]> = [];
348
+ for (let idx = 0; idx < result.neighbors.length; idx++) {
349
+ const neighbour = result.neighbors[idx]!;
350
+ const d = result.distances[idx]!;
351
+ const score = this.metric === "euclidean" ? 1 / (1 + d) : 1 - d;
352
+ out.push([neighbour, score]);
353
+ }
354
+ return out;
355
+ }
356
+ }
357
+
358
+ // ---------------------------------------------------------------------------
359
+ // Factory — auto-loads `hnswlib-node` when requested, falls back to brute-force.
360
+ // ---------------------------------------------------------------------------
361
+
362
+ export interface CreateANNBlockerOptions extends ANNBlockerOptions {
363
+ /** Attempt to use the hnswlib-node fast-path. */
364
+ readonly useHNSW?: boolean;
365
+ /** Pre-loaded hnswlib-node module (skips dynamic import). */
366
+ readonly hnswModule?: HNSWModule;
367
+ /** Additional HNSW tuning knobs. Ignored when falling back to brute-force. */
368
+ readonly maxElements?: number;
369
+ readonly M?: number;
370
+ readonly efConstruction?: number;
371
+ readonly efSearch?: number;
372
+ /**
373
+ * Override the warning sink when the fast-path is unavailable. Defaults to
374
+ * `console.warn`. Tests pass a spy here.
375
+ */
376
+ readonly onFallbackWarning?: (message: string) => void;
377
+ }
378
+
379
+ /**
380
+ * Build an ANN blocker, preferring the `hnswlib-node` fast-path when
381
+ * `useHNSW` is `true` and the module can be loaded. Falls back to the
382
+ * brute-force `ANNBlocker` when the module is missing (e.g. edge runtime,
383
+ * peer dep not installed) and emits a single warning.
384
+ */
385
+ export async function createANNBlocker(
386
+ options: CreateANNBlockerOptions = {},
387
+ ): Promise<ANNBlockerBase> {
388
+ const bruteOptions: ANNBlockerOptions = {
389
+ ...(options.topK !== undefined ? { topK: options.topK } : {}),
390
+ ...(options.metric !== undefined ? { metric: options.metric } : {}),
391
+ };
392
+
393
+ if (!options.useHNSW) {
394
+ return new ANNBlocker(bruteOptions);
395
+ }
396
+
397
+ let hnsw: HNSWModule | null = options.hnswModule ?? null;
398
+ if (!hnsw) {
399
+ try {
400
+ // `as string` prevents tsup / bundlers from trying to resolve this
401
+ // optional native module at build time; it stays a runtime dynamic
402
+ // import that we catch below if it fails.
403
+ const mod = (await import("hnswlib-node" as string)) as unknown as {
404
+ HierarchicalNSW: HNSWModule["HierarchicalNSW"];
405
+ default?: { HierarchicalNSW: HNSWModule["HierarchicalNSW"] };
406
+ };
407
+ const ctor =
408
+ mod.HierarchicalNSW ?? mod.default?.HierarchicalNSW ?? null;
409
+ if (ctor) {
410
+ hnsw = { HierarchicalNSW: ctor };
411
+ }
412
+ } catch {
413
+ hnsw = null;
414
+ }
415
+ }
416
+
417
+ if (!hnsw) {
418
+ const warn = options.onFallbackWarning ?? ((m: string) => console.warn(m));
419
+ warn("hnswlib-node not installed; falling back to brute-force ANN");
420
+ return new ANNBlocker(bruteOptions);
421
+ }
422
+
423
+ return new HNSWANNBlocker({
424
+ hnswModule: hnsw,
425
+ ...(options.topK !== undefined ? { topK: options.topK } : {}),
426
+ ...(options.metric !== undefined ? { metric: options.metric } : {}),
427
+ ...(options.maxElements !== undefined
428
+ ? { maxElements: options.maxElements }
429
+ : {}),
430
+ ...(options.M !== undefined ? { M: options.M } : {}),
431
+ ...(options.efConstruction !== undefined
432
+ ? { efConstruction: options.efConstruction }
433
+ : {}),
434
+ ...(options.efSearch !== undefined ? { efSearch: options.efSearch } : {}),
435
+ });
436
+ }
437
+
438
+ // ---------------------------------------------------------------------------
439
+ // Block builders
440
+ // ---------------------------------------------------------------------------
441
+
442
+ /** Pull an ANN-relevant text from a row. Coerces non-strings, drops null. */
443
+ function getText(row: Row, col: string): string | null {
444
+ const v = row[col];
445
+ if (v === null || v === undefined) return null;
446
+ const s = String(v).trim();
447
+ return s === "" ? null : s;
448
+ }
449
+
450
+ /** Trivial Union-Find. */
451
+ class UnionFind {
452
+ private parent: number[];
453
+ constructor(n: number) {
454
+ this.parent = new Array(n);
455
+ for (let i = 0; i < n; i++) this.parent[i] = i;
456
+ }
457
+ find(x: number): number {
458
+ let r = x;
459
+ while (this.parent[r]! !== r) r = this.parent[r]!;
460
+ // Path compression.
461
+ let cur = x;
462
+ while (this.parent[cur]! !== r) {
463
+ const next = this.parent[cur]!;
464
+ this.parent[cur] = r;
465
+ cur = next;
466
+ }
467
+ return r;
468
+ }
469
+ union(a: number, b: number): void {
470
+ const ra = this.find(a);
471
+ const rb = this.find(b);
472
+ if (ra !== rb) this.parent[ra] = rb;
473
+ }
474
+ }
475
+
476
+ /**
477
+ * Embed one column, query top-K neighbours, and group connected pairs
478
+ * into micro-blocks via Union-Find.
479
+ */
480
+ export async function buildANNBlocks(
481
+ rows: readonly Row[],
482
+ annColumn: string,
483
+ options: BuildANNOptions = {},
484
+ ): Promise<BlockResult[]> {
485
+ if (rows.length < 2) return [];
486
+
487
+ const topK = options.topK ?? 20;
488
+ const maxBlockSize = options.maxBlockSize ?? 1000;
489
+
490
+ const embedder = getEmbedder({
491
+ ...(options.model !== undefined ? { model: options.model } : {}),
492
+ ...(options.apiKey !== undefined ? { apiKey: options.apiKey } : {}),
493
+ ...(options.provider !== undefined ? { provider: options.provider } : {}),
494
+ });
495
+
496
+ // Extract texts.
497
+ const texts: (string | null)[] = rows.map((r) => getText(r, annColumn));
498
+ const embeddings = await embedder.embedColumn(texts);
499
+
500
+ // Build index across all rows.
501
+ const blocker = await createANNBlocker({
502
+ topK,
503
+ ...(options.useHNSW !== undefined ? { useHNSW: options.useHNSW } : {}),
504
+ });
505
+ blocker.buildIndex(embeddings);
506
+ const pairs = blocker.query(embeddings);
507
+ if (pairs.length === 0) return [];
508
+
509
+ // Union-Find on the connected pairs.
510
+ const uf = new UnionFind(rows.length);
511
+ for (const [a, b] of pairs) uf.union(a, b);
512
+
513
+ // Group by root.
514
+ const groups = new Map<number, number[]>();
515
+ for (let i = 0; i < rows.length; i++) {
516
+ // Only include rows that participated in at least one pair (avoid singletons).
517
+ // To detect that, we just include any row whose root has more than itself.
518
+ const root = uf.find(i);
519
+ let arr = groups.get(root);
520
+ if (!arr) {
521
+ arr = [];
522
+ groups.set(root, arr);
523
+ }
524
+ arr.push(i);
525
+ }
526
+
527
+ const results: BlockResult[] = [];
528
+ let blockNum = 0;
529
+ for (const [, members] of groups) {
530
+ if (members.length < 2) continue;
531
+ if (members.length > maxBlockSize) continue; // skip oversized
532
+ results.push({
533
+ blockKey: `ann_${blockNum++}`,
534
+ rows: members.map((idx) => rows[idx]!),
535
+ strategy: "ann",
536
+ depth: 0,
537
+ });
538
+ }
539
+ return results;
540
+ }
541
+
542
+ /**
543
+ * Variant that returns one BlockResult containing every row plus
544
+ * pre-scored pairs derived from ANN cosine similarity. Useful when the
545
+ * scorer should reuse the embedding-based scores instead of recomputing.
546
+ */
547
+ export async function buildANNPairBlocks(
548
+ rows: readonly Row[],
549
+ annColumn: string,
550
+ options: BuildANNOptions = {},
551
+ ): Promise<BlockResult[]> {
552
+ if (rows.length < 2) return [];
553
+
554
+ const topK = options.topK ?? 20;
555
+ const idColumn = options.idColumn ?? "__row_id__";
556
+
557
+ const embedder = getEmbedder({
558
+ ...(options.model !== undefined ? { model: options.model } : {}),
559
+ ...(options.apiKey !== undefined ? { apiKey: options.apiKey } : {}),
560
+ ...(options.provider !== undefined ? { provider: options.provider } : {}),
561
+ });
562
+
563
+ const texts: (string | null)[] = rows.map((r) => getText(r, annColumn));
564
+ const embeddings = await embedder.embedColumn(texts);
565
+
566
+ const blocker = await createANNBlocker({
567
+ topK,
568
+ ...(options.useHNSW !== undefined ? { useHNSW: options.useHNSW } : {}),
569
+ });
570
+ blocker.buildIndex(embeddings);
571
+ const scored = blocker.queryWithScores(embeddings);
572
+ if (scored.length === 0) return [];
573
+
574
+ // Map index positions -> __row_id__ values (fall back to index).
575
+ const rowIdAt = (idx: number): number => {
576
+ const v = rows[idx]?.[idColumn];
577
+ return typeof v === "number" ? v : idx;
578
+ };
579
+
580
+ const preScoredPairs: ScoredPair[] = scored.map(([a, b, score]) =>
581
+ makeScoredPair(rowIdAt(a), rowIdAt(b), score),
582
+ );
583
+
584
+ return [
585
+ {
586
+ blockKey: "ann_pairs_0",
587
+ rows: rows.slice(),
588
+ strategy: "ann_pairs",
589
+ depth: 0,
590
+ preScoredPairs,
591
+ },
592
+ ];
593
+ }