goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,699 @@
1
+ /**
2
+ * cluster.ts — Union-Find clustering with MST splitting.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ */
5
+
6
+ import type { ClusterInfo, PairKey } from "./types.js";
7
+
8
+ // ---------------------------------------------------------------------------
9
+ // Helpers
10
+ // ---------------------------------------------------------------------------
11
+
12
+ /** Canonicalize a pair key: always min:max. Sole producer of branded PairKey. */
13
+ export function pairKey(a: number, b: number): PairKey {
14
+ const lo = a < b ? a : b;
15
+ const hi = a < b ? b : a;
16
+ return `${lo}:${hi}` as PairKey;
17
+ }
18
+
19
+ /** Parse a pair key back into [idA, idB]. */
20
+ export function parsePairKey(key: PairKey): readonly [number, number] {
21
+ const idx = key.indexOf(":");
22
+ return [Number(key.slice(0, idx)), Number(key.slice(idx + 1))];
23
+ }
24
+
25
+ // ---------------------------------------------------------------------------
26
+ // UnionFind
27
+ // ---------------------------------------------------------------------------
28
+
29
+ export class UnionFind {
30
+ private parent = new Map<number, number>();
31
+ private rank = new Map<number, number>();
32
+
33
+ /** Add element as its own root. */
34
+ add(x: number): void {
35
+ if (!this.parent.has(x)) {
36
+ this.parent.set(x, x);
37
+ this.rank.set(x, 0);
38
+ }
39
+ }
40
+
41
+ /** Batch add multiple elements. */
42
+ addMany(ids: readonly number[]): void {
43
+ for (const x of ids) {
44
+ if (!this.parent.has(x)) {
45
+ this.parent.set(x, x);
46
+ this.rank.set(x, 0);
47
+ }
48
+ }
49
+ }
50
+
51
+ /** Find root with iterative path compression. */
52
+ find(x: number): number {
53
+ let root = x;
54
+ while (this.parent.get(root) !== root) {
55
+ root = this.parent.get(root)!;
56
+ }
57
+ // Path compression
58
+ let current = x;
59
+ while (this.parent.get(current) !== root) {
60
+ const next = this.parent.get(current)!;
61
+ this.parent.set(current, root);
62
+ current = next;
63
+ }
64
+ return root;
65
+ }
66
+
67
+ /** Union by rank. */
68
+ union(a: number, b: number): void {
69
+ let ra = this.find(a);
70
+ let rb = this.find(b);
71
+ if (ra === rb) return;
72
+ const rankA = this.rank.get(ra)!;
73
+ const rankB = this.rank.get(rb)!;
74
+ if (rankA < rankB) {
75
+ [ra, rb] = [rb, ra];
76
+ }
77
+ this.parent.set(rb, ra);
78
+ if (rankA === rankB) {
79
+ this.rank.set(ra, rankA + 1);
80
+ }
81
+ }
82
+
83
+ /** Return all clusters as arrays of sets. */
84
+ getClusters(): Set<number>[] {
85
+ const groups = new Map<number, Set<number>>();
86
+ for (const x of this.parent.keys()) {
87
+ const root = this.find(x);
88
+ let group = groups.get(root);
89
+ if (!group) {
90
+ group = new Set<number>();
91
+ groups.set(root, group);
92
+ }
93
+ group.add(x);
94
+ }
95
+ return Array.from(groups.values());
96
+ }
97
+ }
98
+
99
+ // ---------------------------------------------------------------------------
100
+ // MST (max-weight spanning tree via Kruskal)
101
+ // ---------------------------------------------------------------------------
102
+
103
+ /**
104
+ * Build a max-weight spanning tree using Kruskal's algorithm.
105
+ * Returns edges as [idA, idB, score] sorted by descending weight.
106
+ */
107
+ export function buildMst(
108
+ members: readonly number[],
109
+ pairScores: ReadonlyMap<PairKey, number>,
110
+ ): [number, number, number][] {
111
+ // Collect and sort edges by score descending
112
+ const edges: [number, number, number][] = [];
113
+ for (const [key, score] of pairScores) {
114
+ const [a, b] = parsePairKey(key);
115
+ edges.push([a, b, score]);
116
+ }
117
+ edges.sort((x, y) => y[2] - x[2]);
118
+
119
+ const uf = new UnionFind();
120
+ uf.addMany(members);
121
+
122
+ const mst: [number, number, number][] = [];
123
+ const target = members.length - 1;
124
+ for (const [a, b, s] of edges) {
125
+ if (uf.find(a) !== uf.find(b)) {
126
+ uf.union(a, b);
127
+ mst.push([a, b, s]);
128
+ if (mst.length === target) break;
129
+ }
130
+ }
131
+ return mst;
132
+ }
133
+
134
+ // ---------------------------------------------------------------------------
135
+ // Cluster confidence
136
+ // ---------------------------------------------------------------------------
137
+
138
+ export interface ClusterConfidence {
139
+ readonly minEdge: number | null;
140
+ readonly avgEdge: number | null;
141
+ readonly connectivity: number;
142
+ readonly bottleneckPair: readonly [number, number] | null;
143
+ readonly confidence: number;
144
+ }
145
+
146
+ /**
147
+ * Compute confidence metrics for a cluster.
148
+ * confidence = 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity
149
+ */
150
+ export function computeClusterConfidence(
151
+ pairScores: ReadonlyMap<PairKey, number>,
152
+ size: number,
153
+ ): ClusterConfidence {
154
+ if (size <= 1 || pairScores.size === 0) {
155
+ return {
156
+ minEdge: null,
157
+ avgEdge: null,
158
+ connectivity: size <= 1 ? 1.0 : 0.0,
159
+ bottleneckPair: null,
160
+ confidence: size <= 1 ? 1.0 : 0.0,
161
+ };
162
+ }
163
+
164
+ let minEdge = Infinity;
165
+ let sum = 0;
166
+ let bottleneckKey: PairKey | null = null;
167
+
168
+ for (const [key, score] of pairScores) {
169
+ sum += score;
170
+ if (score < minEdge) {
171
+ minEdge = score;
172
+ bottleneckKey = key;
173
+ }
174
+ }
175
+
176
+ const avgEdge = sum / pairScores.size;
177
+ const maxPossibleEdges = (size * (size - 1)) / 2;
178
+ const connectivity =
179
+ maxPossibleEdges > 0 ? pairScores.size / maxPossibleEdges : 0.0;
180
+
181
+ const bottleneckPair: readonly [number, number] | null = bottleneckKey
182
+ ? parsePairKey(bottleneckKey)
183
+ : null;
184
+
185
+ const confidence = 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity;
186
+
187
+ return { minEdge, avgEdge, connectivity, bottleneckPair, confidence };
188
+ }
189
+
190
+ // ---------------------------------------------------------------------------
191
+ // Split oversized cluster
192
+ // ---------------------------------------------------------------------------
193
+
194
+ /** Internal mutable cluster info used during building. */
195
+ interface MutableClusterInfo {
196
+ members: number[];
197
+ size: number;
198
+ oversized: boolean;
199
+ pairScores: Map<PairKey, number>;
200
+ confidence: number;
201
+ bottleneckPair: readonly [number, number] | null;
202
+ clusterQuality: "strong" | "weak" | "split";
203
+ _wasSplit?: boolean;
204
+ }
205
+
206
+ /**
207
+ * Split a cluster by removing the weakest MST edge.
208
+ * Returns sub-cluster infos.
209
+ */
210
+ export function splitOversizedCluster(
211
+ members: readonly number[],
212
+ pairScores: ReadonlyMap<PairKey, number>,
213
+ ): MutableClusterInfo[] {
214
+ if (members.length <= 1 || pairScores.size === 0) {
215
+ return [
216
+ {
217
+ members: [...members].sort((a, b) => a - b),
218
+ size: members.length,
219
+ oversized: false,
220
+ pairScores: new Map(pairScores),
221
+ confidence: 1.0,
222
+ bottleneckPair: null,
223
+ clusterQuality: "strong",
224
+ },
225
+ ];
226
+ }
227
+
228
+ const mst = buildMst(members, pairScores);
229
+ if (mst.length === 0) {
230
+ return [
231
+ {
232
+ members: [...members].sort((a, b) => a - b),
233
+ size: members.length,
234
+ oversized: false,
235
+ pairScores: new Map(pairScores),
236
+ confidence: 1.0,
237
+ bottleneckPair: null,
238
+ clusterQuality: "strong",
239
+ },
240
+ ];
241
+ }
242
+
243
+ // Find weakest edge
244
+ let weakestIdx = 0;
245
+ let weakestScore = mst[0]![2];
246
+ for (let i = 1; i < mst.length; i++) {
247
+ if (mst[i]![2] < weakestScore) {
248
+ weakestScore = mst[i]![2];
249
+ weakestIdx = i;
250
+ }
251
+ }
252
+
253
+ // Rebuild without weakest edge
254
+ const uf = new UnionFind();
255
+ uf.addMany(members as number[]);
256
+ for (let i = 0; i < mst.length; i++) {
257
+ if (i !== weakestIdx) {
258
+ uf.union(mst[i]![0], mst[i]![1]);
259
+ }
260
+ }
261
+
262
+ const result: MutableClusterInfo[] = [];
263
+ for (const subMembers of uf.getClusters()) {
264
+ const subList = [...subMembers].sort((a, b) => a - b);
265
+ const subPairs = new Map<PairKey, number>();
266
+ for (const [key, score] of pairScores) {
267
+ const [a, b] = parsePairKey(key);
268
+ if (subMembers.has(a) && subMembers.has(b)) {
269
+ subPairs.set(key, score);
270
+ }
271
+ }
272
+ const conf = computeClusterConfidence(subPairs, subList.length);
273
+ result.push({
274
+ members: subList,
275
+ size: subList.length,
276
+ oversized: false,
277
+ pairScores: subPairs,
278
+ confidence: conf.confidence,
279
+ bottleneckPair: conf.bottleneckPair,
280
+ clusterQuality: "strong",
281
+ });
282
+ }
283
+ return result;
284
+ }
285
+
286
+ // ---------------------------------------------------------------------------
287
+ // buildClusters options
288
+ // ---------------------------------------------------------------------------
289
+
290
+ export interface BuildClustersOptions {
291
+ readonly maxClusterSize?: number;
292
+ readonly weakClusterThreshold?: number;
293
+ readonly autoSplit?: boolean;
294
+ }
295
+
296
+ // ---------------------------------------------------------------------------
297
+ // buildClusters
298
+ // ---------------------------------------------------------------------------
299
+
300
+ /**
301
+ * Build clusters from scored pairs using Union-Find.
302
+ *
303
+ * Auto-splits oversized clusters via MST (iterative, not recursive).
304
+ * Assigns cluster_quality: "strong", "weak" (avg-min > weakThreshold), or "split".
305
+ * Downgrades confidence by 0.7 for weak clusters.
306
+ */
307
+ export function buildClusters(
308
+ pairs: readonly (readonly [number, number, number])[],
309
+ allIds: readonly number[],
310
+ options?: BuildClustersOptions,
311
+ ): Map<number, ClusterInfo> {
312
+ const maxClusterSize = options?.maxClusterSize ?? 100;
313
+ const weakClusterThreshold = options?.weakClusterThreshold ?? 0.3;
314
+ const autoSplit = options?.autoSplit ?? true;
315
+
316
+ // Build Union-Find from pairs
317
+ const uf = new UnionFind();
318
+ uf.addMany(allIds);
319
+ for (const [idA, idB] of pairs) {
320
+ uf.union(idA, idB);
321
+ }
322
+
323
+ const clusters = uf.getClusters();
324
+
325
+ // Sort clusters by minimum member for deterministic IDs.
326
+ // Use for-loop min — Math.min(...set) crashes on Sets with >65K elements.
327
+ const minOf = (s: Set<number>): number => {
328
+ let m = Infinity;
329
+ for (const v of s) if (v < m) m = v;
330
+ return m;
331
+ };
332
+ clusters.sort((a, b) => minOf(a) - minOf(b));
333
+
334
+ // Map members to cluster IDs
335
+ const memberToCid = new Map<number, number>();
336
+ for (let i = 0; i < clusters.length; i++) {
337
+ const cid = i + 1;
338
+ for (const m of clusters[i]!) {
339
+ memberToCid.set(m, cid);
340
+ }
341
+ }
342
+
343
+ // Build mutable result
344
+ const result = new Map<number, MutableClusterInfo>();
345
+ for (let i = 0; i < clusters.length; i++) {
346
+ const cid = i + 1;
347
+ const memberArr = [...clusters[i]!].sort((a, b) => a - b);
348
+ result.set(cid, {
349
+ members: memberArr,
350
+ size: memberArr.length,
351
+ oversized: memberArr.length > maxClusterSize,
352
+ pairScores: new Map(),
353
+ confidence: 0,
354
+ bottleneckPair: null,
355
+ clusterQuality: "strong",
356
+ });
357
+ }
358
+
359
+ // Assign pair scores to clusters (canonicalized keys)
360
+ for (const [idA, idB, score] of pairs) {
361
+ const cid = memberToCid.get(idA)!;
362
+ const info = result.get(cid)!;
363
+ info.pairScores.set(pairKey(idA, idB), score);
364
+ }
365
+
366
+ // Compute initial confidence
367
+ for (const [, cinfo] of result) {
368
+ const conf = computeClusterConfidence(cinfo.pairScores, cinfo.size);
369
+ cinfo.confidence = conf.confidence;
370
+ cinfo.bottleneckPair = conf.bottleneckPair;
371
+ }
372
+
373
+ // Auto-split oversized clusters (iterative)
374
+ if (autoSplit) {
375
+ const toSplit: number[] = [];
376
+ for (const [cid, c] of result) {
377
+ if (c.oversized) toSplit.push(cid);
378
+ }
379
+
380
+ while (toSplit.length > 0) {
381
+ const cid = toSplit.pop()!;
382
+ const cinfo = result.get(cid)!;
383
+ result.delete(cid);
384
+
385
+ const subClusters = splitOversizedCluster(
386
+ cinfo.members,
387
+ cinfo.pairScores,
388
+ );
389
+ let nextCid = 0;
390
+ for (const [k] of result) {
391
+ if (k > nextCid) nextCid = k;
392
+ }
393
+ nextCid += 1;
394
+
395
+ for (const sc of subClusters) {
396
+ sc.oversized = sc.size > maxClusterSize;
397
+ sc._wasSplit = true;
398
+ result.set(nextCid, sc);
399
+ if (sc.oversized) {
400
+ toSplit.push(nextCid);
401
+ }
402
+ nextCid++;
403
+ }
404
+ }
405
+ }
406
+
407
+ // Assign cluster_quality and apply confidence downgrade
408
+ for (const [, cinfo] of result) {
409
+ if (cinfo._wasSplit) {
410
+ cinfo.clusterQuality = "split";
411
+ } else if (cinfo.size > 1 && cinfo.pairScores.size > 0) {
412
+ const scores = [...cinfo.pairScores.values()];
413
+ let minE = Infinity;
414
+ let sumE = 0;
415
+ for (const s of scores) {
416
+ if (s < minE) minE = s;
417
+ sumE += s;
418
+ }
419
+ const avgE = sumE / scores.length;
420
+ if (avgE - minE > weakClusterThreshold) {
421
+ cinfo.clusterQuality = "weak";
422
+ cinfo.confidence *= 0.7;
423
+ } else {
424
+ cinfo.clusterQuality = "strong";
425
+ }
426
+ } else {
427
+ cinfo.clusterQuality = "strong";
428
+ }
429
+ delete cinfo._wasSplit;
430
+ }
431
+
432
+ // Freeze into ClusterInfo
433
+ const frozen = new Map<number, ClusterInfo>();
434
+ for (const [cid, c] of result) {
435
+ frozen.set(cid, {
436
+ members: c.members,
437
+ size: c.size,
438
+ oversized: c.oversized,
439
+ pairScores: c.pairScores,
440
+ confidence: c.confidence,
441
+ bottleneckPair: c.bottleneckPair,
442
+ clusterQuality: c.clusterQuality,
443
+ });
444
+ }
445
+ return frozen;
446
+ }
447
+
448
+ // ---------------------------------------------------------------------------
449
+ // addToCluster
450
+ // ---------------------------------------------------------------------------
451
+
452
+ /**
453
+ * Add a new record to existing clusters based on matches.
454
+ *
455
+ * - No matches: new singleton cluster
456
+ * - Single cluster match: join that cluster
457
+ * - Multiple cluster match: merge all matched clusters
458
+ *
459
+ * Flags oversized but does NOT auto-split. Caller should call
460
+ * splitOversizedCluster() if desired.
461
+ */
462
+ export function addToCluster(
463
+ recordId: number,
464
+ matches: readonly (readonly [number, number])[],
465
+ clusters: Map<number, ClusterInfo>,
466
+ maxClusterSize = 100,
467
+ ): Map<number, ClusterInfo> {
468
+ const makeSingleton = (): ClusterInfo => ({
469
+ members: [recordId],
470
+ size: 1,
471
+ oversized: false,
472
+ pairScores: new Map(),
473
+ confidence: 1.0,
474
+ bottleneckPair: null,
475
+ clusterQuality: "strong",
476
+ });
477
+
478
+ if (matches.length === 0) {
479
+ const nextCid = _nextCid(clusters);
480
+ clusters.set(nextCid, makeSingleton());
481
+ return clusters;
482
+ }
483
+
484
+ // Map members to cluster IDs
485
+ const memberToCid = new Map<number, number>();
486
+ for (const [cid, cinfo] of clusters) {
487
+ for (const m of cinfo.members) {
488
+ memberToCid.set(m, cid);
489
+ }
490
+ }
491
+
492
+ const matchedCids = new Set<number>();
493
+ for (const [matchedId] of matches) {
494
+ const cid = memberToCid.get(matchedId);
495
+ if (cid !== undefined) matchedCids.add(cid);
496
+ }
497
+
498
+ if (matchedCids.size === 0) {
499
+ const nextCid = _nextCid(clusters);
500
+ clusters.set(nextCid, makeSingleton());
501
+ return clusters;
502
+ }
503
+
504
+ if (matchedCids.size === 1) {
505
+ const cid = matchedCids.values().next().value!;
506
+ const old = clusters.get(cid)!;
507
+ const newPairs = new Map(old.pairScores);
508
+
509
+ for (const [matchedId, score] of matches) {
510
+ if (memberToCid.get(matchedId) === cid) {
511
+ newPairs.set(pairKey(recordId, matchedId), score);
512
+ }
513
+ }
514
+
515
+ const newMembers = [...old.members, recordId].sort((a, b) => a - b);
516
+ const newSize = newMembers.length;
517
+ const conf = computeClusterConfidence(newPairs, newSize);
518
+
519
+ clusters.set(cid, {
520
+ members: newMembers,
521
+ size: newSize,
522
+ oversized: newSize > maxClusterSize,
523
+ pairScores: newPairs,
524
+ confidence: conf.confidence,
525
+ bottleneckPair: conf.bottleneckPair,
526
+ clusterQuality: old.clusterQuality,
527
+ });
528
+ return clusters;
529
+ }
530
+
531
+ // Multiple clusters: merge all
532
+ const mergedMembers: number[] = [recordId];
533
+ const mergedPairs = new Map<PairKey, number>();
534
+
535
+ for (const cid of matchedCids) {
536
+ const cinfo = clusters.get(cid)!;
537
+ mergedMembers.push(...cinfo.members);
538
+ for (const [k, v] of cinfo.pairScores) {
539
+ mergedPairs.set(k, v);
540
+ }
541
+ clusters.delete(cid);
542
+ }
543
+
544
+ for (const [matchedId, score] of matches) {
545
+ mergedPairs.set(pairKey(recordId, matchedId), score);
546
+ }
547
+
548
+ const sortedMembers = mergedMembers.sort((a, b) => a - b);
549
+ const size = sortedMembers.length;
550
+ const conf = computeClusterConfidence(mergedPairs, size);
551
+ const nextCid = _nextCid(clusters);
552
+
553
+ clusters.set(nextCid, {
554
+ members: sortedMembers,
555
+ size,
556
+ oversized: size > maxClusterSize,
557
+ pairScores: mergedPairs,
558
+ confidence: conf.confidence,
559
+ bottleneckPair: conf.bottleneckPair,
560
+ clusterQuality: "strong",
561
+ });
562
+
563
+ return clusters;
564
+ }
565
+
566
+ // ---------------------------------------------------------------------------
567
+ // unmergeRecord
568
+ // ---------------------------------------------------------------------------
569
+
570
+ /**
571
+ * Remove a record from its cluster and re-cluster remaining members.
572
+ * The removed record becomes a singleton.
573
+ */
574
+ export function unmergeRecord(
575
+ recordId: number,
576
+ clusters: Map<number, ClusterInfo>,
577
+ threshold = 0.0,
578
+ ): Map<number, ClusterInfo> {
579
+ // Find which cluster contains this record
580
+ let sourceCid: number | null = null;
581
+ for (const [cid, cinfo] of clusters) {
582
+ if (cinfo.members.includes(recordId)) {
583
+ sourceCid = cid;
584
+ break;
585
+ }
586
+ }
587
+
588
+ if (sourceCid === null) return clusters; // Not found
589
+ const cinfo = clusters.get(sourceCid)!;
590
+ if (cinfo.size <= 1) return clusters; // Already singleton
591
+
592
+ // Extract pairs excluding the removed record, applying threshold
593
+ const remainingMembers = cinfo.members.filter((m) => m !== recordId);
594
+ const remainingPairs: [number, number, number][] = [];
595
+ for (const [key, score] of cinfo.pairScores) {
596
+ const [a, b] = parsePairKey(key);
597
+ if (a !== recordId && b !== recordId && score >= threshold) {
598
+ remainingPairs.push([a, b, score]);
599
+ }
600
+ }
601
+
602
+ // Re-cluster remaining members
603
+ const subClusters = buildClusters(remainingPairs, remainingMembers);
604
+
605
+ // Remove the original cluster
606
+ clusters.delete(sourceCid);
607
+
608
+ // Assign new cluster IDs
609
+ let nextCid = _nextCid(clusters);
610
+
611
+ // Add the removed record as a singleton
612
+ clusters.set(nextCid, {
613
+ members: [recordId],
614
+ size: 1,
615
+ oversized: false,
616
+ pairScores: new Map(),
617
+ confidence: 1.0,
618
+ bottleneckPair: null,
619
+ clusterQuality: "strong",
620
+ });
621
+ nextCid++;
622
+
623
+ // Add re-clustered groups
624
+ for (const [, subInfo] of subClusters) {
625
+ clusters.set(nextCid, subInfo);
626
+ nextCid++;
627
+ }
628
+
629
+ return clusters;
630
+ }
631
+
632
+ // ---------------------------------------------------------------------------
633
+ // unmergeCluster
634
+ // ---------------------------------------------------------------------------
635
+
636
+ /**
637
+ * Shatter a cluster into individual singletons.
638
+ * All members become their own cluster. Pair scores are discarded.
639
+ */
640
+ export function unmergeCluster(
641
+ clusterId: number,
642
+ clusters: Map<number, ClusterInfo>,
643
+ ): Map<number, ClusterInfo> {
644
+ const cinfo = clusters.get(clusterId);
645
+ if (!cinfo) return clusters;
646
+
647
+ const members = cinfo.members;
648
+ clusters.delete(clusterId);
649
+
650
+ let nextCid = _nextCid(clusters);
651
+ for (const memberId of members) {
652
+ clusters.set(nextCid, {
653
+ members: [memberId],
654
+ size: 1,
655
+ oversized: false,
656
+ pairScores: new Map(),
657
+ confidence: 1.0,
658
+ bottleneckPair: null,
659
+ clusterQuality: "strong",
660
+ });
661
+ nextCid++;
662
+ }
663
+
664
+ return clusters;
665
+ }
666
+
667
+ // ---------------------------------------------------------------------------
668
+ // getClusterPairScores
669
+ // ---------------------------------------------------------------------------
670
+
671
+ /**
672
+ * Get pair scores for a specific set of cluster members from all pairs.
673
+ * Call on-demand, not in hot path.
674
+ */
675
+ export function getClusterPairScores(
676
+ members: readonly number[],
677
+ allPairs: readonly (readonly [number, number, number])[],
678
+ ): Map<PairKey, number> {
679
+ const memberSet = new Set(members);
680
+ const result = new Map<PairKey, number>();
681
+ for (const [a, b, s] of allPairs) {
682
+ if (memberSet.has(a) && memberSet.has(b)) {
683
+ result.set(pairKey(a, b), s);
684
+ }
685
+ }
686
+ return result;
687
+ }
688
+
689
+ // ---------------------------------------------------------------------------
690
+ // Internal helpers
691
+ // ---------------------------------------------------------------------------
692
+
693
+ function _nextCid(clusters: ReadonlyMap<number, unknown>): number {
694
+ let max = 0;
695
+ for (const k of clusters.keys()) {
696
+ if (k > max) max = k;
697
+ }
698
+ return max + 1;
699
+ }