goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cluster.ts — Union-Find clustering with MST splitting.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { ClusterInfo, PairKey } from "./types.js";
|
|
7
|
+
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Helpers
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
/** Canonicalize a pair key: always min:max. Sole producer of branded PairKey. */
|
|
13
|
+
export function pairKey(a: number, b: number): PairKey {
|
|
14
|
+
const lo = a < b ? a : b;
|
|
15
|
+
const hi = a < b ? b : a;
|
|
16
|
+
return `${lo}:${hi}` as PairKey;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Parse a pair key back into [idA, idB]. */
|
|
20
|
+
export function parsePairKey(key: PairKey): readonly [number, number] {
|
|
21
|
+
const idx = key.indexOf(":");
|
|
22
|
+
return [Number(key.slice(0, idx)), Number(key.slice(idx + 1))];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// UnionFind
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
export class UnionFind {
|
|
30
|
+
private parent = new Map<number, number>();
|
|
31
|
+
private rank = new Map<number, number>();
|
|
32
|
+
|
|
33
|
+
/** Add element as its own root. */
|
|
34
|
+
add(x: number): void {
|
|
35
|
+
if (!this.parent.has(x)) {
|
|
36
|
+
this.parent.set(x, x);
|
|
37
|
+
this.rank.set(x, 0);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Batch add multiple elements. */
|
|
42
|
+
addMany(ids: readonly number[]): void {
|
|
43
|
+
for (const x of ids) {
|
|
44
|
+
if (!this.parent.has(x)) {
|
|
45
|
+
this.parent.set(x, x);
|
|
46
|
+
this.rank.set(x, 0);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Find root with iterative path compression. */
|
|
52
|
+
find(x: number): number {
|
|
53
|
+
let root = x;
|
|
54
|
+
while (this.parent.get(root) !== root) {
|
|
55
|
+
root = this.parent.get(root)!;
|
|
56
|
+
}
|
|
57
|
+
// Path compression
|
|
58
|
+
let current = x;
|
|
59
|
+
while (this.parent.get(current) !== root) {
|
|
60
|
+
const next = this.parent.get(current)!;
|
|
61
|
+
this.parent.set(current, root);
|
|
62
|
+
current = next;
|
|
63
|
+
}
|
|
64
|
+
return root;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Union by rank. */
|
|
68
|
+
union(a: number, b: number): void {
|
|
69
|
+
let ra = this.find(a);
|
|
70
|
+
let rb = this.find(b);
|
|
71
|
+
if (ra === rb) return;
|
|
72
|
+
const rankA = this.rank.get(ra)!;
|
|
73
|
+
const rankB = this.rank.get(rb)!;
|
|
74
|
+
if (rankA < rankB) {
|
|
75
|
+
[ra, rb] = [rb, ra];
|
|
76
|
+
}
|
|
77
|
+
this.parent.set(rb, ra);
|
|
78
|
+
if (rankA === rankB) {
|
|
79
|
+
this.rank.set(ra, rankA + 1);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Return all clusters as arrays of sets. */
|
|
84
|
+
getClusters(): Set<number>[] {
|
|
85
|
+
const groups = new Map<number, Set<number>>();
|
|
86
|
+
for (const x of this.parent.keys()) {
|
|
87
|
+
const root = this.find(x);
|
|
88
|
+
let group = groups.get(root);
|
|
89
|
+
if (!group) {
|
|
90
|
+
group = new Set<number>();
|
|
91
|
+
groups.set(root, group);
|
|
92
|
+
}
|
|
93
|
+
group.add(x);
|
|
94
|
+
}
|
|
95
|
+
return Array.from(groups.values());
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// MST (max-weight spanning tree via Kruskal)
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Build a max-weight spanning tree using Kruskal's algorithm.
|
|
105
|
+
* Returns edges as [idA, idB, score] sorted by descending weight.
|
|
106
|
+
*/
|
|
107
|
+
export function buildMst(
|
|
108
|
+
members: readonly number[],
|
|
109
|
+
pairScores: ReadonlyMap<PairKey, number>,
|
|
110
|
+
): [number, number, number][] {
|
|
111
|
+
// Collect and sort edges by score descending
|
|
112
|
+
const edges: [number, number, number][] = [];
|
|
113
|
+
for (const [key, score] of pairScores) {
|
|
114
|
+
const [a, b] = parsePairKey(key);
|
|
115
|
+
edges.push([a, b, score]);
|
|
116
|
+
}
|
|
117
|
+
edges.sort((x, y) => y[2] - x[2]);
|
|
118
|
+
|
|
119
|
+
const uf = new UnionFind();
|
|
120
|
+
uf.addMany(members);
|
|
121
|
+
|
|
122
|
+
const mst: [number, number, number][] = [];
|
|
123
|
+
const target = members.length - 1;
|
|
124
|
+
for (const [a, b, s] of edges) {
|
|
125
|
+
if (uf.find(a) !== uf.find(b)) {
|
|
126
|
+
uf.union(a, b);
|
|
127
|
+
mst.push([a, b, s]);
|
|
128
|
+
if (mst.length === target) break;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return mst;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
// Cluster confidence
|
|
136
|
+
// ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
export interface ClusterConfidence {
|
|
139
|
+
readonly minEdge: number | null;
|
|
140
|
+
readonly avgEdge: number | null;
|
|
141
|
+
readonly connectivity: number;
|
|
142
|
+
readonly bottleneckPair: readonly [number, number] | null;
|
|
143
|
+
readonly confidence: number;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Compute confidence metrics for a cluster.
|
|
148
|
+
* confidence = 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity
|
|
149
|
+
*/
|
|
150
|
+
export function computeClusterConfidence(
|
|
151
|
+
pairScores: ReadonlyMap<PairKey, number>,
|
|
152
|
+
size: number,
|
|
153
|
+
): ClusterConfidence {
|
|
154
|
+
if (size <= 1 || pairScores.size === 0) {
|
|
155
|
+
return {
|
|
156
|
+
minEdge: null,
|
|
157
|
+
avgEdge: null,
|
|
158
|
+
connectivity: size <= 1 ? 1.0 : 0.0,
|
|
159
|
+
bottleneckPair: null,
|
|
160
|
+
confidence: size <= 1 ? 1.0 : 0.0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
let minEdge = Infinity;
|
|
165
|
+
let sum = 0;
|
|
166
|
+
let bottleneckKey: PairKey | null = null;
|
|
167
|
+
|
|
168
|
+
for (const [key, score] of pairScores) {
|
|
169
|
+
sum += score;
|
|
170
|
+
if (score < minEdge) {
|
|
171
|
+
minEdge = score;
|
|
172
|
+
bottleneckKey = key;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const avgEdge = sum / pairScores.size;
|
|
177
|
+
const maxPossibleEdges = (size * (size - 1)) / 2;
|
|
178
|
+
const connectivity =
|
|
179
|
+
maxPossibleEdges > 0 ? pairScores.size / maxPossibleEdges : 0.0;
|
|
180
|
+
|
|
181
|
+
const bottleneckPair: readonly [number, number] | null = bottleneckKey
|
|
182
|
+
? parsePairKey(bottleneckKey)
|
|
183
|
+
: null;
|
|
184
|
+
|
|
185
|
+
const confidence = 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity;
|
|
186
|
+
|
|
187
|
+
return { minEdge, avgEdge, connectivity, bottleneckPair, confidence };
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ---------------------------------------------------------------------------
|
|
191
|
+
// Split oversized cluster
|
|
192
|
+
// ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
/** Internal mutable cluster info used during building. */
|
|
195
|
+
interface MutableClusterInfo {
|
|
196
|
+
members: number[];
|
|
197
|
+
size: number;
|
|
198
|
+
oversized: boolean;
|
|
199
|
+
pairScores: Map<PairKey, number>;
|
|
200
|
+
confidence: number;
|
|
201
|
+
bottleneckPair: readonly [number, number] | null;
|
|
202
|
+
clusterQuality: "strong" | "weak" | "split";
|
|
203
|
+
_wasSplit?: boolean;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Split a cluster by removing the weakest MST edge.
|
|
208
|
+
* Returns sub-cluster infos.
|
|
209
|
+
*/
|
|
210
|
+
export function splitOversizedCluster(
|
|
211
|
+
members: readonly number[],
|
|
212
|
+
pairScores: ReadonlyMap<PairKey, number>,
|
|
213
|
+
): MutableClusterInfo[] {
|
|
214
|
+
if (members.length <= 1 || pairScores.size === 0) {
|
|
215
|
+
return [
|
|
216
|
+
{
|
|
217
|
+
members: [...members].sort((a, b) => a - b),
|
|
218
|
+
size: members.length,
|
|
219
|
+
oversized: false,
|
|
220
|
+
pairScores: new Map(pairScores),
|
|
221
|
+
confidence: 1.0,
|
|
222
|
+
bottleneckPair: null,
|
|
223
|
+
clusterQuality: "strong",
|
|
224
|
+
},
|
|
225
|
+
];
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const mst = buildMst(members, pairScores);
|
|
229
|
+
if (mst.length === 0) {
|
|
230
|
+
return [
|
|
231
|
+
{
|
|
232
|
+
members: [...members].sort((a, b) => a - b),
|
|
233
|
+
size: members.length,
|
|
234
|
+
oversized: false,
|
|
235
|
+
pairScores: new Map(pairScores),
|
|
236
|
+
confidence: 1.0,
|
|
237
|
+
bottleneckPair: null,
|
|
238
|
+
clusterQuality: "strong",
|
|
239
|
+
},
|
|
240
|
+
];
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Find weakest edge
|
|
244
|
+
let weakestIdx = 0;
|
|
245
|
+
let weakestScore = mst[0]![2];
|
|
246
|
+
for (let i = 1; i < mst.length; i++) {
|
|
247
|
+
if (mst[i]![2] < weakestScore) {
|
|
248
|
+
weakestScore = mst[i]![2];
|
|
249
|
+
weakestIdx = i;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Rebuild without weakest edge
|
|
254
|
+
const uf = new UnionFind();
|
|
255
|
+
uf.addMany(members as number[]);
|
|
256
|
+
for (let i = 0; i < mst.length; i++) {
|
|
257
|
+
if (i !== weakestIdx) {
|
|
258
|
+
uf.union(mst[i]![0], mst[i]![1]);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const result: MutableClusterInfo[] = [];
|
|
263
|
+
for (const subMembers of uf.getClusters()) {
|
|
264
|
+
const subList = [...subMembers].sort((a, b) => a - b);
|
|
265
|
+
const subPairs = new Map<PairKey, number>();
|
|
266
|
+
for (const [key, score] of pairScores) {
|
|
267
|
+
const [a, b] = parsePairKey(key);
|
|
268
|
+
if (subMembers.has(a) && subMembers.has(b)) {
|
|
269
|
+
subPairs.set(key, score);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
const conf = computeClusterConfidence(subPairs, subList.length);
|
|
273
|
+
result.push({
|
|
274
|
+
members: subList,
|
|
275
|
+
size: subList.length,
|
|
276
|
+
oversized: false,
|
|
277
|
+
pairScores: subPairs,
|
|
278
|
+
confidence: conf.confidence,
|
|
279
|
+
bottleneckPair: conf.bottleneckPair,
|
|
280
|
+
clusterQuality: "strong",
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
return result;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// ---------------------------------------------------------------------------
|
|
287
|
+
// buildClusters options
|
|
288
|
+
// ---------------------------------------------------------------------------
|
|
289
|
+
|
|
290
|
+
export interface BuildClustersOptions {
|
|
291
|
+
readonly maxClusterSize?: number;
|
|
292
|
+
readonly weakClusterThreshold?: number;
|
|
293
|
+
readonly autoSplit?: boolean;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// ---------------------------------------------------------------------------
|
|
297
|
+
// buildClusters
|
|
298
|
+
// ---------------------------------------------------------------------------
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Build clusters from scored pairs using Union-Find.
|
|
302
|
+
*
|
|
303
|
+
* Auto-splits oversized clusters via MST (iterative, not recursive).
|
|
304
|
+
* Assigns cluster_quality: "strong", "weak" (avg-min > weakThreshold), or "split".
|
|
305
|
+
* Downgrades confidence by 0.7 for weak clusters.
|
|
306
|
+
*/
|
|
307
|
+
export function buildClusters(
|
|
308
|
+
pairs: readonly (readonly [number, number, number])[],
|
|
309
|
+
allIds: readonly number[],
|
|
310
|
+
options?: BuildClustersOptions,
|
|
311
|
+
): Map<number, ClusterInfo> {
|
|
312
|
+
const maxClusterSize = options?.maxClusterSize ?? 100;
|
|
313
|
+
const weakClusterThreshold = options?.weakClusterThreshold ?? 0.3;
|
|
314
|
+
const autoSplit = options?.autoSplit ?? true;
|
|
315
|
+
|
|
316
|
+
// Build Union-Find from pairs
|
|
317
|
+
const uf = new UnionFind();
|
|
318
|
+
uf.addMany(allIds);
|
|
319
|
+
for (const [idA, idB] of pairs) {
|
|
320
|
+
uf.union(idA, idB);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const clusters = uf.getClusters();
|
|
324
|
+
|
|
325
|
+
// Sort clusters by minimum member for deterministic IDs.
|
|
326
|
+
// Use for-loop min — Math.min(...set) crashes on Sets with >65K elements.
|
|
327
|
+
const minOf = (s: Set<number>): number => {
|
|
328
|
+
let m = Infinity;
|
|
329
|
+
for (const v of s) if (v < m) m = v;
|
|
330
|
+
return m;
|
|
331
|
+
};
|
|
332
|
+
clusters.sort((a, b) => minOf(a) - minOf(b));
|
|
333
|
+
|
|
334
|
+
// Map members to cluster IDs
|
|
335
|
+
const memberToCid = new Map<number, number>();
|
|
336
|
+
for (let i = 0; i < clusters.length; i++) {
|
|
337
|
+
const cid = i + 1;
|
|
338
|
+
for (const m of clusters[i]!) {
|
|
339
|
+
memberToCid.set(m, cid);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Build mutable result
|
|
344
|
+
const result = new Map<number, MutableClusterInfo>();
|
|
345
|
+
for (let i = 0; i < clusters.length; i++) {
|
|
346
|
+
const cid = i + 1;
|
|
347
|
+
const memberArr = [...clusters[i]!].sort((a, b) => a - b);
|
|
348
|
+
result.set(cid, {
|
|
349
|
+
members: memberArr,
|
|
350
|
+
size: memberArr.length,
|
|
351
|
+
oversized: memberArr.length > maxClusterSize,
|
|
352
|
+
pairScores: new Map(),
|
|
353
|
+
confidence: 0,
|
|
354
|
+
bottleneckPair: null,
|
|
355
|
+
clusterQuality: "strong",
|
|
356
|
+
});
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// Assign pair scores to clusters (canonicalized keys)
|
|
360
|
+
for (const [idA, idB, score] of pairs) {
|
|
361
|
+
const cid = memberToCid.get(idA)!;
|
|
362
|
+
const info = result.get(cid)!;
|
|
363
|
+
info.pairScores.set(pairKey(idA, idB), score);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Compute initial confidence
|
|
367
|
+
for (const [, cinfo] of result) {
|
|
368
|
+
const conf = computeClusterConfidence(cinfo.pairScores, cinfo.size);
|
|
369
|
+
cinfo.confidence = conf.confidence;
|
|
370
|
+
cinfo.bottleneckPair = conf.bottleneckPair;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Auto-split oversized clusters (iterative)
|
|
374
|
+
if (autoSplit) {
|
|
375
|
+
const toSplit: number[] = [];
|
|
376
|
+
for (const [cid, c] of result) {
|
|
377
|
+
if (c.oversized) toSplit.push(cid);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
while (toSplit.length > 0) {
|
|
381
|
+
const cid = toSplit.pop()!;
|
|
382
|
+
const cinfo = result.get(cid)!;
|
|
383
|
+
result.delete(cid);
|
|
384
|
+
|
|
385
|
+
const subClusters = splitOversizedCluster(
|
|
386
|
+
cinfo.members,
|
|
387
|
+
cinfo.pairScores,
|
|
388
|
+
);
|
|
389
|
+
let nextCid = 0;
|
|
390
|
+
for (const [k] of result) {
|
|
391
|
+
if (k > nextCid) nextCid = k;
|
|
392
|
+
}
|
|
393
|
+
nextCid += 1;
|
|
394
|
+
|
|
395
|
+
for (const sc of subClusters) {
|
|
396
|
+
sc.oversized = sc.size > maxClusterSize;
|
|
397
|
+
sc._wasSplit = true;
|
|
398
|
+
result.set(nextCid, sc);
|
|
399
|
+
if (sc.oversized) {
|
|
400
|
+
toSplit.push(nextCid);
|
|
401
|
+
}
|
|
402
|
+
nextCid++;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Assign cluster_quality and apply confidence downgrade
|
|
408
|
+
for (const [, cinfo] of result) {
|
|
409
|
+
if (cinfo._wasSplit) {
|
|
410
|
+
cinfo.clusterQuality = "split";
|
|
411
|
+
} else if (cinfo.size > 1 && cinfo.pairScores.size > 0) {
|
|
412
|
+
const scores = [...cinfo.pairScores.values()];
|
|
413
|
+
let minE = Infinity;
|
|
414
|
+
let sumE = 0;
|
|
415
|
+
for (const s of scores) {
|
|
416
|
+
if (s < minE) minE = s;
|
|
417
|
+
sumE += s;
|
|
418
|
+
}
|
|
419
|
+
const avgE = sumE / scores.length;
|
|
420
|
+
if (avgE - minE > weakClusterThreshold) {
|
|
421
|
+
cinfo.clusterQuality = "weak";
|
|
422
|
+
cinfo.confidence *= 0.7;
|
|
423
|
+
} else {
|
|
424
|
+
cinfo.clusterQuality = "strong";
|
|
425
|
+
}
|
|
426
|
+
} else {
|
|
427
|
+
cinfo.clusterQuality = "strong";
|
|
428
|
+
}
|
|
429
|
+
delete cinfo._wasSplit;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// Freeze into ClusterInfo
|
|
433
|
+
const frozen = new Map<number, ClusterInfo>();
|
|
434
|
+
for (const [cid, c] of result) {
|
|
435
|
+
frozen.set(cid, {
|
|
436
|
+
members: c.members,
|
|
437
|
+
size: c.size,
|
|
438
|
+
oversized: c.oversized,
|
|
439
|
+
pairScores: c.pairScores,
|
|
440
|
+
confidence: c.confidence,
|
|
441
|
+
bottleneckPair: c.bottleneckPair,
|
|
442
|
+
clusterQuality: c.clusterQuality,
|
|
443
|
+
});
|
|
444
|
+
}
|
|
445
|
+
return frozen;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// ---------------------------------------------------------------------------
|
|
449
|
+
// addToCluster
|
|
450
|
+
// ---------------------------------------------------------------------------
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Add a new record to existing clusters based on matches.
|
|
454
|
+
*
|
|
455
|
+
* - No matches: new singleton cluster
|
|
456
|
+
* - Single cluster match: join that cluster
|
|
457
|
+
* - Multiple cluster match: merge all matched clusters
|
|
458
|
+
*
|
|
459
|
+
* Flags oversized but does NOT auto-split. Caller should call
|
|
460
|
+
* splitOversizedCluster() if desired.
|
|
461
|
+
*/
|
|
462
|
+
export function addToCluster(
|
|
463
|
+
recordId: number,
|
|
464
|
+
matches: readonly (readonly [number, number])[],
|
|
465
|
+
clusters: Map<number, ClusterInfo>,
|
|
466
|
+
maxClusterSize = 100,
|
|
467
|
+
): Map<number, ClusterInfo> {
|
|
468
|
+
const makeSingleton = (): ClusterInfo => ({
|
|
469
|
+
members: [recordId],
|
|
470
|
+
size: 1,
|
|
471
|
+
oversized: false,
|
|
472
|
+
pairScores: new Map(),
|
|
473
|
+
confidence: 1.0,
|
|
474
|
+
bottleneckPair: null,
|
|
475
|
+
clusterQuality: "strong",
|
|
476
|
+
});
|
|
477
|
+
|
|
478
|
+
if (matches.length === 0) {
|
|
479
|
+
const nextCid = _nextCid(clusters);
|
|
480
|
+
clusters.set(nextCid, makeSingleton());
|
|
481
|
+
return clusters;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Map members to cluster IDs
|
|
485
|
+
const memberToCid = new Map<number, number>();
|
|
486
|
+
for (const [cid, cinfo] of clusters) {
|
|
487
|
+
for (const m of cinfo.members) {
|
|
488
|
+
memberToCid.set(m, cid);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
const matchedCids = new Set<number>();
|
|
493
|
+
for (const [matchedId] of matches) {
|
|
494
|
+
const cid = memberToCid.get(matchedId);
|
|
495
|
+
if (cid !== undefined) matchedCids.add(cid);
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
if (matchedCids.size === 0) {
|
|
499
|
+
const nextCid = _nextCid(clusters);
|
|
500
|
+
clusters.set(nextCid, makeSingleton());
|
|
501
|
+
return clusters;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
if (matchedCids.size === 1) {
|
|
505
|
+
const cid = matchedCids.values().next().value!;
|
|
506
|
+
const old = clusters.get(cid)!;
|
|
507
|
+
const newPairs = new Map(old.pairScores);
|
|
508
|
+
|
|
509
|
+
for (const [matchedId, score] of matches) {
|
|
510
|
+
if (memberToCid.get(matchedId) === cid) {
|
|
511
|
+
newPairs.set(pairKey(recordId, matchedId), score);
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
const newMembers = [...old.members, recordId].sort((a, b) => a - b);
|
|
516
|
+
const newSize = newMembers.length;
|
|
517
|
+
const conf = computeClusterConfidence(newPairs, newSize);
|
|
518
|
+
|
|
519
|
+
clusters.set(cid, {
|
|
520
|
+
members: newMembers,
|
|
521
|
+
size: newSize,
|
|
522
|
+
oversized: newSize > maxClusterSize,
|
|
523
|
+
pairScores: newPairs,
|
|
524
|
+
confidence: conf.confidence,
|
|
525
|
+
bottleneckPair: conf.bottleneckPair,
|
|
526
|
+
clusterQuality: old.clusterQuality,
|
|
527
|
+
});
|
|
528
|
+
return clusters;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// Multiple clusters: merge all
|
|
532
|
+
const mergedMembers: number[] = [recordId];
|
|
533
|
+
const mergedPairs = new Map<PairKey, number>();
|
|
534
|
+
|
|
535
|
+
for (const cid of matchedCids) {
|
|
536
|
+
const cinfo = clusters.get(cid)!;
|
|
537
|
+
mergedMembers.push(...cinfo.members);
|
|
538
|
+
for (const [k, v] of cinfo.pairScores) {
|
|
539
|
+
mergedPairs.set(k, v);
|
|
540
|
+
}
|
|
541
|
+
clusters.delete(cid);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
for (const [matchedId, score] of matches) {
|
|
545
|
+
mergedPairs.set(pairKey(recordId, matchedId), score);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
const sortedMembers = mergedMembers.sort((a, b) => a - b);
|
|
549
|
+
const size = sortedMembers.length;
|
|
550
|
+
const conf = computeClusterConfidence(mergedPairs, size);
|
|
551
|
+
const nextCid = _nextCid(clusters);
|
|
552
|
+
|
|
553
|
+
clusters.set(nextCid, {
|
|
554
|
+
members: sortedMembers,
|
|
555
|
+
size,
|
|
556
|
+
oversized: size > maxClusterSize,
|
|
557
|
+
pairScores: mergedPairs,
|
|
558
|
+
confidence: conf.confidence,
|
|
559
|
+
bottleneckPair: conf.bottleneckPair,
|
|
560
|
+
clusterQuality: "strong",
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
return clusters;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// ---------------------------------------------------------------------------
|
|
567
|
+
// unmergeRecord
|
|
568
|
+
// ---------------------------------------------------------------------------
|
|
569
|
+
|
|
570
|
+
/**
|
|
571
|
+
* Remove a record from its cluster and re-cluster remaining members.
|
|
572
|
+
* The removed record becomes a singleton.
|
|
573
|
+
*/
|
|
574
|
+
export function unmergeRecord(
|
|
575
|
+
recordId: number,
|
|
576
|
+
clusters: Map<number, ClusterInfo>,
|
|
577
|
+
threshold = 0.0,
|
|
578
|
+
): Map<number, ClusterInfo> {
|
|
579
|
+
// Find which cluster contains this record
|
|
580
|
+
let sourceCid: number | null = null;
|
|
581
|
+
for (const [cid, cinfo] of clusters) {
|
|
582
|
+
if (cinfo.members.includes(recordId)) {
|
|
583
|
+
sourceCid = cid;
|
|
584
|
+
break;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
if (sourceCid === null) return clusters; // Not found
|
|
589
|
+
const cinfo = clusters.get(sourceCid)!;
|
|
590
|
+
if (cinfo.size <= 1) return clusters; // Already singleton
|
|
591
|
+
|
|
592
|
+
// Extract pairs excluding the removed record, applying threshold
|
|
593
|
+
const remainingMembers = cinfo.members.filter((m) => m !== recordId);
|
|
594
|
+
const remainingPairs: [number, number, number][] = [];
|
|
595
|
+
for (const [key, score] of cinfo.pairScores) {
|
|
596
|
+
const [a, b] = parsePairKey(key);
|
|
597
|
+
if (a !== recordId && b !== recordId && score >= threshold) {
|
|
598
|
+
remainingPairs.push([a, b, score]);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
// Re-cluster remaining members
|
|
603
|
+
const subClusters = buildClusters(remainingPairs, remainingMembers);
|
|
604
|
+
|
|
605
|
+
// Remove the original cluster
|
|
606
|
+
clusters.delete(sourceCid);
|
|
607
|
+
|
|
608
|
+
// Assign new cluster IDs
|
|
609
|
+
let nextCid = _nextCid(clusters);
|
|
610
|
+
|
|
611
|
+
// Add the removed record as a singleton
|
|
612
|
+
clusters.set(nextCid, {
|
|
613
|
+
members: [recordId],
|
|
614
|
+
size: 1,
|
|
615
|
+
oversized: false,
|
|
616
|
+
pairScores: new Map(),
|
|
617
|
+
confidence: 1.0,
|
|
618
|
+
bottleneckPair: null,
|
|
619
|
+
clusterQuality: "strong",
|
|
620
|
+
});
|
|
621
|
+
nextCid++;
|
|
622
|
+
|
|
623
|
+
// Add re-clustered groups
|
|
624
|
+
for (const [, subInfo] of subClusters) {
|
|
625
|
+
clusters.set(nextCid, subInfo);
|
|
626
|
+
nextCid++;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
return clusters;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// ---------------------------------------------------------------------------
|
|
633
|
+
// unmergeCluster
|
|
634
|
+
// ---------------------------------------------------------------------------
|
|
635
|
+
|
|
636
|
+
/**
|
|
637
|
+
* Shatter a cluster into individual singletons.
|
|
638
|
+
* All members become their own cluster. Pair scores are discarded.
|
|
639
|
+
*/
|
|
640
|
+
export function unmergeCluster(
|
|
641
|
+
clusterId: number,
|
|
642
|
+
clusters: Map<number, ClusterInfo>,
|
|
643
|
+
): Map<number, ClusterInfo> {
|
|
644
|
+
const cinfo = clusters.get(clusterId);
|
|
645
|
+
if (!cinfo) return clusters;
|
|
646
|
+
|
|
647
|
+
const members = cinfo.members;
|
|
648
|
+
clusters.delete(clusterId);
|
|
649
|
+
|
|
650
|
+
let nextCid = _nextCid(clusters);
|
|
651
|
+
for (const memberId of members) {
|
|
652
|
+
clusters.set(nextCid, {
|
|
653
|
+
members: [memberId],
|
|
654
|
+
size: 1,
|
|
655
|
+
oversized: false,
|
|
656
|
+
pairScores: new Map(),
|
|
657
|
+
confidence: 1.0,
|
|
658
|
+
bottleneckPair: null,
|
|
659
|
+
clusterQuality: "strong",
|
|
660
|
+
});
|
|
661
|
+
nextCid++;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
return clusters;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
// ---------------------------------------------------------------------------
|
|
668
|
+
// getClusterPairScores
|
|
669
|
+
// ---------------------------------------------------------------------------
|
|
670
|
+
|
|
671
|
+
/**
|
|
672
|
+
* Get pair scores for a specific set of cluster members from all pairs.
|
|
673
|
+
* Call on-demand, not in hot path.
|
|
674
|
+
*/
|
|
675
|
+
export function getClusterPairScores(
|
|
676
|
+
members: readonly number[],
|
|
677
|
+
allPairs: readonly (readonly [number, number, number])[],
|
|
678
|
+
): Map<PairKey, number> {
|
|
679
|
+
const memberSet = new Set(members);
|
|
680
|
+
const result = new Map<PairKey, number>();
|
|
681
|
+
for (const [a, b, s] of allPairs) {
|
|
682
|
+
if (memberSet.has(a) && memberSet.has(b)) {
|
|
683
|
+
result.set(pairKey(a, b), s);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
return result;
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// ---------------------------------------------------------------------------
|
|
690
|
+
// Internal helpers
|
|
691
|
+
// ---------------------------------------------------------------------------
|
|
692
|
+
|
|
693
|
+
function _nextCid(clusters: ReadonlyMap<number, unknown>): number {
|
|
694
|
+
let max = 0;
|
|
695
|
+
for (const k of clusters.keys()) {
|
|
696
|
+
if (k > max) max = k;
|
|
697
|
+
}
|
|
698
|
+
return max + 1;
|
|
699
|
+
}
|