goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
buildBlocks,
|
|
4
|
+
buildBlocksAsync,
|
|
5
|
+
makeBlockingConfig,
|
|
6
|
+
} from "../../src/core/index.js";
|
|
7
|
+
import type { Row, BlockingConfig } from "../../src/core/index.js";
|
|
8
|
+
|
|
9
|
+
describe("buildBlocksAsync", () => {
|
|
10
|
+
it("strategy='static' delegates to buildBlocks (same result)", async () => {
|
|
11
|
+
const rows: Row[] = [
|
|
12
|
+
{ __row_id__: 0, zip: "12345" },
|
|
13
|
+
{ __row_id__: 1, zip: "12345" },
|
|
14
|
+
{ __row_id__: 2, zip: "67890" },
|
|
15
|
+
{ __row_id__: 3, zip: "67890" },
|
|
16
|
+
];
|
|
17
|
+
const config: BlockingConfig = makeBlockingConfig({
|
|
18
|
+
strategy: "static",
|
|
19
|
+
keys: [{ fields: ["zip"], transforms: [] }],
|
|
20
|
+
});
|
|
21
|
+
const sync = buildBlocks(rows, config);
|
|
22
|
+
const async_ = await buildBlocksAsync(rows, config);
|
|
23
|
+
expect(async_.length).toBe(sync.length);
|
|
24
|
+
for (let i = 0; i < sync.length; i++) {
|
|
25
|
+
expect(async_[i]!.blockKey).toBe(sync[i]!.blockKey);
|
|
26
|
+
expect(async_[i]!.rows.length).toBe(sync[i]!.rows.length);
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("strategy='ann' without apiKey/env throws clear error", async () => {
|
|
31
|
+
const rows: Row[] = [
|
|
32
|
+
{ __row_id__: 0, name: "Alice" },
|
|
33
|
+
{ __row_id__: 1, name: "Alyce" },
|
|
34
|
+
];
|
|
35
|
+
const config: BlockingConfig = makeBlockingConfig({
|
|
36
|
+
strategy: "ann",
|
|
37
|
+
keys: [],
|
|
38
|
+
annColumn: "name",
|
|
39
|
+
});
|
|
40
|
+
// Stash any real API key so the test is deterministic.
|
|
41
|
+
const env = (globalThis as { process?: { env?: Record<string, string | undefined> } }).process?.env;
|
|
42
|
+
const saved = env?.OPENAI_API_KEY;
|
|
43
|
+
if (env) delete env.OPENAI_API_KEY;
|
|
44
|
+
try {
|
|
45
|
+
await expect(buildBlocksAsync(rows, config)).rejects.toThrow(/OpenAI|API key|apiKey/);
|
|
46
|
+
} finally {
|
|
47
|
+
if (env && saved !== undefined) env.OPENAI_API_KEY = saved;
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("strategy='ann' missing annColumn throws", async () => {
|
|
52
|
+
const rows: Row[] = [
|
|
53
|
+
{ __row_id__: 0, name: "Alice" },
|
|
54
|
+
{ __row_id__: 1, name: "Alyce" },
|
|
55
|
+
];
|
|
56
|
+
const config: BlockingConfig = makeBlockingConfig({
|
|
57
|
+
strategy: "ann",
|
|
58
|
+
keys: [],
|
|
59
|
+
// annColumn omitted
|
|
60
|
+
});
|
|
61
|
+
await expect(buildBlocksAsync(rows, config)).rejects.toThrow(/annColumn/);
|
|
62
|
+
});
|
|
63
|
+
});
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
UnionFind,
|
|
4
|
+
buildClusters,
|
|
5
|
+
computeClusterConfidence,
|
|
6
|
+
unmergeRecord,
|
|
7
|
+
unmergeCluster,
|
|
8
|
+
addToCluster,
|
|
9
|
+
pairKey,
|
|
10
|
+
} from "../../src/core/index.js";
|
|
11
|
+
import type { ClusterInfo } from "../../src/core/index.js";
|
|
12
|
+
|
|
13
|
+
describe("UnionFind", () => {
|
|
14
|
+
it("add + find returns self", () => {
|
|
15
|
+
const uf = new UnionFind();
|
|
16
|
+
uf.add(1);
|
|
17
|
+
expect(uf.find(1)).toBe(1);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("union joins two elements", () => {
|
|
21
|
+
const uf = new UnionFind();
|
|
22
|
+
uf.add(1);
|
|
23
|
+
uf.add(2);
|
|
24
|
+
uf.union(1, 2);
|
|
25
|
+
expect(uf.find(1)).toBe(uf.find(2));
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("getClusters returns grouping", () => {
|
|
29
|
+
const uf = new UnionFind();
|
|
30
|
+
uf.addMany([1, 2, 3, 4]);
|
|
31
|
+
uf.union(1, 2);
|
|
32
|
+
uf.union(3, 4);
|
|
33
|
+
const clusters = uf.getClusters();
|
|
34
|
+
expect(clusters.length).toBe(2);
|
|
35
|
+
const sizes = clusters.map((c) => c.size).sort();
|
|
36
|
+
expect(sizes).toEqual([2, 2]);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("transitive closure", () => {
|
|
40
|
+
const uf = new UnionFind();
|
|
41
|
+
uf.addMany([1, 2, 3]);
|
|
42
|
+
uf.union(1, 2);
|
|
43
|
+
uf.union(2, 3);
|
|
44
|
+
expect(uf.find(1)).toBe(uf.find(3));
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
describe("buildClusters", () => {
|
|
49
|
+
it("simple pairs produce expected clusters", () => {
|
|
50
|
+
const pairs: [number, number, number][] = [
|
|
51
|
+
[1, 2, 0.95],
|
|
52
|
+
[3, 4, 0.9],
|
|
53
|
+
];
|
|
54
|
+
const allIds = [1, 2, 3, 4, 5];
|
|
55
|
+
const clusters = buildClusters(pairs, allIds);
|
|
56
|
+
expect(clusters.size).toBe(3); // {1,2}, {3,4}, {5}
|
|
57
|
+
const sizes = [...clusters.values()].map((c) => c.size).sort();
|
|
58
|
+
expect(sizes).toEqual([1, 2, 2]);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it("cluster with only singletons", () => {
|
|
62
|
+
const clusters = buildClusters([], [1, 2, 3]);
|
|
63
|
+
expect(clusters.size).toBe(3);
|
|
64
|
+
for (const c of clusters.values()) {
|
|
65
|
+
expect(c.size).toBe(1);
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it("weak cluster detection downgrades confidence", () => {
|
|
70
|
+
// Chain with weak link: edges (1,2)=0.95, (2,3)=0.95, (1,3)=0.3
|
|
71
|
+
// avg - min = (0.95+0.95+0.3)/3 - 0.3 ~= 0.433, > 0.3 threshold
|
|
72
|
+
const pairs: [number, number, number][] = [
|
|
73
|
+
[1, 2, 0.95],
|
|
74
|
+
[2, 3, 0.95],
|
|
75
|
+
[1, 3, 0.3],
|
|
76
|
+
];
|
|
77
|
+
const clusters = buildClusters(pairs, [1, 2, 3]);
|
|
78
|
+
const single = [...clusters.values()][0]!;
|
|
79
|
+
expect(single.clusterQuality).toBe("weak");
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it("oversized cluster auto-splits", () => {
|
|
83
|
+
// With maxClusterSize=2, 3 fully-connected nodes should split
|
|
84
|
+
const pairs: [number, number, number][] = [
|
|
85
|
+
[1, 2, 0.9],
|
|
86
|
+
[2, 3, 0.5], // weakest
|
|
87
|
+
[1, 3, 0.9],
|
|
88
|
+
];
|
|
89
|
+
const clusters = buildClusters(pairs, [1, 2, 3], { maxClusterSize: 2 });
|
|
90
|
+
// Should be split into 2+ clusters
|
|
91
|
+
expect(clusters.size).toBeGreaterThan(1);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it("auto-split disabled leaves oversized", () => {
|
|
95
|
+
const pairs: [number, number, number][] = [
|
|
96
|
+
[1, 2, 0.9],
|
|
97
|
+
[2, 3, 0.9],
|
|
98
|
+
[1, 3, 0.9],
|
|
99
|
+
];
|
|
100
|
+
const clusters = buildClusters(pairs, [1, 2, 3], {
|
|
101
|
+
maxClusterSize: 2,
|
|
102
|
+
autoSplit: false,
|
|
103
|
+
});
|
|
104
|
+
expect(clusters.size).toBe(1);
|
|
105
|
+
const single = [...clusters.values()][0]!;
|
|
106
|
+
expect(single.oversized).toBe(true);
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
describe("computeClusterConfidence", () => {
|
|
111
|
+
it("singleton confidence 1.0", () => {
|
|
112
|
+
const conf = computeClusterConfidence(new Map(), 1);
|
|
113
|
+
expect(conf.confidence).toBe(1.0);
|
|
114
|
+
expect(conf.minEdge).toBe(null);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it("confidence formula: 0.4*min + 0.3*avg + 0.3*connectivity", () => {
|
|
118
|
+
// One pair, size=2 — fully connected so connectivity=1.0
|
|
119
|
+
const pairs = new Map([[pairKey(1, 2), 0.8]]);
|
|
120
|
+
const conf = computeClusterConfidence(pairs, 2);
|
|
121
|
+
expect(conf.minEdge).toBe(0.8);
|
|
122
|
+
expect(conf.avgEdge).toBe(0.8);
|
|
123
|
+
expect(conf.connectivity).toBe(1.0);
|
|
124
|
+
// 0.4*0.8 + 0.3*0.8 + 0.3*1.0 = 0.32 + 0.24 + 0.30 = 0.86
|
|
125
|
+
expect(conf.confidence).toBeCloseTo(0.86, 5);
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
it("bottleneck pair is weakest edge", () => {
|
|
129
|
+
const pairs = new Map([
|
|
130
|
+
[pairKey(1, 2), 0.9],
|
|
131
|
+
[pairKey(2, 3), 0.5],
|
|
132
|
+
]);
|
|
133
|
+
const conf = computeClusterConfidence(pairs, 3);
|
|
134
|
+
expect(conf.bottleneckPair).toEqual([2, 3]);
|
|
135
|
+
});
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
describe("unmergeRecord", () => {
|
|
139
|
+
it("removes a record and makes it singleton", () => {
|
|
140
|
+
const pairs: [number, number, number][] = [
|
|
141
|
+
[1, 2, 0.95],
|
|
142
|
+
[2, 3, 0.95],
|
|
143
|
+
[1, 3, 0.95],
|
|
144
|
+
];
|
|
145
|
+
const clusters = buildClusters(pairs, [1, 2, 3]);
|
|
146
|
+
// Cluster has {1,2,3}
|
|
147
|
+
expect(clusters.size).toBe(1);
|
|
148
|
+
|
|
149
|
+
const updated = unmergeRecord(1, clusters);
|
|
150
|
+
// Now record 1 is a singleton; 2,3 may still be together
|
|
151
|
+
const allMembers: number[] = [];
|
|
152
|
+
for (const c of updated.values()) {
|
|
153
|
+
allMembers.push(...c.members);
|
|
154
|
+
}
|
|
155
|
+
expect(allMembers.sort()).toEqual([1, 2, 3]);
|
|
156
|
+
|
|
157
|
+
// Find cluster containing record 1
|
|
158
|
+
let foundSingleton = false;
|
|
159
|
+
for (const c of updated.values()) {
|
|
160
|
+
if (c.members.length === 1 && c.members[0] === 1) {
|
|
161
|
+
foundSingleton = true;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
expect(foundSingleton).toBe(true);
|
|
165
|
+
});
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
describe("unmergeCluster", () => {
|
|
169
|
+
it("shatters cluster into singletons", () => {
|
|
170
|
+
const pairs: [number, number, number][] = [[1, 2, 0.95], [2, 3, 0.95]];
|
|
171
|
+
const clusters = buildClusters(pairs, [1, 2, 3]);
|
|
172
|
+
const cid = [...clusters.keys()][0]!;
|
|
173
|
+
const updated = unmergeCluster(cid, clusters);
|
|
174
|
+
expect(updated.size).toBe(3);
|
|
175
|
+
for (const c of updated.values()) {
|
|
176
|
+
expect(c.size).toBe(1);
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
describe("addToCluster", () => {
|
|
182
|
+
it("no match -> singleton", () => {
|
|
183
|
+
const clusters = new Map<number, ClusterInfo>();
|
|
184
|
+
addToCluster(5, [], clusters);
|
|
185
|
+
expect(clusters.size).toBe(1);
|
|
186
|
+
const c = [...clusters.values()][0]!;
|
|
187
|
+
expect(c.members).toEqual([5]);
|
|
188
|
+
expect(c.size).toBe(1);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it("1 cluster match -> joins that cluster", () => {
|
|
192
|
+
// Start with cluster {1,2}
|
|
193
|
+
const pairs: [number, number, number][] = [[1, 2, 0.9]];
|
|
194
|
+
const clusters = buildClusters(pairs, [1, 2]);
|
|
195
|
+
addToCluster(3, [[1, 0.9]], clusters);
|
|
196
|
+
// Should have one cluster of size 3
|
|
197
|
+
const sizes = [...clusters.values()].map((c) => c.size);
|
|
198
|
+
expect(sizes).toContain(3);
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
it("2+ cluster matches -> merges clusters", () => {
|
|
202
|
+
const pairs: [number, number, number][] = [
|
|
203
|
+
[1, 2, 0.9],
|
|
204
|
+
[3, 4, 0.9],
|
|
205
|
+
];
|
|
206
|
+
const clusters = buildClusters(pairs, [1, 2, 3, 4]);
|
|
207
|
+
expect(clusters.size).toBe(2);
|
|
208
|
+
addToCluster(5, [[1, 0.9], [3, 0.9]], clusters);
|
|
209
|
+
// Should have merged into one cluster of size 5
|
|
210
|
+
const sizes = [...clusters.values()].map((c) => c.size);
|
|
211
|
+
expect(sizes).toContain(5);
|
|
212
|
+
});
|
|
213
|
+
});
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { compareClusters, buildClusters } from "../../src/core/index.js";
|
|
3
|
+
|
|
4
|
+
describe("compareClusters (CCMS)", () => {
|
|
5
|
+
it("identical clustering -> all unchanged, TWI = 1", () => {
|
|
6
|
+
const pairs: [number, number, number][] = [[1, 2, 0.9], [3, 4, 0.9]];
|
|
7
|
+
const a = buildClusters(pairs, [1, 2, 3, 4]);
|
|
8
|
+
const b = buildClusters(pairs, [1, 2, 3, 4]);
|
|
9
|
+
const result = compareClusters(a, b);
|
|
10
|
+
expect(result.unchanged).toBe(a.size);
|
|
11
|
+
expect(result.merged).toBe(0);
|
|
12
|
+
expect(result.partitioned).toBe(0);
|
|
13
|
+
expect(result.overlapping).toBe(0);
|
|
14
|
+
expect(result.twi).toBeCloseTo(1.0, 5);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it("different clustering produces non-zero classifications", () => {
|
|
18
|
+
// A has {1,2,3}, B has {1,2},{3}
|
|
19
|
+
const pairsA: [number, number, number][] = [[1, 2, 0.9], [2, 3, 0.9]];
|
|
20
|
+
const pairsB: [number, number, number][] = [[1, 2, 0.9]];
|
|
21
|
+
const a = buildClusters(pairsA, [1, 2, 3]);
|
|
22
|
+
const b = buildClusters(pairsB, [1, 2, 3]);
|
|
23
|
+
const result = compareClusters(a, b);
|
|
24
|
+
// A cluster {1,2,3} is partitioned in B
|
|
25
|
+
expect(result.partitioned).toBeGreaterThanOrEqual(1);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("throws if row id coverage differs", () => {
|
|
29
|
+
const a = buildClusters([], [1, 2]);
|
|
30
|
+
const b = buildClusters([], [1, 2, 3]);
|
|
31
|
+
expect(() => compareClusters(a, b)).toThrow();
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it("returns cc1, cc2, rc metadata", () => {
|
|
35
|
+
const a = buildClusters([[1, 2, 0.9]], [1, 2, 3]);
|
|
36
|
+
const b = buildClusters([[1, 2, 0.9]], [1, 2, 3]);
|
|
37
|
+
const result = compareClusters(a, b);
|
|
38
|
+
expect(result.cc1).toBe(a.size);
|
|
39
|
+
expect(result.cc2).toBe(b.size);
|
|
40
|
+
expect(result.rc).toBe(3);
|
|
41
|
+
});
|
|
42
|
+
});
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { parseConfig } from "../../src/core/index.js";
|
|
3
|
+
|
|
4
|
+
describe("parseConfig", () => {
|
|
5
|
+
it("accepts snake_case keys", () => {
|
|
6
|
+
const raw = {
|
|
7
|
+
match_settings: [
|
|
8
|
+
{
|
|
9
|
+
name: "email_mk",
|
|
10
|
+
type: "exact",
|
|
11
|
+
fields: [{ field: "email", transforms: ["lowercase"], scorer: "exact", weight: 1.0 }],
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
threshold: 0.9,
|
|
15
|
+
};
|
|
16
|
+
const config = parseConfig(raw);
|
|
17
|
+
expect(config.matchkeys?.length).toBe(1);
|
|
18
|
+
expect(config.matchkeys?.[0]?.name).toBe("email_mk");
|
|
19
|
+
expect(config.threshold).toBe(0.9);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it("accepts camelCase keys", () => {
|
|
23
|
+
const raw = {
|
|
24
|
+
matchkeys: [
|
|
25
|
+
{
|
|
26
|
+
name: "mk1",
|
|
27
|
+
type: "weighted",
|
|
28
|
+
fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
|
|
29
|
+
threshold: 0.85,
|
|
30
|
+
},
|
|
31
|
+
],
|
|
32
|
+
};
|
|
33
|
+
const config = parseConfig(raw);
|
|
34
|
+
const mk0 = config.matchkeys?.[0];
|
|
35
|
+
expect(mk0?.type).toBe("weighted");
|
|
36
|
+
if (mk0?.type === "weighted") {
|
|
37
|
+
expect(mk0.threshold).toBe(0.85);
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it("parses matchkeys fields array", () => {
|
|
42
|
+
const raw = {
|
|
43
|
+
matchkeys: [
|
|
44
|
+
{
|
|
45
|
+
name: "m",
|
|
46
|
+
type: "weighted",
|
|
47
|
+
fields: [
|
|
48
|
+
{ field: "first", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 0.5 },
|
|
49
|
+
{ field: "last", transforms: [], scorer: "jaro_winkler", weight: 1.0 },
|
|
50
|
+
],
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
};
|
|
54
|
+
const config = parseConfig(raw);
|
|
55
|
+
expect(config.matchkeys?.[0]?.fields.length).toBe(2);
|
|
56
|
+
expect(config.matchkeys?.[0]?.fields[0]?.weight).toBe(0.5);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it("parses blocking config", () => {
|
|
60
|
+
const raw = {
|
|
61
|
+
blocking: {
|
|
62
|
+
strategy: "static",
|
|
63
|
+
keys: [{ fields: ["zip"], transforms: ["lowercase"] }],
|
|
64
|
+
max_block_size: 1000,
|
|
65
|
+
skip_oversized: true,
|
|
66
|
+
},
|
|
67
|
+
};
|
|
68
|
+
const config = parseConfig(raw);
|
|
69
|
+
expect(config.blocking?.strategy).toBe("static");
|
|
70
|
+
expect(config.blocking?.maxBlockSize).toBe(1000);
|
|
71
|
+
expect(config.blocking?.skipOversized).toBe(true);
|
|
72
|
+
expect(config.blocking?.keys.length).toBe(1);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it("normalizes golden_rules.default -> defaultStrategy", () => {
|
|
76
|
+
const raw = {
|
|
77
|
+
golden_rules: {
|
|
78
|
+
default: "most_complete",
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
const config = parseConfig(raw);
|
|
82
|
+
expect(config.goldenRules?.defaultStrategy).toBe("most_complete");
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("accepts goldenRules.defaultStrategy directly", () => {
|
|
86
|
+
const raw = {
|
|
87
|
+
goldenRules: {
|
|
88
|
+
defaultStrategy: "majority_vote",
|
|
89
|
+
},
|
|
90
|
+
};
|
|
91
|
+
const config = parseConfig(raw);
|
|
92
|
+
expect(config.goldenRules?.defaultStrategy).toBe("majority_vote");
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it("throws on invalid config (not an object)", () => {
|
|
96
|
+
expect(() => parseConfig("not-an-object")).toThrow();
|
|
97
|
+
expect(() => parseConfig(null)).toThrow();
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it("throws on invalid nested config (matchkey without name)", () => {
|
|
101
|
+
const raw = {
|
|
102
|
+
matchkeys: [{ type: "exact", fields: [] }],
|
|
103
|
+
};
|
|
104
|
+
expect(() => parseConfig(raw)).toThrow();
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
// -------------------------------------------------------------------------
|
|
108
|
+
// String-union validation
|
|
109
|
+
// -------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
describe("string-union validation", () => {
|
|
112
|
+
it("throws on invalid matchkey type with clear message listing valid options", () => {
|
|
113
|
+
const raw = {
|
|
114
|
+
matchkeys: [
|
|
115
|
+
{
|
|
116
|
+
name: "bad",
|
|
117
|
+
type: "garbage",
|
|
118
|
+
fields: [{ field: "x", transforms: [], scorer: "exact", weight: 1 }],
|
|
119
|
+
},
|
|
120
|
+
],
|
|
121
|
+
};
|
|
122
|
+
try {
|
|
123
|
+
parseConfig(raw);
|
|
124
|
+
throw new Error("should have thrown");
|
|
125
|
+
} catch (err) {
|
|
126
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
127
|
+
expect(msg).toContain("garbage");
|
|
128
|
+
expect(msg).toContain("exact");
|
|
129
|
+
expect(msg).toContain("weighted");
|
|
130
|
+
expect(msg).toContain("probabilistic");
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("throws on invalid transform with valid options listed", () => {
|
|
135
|
+
const raw = {
|
|
136
|
+
matchkeys: [
|
|
137
|
+
{
|
|
138
|
+
name: "mk",
|
|
139
|
+
type: "weighted",
|
|
140
|
+
fields: [
|
|
141
|
+
{
|
|
142
|
+
field: "name",
|
|
143
|
+
transforms: ["not_a_real_transform"],
|
|
144
|
+
scorer: "jaro_winkler",
|
|
145
|
+
weight: 1,
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
},
|
|
149
|
+
],
|
|
150
|
+
};
|
|
151
|
+
try {
|
|
152
|
+
parseConfig(raw);
|
|
153
|
+
throw new Error("should have thrown");
|
|
154
|
+
} catch (err) {
|
|
155
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
156
|
+
expect(msg).toContain("not_a_real_transform");
|
|
157
|
+
expect(msg).toContain("lowercase");
|
|
158
|
+
expect(msg).toContain("substring");
|
|
159
|
+
expect(msg).toContain("qgram");
|
|
160
|
+
}
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
it("throws on invalid blocking strategy", () => {
|
|
164
|
+
const raw = {
|
|
165
|
+
blocking: {
|
|
166
|
+
strategy: "nonsense",
|
|
167
|
+
keys: [{ fields: ["zip"], transforms: [] }],
|
|
168
|
+
},
|
|
169
|
+
};
|
|
170
|
+
expect(() => parseConfig(raw)).toThrow(/nonsense/);
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
it("throws on invalid golden_rules field strategy", () => {
|
|
174
|
+
const raw = {
|
|
175
|
+
golden_rules: {
|
|
176
|
+
default: "most_complete",
|
|
177
|
+
field_rules: {
|
|
178
|
+
email: { strategy: "pick_worst" },
|
|
179
|
+
},
|
|
180
|
+
},
|
|
181
|
+
};
|
|
182
|
+
expect(() => parseConfig(raw)).toThrow(/pick_worst/);
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
it("throws on invalid standardizer", () => {
|
|
186
|
+
const raw = {
|
|
187
|
+
standardization: {
|
|
188
|
+
rules: {
|
|
189
|
+
name: ["scramble"],
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
};
|
|
193
|
+
expect(() => parseConfig(raw)).toThrow(/scramble/);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it("throws on invalid memory backend", () => {
|
|
197
|
+
const raw = {
|
|
198
|
+
memory: {
|
|
199
|
+
enabled: true,
|
|
200
|
+
backend: "redis",
|
|
201
|
+
},
|
|
202
|
+
};
|
|
203
|
+
expect(() => parseConfig(raw)).toThrow(/redis/);
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
it("accepts parametric transforms (substring, qgram, bloom_filter)", () => {
|
|
207
|
+
const raw = {
|
|
208
|
+
matchkeys: [
|
|
209
|
+
{
|
|
210
|
+
name: "mk",
|
|
211
|
+
type: "weighted",
|
|
212
|
+
fields: [
|
|
213
|
+
{
|
|
214
|
+
field: "a",
|
|
215
|
+
transforms: ["substring:0:3", "qgram:3", "bloom_filter"],
|
|
216
|
+
scorer: "jaro_winkler",
|
|
217
|
+
weight: 1,
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
field: "b",
|
|
221
|
+
transforms: ["bloom_filter:high"],
|
|
222
|
+
scorer: "dice",
|
|
223
|
+
weight: 1,
|
|
224
|
+
},
|
|
225
|
+
],
|
|
226
|
+
},
|
|
227
|
+
],
|
|
228
|
+
};
|
|
229
|
+
const config = parseConfig(raw);
|
|
230
|
+
expect(config.matchkeys?.[0]?.fields[0]?.transforms).toEqual([
|
|
231
|
+
"substring:0:3",
|
|
232
|
+
"qgram:3",
|
|
233
|
+
"bloom_filter",
|
|
234
|
+
]);
|
|
235
|
+
expect(config.matchkeys?.[0]?.fields[1]?.transforms).toEqual([
|
|
236
|
+
"bloom_filter:high",
|
|
237
|
+
]);
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
it("unknown scorer only warns (does not throw)", () => {
|
|
241
|
+
const warnings: string[] = [];
|
|
242
|
+
const origWarn = console.warn;
|
|
243
|
+
console.warn = (msg: unknown) => {
|
|
244
|
+
warnings.push(String(msg));
|
|
245
|
+
};
|
|
246
|
+
try {
|
|
247
|
+
const raw = {
|
|
248
|
+
matchkeys: [
|
|
249
|
+
{
|
|
250
|
+
name: "mk",
|
|
251
|
+
type: "weighted",
|
|
252
|
+
fields: [
|
|
253
|
+
{
|
|
254
|
+
field: "a",
|
|
255
|
+
transforms: ["lowercase"],
|
|
256
|
+
scorer: "my_plugin_scorer",
|
|
257
|
+
weight: 1,
|
|
258
|
+
},
|
|
259
|
+
],
|
|
260
|
+
},
|
|
261
|
+
],
|
|
262
|
+
};
|
|
263
|
+
const config = parseConfig(raw);
|
|
264
|
+
expect(config.matchkeys?.[0]?.fields[0]?.scorer).toBe("my_plugin_scorer");
|
|
265
|
+
expect(warnings.join(" ")).toContain("my_plugin_scorer");
|
|
266
|
+
} finally {
|
|
267
|
+
console.warn = origWarn;
|
|
268
|
+
}
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
it("accepts valid known scorers without warning", () => {
|
|
272
|
+
const warnings: string[] = [];
|
|
273
|
+
const origWarn = console.warn;
|
|
274
|
+
console.warn = (msg: unknown) => {
|
|
275
|
+
warnings.push(String(msg));
|
|
276
|
+
};
|
|
277
|
+
try {
|
|
278
|
+
const raw = {
|
|
279
|
+
matchkeys: [
|
|
280
|
+
{
|
|
281
|
+
name: "mk",
|
|
282
|
+
type: "weighted",
|
|
283
|
+
fields: [
|
|
284
|
+
{
|
|
285
|
+
field: "a",
|
|
286
|
+
transforms: ["lowercase"],
|
|
287
|
+
scorer: "jaro_winkler",
|
|
288
|
+
weight: 1,
|
|
289
|
+
},
|
|
290
|
+
],
|
|
291
|
+
},
|
|
292
|
+
],
|
|
293
|
+
};
|
|
294
|
+
parseConfig(raw);
|
|
295
|
+
expect(warnings).toHaveLength(0);
|
|
296
|
+
} finally {
|
|
297
|
+
console.warn = origWarn;
|
|
298
|
+
}
|
|
299
|
+
});
|
|
300
|
+
});
|
|
301
|
+
});
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
registerConnector,
|
|
4
|
+
loadConnector,
|
|
5
|
+
listConnectors,
|
|
6
|
+
type BaseConnector,
|
|
7
|
+
} from "../../src/node/connectors/base.js";
|
|
8
|
+
|
|
9
|
+
function makeFakeConnector(name: string): BaseConnector {
|
|
10
|
+
return {
|
|
11
|
+
name,
|
|
12
|
+
async connect() {},
|
|
13
|
+
async read() {
|
|
14
|
+
return [];
|
|
15
|
+
},
|
|
16
|
+
async close() {},
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
describe("connector registry", () => {
|
|
21
|
+
it("registerConnector + loadConnector round-trips", () => {
|
|
22
|
+
registerConnector("test_fake_a", () => makeFakeConnector("test_fake_a"));
|
|
23
|
+
const c = loadConnector("test_fake_a", {});
|
|
24
|
+
expect(c.name).toBe("test_fake_a");
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("loadConnector with unknown name throws including registered list", () => {
|
|
28
|
+
registerConnector("known_one", () => makeFakeConnector("known_one"));
|
|
29
|
+
expect(() => loadConnector("does_not_exist", {})).toThrow(/Unknown connector/);
|
|
30
|
+
expect(() => loadConnector("does_not_exist", {})).toThrow(/known_one/);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it("listConnectors includes registered connectors", () => {
|
|
34
|
+
registerConnector("test_fake_b", () => makeFakeConnector("test_fake_b"));
|
|
35
|
+
const names = listConnectors();
|
|
36
|
+
expect(names).toContain("test_fake_b");
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("factory receives passed config", async () => {
|
|
40
|
+
let received: unknown = null;
|
|
41
|
+
registerConnector("test_fake_c", (cfg) => {
|
|
42
|
+
received = cfg;
|
|
43
|
+
return makeFakeConnector("test_fake_c");
|
|
44
|
+
});
|
|
45
|
+
loadConnector("test_fake_c", { foo: "bar", n: 42 });
|
|
46
|
+
expect(received).toEqual({ foo: "bar", n: 42 });
|
|
47
|
+
});
|
|
48
|
+
});
|