goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { evaluatePairs, evaluateClusters, buildClusters } from "../../src/core/index.js";
|
|
3
|
+
import type { ScoredPair } from "../../src/core/index.js";
|
|
4
|
+
|
|
5
|
+
describe("evaluatePairs", () => {
|
|
6
|
+
it("perfect predictions -> precision=recall=f1=1.0", () => {
|
|
7
|
+
const predicted: ScoredPair[] = [
|
|
8
|
+
{ idA: 1, idB: 2, score: 0.9 },
|
|
9
|
+
{ idA: 3, idB: 4, score: 0.9 },
|
|
10
|
+
];
|
|
11
|
+
const truth: [number, number][] = [
|
|
12
|
+
[1, 2],
|
|
13
|
+
[3, 4],
|
|
14
|
+
];
|
|
15
|
+
const result = evaluatePairs(predicted, truth);
|
|
16
|
+
expect(result.precision).toBe(1);
|
|
17
|
+
expect(result.recall).toBe(1);
|
|
18
|
+
expect(result.f1).toBe(1);
|
|
19
|
+
expect(result.truePositives).toBe(2);
|
|
20
|
+
expect(result.falsePositives).toBe(0);
|
|
21
|
+
expect(result.falseNegatives).toBe(0);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("mix of TP, FP, FN", () => {
|
|
25
|
+
const predicted: ScoredPair[] = [
|
|
26
|
+
{ idA: 1, idB: 2, score: 0.9 }, // TP
|
|
27
|
+
{ idA: 5, idB: 6, score: 0.9 }, // FP
|
|
28
|
+
];
|
|
29
|
+
const truth: [number, number][] = [
|
|
30
|
+
[1, 2], // in predicted -> TP
|
|
31
|
+
[3, 4], // not predicted -> FN
|
|
32
|
+
];
|
|
33
|
+
const result = evaluatePairs(predicted, truth);
|
|
34
|
+
expect(result.truePositives).toBe(1);
|
|
35
|
+
expect(result.falsePositives).toBe(1);
|
|
36
|
+
expect(result.falseNegatives).toBe(1);
|
|
37
|
+
expect(result.precision).toBe(0.5);
|
|
38
|
+
expect(result.recall).toBe(0.5);
|
|
39
|
+
expect(result.f1).toBe(0.5);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("no predictions and no truth -> all zeros", () => {
|
|
43
|
+
const result = evaluatePairs([], []);
|
|
44
|
+
expect(result.precision).toBe(0);
|
|
45
|
+
expect(result.recall).toBe(0);
|
|
46
|
+
expect(result.f1).toBe(0);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it("canonicalizes pair ordering", () => {
|
|
50
|
+
// predicted (2,1) should match truth (1,2)
|
|
51
|
+
const predicted: ScoredPair[] = [{ idA: 2, idB: 1, score: 0.9 }];
|
|
52
|
+
const truth: [number, number][] = [[1, 2]];
|
|
53
|
+
const result = evaluatePairs(predicted, truth);
|
|
54
|
+
expect(result.truePositives).toBe(1);
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
describe("evaluateClusters", () => {
|
|
59
|
+
it("expands clusters to pairs then evaluates", () => {
|
|
60
|
+
// cluster {1,2,3} -> pairs (1,2),(1,3),(2,3)
|
|
61
|
+
const pairs: [number, number, number][] = [
|
|
62
|
+
[1, 2, 0.9],
|
|
63
|
+
[2, 3, 0.9],
|
|
64
|
+
[1, 3, 0.9],
|
|
65
|
+
];
|
|
66
|
+
const clusters = buildClusters(pairs, [1, 2, 3]);
|
|
67
|
+
const truth: [number, number][] = [
|
|
68
|
+
[1, 2],
|
|
69
|
+
[1, 3],
|
|
70
|
+
[2, 3],
|
|
71
|
+
];
|
|
72
|
+
const result = evaluateClusters(clusters, truth, [1, 2, 3]);
|
|
73
|
+
expect(result.precision).toBe(1);
|
|
74
|
+
expect(result.recall).toBe(1);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("singleton clusters produce no pairs", () => {
|
|
78
|
+
const clusters = buildClusters([], [1, 2, 3]);
|
|
79
|
+
// No predicted pairs, but truth has (1,2) -> all false negatives
|
|
80
|
+
const truth: [number, number][] = [[1, 2]];
|
|
81
|
+
const result = evaluateClusters(clusters, truth, [1, 2, 3]);
|
|
82
|
+
expect(result.truePositives).toBe(0);
|
|
83
|
+
expect(result.falseNegatives).toBe(1);
|
|
84
|
+
});
|
|
85
|
+
});
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { explainPair, explainCluster, buildClusters, pairKey } from "../../src/core/index.js";
|
|
3
|
+
import type { ClusterInfo, MatchkeyConfig, Row } from "../../src/core/index.js";
|
|
4
|
+
|
|
5
|
+
const MK: MatchkeyConfig = {
|
|
6
|
+
name: "name_mk",
|
|
7
|
+
type: "weighted",
|
|
8
|
+
threshold: 0.8,
|
|
9
|
+
fields: [
|
|
10
|
+
{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 },
|
|
11
|
+
{ field: "city", transforms: [], scorer: "exact", weight: 1.0 },
|
|
12
|
+
],
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
describe("explainPair", () => {
|
|
16
|
+
it("produces reasoning strings and overall score", () => {
|
|
17
|
+
const rowA: Row = { name: "John Smith", city: "NYC" };
|
|
18
|
+
const rowB: Row = { name: "John Smith", city: "NYC" };
|
|
19
|
+
const exp = explainPair(rowA, rowB, MK);
|
|
20
|
+
expect(exp.score).toBe(1.0);
|
|
21
|
+
expect(exp.reasoning.length).toBeGreaterThan(0);
|
|
22
|
+
expect(exp.confidence).toBe("high");
|
|
23
|
+
expect(typeof exp.explanation).toBe("string");
|
|
24
|
+
expect(exp.explanation.length).toBeGreaterThan(0);
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
it("includes fieldScores for each field", () => {
|
|
28
|
+
const rowA: Row = { name: "John", city: "NYC" };
|
|
29
|
+
const rowB: Row = { name: "Jon", city: "NYC" };
|
|
30
|
+
const exp = explainPair(rowA, rowB, MK);
|
|
31
|
+
expect(exp.fieldScores.name).not.toBe(null);
|
|
32
|
+
expect(exp.fieldScores.city).toBe(1.0);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it("low-confidence when overall score is low", () => {
|
|
36
|
+
const rowA: Row = { name: "Alice", city: "NYC" };
|
|
37
|
+
const rowB: Row = { name: "Zeke", city: "LA" };
|
|
38
|
+
const exp = explainPair(rowA, rowB, MK);
|
|
39
|
+
expect(exp.confidence).toBe("low");
|
|
40
|
+
});
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
describe("explainCluster", () => {
|
|
44
|
+
it("summarizes multi-member cluster", () => {
|
|
45
|
+
const pairs: [number, number, number][] = [[1, 2, 0.9], [2, 3, 0.85], [1, 3, 0.88]];
|
|
46
|
+
const clusters = buildClusters(pairs, [1, 2, 3]);
|
|
47
|
+
const cid = [...clusters.keys()][0]!;
|
|
48
|
+
const cinfo = clusters.get(cid)!;
|
|
49
|
+
|
|
50
|
+
const rows: Row[] = [
|
|
51
|
+
{ __row_id__: 1, name: "A", city: "X" },
|
|
52
|
+
{ __row_id__: 2, name: "A", city: "X" },
|
|
53
|
+
{ __row_id__: 3, name: "A", city: "X" },
|
|
54
|
+
];
|
|
55
|
+
const exp = explainCluster(cid, cinfo, rows, MK);
|
|
56
|
+
expect(exp.size).toBe(3);
|
|
57
|
+
expect(exp.summary).toContain("Cluster of 3");
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("singleton cluster has specialized summary", () => {
|
|
61
|
+
const cinfo: ClusterInfo = {
|
|
62
|
+
members: [7],
|
|
63
|
+
size: 1,
|
|
64
|
+
oversized: false,
|
|
65
|
+
pairScores: new Map(),
|
|
66
|
+
confidence: 1.0,
|
|
67
|
+
bottleneckPair: null,
|
|
68
|
+
clusterQuality: "strong",
|
|
69
|
+
};
|
|
70
|
+
const exp = explainCluster(7, cinfo, [{ __row_id__: 7 }], MK);
|
|
71
|
+
expect(exp.summary).toContain("Singleton");
|
|
72
|
+
});
|
|
73
|
+
});
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { mergeField, buildGoldenRecord, makeGoldenRulesConfig } from "../../src/core/index.js";
|
|
3
|
+
import type { GoldenFieldRule, Row } from "../../src/core/index.js";
|
|
4
|
+
|
|
5
|
+
describe("mergeField strategies", () => {
|
|
6
|
+
it("most_complete picks longest string", () => {
|
|
7
|
+
const rule: GoldenFieldRule = { strategy: "most_complete" };
|
|
8
|
+
const result = mergeField(["Jon", "John Smith", "Joh"], rule);
|
|
9
|
+
expect(result.value).toBe("John Smith");
|
|
10
|
+
});
|
|
11
|
+
|
|
12
|
+
it("majority_vote picks most frequent", () => {
|
|
13
|
+
const rule: GoldenFieldRule = { strategy: "majority_vote" };
|
|
14
|
+
const result = mergeField(["A", "B", "A", "A"], rule);
|
|
15
|
+
expect(result.value).toBe("A");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it("source_priority picks according to priority list", () => {
|
|
19
|
+
const rule: GoldenFieldRule = {
|
|
20
|
+
strategy: "source_priority",
|
|
21
|
+
sourcePriority: ["crm", "erp"],
|
|
22
|
+
};
|
|
23
|
+
const result = mergeField(["valA", "valB"], rule, { sources: ["erp", "crm"] });
|
|
24
|
+
// crm has higher priority, so "valB" wins
|
|
25
|
+
expect(result.value).toBe("valB");
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("most_recent picks latest by date", () => {
|
|
29
|
+
const rule: GoldenFieldRule = { strategy: "most_recent" };
|
|
30
|
+
const result = mergeField(["old", "new"], rule, {
|
|
31
|
+
dates: ["2020-01-01", "2024-01-01"],
|
|
32
|
+
});
|
|
33
|
+
expect(result.value).toBe("new");
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it("first_non_null picks first non-null", () => {
|
|
37
|
+
const rule: GoldenFieldRule = { strategy: "first_non_null" };
|
|
38
|
+
const result = mergeField([null, "first", "second"], rule);
|
|
39
|
+
expect(result.value).toBe("first");
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("all-identical values -> confidence 1.0", () => {
|
|
43
|
+
const rule: GoldenFieldRule = { strategy: "most_complete" };
|
|
44
|
+
const result = mergeField(["same", "same", "same"], rule);
|
|
45
|
+
expect(result.confidence).toBe(1.0);
|
|
46
|
+
expect(result.value).toBe("same");
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it("all-null -> value null, confidence 0, sourceIndex null", () => {
|
|
50
|
+
const rule: GoldenFieldRule = { strategy: "most_complete" };
|
|
51
|
+
const result = mergeField([null, null, null], rule);
|
|
52
|
+
expect(result.value).toBe(null);
|
|
53
|
+
expect(result.confidence).toBe(0);
|
|
54
|
+
expect(result.sourceIndex).toBe(null);
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
describe("buildGoldenRecord", () => {
|
|
59
|
+
it("produces a merged record for the cluster", () => {
|
|
60
|
+
const clusterRows: Row[] = [
|
|
61
|
+
{ __row_id__: 0, name: "Jon", email: "a@x.com" },
|
|
62
|
+
{ __row_id__: 1, name: "John", email: "a@x.com" },
|
|
63
|
+
];
|
|
64
|
+
const rules = makeGoldenRulesConfig({ defaultStrategy: "most_complete" });
|
|
65
|
+
const golden = buildGoldenRecord(clusterRows, rules);
|
|
66
|
+
|
|
67
|
+
// email is identical -> confidence 1.0
|
|
68
|
+
expect(golden.fields.email?.value).toBe("a@x.com");
|
|
69
|
+
expect(golden.fields.email?.confidence).toBe(1.0);
|
|
70
|
+
|
|
71
|
+
// name: most_complete picks longest -> "John"
|
|
72
|
+
expect(golden.fields.name?.value).toBe("John");
|
|
73
|
+
|
|
74
|
+
// goldenConfidence averages field confidences
|
|
75
|
+
expect(golden.goldenConfidence).toBeGreaterThan(0);
|
|
76
|
+
expect(golden.goldenConfidence).toBeLessThanOrEqual(1);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it("empty cluster returns empty record", () => {
|
|
80
|
+
const rules = makeGoldenRulesConfig();
|
|
81
|
+
const golden = buildGoldenRecord([], rules);
|
|
82
|
+
expect(Object.keys(golden.fields).length).toBe(0);
|
|
83
|
+
expect(golden.goldenConfidence).toBe(0);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it("ignores internal columns", () => {
|
|
87
|
+
const clusterRows: Row[] = [
|
|
88
|
+
{ __row_id__: 0, __cluster_id__: 1, name: "A" },
|
|
89
|
+
];
|
|
90
|
+
const rules = makeGoldenRulesConfig();
|
|
91
|
+
const golden = buildGoldenRecord(clusterRows, rules);
|
|
92
|
+
// __row_id__ should NOT appear in output fields
|
|
93
|
+
expect(golden.fields.__row_id__).toBeUndefined();
|
|
94
|
+
expect(golden.fields.__cluster_id__).toBeUndefined();
|
|
95
|
+
expect(golden.fields.name).toBeDefined();
|
|
96
|
+
});
|
|
97
|
+
});
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
runGraphER,
|
|
4
|
+
type TableSchema,
|
|
5
|
+
type Relationship,
|
|
6
|
+
type GraphERScorer,
|
|
7
|
+
} from "../../src/core/graph-er.js";
|
|
8
|
+
import { scorePair } from "../../src/core/scorer.js";
|
|
9
|
+
import { makeMatchkeyField } from "../../src/core/types.js";
|
|
10
|
+
import type { Row, ScoredPair, MatchkeyField } from "../../src/core/types.js";
|
|
11
|
+
|
|
12
|
+
// Build a scorer that compares every pair in a row list using a weighted
|
|
13
|
+
// matchkey on the supplied fields. runGraphER expects pair idA/idB to be
|
|
14
|
+
// 0-based row indices (it's how clustersFromPairs seeds its Union-Find).
|
|
15
|
+
function allPairsScorer(fields: readonly MatchkeyField[]): GraphERScorer {
|
|
16
|
+
return (rows: readonly Row[]): readonly ScoredPair[] => {
|
|
17
|
+
const pairs: ScoredPair[] = [];
|
|
18
|
+
for (let i = 0; i < rows.length; i++) {
|
|
19
|
+
for (let j = i + 1; j < rows.length; j++) {
|
|
20
|
+
const score = scorePair(rows[i]!, rows[j]!, fields);
|
|
21
|
+
pairs.push({ idA: i, idB: j, score });
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return pairs;
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
describe("runGraphER", () => {
|
|
29
|
+
it("produces clusters per table for 2 tables + 1 relationship", () => {
|
|
30
|
+
const customers: Row[] = [
|
|
31
|
+
{ id: 1, name: "John Smith", company_id: 100 },
|
|
32
|
+
{ id: 2, name: "Jon Smith", company_id: 100 },
|
|
33
|
+
{ id: 3, name: "Jane Doe", company_id: 200 },
|
|
34
|
+
];
|
|
35
|
+
const companies: Row[] = [
|
|
36
|
+
{ id: 100, name: "Acme Inc" },
|
|
37
|
+
{ id: 200, name: "Widgets LLC" },
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
const tables: TableSchema[] = [
|
|
41
|
+
{ name: "customers", rows: customers, idColumn: "id" },
|
|
42
|
+
{ name: "companies", rows: companies, idColumn: "id" },
|
|
43
|
+
];
|
|
44
|
+
const relationships: Relationship[] = [
|
|
45
|
+
{ tableA: "customers", tableB: "companies", fkColumn: "company_id" },
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
const nameField = [
|
|
49
|
+
makeMatchkeyField({ field: "name", scorer: "jaro_winkler", transforms: ["lowercase"] }),
|
|
50
|
+
];
|
|
51
|
+
const scorerByTable = new Map<string, GraphERScorer>([
|
|
52
|
+
["customers", allPairsScorer(nameField)],
|
|
53
|
+
["companies", allPairsScorer(nameField)],
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
const result = runGraphER(tables, relationships, {
|
|
57
|
+
scorerByTable,
|
|
58
|
+
threshold: 0.85,
|
|
59
|
+
maxIterations: 5,
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
expect(result.clustersByTable.has("customers")).toBe(true);
|
|
63
|
+
expect(result.clustersByTable.has("companies")).toBe(true);
|
|
64
|
+
// Companies are distinct -> 2 singleton clusters
|
|
65
|
+
expect(result.clustersByTable.get("companies")!.size).toBe(2);
|
|
66
|
+
// John/Jon Smith pair above threshold -> 1 cluster of size 2; Jane is singleton
|
|
67
|
+
const custClusters = result.clustersByTable.get("customers")!;
|
|
68
|
+
const sizes = [...custClusters.values()].map((c) => c.size).sort();
|
|
69
|
+
expect(sizes).toEqual([1, 2]);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("evidence propagation pulls rows with shared FK cluster toward same cluster", () => {
|
|
73
|
+
// Two customer rows whose names are borderline (below base threshold) but
|
|
74
|
+
// whose company_id points to the same company row. Evidence boost should
|
|
75
|
+
// push them over the threshold once companies are clustered.
|
|
76
|
+
const customers: Row[] = [
|
|
77
|
+
{ id: 1, name: "J Smith", company_id: 100 },
|
|
78
|
+
{ id: 2, name: "John Smyth", company_id: 100 },
|
|
79
|
+
];
|
|
80
|
+
const companies: Row[] = [{ id: 100, name: "Acme Inc" }];
|
|
81
|
+
|
|
82
|
+
const tables: TableSchema[] = [
|
|
83
|
+
{ name: "customers", rows: customers, idColumn: "id" },
|
|
84
|
+
{ name: "companies", rows: companies, idColumn: "id" },
|
|
85
|
+
];
|
|
86
|
+
const relationships: Relationship[] = [
|
|
87
|
+
{ tableA: "customers", tableB: "companies", fkColumn: "company_id" },
|
|
88
|
+
];
|
|
89
|
+
const nameField = [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })];
|
|
90
|
+
const scorerByTable = new Map<string, GraphERScorer>([
|
|
91
|
+
["customers", allPairsScorer(nameField)],
|
|
92
|
+
["companies", allPairsScorer(nameField)],
|
|
93
|
+
]);
|
|
94
|
+
|
|
95
|
+
// Choose threshold right above the raw JW score for "J Smith" vs "John Smyth"
|
|
96
|
+
// so that only the boost can merge them.
|
|
97
|
+
const rawScore = scorePair(customers[0]!, customers[1]!, nameField);
|
|
98
|
+
const threshold = Math.min(0.99, rawScore + 0.05);
|
|
99
|
+
|
|
100
|
+
const result = runGraphER(tables, relationships, {
|
|
101
|
+
scorerByTable,
|
|
102
|
+
threshold,
|
|
103
|
+
similarityBoost: 0.5,
|
|
104
|
+
maxIterations: 5,
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
const custClusters = result.clustersByTable.get("customers")!;
|
|
108
|
+
// After at least one propagation iteration, the two customers should merge.
|
|
109
|
+
const merged = [...custClusters.values()].some((c) => c.size === 2);
|
|
110
|
+
expect(merged).toBe(true);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("converges within maxIterations for a small tractable dataset", () => {
|
|
114
|
+
const customers: Row[] = [
|
|
115
|
+
{ id: 1, name: "Alice", company_id: 100 },
|
|
116
|
+
{ id: 2, name: "Bob", company_id: 200 },
|
|
117
|
+
];
|
|
118
|
+
const companies: Row[] = [
|
|
119
|
+
{ id: 100, name: "Acme" },
|
|
120
|
+
{ id: 200, name: "Widget" },
|
|
121
|
+
];
|
|
122
|
+
const tables: TableSchema[] = [
|
|
123
|
+
{ name: "customers", rows: customers, idColumn: "id" },
|
|
124
|
+
{ name: "companies", rows: companies, idColumn: "id" },
|
|
125
|
+
];
|
|
126
|
+
const nameField = [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })];
|
|
127
|
+
const scorerByTable = new Map<string, GraphERScorer>([
|
|
128
|
+
["customers", allPairsScorer(nameField)],
|
|
129
|
+
["companies", allPairsScorer(nameField)],
|
|
130
|
+
]);
|
|
131
|
+
|
|
132
|
+
const result = runGraphER(tables, [], { scorerByTable, maxIterations: 10 });
|
|
133
|
+
expect(result.converged).toBe(true);
|
|
134
|
+
expect(result.iterations).toBeLessThanOrEqual(10);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
it("respects the maxIterations cap when propagation never stabilizes", () => {
|
|
138
|
+
// Construct inputs that force at least one iteration — we just need to
|
|
139
|
+
// confirm the loop terminates at the cap even if convergence isn't reached.
|
|
140
|
+
const customers: Row[] = [
|
|
141
|
+
{ id: 1, name: "A", company_id: 100 },
|
|
142
|
+
{ id: 2, name: "B", company_id: 100 },
|
|
143
|
+
];
|
|
144
|
+
const companies: Row[] = [{ id: 100, name: "Acme" }];
|
|
145
|
+
const tables: TableSchema[] = [
|
|
146
|
+
{ name: "customers", rows: customers, idColumn: "id" },
|
|
147
|
+
{ name: "companies", rows: companies, idColumn: "id" },
|
|
148
|
+
];
|
|
149
|
+
const nameField = [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })];
|
|
150
|
+
const scorerByTable = new Map<string, GraphERScorer>([
|
|
151
|
+
["customers", allPairsScorer(nameField)],
|
|
152
|
+
["companies", allPairsScorer(nameField)],
|
|
153
|
+
]);
|
|
154
|
+
|
|
155
|
+
const cap = 2;
|
|
156
|
+
const result = runGraphER(tables, [], {
|
|
157
|
+
scorerByTable,
|
|
158
|
+
maxIterations: cap,
|
|
159
|
+
convergenceThreshold: 0, // never accept convergence except exact match
|
|
160
|
+
});
|
|
161
|
+
// Iterations (+1 if converged) must not exceed the cap + 1.
|
|
162
|
+
expect(result.iterations).toBeLessThanOrEqual(cap + 1);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
it("throws when a scorer is missing for a table", () => {
|
|
166
|
+
const tables: TableSchema[] = [
|
|
167
|
+
{ name: "t", rows: [{ id: 1 }], idColumn: "id" },
|
|
168
|
+
];
|
|
169
|
+
expect(() =>
|
|
170
|
+
runGraphER(tables, [], { scorerByTable: new Map() }),
|
|
171
|
+
).toThrow(/Missing scorer/);
|
|
172
|
+
});
|
|
173
|
+
});
|