goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import {
|
|
2
|
+
describe,
|
|
3
|
+
it,
|
|
4
|
+
expect,
|
|
5
|
+
vi,
|
|
6
|
+
beforeEach,
|
|
7
|
+
afterEach,
|
|
8
|
+
} from "vitest";
|
|
9
|
+
import { createSalesforceConnector } from "../../src/node/connectors/salesforce.js";
|
|
10
|
+
|
|
11
|
+
let fetchMock: ReturnType<typeof vi.fn>;
|
|
12
|
+
|
|
13
|
+
beforeEach(() => {
|
|
14
|
+
fetchMock = vi.fn();
|
|
15
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
afterEach(() => {
|
|
19
|
+
vi.unstubAllGlobals();
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
function jsonResponse(body: unknown): Response {
|
|
23
|
+
return {
|
|
24
|
+
ok: true,
|
|
25
|
+
status: 200,
|
|
26
|
+
text: async () => "",
|
|
27
|
+
json: async () => body,
|
|
28
|
+
} as Response;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
describe("createSalesforceConnector", () => {
|
|
32
|
+
it("returns connector with name 'salesforce'", () => {
|
|
33
|
+
const c = createSalesforceConnector({
|
|
34
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
35
|
+
accessToken: "token",
|
|
36
|
+
});
|
|
37
|
+
expect(c.name).toBe("salesforce");
|
|
38
|
+
expect(typeof c.connect).toBe("function");
|
|
39
|
+
expect(typeof c.read).toBe("function");
|
|
40
|
+
expect(typeof c.close).toBe("function");
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("read() before connect() throws 'Not connected'", async () => {
|
|
44
|
+
const c = createSalesforceConnector({
|
|
45
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
46
|
+
// no accessToken
|
|
47
|
+
});
|
|
48
|
+
await expect(c.read({ table: "Account" })).rejects.toThrow(/not connected/i);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("connect() does OAuth password grant", async () => {
|
|
52
|
+
fetchMock.mockResolvedValueOnce(
|
|
53
|
+
jsonResponse({
|
|
54
|
+
access_token: "TOKEN_123",
|
|
55
|
+
instance_url: "https://x.my.salesforce.com",
|
|
56
|
+
}),
|
|
57
|
+
);
|
|
58
|
+
const c = createSalesforceConnector({
|
|
59
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
60
|
+
clientId: "cid",
|
|
61
|
+
clientSecret: "secret",
|
|
62
|
+
username: "u",
|
|
63
|
+
password: "p",
|
|
64
|
+
securityToken: "tok",
|
|
65
|
+
});
|
|
66
|
+
await c.connect();
|
|
67
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
68
|
+
const url = fetchMock.mock.calls[0]![0] as string;
|
|
69
|
+
expect(url).toContain("/services/oauth2/token");
|
|
70
|
+
const init = fetchMock.mock.calls[0]![1] as RequestInit;
|
|
71
|
+
expect(init.method).toBe("POST");
|
|
72
|
+
// Body should contain combined password+token
|
|
73
|
+
expect(String(init.body)).toContain("password=ptok");
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it("connect() with pre-issued accessToken is no-op (no fetch)", async () => {
|
|
77
|
+
const c = createSalesforceConnector({
|
|
78
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
79
|
+
accessToken: "preissued",
|
|
80
|
+
});
|
|
81
|
+
await c.connect();
|
|
82
|
+
expect(fetchMock).not.toHaveBeenCalled();
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("read() with object query builds SOQL and fetches paginated results", async () => {
|
|
86
|
+
// First page returns nextRecordsUrl; second page returns final.
|
|
87
|
+
fetchMock
|
|
88
|
+
.mockResolvedValueOnce(
|
|
89
|
+
jsonResponse({
|
|
90
|
+
records: [{ Id: "001A", Name: "Acme" }],
|
|
91
|
+
done: false,
|
|
92
|
+
nextRecordsUrl: "/services/data/v61.0/query/01g0000NEXT",
|
|
93
|
+
}),
|
|
94
|
+
)
|
|
95
|
+
.mockResolvedValueOnce(
|
|
96
|
+
jsonResponse({
|
|
97
|
+
records: [{ Id: "001B", Name: "Beta" }],
|
|
98
|
+
done: true,
|
|
99
|
+
}),
|
|
100
|
+
);
|
|
101
|
+
|
|
102
|
+
const c = createSalesforceConnector({
|
|
103
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
104
|
+
accessToken: "TOKEN",
|
|
105
|
+
});
|
|
106
|
+
const rows = await c.read({ table: "Account", columns: ["Id", "Name"], limit: 10 });
|
|
107
|
+
expect(rows.length).toBe(2);
|
|
108
|
+
expect(rows[0]!.Id).toBe("001A");
|
|
109
|
+
expect(rows[1]!.Id).toBe("001B");
|
|
110
|
+
expect(fetchMock).toHaveBeenCalledTimes(2);
|
|
111
|
+
|
|
112
|
+
const firstUrl = fetchMock.mock.calls[0]![0] as string;
|
|
113
|
+
expect(firstUrl).toContain("/services/data/v61.0/query");
|
|
114
|
+
expect(firstUrl).toContain("Id%2CName"); // url-encoded comma
|
|
115
|
+
expect(firstUrl).toContain("LIMIT%2010"); // encodeURIComponent uses %20 for space
|
|
116
|
+
|
|
117
|
+
const secondUrl = fetchMock.mock.calls[1]![0] as string;
|
|
118
|
+
expect(secondUrl).toContain("01g0000NEXT");
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it("read() with raw SOQL string passes it through", async () => {
|
|
122
|
+
fetchMock.mockResolvedValueOnce(
|
|
123
|
+
jsonResponse({ records: [{ Id: "X" }], done: true }),
|
|
124
|
+
);
|
|
125
|
+
const c = createSalesforceConnector({
|
|
126
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
127
|
+
accessToken: "T",
|
|
128
|
+
});
|
|
129
|
+
const rows = await c.read("SELECT Id FROM Account WHERE Name = 'Foo'");
|
|
130
|
+
expect(rows.length).toBe(1);
|
|
131
|
+
const url = fetchMock.mock.calls[0]![0] as string;
|
|
132
|
+
expect(url).toContain("WHERE%20Name");
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it("read() throws on non-2xx response", async () => {
|
|
136
|
+
fetchMock.mockResolvedValueOnce({
|
|
137
|
+
ok: false,
|
|
138
|
+
status: 401,
|
|
139
|
+
text: async () => "Unauthorized",
|
|
140
|
+
json: async () => ({}),
|
|
141
|
+
} as Response);
|
|
142
|
+
const c = createSalesforceConnector({
|
|
143
|
+
instanceUrl: "https://example.my.salesforce.com",
|
|
144
|
+
accessToken: "T",
|
|
145
|
+
});
|
|
146
|
+
await expect(c.read({ table: "Account" })).rejects.toThrow(/Salesforce query failed/);
|
|
147
|
+
});
|
|
148
|
+
});
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
scoreField,
|
|
4
|
+
scorePair,
|
|
5
|
+
findExactMatches,
|
|
6
|
+
findFuzzyMatches,
|
|
7
|
+
jaro,
|
|
8
|
+
jaroWinkler,
|
|
9
|
+
levenshteinDistance,
|
|
10
|
+
levenshteinSimilarity,
|
|
11
|
+
tokenSortRatio,
|
|
12
|
+
soundexMatch,
|
|
13
|
+
diceCoefficient,
|
|
14
|
+
jaccardSimilarity,
|
|
15
|
+
ensembleScore,
|
|
16
|
+
scoreMatrix,
|
|
17
|
+
applyTransform,
|
|
18
|
+
} from "../../src/core/index.js";
|
|
19
|
+
import type { MatchkeyConfig, MatchkeyField, Row } from "../../src/core/index.js";
|
|
20
|
+
|
|
21
|
+
describe("jaro / jaroWinkler", () => {
|
|
22
|
+
it("jaro MARTHA ~= MARHTA matches Python (0.9444)", () => {
|
|
23
|
+
expect(jaro("MARTHA", "MARHTA")).toBeCloseTo(0.9444, 4);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("jaroWinkler MARTHA ~= MARHTA matches Python (0.9611)", () => {
|
|
27
|
+
expect(jaroWinkler("MARTHA", "MARHTA")).toBeCloseTo(0.9611, 4);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("jaroWinkler DIXON / DICKSONX matches Python (0.8133)", () => {
|
|
31
|
+
expect(jaroWinkler("DIXON", "DICKSONX")).toBeCloseTo(0.8133, 4);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it("jaroWinkler JELLYFISH / SMELLYFISH matches Python (0.8963)", () => {
|
|
35
|
+
expect(jaroWinkler("JELLYFISH", "SMELLYFISH")).toBeCloseTo(0.8963, 4);
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it("jaro identical -> 1.0", () => {
|
|
39
|
+
expect(jaro("hello", "hello")).toBe(1.0);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("jaro empty -> 0", () => {
|
|
43
|
+
expect(jaro("", "hello")).toBe(0.0);
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
describe("levenshtein", () => {
|
|
48
|
+
it("kitten -> sitting is distance 3", () => {
|
|
49
|
+
expect(levenshteinDistance("kitten", "sitting")).toBe(3);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("identical distance 0", () => {
|
|
53
|
+
expect(levenshteinDistance("abc", "abc")).toBe(0);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it("empty -> len", () => {
|
|
57
|
+
expect(levenshteinDistance("", "abc")).toBe(3);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it("similarity 1.0 for identical", () => {
|
|
61
|
+
expect(levenshteinSimilarity("abc", "abc")).toBe(1.0);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("kitten/sitting similarity matches Python (1 - 3/7 = 0.5714)", () => {
|
|
65
|
+
expect(levenshteinSimilarity("kitten", "sitting")).toBeCloseTo(0.5714, 4);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("saturday/sunday similarity matches Python (1 - 3/8 = 0.6250)", () => {
|
|
69
|
+
expect(levenshteinSimilarity("saturday", "sunday")).toBeCloseTo(0.625, 4);
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
describe("tokenSortRatio (rapidfuzz-compatible)", () => {
|
|
74
|
+
it("John Smith / Smith John -> 1.0 (same token set)", () => {
|
|
75
|
+
expect(tokenSortRatio("John Smith", "Smith John")).toBe(1.0);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("New York Mets / Mets New York -> 1.0", () => {
|
|
79
|
+
expect(tokenSortRatio("New York Mets", "Mets New York")).toBe(1.0);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it("John Smith / Smith Johnson matches Indel ratio (0.8696)", () => {
|
|
83
|
+
// sorted: "john smith" (10) vs "johnson smith" (13)
|
|
84
|
+
// indel distance = 3 (insert s, o, n), 1 - 3/23 = 20/23 ≈ 0.8696
|
|
85
|
+
expect(tokenSortRatio("John Smith", "Smith Johnson")).toBeCloseTo(0.8696, 4);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it("lowercases before sorting (case-insensitive)", () => {
|
|
89
|
+
expect(tokenSortRatio("John SMITH", "smith JOHN")).toBe(1.0);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it("strips punctuation (rapidfuzz preprocessing)", () => {
|
|
93
|
+
expect(tokenSortRatio("John, Smith!", "smith john.")).toBe(1.0);
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it("different tokens return < 1", () => {
|
|
97
|
+
expect(tokenSortRatio("John", "Jane")).toBeLessThan(1.0);
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
describe("soundexMatch", () => {
|
|
102
|
+
it("Robert/Rupert same code -> 1.0 (both R163)", () => {
|
|
103
|
+
expect(soundexMatch("Robert", "Rupert")).toBe(1.0);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it("Smith/Smyth same code -> 1.0 (both S530)", () => {
|
|
107
|
+
expect(soundexMatch("Smith", "Smyth")).toBe(1.0);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it("Smith/Doe -> 0", () => {
|
|
111
|
+
expect(soundexMatch("Smith", "Doe")).toBe(0.0);
|
|
112
|
+
});
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
describe("ensembleScore", () => {
|
|
116
|
+
it("identical strings -> 1", () => {
|
|
117
|
+
expect(ensembleScore("hello", "hello")).toBe(1.0);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it("returns at least jaro_winkler", () => {
|
|
121
|
+
const jw = jaroWinkler("Smith", "Smyth");
|
|
122
|
+
const en = ensembleScore("Smith", "Smyth");
|
|
123
|
+
expect(en).toBeGreaterThanOrEqual(jw);
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
describe("dice / jaccard (bloom filter hex)", () => {
|
|
128
|
+
it("dice of identical bloom filters -> 1.0", () => {
|
|
129
|
+
const bloom = applyTransform("hello", "bloom_filter");
|
|
130
|
+
expect(bloom).not.toBe(null);
|
|
131
|
+
expect(diceCoefficient(bloom!, bloom!)).toBe(1.0);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it("jaccard of identical -> 1.0", () => {
|
|
135
|
+
const bloom = applyTransform("hello", "bloom_filter");
|
|
136
|
+
expect(jaccardSimilarity(bloom!, bloom!)).toBe(1.0);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it("all-zero filters -> 0", () => {
|
|
140
|
+
// 256 bits / 8 = 32 bytes, so 64 hex chars
|
|
141
|
+
const zero = "0".repeat(64);
|
|
142
|
+
expect(diceCoefficient(zero, zero)).toBe(0.0);
|
|
143
|
+
expect(jaccardSimilarity(zero, zero)).toBe(0.0);
|
|
144
|
+
});
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
describe("scoreField", () => {
|
|
148
|
+
it("exact a==a -> 1.0", () => {
|
|
149
|
+
expect(scoreField("a", "a", "exact")).toBe(1.0);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it("exact a!=b -> 0.0", () => {
|
|
153
|
+
expect(scoreField("a", "b", "exact")).toBe(0.0);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it("returns null if either input is null", () => {
|
|
157
|
+
expect(scoreField(null, "a", "exact")).toBe(null);
|
|
158
|
+
expect(scoreField("a", null, "jaro_winkler")).toBe(null);
|
|
159
|
+
expect(scoreField(null, null, "exact")).toBe(null);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
it("unknown scorer throws", () => {
|
|
163
|
+
expect(() => scoreField("a", "b", "fake_scorer")).toThrow();
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it("jaro_winkler returns similarity", () => {
|
|
167
|
+
const s = scoreField("abc", "abc", "jaro_winkler");
|
|
168
|
+
expect(s).toBe(1.0);
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
it("levenshtein", () => {
|
|
172
|
+
const s = scoreField("abc", "abc", "levenshtein");
|
|
173
|
+
expect(s).toBe(1.0);
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
it("token_sort", () => {
|
|
177
|
+
const s = scoreField("a b", "b a", "token_sort");
|
|
178
|
+
expect(s).toBe(1.0);
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
it("token_sort strips punctuation and lowercases (rapidfuzz parity)", () => {
|
|
182
|
+
// "John, Smith!" vs "smith john." → both sort to "john smith" → 1.0
|
|
183
|
+
expect(scoreField("John, Smith!", "smith john.", "token_sort")).toBe(1.0);
|
|
184
|
+
});
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
describe("scorePair - weighted fields", () => {
|
|
188
|
+
it("weighted aggregation of fields", () => {
|
|
189
|
+
const rowA: Row = { name: "John", city: "NYC" };
|
|
190
|
+
const rowB: Row = { name: "John", city: "NYC" };
|
|
191
|
+
const fields: MatchkeyField[] = [
|
|
192
|
+
{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 },
|
|
193
|
+
{ field: "city", transforms: [], scorer: "exact", weight: 1.0 },
|
|
194
|
+
];
|
|
195
|
+
expect(scorePair(rowA, rowB, fields)).toBe(1.0);
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
it("returns 0 when weightSum=0 (all fields null)", () => {
|
|
199
|
+
const rowA: Row = { name: null };
|
|
200
|
+
const rowB: Row = { name: null };
|
|
201
|
+
const fields: MatchkeyField[] = [
|
|
202
|
+
{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 },
|
|
203
|
+
];
|
|
204
|
+
expect(scorePair(rowA, rowB, fields)).toBe(0);
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
it("weighted average of partial matches", () => {
|
|
208
|
+
const rowA: Row = { name: "John", city: "NYC" };
|
|
209
|
+
const rowB: Row = { name: "John", city: "LA" };
|
|
210
|
+
const fields: MatchkeyField[] = [
|
|
211
|
+
{ field: "name", transforms: [], scorer: "exact", weight: 1.0 },
|
|
212
|
+
{ field: "city", transforms: [], scorer: "exact", weight: 1.0 },
|
|
213
|
+
];
|
|
214
|
+
// (1.0 * 1 + 0.0 * 1) / 2 = 0.5
|
|
215
|
+
expect(scorePair(rowA, rowB, fields)).toBe(0.5);
|
|
216
|
+
});
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
describe("findExactMatches", () => {
|
|
220
|
+
it("groups by matchkey column", () => {
|
|
221
|
+
const rows: Row[] = [
|
|
222
|
+
{ __row_id__: 0, email: "a@x.com" },
|
|
223
|
+
{ __row_id__: 1, email: "a@x.com" },
|
|
224
|
+
{ __row_id__: 2, email: "b@x.com" },
|
|
225
|
+
];
|
|
226
|
+
const mk: MatchkeyConfig = {
|
|
227
|
+
name: "email",
|
|
228
|
+
type: "exact",
|
|
229
|
+
fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
|
|
230
|
+
};
|
|
231
|
+
const pairs = findExactMatches(rows, mk);
|
|
232
|
+
expect(pairs.length).toBe(1);
|
|
233
|
+
expect(pairs[0]!.idA).toBe(0);
|
|
234
|
+
expect(pairs[0]!.idB).toBe(1);
|
|
235
|
+
expect(pairs[0]!.score).toBe(1.0);
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
it("returns empty for 0 or 1 rows", () => {
|
|
239
|
+
const mk: MatchkeyConfig = {
|
|
240
|
+
name: "email",
|
|
241
|
+
type: "exact",
|
|
242
|
+
fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
|
|
243
|
+
};
|
|
244
|
+
expect(findExactMatches([], mk)).toEqual([]);
|
|
245
|
+
expect(findExactMatches([{ __row_id__: 0, email: "a" }], mk)).toEqual([]);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
it("skips rows where matchkey field is null", () => {
|
|
249
|
+
const rows: Row[] = [
|
|
250
|
+
{ __row_id__: 0, email: null },
|
|
251
|
+
{ __row_id__: 1, email: null },
|
|
252
|
+
{ __row_id__: 2, email: "x@x.com" },
|
|
253
|
+
];
|
|
254
|
+
const mk: MatchkeyConfig = {
|
|
255
|
+
name: "email",
|
|
256
|
+
type: "exact",
|
|
257
|
+
fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
|
|
258
|
+
};
|
|
259
|
+
const pairs = findExactMatches(rows, mk);
|
|
260
|
+
expect(pairs.length).toBe(0);
|
|
261
|
+
});
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
describe("findFuzzyMatches", () => {
|
|
265
|
+
it("NxN scoring within block", () => {
|
|
266
|
+
const rows: Row[] = [
|
|
267
|
+
{ __row_id__: 0, name: "Jon Smith" },
|
|
268
|
+
{ __row_id__: 1, name: "John Smith" },
|
|
269
|
+
{ __row_id__: 2, name: "Zeke Xavier" },
|
|
270
|
+
];
|
|
271
|
+
const mk: MatchkeyConfig = {
|
|
272
|
+
name: "name_fuzzy",
|
|
273
|
+
type: "weighted",
|
|
274
|
+
threshold: 0.7,
|
|
275
|
+
fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
|
|
276
|
+
};
|
|
277
|
+
const pairs = findFuzzyMatches(rows, mk);
|
|
278
|
+
// Jon/John should match; Zeke should not match either
|
|
279
|
+
const hasPair01 = pairs.some((p) => p.idA === 0 && p.idB === 1);
|
|
280
|
+
expect(hasPair01).toBe(true);
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
it("empty if fewer than 2 rows", () => {
|
|
284
|
+
const mk: MatchkeyConfig = {
|
|
285
|
+
name: "f",
|
|
286
|
+
type: "weighted",
|
|
287
|
+
threshold: 0.85,
|
|
288
|
+
fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 }],
|
|
289
|
+
};
|
|
290
|
+
expect(findFuzzyMatches([], mk)).toEqual([]);
|
|
291
|
+
});
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
describe("scoreMatrix", () => {
|
|
295
|
+
it("symmetric with 0 diagonal", () => {
|
|
296
|
+
const m = scoreMatrix(["abc", "abd", "xyz"], "jaro_winkler");
|
|
297
|
+
expect(m.length).toBe(3);
|
|
298
|
+
expect(m[0]![0]).toBe(0); // diagonal
|
|
299
|
+
expect(m[0]![1]).toBe(m[1]![0]); // symmetric
|
|
300
|
+
});
|
|
301
|
+
});
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { runSensitivity, stabilityReport } from "../../src/core/sensitivity.js";
|
|
3
|
+
import {
|
|
4
|
+
makeConfig,
|
|
5
|
+
makeBlockingConfig,
|
|
6
|
+
makeMatchkeyConfig,
|
|
7
|
+
makeMatchkeyField,
|
|
8
|
+
} from "../../src/core/types.js";
|
|
9
|
+
import type { Row } from "../../src/core/types.js";
|
|
10
|
+
|
|
11
|
+
function makeRows(): Row[] {
|
|
12
|
+
// 20 rows, five near-duplicate pairs (so threshold sweeps actually change clustering).
|
|
13
|
+
const rows: Row[] = [];
|
|
14
|
+
let id = 0;
|
|
15
|
+
const dupes: Array<[string, string]> = [
|
|
16
|
+
["John Smith", "Jon Smith"],
|
|
17
|
+
["Mary Jones", "Marie Jones"],
|
|
18
|
+
["Alice Brown", "Alicia Brown"],
|
|
19
|
+
["Bob Miller", "Robert Miller"],
|
|
20
|
+
["Carol Davis", "Caroline Davis"],
|
|
21
|
+
];
|
|
22
|
+
for (const [a, b] of dupes) {
|
|
23
|
+
rows.push({ __row_id__: id++, name: a });
|
|
24
|
+
rows.push({ __row_id__: id++, name: b });
|
|
25
|
+
}
|
|
26
|
+
// 10 distinct distractors.
|
|
27
|
+
for (const nm of [
|
|
28
|
+
"Zygmunt Petrov", "Xiong Wei", "Nakamura Taro", "Kowalski Jan",
|
|
29
|
+
"Oduya Esther", "Rasmussen Ole", "Tanaka Yui", "Vasquez Maria",
|
|
30
|
+
"Wojcik Piotr", "Yamamoto Ken",
|
|
31
|
+
]) {
|
|
32
|
+
rows.push({ __row_id__: id++, name: nm });
|
|
33
|
+
}
|
|
34
|
+
return rows;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function baselineConfig() {
|
|
38
|
+
const mk = makeMatchkeyConfig({
|
|
39
|
+
name: "name_fuzzy",
|
|
40
|
+
type: "weighted",
|
|
41
|
+
threshold: 0.85,
|
|
42
|
+
fields: [
|
|
43
|
+
makeMatchkeyField({
|
|
44
|
+
field: "name",
|
|
45
|
+
scorer: "jaro_winkler",
|
|
46
|
+
transforms: ["lowercase"],
|
|
47
|
+
}),
|
|
48
|
+
],
|
|
49
|
+
});
|
|
50
|
+
const blocking = makeBlockingConfig({
|
|
51
|
+
strategy: "static",
|
|
52
|
+
keys: [{ fields: ["name"], transforms: ["lowercase", "substring:0:1"] }],
|
|
53
|
+
});
|
|
54
|
+
return makeConfig({ matchkeys: [mk], blocking, threshold: 0.85 });
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
describe("runSensitivity", () => {
|
|
58
|
+
it("produces one SweepPoint per value with stats and TWI vs baseline", () => {
|
|
59
|
+
const rows = makeRows();
|
|
60
|
+
const cfg = baselineConfig();
|
|
61
|
+
const result = runSensitivity(rows, cfg, [
|
|
62
|
+
{ path: "threshold", values: [0.75, 0.85, 0.95] },
|
|
63
|
+
]);
|
|
64
|
+
expect(result.points.length).toBe(3);
|
|
65
|
+
for (const p of result.points) {
|
|
66
|
+
// Either error path or stats path; in a clean run we expect stats.
|
|
67
|
+
if (p.error === undefined) {
|
|
68
|
+
expect(typeof p.stats["totalClusters"]).toBe("number");
|
|
69
|
+
expect(typeof p.stats["totalRecords"]).toBe("number");
|
|
70
|
+
// TWI is an optional number in [0, 1]-ish when present.
|
|
71
|
+
if (p.twi !== undefined) {
|
|
72
|
+
expect(p.twi).toBeGreaterThanOrEqual(0);
|
|
73
|
+
expect(p.twi).toBeLessThanOrEqual(1);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
// Baseline has TWI = 1 by construction.
|
|
78
|
+
expect(result.baseline.twi).toBe(1.0);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it("writes the sweep-param value into each point's params object (dot-path at root)", () => {
|
|
82
|
+
const rows = makeRows();
|
|
83
|
+
const cfg = baselineConfig();
|
|
84
|
+
const result = runSensitivity(rows, cfg, [
|
|
85
|
+
{ path: "threshold", values: [0.7, 0.9] },
|
|
86
|
+
]);
|
|
87
|
+
const values = result.points.map((p) => p.params["threshold"]);
|
|
88
|
+
expect(values).toEqual([0.7, 0.9]);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it("preserves partial results when one point errors", () => {
|
|
92
|
+
const rows = makeRows();
|
|
93
|
+
const cfg = baselineConfig();
|
|
94
|
+
// Inject a non-existent path that will parse fine but using an invalid
|
|
95
|
+
// matchkey object forces the pipeline to throw.
|
|
96
|
+
const result = runSensitivity(rows, cfg, [
|
|
97
|
+
{
|
|
98
|
+
path: "matchkeys",
|
|
99
|
+
values: [
|
|
100
|
+
// Valid: single matchkey -> runs fine
|
|
101
|
+
[
|
|
102
|
+
makeMatchkeyConfig({
|
|
103
|
+
name: "m",
|
|
104
|
+
type: "weighted",
|
|
105
|
+
fields: [makeMatchkeyField({ field: "name", scorer: "jaro_winkler" })],
|
|
106
|
+
threshold: 0.85,
|
|
107
|
+
}),
|
|
108
|
+
],
|
|
109
|
+
// Invalid: matchkey referring to a non-existent scorer triggers an
|
|
110
|
+
// exception during scoring for some inputs. Use a malformed shape
|
|
111
|
+
// that throws early: field array contains non-objects.
|
|
112
|
+
[
|
|
113
|
+
{
|
|
114
|
+
name: "bad",
|
|
115
|
+
type: "weighted",
|
|
116
|
+
fields: [null],
|
|
117
|
+
threshold: 0.85,
|
|
118
|
+
} as unknown,
|
|
119
|
+
],
|
|
120
|
+
],
|
|
121
|
+
},
|
|
122
|
+
]);
|
|
123
|
+
expect(result.points.length).toBe(2);
|
|
124
|
+
// At least one good point, at least one error point.
|
|
125
|
+
const errs = result.points.filter((p) => p.error !== undefined);
|
|
126
|
+
const ok = result.points.filter((p) => p.error === undefined);
|
|
127
|
+
expect(ok.length).toBeGreaterThanOrEqual(1);
|
|
128
|
+
expect(errs.length).toBeGreaterThanOrEqual(1);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
it("empty sweep params yields zero points but still a baseline", () => {
|
|
132
|
+
const rows = makeRows();
|
|
133
|
+
const cfg = baselineConfig();
|
|
134
|
+
const result = runSensitivity(rows, cfg, []);
|
|
135
|
+
expect(result.points.length).toBe(0);
|
|
136
|
+
expect(result.baseline.twi).toBe(1.0);
|
|
137
|
+
expect(result.stable).toBe(true);
|
|
138
|
+
});
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
describe("stabilityReport", () => {
|
|
142
|
+
it("returns a string summarizing the sweep", () => {
|
|
143
|
+
const rows = makeRows();
|
|
144
|
+
const cfg = baselineConfig();
|
|
145
|
+
const result = runSensitivity(rows, cfg, [
|
|
146
|
+
{ path: "threshold", values: [0.85, 0.95] },
|
|
147
|
+
]);
|
|
148
|
+
const report = stabilityReport(result);
|
|
149
|
+
expect(typeof report).toBe("string");
|
|
150
|
+
expect(report).toContain("Sensitivity sweep");
|
|
151
|
+
expect(report).toContain("Points");
|
|
152
|
+
expect(report).toContain("Stable");
|
|
153
|
+
});
|
|
154
|
+
});
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { applyStandardizer, applyStandardization } from "../../src/core/index.js";
|
|
3
|
+
import type { Row } from "../../src/core/index.js";
|
|
4
|
+
|
|
5
|
+
describe("applyStandardizer", () => {
|
|
6
|
+
it("email: lowercase and strip", () => {
|
|
7
|
+
expect(applyStandardizer(" USER@Example.COM ", "email")).toBe("user@example.com");
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
it("email: invalid returns empty string (null->empty)", () => {
|
|
11
|
+
expect(applyStandardizer("not-an-email", "email")).toBe("");
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("name_proper: JOHN smith -> John Smith", () => {
|
|
15
|
+
expect(applyStandardizer("JOHN smith", "name_proper")).toBe("John Smith");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it("name_proper: hyphenated mary-jane -> Mary-Jane", () => {
|
|
19
|
+
expect(applyStandardizer("mary-jane", "name_proper")).toBe("Mary-Jane");
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
it("phone: digits only, strips US country code", () => {
|
|
23
|
+
expect(applyStandardizer("1-800-555-1234", "phone")).toBe("8005551234");
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("phone: pure digits retained", () => {
|
|
27
|
+
expect(applyStandardizer("(415) 555-1234", "phone")).toBe("4155551234");
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("zip5: 12345-6789 -> 12345", () => {
|
|
31
|
+
expect(applyStandardizer("12345-6789", "zip5")).toBe("12345");
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it("zip5: short padded", () => {
|
|
35
|
+
expect(applyStandardizer("123", "zip5")).toBe("00123");
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it("address: MAIN ST -> Main St", () => {
|
|
39
|
+
expect(applyStandardizer("MAIN ST", "address")).toBe("Main St");
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("address: MAIN STREET -> Main St (abbreviated)", () => {
|
|
43
|
+
expect(applyStandardizer("MAIN STREET", "address")).toBe("Main St");
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("state uppercases", () => {
|
|
47
|
+
expect(applyStandardizer(" ca ", "state")).toBe("CA");
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
it("strip removes whitespace", () => {
|
|
51
|
+
expect(applyStandardizer(" hello ", "strip")).toBe("hello");
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it("unknown standardizer throws", () => {
|
|
55
|
+
expect(() => applyStandardizer("x", "not-a-thing")).toThrow();
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
describe("applyStandardization", () => {
|
|
60
|
+
it("applies rules dict to rows", () => {
|
|
61
|
+
const rows: Row[] = [
|
|
62
|
+
{ email: "USER@X.COM", first: "JOHN" },
|
|
63
|
+
];
|
|
64
|
+
const out = applyStandardization(rows, {
|
|
65
|
+
email: ["email"],
|
|
66
|
+
first: ["name_proper"],
|
|
67
|
+
});
|
|
68
|
+
expect(out[0]!.email).toBe("user@x.com");
|
|
69
|
+
expect(out[0]!.first).toBe("John");
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("leaves nulls as-is", () => {
|
|
73
|
+
const rows: Row[] = [{ email: null, first: "A" }];
|
|
74
|
+
const out = applyStandardization(rows, { email: ["email"], first: ["name_proper"] });
|
|
75
|
+
expect(out[0]!.email).toBe(null);
|
|
76
|
+
expect(out[0]!.first).toBe("A");
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it("chains multiple standardizers", () => {
|
|
80
|
+
const rows: Row[] = [{ first: " JOHN " }];
|
|
81
|
+
const out = applyStandardization(rows, { first: ["strip", "name_proper"] });
|
|
82
|
+
expect(out[0]!.first).toBe("John");
|
|
83
|
+
});
|
|
84
|
+
});
|