goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
CrossEncoderModel,
|
|
4
|
+
rerankPair,
|
|
5
|
+
rerankTopPairs,
|
|
6
|
+
_resetCrossEncoderModelCache,
|
|
7
|
+
} from "../../src/core/index.js";
|
|
8
|
+
import type { MatchkeyConfig, Row, ScoredPair } from "../../src/core/index.js";
|
|
9
|
+
|
|
10
|
+
describe("CrossEncoderModel", () => {
|
|
11
|
+
beforeEach(() => {
|
|
12
|
+
_resetCrossEncoderModelCache();
|
|
13
|
+
// Silence the expected "cross-encoder failed" console.warn noise.
|
|
14
|
+
vi.spyOn(console, "warn").mockImplementation(() => undefined);
|
|
15
|
+
});
|
|
16
|
+
afterEach(() => {
|
|
17
|
+
vi.restoreAllMocks();
|
|
18
|
+
vi.unstubAllGlobals();
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it("throws a clear error when @huggingface/transformers is not installed", async () => {
|
|
22
|
+
const model = new CrossEncoderModel();
|
|
23
|
+
await expect(model.score("a", "b")).rejects.toThrow(
|
|
24
|
+
/@huggingface\/transformers/,
|
|
25
|
+
);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it("the same error is raised on subsequent calls (cache does not lock into a rejected promise)", async () => {
|
|
29
|
+
const model = new CrossEncoderModel();
|
|
30
|
+
await expect(model.score("a", "b")).rejects.toThrow(
|
|
31
|
+
/@huggingface\/transformers/,
|
|
32
|
+
);
|
|
33
|
+
// Second call should still reject with a fresh load attempt.
|
|
34
|
+
await expect(model.score("a", "b")).rejects.toThrow(
|
|
35
|
+
/@huggingface\/transformers/,
|
|
36
|
+
);
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
describe("rerankPair with reranker option", () => {
|
|
41
|
+
beforeEach(() => {
|
|
42
|
+
_resetCrossEncoderModelCache();
|
|
43
|
+
vi.spyOn(console, "warn").mockImplementation(() => undefined);
|
|
44
|
+
});
|
|
45
|
+
afterEach(() => {
|
|
46
|
+
vi.restoreAllMocks();
|
|
47
|
+
vi.unstubAllGlobals();
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
it("reranker='cross-encoder' falls back to LLM path when package missing", async () => {
|
|
51
|
+
// No apiKey, no env var, no transformers package -> fallback returns NaN.
|
|
52
|
+
const env = (globalThis as { process?: { env?: Record<string, string | undefined> } })
|
|
53
|
+
.process?.env;
|
|
54
|
+
const savedOpen = env?.OPENAI_API_KEY;
|
|
55
|
+
const savedAnt = env?.ANTHROPIC_API_KEY;
|
|
56
|
+
if (env) {
|
|
57
|
+
delete env.OPENAI_API_KEY;
|
|
58
|
+
delete env.ANTHROPIC_API_KEY;
|
|
59
|
+
}
|
|
60
|
+
try {
|
|
61
|
+
const score = await rerankPair(
|
|
62
|
+
{ name: "John", email: "john@x.com" },
|
|
63
|
+
{ name: "Jon", email: "jon@x.com" },
|
|
64
|
+
["name", "email"],
|
|
65
|
+
{ reranker: "cross-encoder" },
|
|
66
|
+
);
|
|
67
|
+
// Fallback path returns NaN without apiKey.
|
|
68
|
+
expect(Number.isNaN(score)).toBe(true);
|
|
69
|
+
} finally {
|
|
70
|
+
if (env) {
|
|
71
|
+
if (savedOpen !== undefined) env.OPENAI_API_KEY = savedOpen;
|
|
72
|
+
if (savedAnt !== undefined) env.ANTHROPIC_API_KEY = savedAnt;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("reranker='cross-encoder' with apiKey falls back to LLM and produces a number", async () => {
|
|
78
|
+
const fetchMock = vi.fn();
|
|
79
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
80
|
+
fetchMock.mockResolvedValueOnce({
|
|
81
|
+
ok: true,
|
|
82
|
+
status: 200,
|
|
83
|
+
text: async () => "",
|
|
84
|
+
json: async () => ({
|
|
85
|
+
choices: [{ message: { content: JSON.stringify({ score: 0.77 }) } }],
|
|
86
|
+
usage: { prompt_tokens: 10, completion_tokens: 4 },
|
|
87
|
+
}),
|
|
88
|
+
} as Response);
|
|
89
|
+
|
|
90
|
+
const score = await rerankPair(
|
|
91
|
+
{ name: "John" },
|
|
92
|
+
{ name: "Jon" },
|
|
93
|
+
["name"],
|
|
94
|
+
{ reranker: "cross-encoder", apiKey: "sk-test", maxRetries: 0 },
|
|
95
|
+
);
|
|
96
|
+
expect(typeof score).toBe("number");
|
|
97
|
+
// Fallback exercised the LLM path -> fetch was called.
|
|
98
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
99
|
+
expect(score).toBeCloseTo(0.77, 6);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it("reranker='llm' (default) uses the LLM path", async () => {
|
|
103
|
+
const fetchMock = vi.fn();
|
|
104
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
105
|
+
fetchMock.mockResolvedValueOnce({
|
|
106
|
+
ok: true,
|
|
107
|
+
status: 200,
|
|
108
|
+
text: async () => "",
|
|
109
|
+
json: async () => ({
|
|
110
|
+
choices: [{ message: { content: JSON.stringify({ score: 0.91 }) } }],
|
|
111
|
+
usage: { prompt_tokens: 10, completion_tokens: 4 },
|
|
112
|
+
}),
|
|
113
|
+
} as Response);
|
|
114
|
+
|
|
115
|
+
const score = await rerankPair(
|
|
116
|
+
{ name: "John" },
|
|
117
|
+
{ name: "Jon" },
|
|
118
|
+
["name"],
|
|
119
|
+
{ reranker: "llm", apiKey: "sk-test", maxRetries: 0 },
|
|
120
|
+
);
|
|
121
|
+
expect(typeof score).toBe("number");
|
|
122
|
+
expect(score).toBeCloseTo(0.91, 6);
|
|
123
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
124
|
+
});
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
describe("rerankTopPairs with reranker='cross-encoder' fallback", () => {
|
|
128
|
+
const mk: MatchkeyConfig = {
|
|
129
|
+
name: "name_match",
|
|
130
|
+
type: "weighted",
|
|
131
|
+
threshold: 0.85,
|
|
132
|
+
fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1 }],
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
const rows: Row[] = [
|
|
136
|
+
{ __row_id__: 0, name: "Alice" },
|
|
137
|
+
{ __row_id__: 1, name: "Alyce" },
|
|
138
|
+
];
|
|
139
|
+
|
|
140
|
+
beforeEach(() => {
|
|
141
|
+
_resetCrossEncoderModelCache();
|
|
142
|
+
vi.spyOn(console, "warn").mockImplementation(() => undefined);
|
|
143
|
+
});
|
|
144
|
+
afterEach(() => {
|
|
145
|
+
vi.restoreAllMocks();
|
|
146
|
+
vi.unstubAllGlobals();
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
it("falls back to LLM per-pair when cross-encoder load fails", async () => {
|
|
150
|
+
const fetchMock = vi.fn();
|
|
151
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
152
|
+
fetchMock.mockResolvedValueOnce({
|
|
153
|
+
ok: true,
|
|
154
|
+
status: 200,
|
|
155
|
+
text: async () => "",
|
|
156
|
+
json: async () => ({
|
|
157
|
+
choices: [{ message: { content: JSON.stringify({ score: 1.0 }) } }],
|
|
158
|
+
usage: { prompt_tokens: 10, completion_tokens: 4 },
|
|
159
|
+
}),
|
|
160
|
+
} as Response);
|
|
161
|
+
|
|
162
|
+
const pairs: ScoredPair[] = [{ idA: 0, idB: 1, score: 0.9 }];
|
|
163
|
+
const out = await rerankTopPairs(pairs, rows, mk, {
|
|
164
|
+
reranker: "cross-encoder",
|
|
165
|
+
apiKey: "sk-test",
|
|
166
|
+
maxRetries: 0,
|
|
167
|
+
});
|
|
168
|
+
expect(out.length).toBe(1);
|
|
169
|
+
// (1 - 0.5) * 0.9 + 0.5 * 1.0 = 0.95
|
|
170
|
+
expect(out[0]!.score).toBeCloseTo(0.95, 6);
|
|
171
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
it("passes pairs through unchanged when cross-encoder load fails and no apiKey", async () => {
|
|
175
|
+
const env = (globalThis as { process?: { env?: Record<string, string | undefined> } })
|
|
176
|
+
.process?.env;
|
|
177
|
+
const savedOpen = env?.OPENAI_API_KEY;
|
|
178
|
+
const savedAnt = env?.ANTHROPIC_API_KEY;
|
|
179
|
+
if (env) {
|
|
180
|
+
delete env.OPENAI_API_KEY;
|
|
181
|
+
delete env.ANTHROPIC_API_KEY;
|
|
182
|
+
}
|
|
183
|
+
try {
|
|
184
|
+
const pairs: ScoredPair[] = [{ idA: 0, idB: 1, score: 0.9 }];
|
|
185
|
+
const out = await rerankTopPairs(pairs, rows, mk, {
|
|
186
|
+
reranker: "cross-encoder",
|
|
187
|
+
});
|
|
188
|
+
// No backend available -> original scores survive (>= threshold).
|
|
189
|
+
expect(out.length).toBe(1);
|
|
190
|
+
expect(out[0]!.score).toBeCloseTo(0.9, 6);
|
|
191
|
+
} finally {
|
|
192
|
+
if (env) {
|
|
193
|
+
if (savedOpen !== undefined) env.OPENAI_API_KEY = savedOpen;
|
|
194
|
+
if (savedAnt !== undefined) env.ANTHROPIC_API_KEY = savedAnt;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
});
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import {
|
|
2
|
+
describe,
|
|
3
|
+
it,
|
|
4
|
+
expect,
|
|
5
|
+
vi,
|
|
6
|
+
beforeEach,
|
|
7
|
+
afterEach,
|
|
8
|
+
} from "vitest";
|
|
9
|
+
import { rerankPair, rerankTopPairs } from "../../src/core/index.js";
|
|
10
|
+
import type {
|
|
11
|
+
MatchkeyConfig,
|
|
12
|
+
Row,
|
|
13
|
+
ScoredPair,
|
|
14
|
+
} from "../../src/core/index.js";
|
|
15
|
+
|
|
16
|
+
let fetchMock: ReturnType<typeof vi.fn>;
|
|
17
|
+
|
|
18
|
+
function mockOpenAIChat(score: number) {
|
|
19
|
+
return {
|
|
20
|
+
ok: true,
|
|
21
|
+
status: 200,
|
|
22
|
+
text: async () => "",
|
|
23
|
+
json: async () => ({
|
|
24
|
+
choices: [{ message: { content: JSON.stringify({ score }) } }],
|
|
25
|
+
usage: { prompt_tokens: 50, completion_tokens: 8 },
|
|
26
|
+
}),
|
|
27
|
+
} as Response;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
beforeEach(() => {
|
|
31
|
+
fetchMock = vi.fn();
|
|
32
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
afterEach(() => {
|
|
36
|
+
vi.unstubAllGlobals();
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
describe("rerankPair", () => {
|
|
40
|
+
it("returns parsed 0..1 score on success", async () => {
|
|
41
|
+
fetchMock.mockResolvedValueOnce(mockOpenAIChat(0.92));
|
|
42
|
+
const score = await rerankPair(
|
|
43
|
+
{ name: "Bob Smith" },
|
|
44
|
+
{ name: "Robert Smith" },
|
|
45
|
+
["name"],
|
|
46
|
+
{ apiKey: "sk-test", maxRetries: 0 },
|
|
47
|
+
);
|
|
48
|
+
expect(score).toBeCloseTo(0.92, 6);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("returns NaN when no apiKey provided", async () => {
|
|
52
|
+
// Ensure env var not set.
|
|
53
|
+
const env = (globalThis as { process?: { env?: Record<string, string | undefined> } }).process?.env;
|
|
54
|
+
const savedOpen = env?.OPENAI_API_KEY;
|
|
55
|
+
const savedAnt = env?.ANTHROPIC_API_KEY;
|
|
56
|
+
if (env) {
|
|
57
|
+
delete env.OPENAI_API_KEY;
|
|
58
|
+
delete env.ANTHROPIC_API_KEY;
|
|
59
|
+
}
|
|
60
|
+
try {
|
|
61
|
+
const score = await rerankPair({ a: 1 }, { a: 1 }, ["a"], {});
|
|
62
|
+
expect(Number.isNaN(score)).toBe(true);
|
|
63
|
+
} finally {
|
|
64
|
+
if (env) {
|
|
65
|
+
if (savedOpen !== undefined) env.OPENAI_API_KEY = savedOpen;
|
|
66
|
+
if (savedAnt !== undefined) env.ANTHROPIC_API_KEY = savedAnt;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("returns NaN on HTTP failure", async () => {
|
|
72
|
+
fetchMock.mockResolvedValueOnce({
|
|
73
|
+
ok: false,
|
|
74
|
+
status: 401,
|
|
75
|
+
text: async () => "unauthorized",
|
|
76
|
+
json: async () => ({}),
|
|
77
|
+
} as Response);
|
|
78
|
+
const score = await rerankPair({}, {}, [], {
|
|
79
|
+
apiKey: "sk-test",
|
|
80
|
+
maxRetries: 0,
|
|
81
|
+
});
|
|
82
|
+
expect(Number.isNaN(score)).toBe(true);
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
describe("rerankTopPairs", () => {
|
|
87
|
+
const mk: MatchkeyConfig = {
|
|
88
|
+
name: "name_match",
|
|
89
|
+
type: "weighted",
|
|
90
|
+
threshold: 0.85,
|
|
91
|
+
fields: [{ field: "name", transforms: [], scorer: "jaro_winkler", weight: 1 }],
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
const rows: Row[] = [
|
|
95
|
+
{ __row_id__: 0, name: "Alice" },
|
|
96
|
+
{ __row_id__: 1, name: "Alyce" },
|
|
97
|
+
{ __row_id__: 2, name: "Bob" },
|
|
98
|
+
{ __row_id__: 3, name: "Robert" },
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
it("filters pairs to within band [threshold-band, 1.0] and caps at topN", async () => {
|
|
102
|
+
// Band default 0.1 → only pairs >= 0.75 are candidates.
|
|
103
|
+
const pairs: ScoredPair[] = [
|
|
104
|
+
{ idA: 0, idB: 1, score: 0.95 },
|
|
105
|
+
{ idA: 2, idB: 3, score: 0.80 },
|
|
106
|
+
{ idA: 0, idB: 2, score: 0.50 }, // below band - not reranked
|
|
107
|
+
];
|
|
108
|
+
// Mock 1 LLM response (topN=1)
|
|
109
|
+
fetchMock.mockResolvedValueOnce(mockOpenAIChat(1.0));
|
|
110
|
+
|
|
111
|
+
const out = await rerankTopPairs(pairs, rows, mk, {
|
|
112
|
+
apiKey: "sk-test",
|
|
113
|
+
maxRetries: 0,
|
|
114
|
+
topN: 1,
|
|
115
|
+
band: 0.1,
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
// Only one fetch (capped at topN=1)
|
|
119
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
120
|
+
// Output should drop pairs below threshold
|
|
121
|
+
// - (0,1): combined=0.5*0.95+0.5*1.0=0.975 -> kept
|
|
122
|
+
// - (2,3): not reranked, original 0.80 < threshold 0.85 -> dropped
|
|
123
|
+
// - (0,2): not reranked, 0.50 < 0.85 -> dropped
|
|
124
|
+
const ids = out.map((p) => `${p.idA}-${p.idB}`);
|
|
125
|
+
expect(ids).toContain("0-1");
|
|
126
|
+
expect(ids).not.toContain("0-2");
|
|
127
|
+
expect(ids).not.toContain("2-3");
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
it("returns input unchanged when no apiKey", async () => {
|
|
131
|
+
const env = (globalThis as { process?: { env?: Record<string, string | undefined> } }).process?.env;
|
|
132
|
+
const savedOpen = env?.OPENAI_API_KEY;
|
|
133
|
+
const savedAnt = env?.ANTHROPIC_API_KEY;
|
|
134
|
+
if (env) {
|
|
135
|
+
delete env.OPENAI_API_KEY;
|
|
136
|
+
delete env.ANTHROPIC_API_KEY;
|
|
137
|
+
}
|
|
138
|
+
try {
|
|
139
|
+
const pairs: ScoredPair[] = [{ idA: 0, idB: 1, score: 0.9 }];
|
|
140
|
+
const out = await rerankTopPairs(pairs, rows, mk, {});
|
|
141
|
+
expect(out).toBe(pairs);
|
|
142
|
+
} finally {
|
|
143
|
+
if (env) {
|
|
144
|
+
if (savedOpen !== undefined) env.OPENAI_API_KEY = savedOpen;
|
|
145
|
+
if (savedAnt !== undefined) env.ANTHROPIC_API_KEY = savedAnt;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it("on HTTP failure pair keeps original score", async () => {
|
|
151
|
+
// Borderline pair: 0.86 (just above threshold 0.85). LLM call will fail.
|
|
152
|
+
const pairs: ScoredPair[] = [{ idA: 0, idB: 1, score: 0.86 }];
|
|
153
|
+
fetchMock.mockResolvedValueOnce({
|
|
154
|
+
ok: false,
|
|
155
|
+
status: 500,
|
|
156
|
+
text: async () => "boom",
|
|
157
|
+
json: async () => ({}),
|
|
158
|
+
} as Response);
|
|
159
|
+
|
|
160
|
+
const out = await rerankTopPairs(pairs, rows, mk, {
|
|
161
|
+
apiKey: "sk-test",
|
|
162
|
+
maxRetries: 0,
|
|
163
|
+
});
|
|
164
|
+
// Pair kept with original score (0.86 >= threshold 0.85)
|
|
165
|
+
expect(out.length).toBe(1);
|
|
166
|
+
expect(out[0]!.score).toBeCloseTo(0.86, 6);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it("empty input -> empty output", async () => {
|
|
170
|
+
const out = await rerankTopPairs([], rows, mk, { apiKey: "sk-test" });
|
|
171
|
+
expect(out).toEqual([]);
|
|
172
|
+
});
|
|
173
|
+
});
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { createPostgresConnector } from "../../src/node/db/postgres.js";
|
|
3
|
+
import { createDuckDBConnector } from "../../src/node/backends/duckdb.js";
|
|
4
|
+
|
|
5
|
+
describe("createPostgresConnector", () => {
|
|
6
|
+
it("throws clear install message when 'pg' is not installed", () => {
|
|
7
|
+
// pg is an optional peer dep; not in this dev env.
|
|
8
|
+
try {
|
|
9
|
+
const c = createPostgresConnector({ host: "localhost" });
|
|
10
|
+
// If pg happens to be installed, just validate the shape.
|
|
11
|
+
expect(typeof c.connect).toBe("function");
|
|
12
|
+
expect(typeof c.query).toBe("function");
|
|
13
|
+
expect(typeof c.close).toBe("function");
|
|
14
|
+
} catch (err) {
|
|
15
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
16
|
+
expect(msg).toMatch(/pg/);
|
|
17
|
+
expect(msg).toMatch(/install/i);
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
describe("createDuckDBConnector", () => {
|
|
23
|
+
it("throws clear install message when '@duckdb/node-api' is not installed", async () => {
|
|
24
|
+
try {
|
|
25
|
+
const c = await createDuckDBConnector();
|
|
26
|
+
// If installed, validate connector shape.
|
|
27
|
+
expect(typeof c.readTable).toBe("function");
|
|
28
|
+
expect(typeof c.writeTable).toBe("function");
|
|
29
|
+
expect(typeof c.close).toBe("function");
|
|
30
|
+
c.close();
|
|
31
|
+
} catch (err) {
|
|
32
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
33
|
+
expect(msg).toMatch(/@duckdb\/node-api/);
|
|
34
|
+
expect(msg).toMatch(/install/i);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
});
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { detectDomain, extractFeatures } from "../../src/core/domain.js";
|
|
3
|
+
import type { Row } from "../../src/core/types.js";
|
|
4
|
+
|
|
5
|
+
describe("detectDomain", () => {
|
|
6
|
+
it("classifies electronics-like columns as 'product'", () => {
|
|
7
|
+
const profile = detectDomain(["brand", "model", "sku", "price"]);
|
|
8
|
+
expect(profile.name).toBe("product");
|
|
9
|
+
expect(profile.confidence).toBeGreaterThan(0);
|
|
10
|
+
expect(profile.featureColumns).toContain("brand");
|
|
11
|
+
expect(profile.featureColumns).toContain("model");
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("classifies person columns as 'person'", () => {
|
|
15
|
+
const profile = detectDomain(["first_name", "last_name", "email", "phone"]);
|
|
16
|
+
expect(profile.name).toBe("person");
|
|
17
|
+
expect(profile.confidence).toBeGreaterThan(0);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("classifies bibliographic columns as 'bibliographic'", () => {
|
|
21
|
+
const profile = detectDomain(["title", "authors", "year", "venue"]);
|
|
22
|
+
expect(profile.name).toBe("bibliographic");
|
|
23
|
+
expect(profile.confidence).toBeGreaterThan(0);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("returns 'generic' when no signatures match", () => {
|
|
27
|
+
const profile = detectDomain(["foo", "bar", "baz"]);
|
|
28
|
+
expect(profile.name).toBe("generic");
|
|
29
|
+
expect(profile.confidence).toBe(0);
|
|
30
|
+
expect(profile.featureColumns).toEqual([]);
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it("reports text columns (description, title, notes, body)", () => {
|
|
34
|
+
const profile = detectDomain(["title", "description", "notes", "brand"]);
|
|
35
|
+
// All three TEXT_NAME_RE hits show up in textColumns regardless of winner.
|
|
36
|
+
for (const col of ["title", "description", "notes"]) {
|
|
37
|
+
expect(profile.textColumns).toContain(col);
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
describe("extractFeatures", () => {
|
|
43
|
+
it("adds __brand__/__model__/__version__ columns for product rows with signal", () => {
|
|
44
|
+
const rows: Row[] = [
|
|
45
|
+
{ brand: "Apple", model: "iPhone-12", description: "A product" },
|
|
46
|
+
{ brand: "Samsung", model: "SGH-M220", description: "Phone" },
|
|
47
|
+
];
|
|
48
|
+
const profile = detectDomain(Object.keys(rows[0]!));
|
|
49
|
+
const { rows: enriched, lowConfidenceIds } = extractFeatures(rows, profile);
|
|
50
|
+
expect(enriched.length).toBe(2);
|
|
51
|
+
expect(enriched[0]!["__brand__"]).toBe("apple");
|
|
52
|
+
expect(enriched[0]!["__model__"]).toBe("IPHONE12");
|
|
53
|
+
expect(enriched[1]!["__brand__"]).toBe("samsung");
|
|
54
|
+
// Both rows have brand + model => not in lowConfidenceIds.
|
|
55
|
+
expect(lowConfidenceIds).not.toContain(0);
|
|
56
|
+
expect(lowConfidenceIds).not.toContain(1);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it("reports low-confidence rows when features can't be extracted", () => {
|
|
60
|
+
const rows: Row[] = [
|
|
61
|
+
{ brand: null, model: null, description: "foo bar" },
|
|
62
|
+
{ brand: null, model: null, description: null },
|
|
63
|
+
];
|
|
64
|
+
const profile = detectDomain(["brand", "model", "description"]);
|
|
65
|
+
const { lowConfidenceIds } = extractFeatures(rows, profile, 0.5);
|
|
66
|
+
// Both rows have 0/3 features extracted -> confidence 0 < 0.5.
|
|
67
|
+
expect(lowConfidenceIds).toContain(0);
|
|
68
|
+
expect(lowConfidenceIds).toContain(1);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("generic domain: returns rows unchanged and no low-confidence ids", () => {
|
|
72
|
+
const rows: Row[] = [{ foo: "a" }, { foo: "b" }];
|
|
73
|
+
const profile = detectDomain(["foo"]);
|
|
74
|
+
const { rows: out, lowConfidenceIds } = extractFeatures(rows, profile);
|
|
75
|
+
expect(out.length).toBe(2);
|
|
76
|
+
expect(lowConfidenceIds).toEqual([]);
|
|
77
|
+
// No underscore columns injected.
|
|
78
|
+
expect(Object.keys(out[0]!)).toEqual(["foo"]);
|
|
79
|
+
});
|
|
80
|
+
});
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import {
|
|
2
|
+
describe,
|
|
3
|
+
it,
|
|
4
|
+
expect,
|
|
5
|
+
vi,
|
|
6
|
+
beforeEach,
|
|
7
|
+
afterEach,
|
|
8
|
+
} from "vitest";
|
|
9
|
+
import { Embedder, getEmbedder, EmbedderError } from "../../src/core/index.js";
|
|
10
|
+
import { _clearEmbedderCache } from "../../src/core/embedder.js";
|
|
11
|
+
|
|
12
|
+
let fetchMock: ReturnType<typeof vi.fn>;
|
|
13
|
+
|
|
14
|
+
function vec(...nums: number[]): Float32Array {
|
|
15
|
+
return new Float32Array(nums);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function mockOpenAIResponse(embeddings: number[][], totalTokens = 10) {
|
|
19
|
+
return {
|
|
20
|
+
ok: true,
|
|
21
|
+
status: 200,
|
|
22
|
+
text: async () => "",
|
|
23
|
+
json: async () => ({
|
|
24
|
+
data: embeddings.map((embedding) => ({ embedding })),
|
|
25
|
+
usage: { total_tokens: totalTokens },
|
|
26
|
+
}),
|
|
27
|
+
} as Response;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
beforeEach(() => {
|
|
31
|
+
fetchMock = vi.fn();
|
|
32
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
33
|
+
_clearEmbedderCache();
|
|
34
|
+
// Avoid accidental env apiKey leakage between tests.
|
|
35
|
+
delete (globalThis as { process?: { env?: Record<string, string | undefined> } }).process?.env?.OPENAI_API_KEY;
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
afterEach(() => {
|
|
39
|
+
vi.unstubAllGlobals();
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
describe("Embedder.cosineSimilarity", () => {
|
|
43
|
+
it("known orthogonal vectors -> 0", () => {
|
|
44
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
45
|
+
expect(e.cosineSimilarity(vec(1, 0, 0), vec(0, 1, 0))).toBe(0);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it("identical vectors -> 1", () => {
|
|
49
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
50
|
+
expect(e.cosineSimilarity(vec(1, 2, 3), vec(1, 2, 3))).toBeCloseTo(1, 6);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it("zero vector -> 0", () => {
|
|
54
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
55
|
+
expect(e.cosineSimilarity(vec(0, 0), vec(1, 1))).toBe(0);
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
describe("Embedder.cosineSimilarityMatrix", () => {
|
|
60
|
+
it("returns NxN matrix with diagonal of 1", () => {
|
|
61
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
62
|
+
const embeddings = [vec(1, 0, 0), vec(0, 1, 0), vec(1, 0, 0)];
|
|
63
|
+
const matrix = e.cosineSimilarityMatrix(embeddings);
|
|
64
|
+
expect(matrix.length).toBe(3);
|
|
65
|
+
expect(matrix[0]!.length).toBe(3);
|
|
66
|
+
for (let i = 0; i < 3; i++) {
|
|
67
|
+
expect(matrix[i]![i]).toBe(1);
|
|
68
|
+
}
|
|
69
|
+
// Symmetric off-diagonals
|
|
70
|
+
expect(matrix[0]![1]).toBeCloseTo(matrix[1]![0]!, 9);
|
|
71
|
+
// 0 and 2 are identical
|
|
72
|
+
expect(matrix[0]![2]).toBeCloseTo(1, 6);
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe("Embedder.embedBatch — unique-text dedup", () => {
|
|
77
|
+
it("dedupes identical texts before calling API", async () => {
|
|
78
|
+
fetchMock.mockResolvedValueOnce(
|
|
79
|
+
mockOpenAIResponse([
|
|
80
|
+
[1, 0, 0],
|
|
81
|
+
[0, 1, 0],
|
|
82
|
+
]),
|
|
83
|
+
);
|
|
84
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
85
|
+
const result = await e.embedBatch(["foo", "bar", "foo", "foo"]);
|
|
86
|
+
|
|
87
|
+
// Only one API call
|
|
88
|
+
expect(fetchMock).toHaveBeenCalledTimes(1);
|
|
89
|
+
// Body should contain only unique inputs ["foo","bar"]
|
|
90
|
+
const callArgs = fetchMock.mock.calls[0];
|
|
91
|
+
const body = JSON.parse((callArgs![1] as RequestInit).body as string);
|
|
92
|
+
expect(body.input).toEqual(["foo", "bar"]);
|
|
93
|
+
|
|
94
|
+
// 4 outputs in original order
|
|
95
|
+
expect(result.embeddings.length).toBe(4);
|
|
96
|
+
// foo positions (0, 2, 3) share the same embedding object identity
|
|
97
|
+
expect(result.embeddings[0]).toBe(result.embeddings[2]);
|
|
98
|
+
expect(result.embeddings[0]).toBe(result.embeddings[3]);
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
describe("Embedder.embedOne", () => {
|
|
103
|
+
it("returns single Float32Array", async () => {
|
|
104
|
+
fetchMock.mockResolvedValueOnce(mockOpenAIResponse([[3, 4]]));
|
|
105
|
+
const e = new Embedder({ apiKey: "sk-test" });
|
|
106
|
+
const v = await e.embedOne("hello");
|
|
107
|
+
expect(v).toBeInstanceOf(Float32Array);
|
|
108
|
+
expect(v.length).toBe(2);
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
describe("Embedder error handling", () => {
|
|
113
|
+
it("throws EmbedderError when no apiKey and no env var", async () => {
|
|
114
|
+
const e = new Embedder({ provider: "openai" });
|
|
115
|
+
// Make sure there's truly no apiKey.
|
|
116
|
+
await expect(e.embedBatch(["hello"])).rejects.toThrow(EmbedderError);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
it("throws EmbedderError when API returns non-2xx", async () => {
|
|
120
|
+
fetchMock.mockResolvedValueOnce({
|
|
121
|
+
ok: false,
|
|
122
|
+
status: 401,
|
|
123
|
+
text: async () => "unauthorized",
|
|
124
|
+
json: async () => ({}),
|
|
125
|
+
} as Response);
|
|
126
|
+
const e = new Embedder({ apiKey: "sk-test", maxRetries: 0 });
|
|
127
|
+
await expect(e.embedBatch(["hi"])).rejects.toThrow(EmbedderError);
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
describe("getEmbedder caching", () => {
|
|
132
|
+
it("returns same instance for same provider+model", () => {
|
|
133
|
+
_clearEmbedderCache();
|
|
134
|
+
const a = getEmbedder({ apiKey: "sk-test", model: "text-embedding-3-small" });
|
|
135
|
+
const b = getEmbedder({ apiKey: "sk-test", model: "text-embedding-3-small" });
|
|
136
|
+
expect(a).toBe(b);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it("returns different instance for different model", () => {
|
|
140
|
+
_clearEmbedderCache();
|
|
141
|
+
const a = getEmbedder({ apiKey: "sk-test", model: "text-embedding-3-small" });
|
|
142
|
+
const b = getEmbedder({ apiKey: "sk-test", model: "text-embedding-3-large" });
|
|
143
|
+
expect(a).not.toBe(b);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
it("accepts string shorthand for model", () => {
|
|
147
|
+
_clearEmbedderCache();
|
|
148
|
+
const a = getEmbedder("text-embedding-3-small");
|
|
149
|
+
expect(a).toBeInstanceOf(Embedder);
|
|
150
|
+
});
|
|
151
|
+
});
|