goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import { describe, it, expect, vi } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
ANNBlocker,
|
|
4
|
+
HNSWANNBlocker,
|
|
5
|
+
createANNBlocker,
|
|
6
|
+
type HNSWModule,
|
|
7
|
+
type HNSWIndexLike,
|
|
8
|
+
} from "../../src/core/index.js";
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Mock hnswlib-node
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Minimal in-memory stand-in for `hnswlib.HierarchicalNSW`. It records every
|
|
16
|
+
* init/add/search call so assertions can verify the fast-path is being used,
|
|
17
|
+
* and returns deterministic neighbours so pair assertions are stable.
|
|
18
|
+
*/
|
|
19
|
+
class MockIndex implements HNSWIndexLike {
|
|
20
|
+
public readonly points: Array<{ label: number; vec: number[] }> = [];
|
|
21
|
+
public initArgs: number[] | null = null;
|
|
22
|
+
public ef: number | null = null;
|
|
23
|
+
public searchCalls = 0;
|
|
24
|
+
|
|
25
|
+
constructor(public metric: string, public dim: number) {}
|
|
26
|
+
|
|
27
|
+
initIndex(
|
|
28
|
+
maxElements: number,
|
|
29
|
+
M?: number,
|
|
30
|
+
efConstruction?: number,
|
|
31
|
+
randomSeed?: number,
|
|
32
|
+
): void {
|
|
33
|
+
this.initArgs = [maxElements, M ?? 16, efConstruction ?? 200, randomSeed ?? 100];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
setEf(ef: number): void {
|
|
37
|
+
this.ef = ef;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
addPoint(vector: number[] | Float32Array, labelId: number): void {
|
|
41
|
+
this.points.push({ label: labelId, vec: Array.from(vector) });
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
searchKnn(
|
|
45
|
+
_query: number[] | Float32Array,
|
|
46
|
+
k: number,
|
|
47
|
+
): { distances: number[]; neighbors: number[] } {
|
|
48
|
+
this.searchCalls++;
|
|
49
|
+
const n = Math.min(k, this.points.length);
|
|
50
|
+
// Return the first n labels with small increasing distances. This makes
|
|
51
|
+
// the test deterministic without having to actually compute distances.
|
|
52
|
+
const neighbors: number[] = [];
|
|
53
|
+
const distances: number[] = [];
|
|
54
|
+
for (let i = 0; i < n; i++) {
|
|
55
|
+
neighbors.push(this.points[i]!.label);
|
|
56
|
+
distances.push(i * 0.1);
|
|
57
|
+
}
|
|
58
|
+
return { neighbors, distances };
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function makeMockModule(): {
|
|
63
|
+
module: HNSWModule;
|
|
64
|
+
instances: MockIndex[];
|
|
65
|
+
} {
|
|
66
|
+
const instances: MockIndex[] = [];
|
|
67
|
+
const module: HNSWModule = {
|
|
68
|
+
HierarchicalNSW: class extends MockIndex {
|
|
69
|
+
constructor(metric: string, dim: number) {
|
|
70
|
+
super(metric, dim);
|
|
71
|
+
instances.push(this);
|
|
72
|
+
}
|
|
73
|
+
} as unknown as HNSWModule["HierarchicalNSW"],
|
|
74
|
+
};
|
|
75
|
+
return { module, instances };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function vec(...nums: number[]): Float32Array {
|
|
79
|
+
return new Float32Array(nums);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
// HNSWANNBlocker — direct unit tests
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
describe("HNSWANNBlocker", () => {
|
|
87
|
+
it("buildIndex initialises the native index and adds every point", () => {
|
|
88
|
+
const { module, instances } = makeMockModule();
|
|
89
|
+
const blocker = new HNSWANNBlocker({
|
|
90
|
+
hnswModule: module,
|
|
91
|
+
topK: 3,
|
|
92
|
+
metric: "cosine",
|
|
93
|
+
M: 8,
|
|
94
|
+
efConstruction: 100,
|
|
95
|
+
efSearch: 42,
|
|
96
|
+
});
|
|
97
|
+
blocker.buildIndex([vec(1, 0, 0), vec(0, 1, 0), vec(0, 0, 1)]);
|
|
98
|
+
expect(instances).toHaveLength(1);
|
|
99
|
+
const idx = instances[0]!;
|
|
100
|
+
expect(idx.metric).toBe("cosine");
|
|
101
|
+
expect(idx.dim).toBe(3);
|
|
102
|
+
expect(idx.initArgs?.[1]).toBe(8); // M
|
|
103
|
+
expect(idx.initArgs?.[2]).toBe(100); // efConstruction
|
|
104
|
+
expect(idx.ef).toBe(42);
|
|
105
|
+
expect(idx.points.map((p) => p.label)).toEqual([0, 1, 2]);
|
|
106
|
+
expect(blocker.indexSize).toBe(3);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it("buildIndex with empty input clears the index", () => {
|
|
110
|
+
const { module } = makeMockModule();
|
|
111
|
+
const blocker = new HNSWANNBlocker({ hnswModule: module, topK: 3 });
|
|
112
|
+
blocker.buildIndex([]);
|
|
113
|
+
expect(blocker.indexSize).toBe(0);
|
|
114
|
+
expect(blocker.query([vec(1, 2, 3)])).toEqual([]);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it("maps 'euclidean' option to the 'l2' metric string", () => {
|
|
118
|
+
const { module, instances } = makeMockModule();
|
|
119
|
+
const blocker = new HNSWANNBlocker({
|
|
120
|
+
hnswModule: module,
|
|
121
|
+
metric: "euclidean",
|
|
122
|
+
});
|
|
123
|
+
blocker.buildIndex([vec(1, 0), vec(0, 1)]);
|
|
124
|
+
expect(instances[0]!.metric).toBe("l2");
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it("query canonicalises pairs and drops self-matches", () => {
|
|
128
|
+
const { module } = makeMockModule();
|
|
129
|
+
const blocker = new HNSWANNBlocker({
|
|
130
|
+
hnswModule: module,
|
|
131
|
+
topK: 3,
|
|
132
|
+
});
|
|
133
|
+
blocker.buildIndex([vec(1, 0, 0), vec(0, 1, 0), vec(0, 0, 1)]);
|
|
134
|
+
const pairs = blocker.query([vec(1, 0, 0), vec(0, 1, 0), vec(0, 0, 1)]);
|
|
135
|
+
// MockIndex returns labels [0,1,2] for every query — after self-filter +
|
|
136
|
+
// canonicalisation we should get {0,1}, {0,2}, {1,2}.
|
|
137
|
+
const sorted = pairs.map(([a, b]) => `${a}-${b}`).sort();
|
|
138
|
+
expect(sorted).toEqual(["0-1", "0-2", "1-2"]);
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it("queryWithScores converts cosine distance to similarity = 1 - d", () => {
|
|
142
|
+
const { module } = makeMockModule();
|
|
143
|
+
const blocker = new HNSWANNBlocker({
|
|
144
|
+
hnswModule: module,
|
|
145
|
+
topK: 3,
|
|
146
|
+
metric: "cosine",
|
|
147
|
+
});
|
|
148
|
+
blocker.buildIndex([vec(1, 0), vec(0, 1), vec(1, 1)]);
|
|
149
|
+
const scored = blocker.queryWithScores([vec(1, 0), vec(0, 1), vec(1, 1)]);
|
|
150
|
+
// Distances produced by MockIndex: 0, 0.1, 0.2 → similarities 1.0, 0.9, 0.8
|
|
151
|
+
// After dedup (best score kept per pair) all three unique pairs survive.
|
|
152
|
+
expect(scored).toHaveLength(3);
|
|
153
|
+
for (const [, , score] of scored) {
|
|
154
|
+
expect(score).toBeGreaterThanOrEqual(0.8 - 1e-6);
|
|
155
|
+
expect(score).toBeLessThanOrEqual(1.0 + 1e-6);
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it("queryWithScores converts l2 distance to 1 / (1 + d)", () => {
|
|
160
|
+
const { module } = makeMockModule();
|
|
161
|
+
const blocker = new HNSWANNBlocker({
|
|
162
|
+
hnswModule: module,
|
|
163
|
+
topK: 2,
|
|
164
|
+
metric: "euclidean",
|
|
165
|
+
});
|
|
166
|
+
blocker.buildIndex([vec(0, 0), vec(1, 1)]);
|
|
167
|
+
const scored = blocker.queryWithScores([vec(0, 0), vec(1, 1)]);
|
|
168
|
+
// Best distance across the two queries for pair (0,1) is 0 → score 1.0.
|
|
169
|
+
expect(scored).toHaveLength(1);
|
|
170
|
+
const [, , score] = scored[0]!;
|
|
171
|
+
expect(score).toBeCloseTo(1.0, 6);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
it("queryOne returns (neighbour, score) pairs", () => {
|
|
175
|
+
const { module } = makeMockModule();
|
|
176
|
+
const blocker = new HNSWANNBlocker({
|
|
177
|
+
hnswModule: module,
|
|
178
|
+
topK: 2,
|
|
179
|
+
metric: "cosine",
|
|
180
|
+
});
|
|
181
|
+
blocker.buildIndex([vec(1, 0), vec(0, 1), vec(1, 1)]);
|
|
182
|
+
const out = blocker.queryOne(vec(0.5, 0.5));
|
|
183
|
+
expect(out).toHaveLength(2);
|
|
184
|
+
// First mock neighbour has distance 0 → similarity 1.0.
|
|
185
|
+
expect(out[0]![1]).toBeCloseTo(1.0, 6);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it("addToIndex grows the index and returns sequential ids", () => {
|
|
189
|
+
const { module, instances } = makeMockModule();
|
|
190
|
+
const blocker = new HNSWANNBlocker({ hnswModule: module });
|
|
191
|
+
blocker.buildIndex([vec(1, 0), vec(0, 1)]);
|
|
192
|
+
const id1 = blocker.addToIndex(vec(1, 1));
|
|
193
|
+
const id2 = blocker.addToIndex(vec(0.5, 0.5));
|
|
194
|
+
expect(id1).toBe(2);
|
|
195
|
+
expect(id2).toBe(3);
|
|
196
|
+
expect(blocker.indexSize).toBe(4);
|
|
197
|
+
expect(instances[0]!.points).toHaveLength(4);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
it("addToIndex before buildIndex throws", () => {
|
|
201
|
+
const { module } = makeMockModule();
|
|
202
|
+
const blocker = new HNSWANNBlocker({ hnswModule: module });
|
|
203
|
+
expect(() => blocker.addToIndex(vec(1, 2))).toThrow(/buildIndex/);
|
|
204
|
+
});
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// createANNBlocker factory
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
describe("createANNBlocker", () => {
|
|
212
|
+
it("returns a brute-force ANNBlocker when useHNSW=false", async () => {
|
|
213
|
+
const blocker = await createANNBlocker({ useHNSW: false, topK: 5 });
|
|
214
|
+
expect(blocker).toBeInstanceOf(ANNBlocker);
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
it("returns a brute-force ANNBlocker with no options", async () => {
|
|
218
|
+
const blocker = await createANNBlocker();
|
|
219
|
+
expect(blocker).toBeInstanceOf(ANNBlocker);
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
it("uses the provided hnswModule when useHNSW=true", async () => {
|
|
223
|
+
const { module, instances } = makeMockModule();
|
|
224
|
+
const blocker = await createANNBlocker({
|
|
225
|
+
useHNSW: true,
|
|
226
|
+
hnswModule: module,
|
|
227
|
+
topK: 4,
|
|
228
|
+
metric: "cosine",
|
|
229
|
+
});
|
|
230
|
+
expect(blocker).toBeInstanceOf(HNSWANNBlocker);
|
|
231
|
+
blocker.buildIndex([vec(1, 0), vec(0, 1)]);
|
|
232
|
+
expect(instances).toHaveLength(1);
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
it("falls back to brute-force when hnswlib-node is not installed", async () => {
|
|
236
|
+
const warnings: string[] = [];
|
|
237
|
+
const blocker = await createANNBlocker({
|
|
238
|
+
useHNSW: true,
|
|
239
|
+
onFallbackWarning: (m) => warnings.push(m),
|
|
240
|
+
});
|
|
241
|
+
// hnswlib-node is not in devDependencies, so the dynamic import fails.
|
|
242
|
+
expect(blocker).toBeInstanceOf(ANNBlocker);
|
|
243
|
+
expect(warnings).toHaveLength(1);
|
|
244
|
+
expect(warnings[0]).toMatch(/hnswlib-node/);
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
it("forwards tuning knobs to HNSWANNBlocker", async () => {
|
|
248
|
+
const { module, instances } = makeMockModule();
|
|
249
|
+
const blocker = await createANNBlocker({
|
|
250
|
+
useHNSW: true,
|
|
251
|
+
hnswModule: module,
|
|
252
|
+
topK: 7,
|
|
253
|
+
metric: "euclidean",
|
|
254
|
+
M: 24,
|
|
255
|
+
efConstruction: 300,
|
|
256
|
+
efSearch: 77,
|
|
257
|
+
maxElements: 500,
|
|
258
|
+
});
|
|
259
|
+
expect(blocker).toBeInstanceOf(HNSWANNBlocker);
|
|
260
|
+
blocker.buildIndex([vec(1, 2), vec(3, 4)]);
|
|
261
|
+
const idx = instances[0]!;
|
|
262
|
+
expect(idx.metric).toBe("l2");
|
|
263
|
+
expect(idx.initArgs?.[0]).toBe(500); // maxElements
|
|
264
|
+
expect(idx.initArgs?.[1]).toBe(24); // M
|
|
265
|
+
expect(idx.initArgs?.[2]).toBe(300); // efConstruction
|
|
266
|
+
expect(idx.ef).toBe(77);
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
it("silences the warn sink via onFallbackWarning", async () => {
|
|
270
|
+
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
|
|
271
|
+
try {
|
|
272
|
+
await createANNBlocker({
|
|
273
|
+
useHNSW: true,
|
|
274
|
+
onFallbackWarning: () => {
|
|
275
|
+
// swallow
|
|
276
|
+
},
|
|
277
|
+
});
|
|
278
|
+
expect(warnSpy).not.toHaveBeenCalled();
|
|
279
|
+
} finally {
|
|
280
|
+
warnSpy.mockRestore();
|
|
281
|
+
}
|
|
282
|
+
});
|
|
283
|
+
});
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import {
|
|
2
|
+
describe,
|
|
3
|
+
it,
|
|
4
|
+
expect,
|
|
5
|
+
vi,
|
|
6
|
+
beforeEach,
|
|
7
|
+
afterEach,
|
|
8
|
+
} from "vitest";
|
|
9
|
+
import { createHubSpotConnector } from "../../src/node/connectors/hubspot.js";
|
|
10
|
+
|
|
11
|
+
let fetchMock: ReturnType<typeof vi.fn>;
|
|
12
|
+
|
|
13
|
+
beforeEach(() => {
|
|
14
|
+
fetchMock = vi.fn();
|
|
15
|
+
vi.stubGlobal("fetch", fetchMock);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
afterEach(() => {
|
|
19
|
+
vi.unstubAllGlobals();
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
function jsonResponse(body: unknown): Response {
|
|
23
|
+
return {
|
|
24
|
+
ok: true,
|
|
25
|
+
status: 200,
|
|
26
|
+
text: async () => "",
|
|
27
|
+
json: async () => body,
|
|
28
|
+
} as Response;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
describe("createHubSpotConnector", () => {
|
|
32
|
+
it("returns connector with proper shape", () => {
|
|
33
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
34
|
+
expect(c.name).toBe("hubspot");
|
|
35
|
+
expect(typeof c.connect).toBe("function");
|
|
36
|
+
expect(typeof c.read).toBe("function");
|
|
37
|
+
expect(typeof c.close).toBe("function");
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it("connect() is a no-op (no fetch)", async () => {
|
|
41
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
42
|
+
await c.connect();
|
|
43
|
+
expect(fetchMock).not.toHaveBeenCalled();
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it("throws when called with raw SQL string", async () => {
|
|
47
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
48
|
+
await expect(c.read("SELECT * FROM contacts")).rejects.toThrow(
|
|
49
|
+
/object query/i,
|
|
50
|
+
);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it("read() returns rows merging id + properties", async () => {
|
|
54
|
+
fetchMock.mockResolvedValueOnce(
|
|
55
|
+
jsonResponse({
|
|
56
|
+
results: [
|
|
57
|
+
{ id: "1", properties: { firstname: "Ada", lastname: "Lovelace" } },
|
|
58
|
+
{ id: "2", properties: { firstname: "Alan", lastname: "Turing" } },
|
|
59
|
+
],
|
|
60
|
+
}),
|
|
61
|
+
);
|
|
62
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
63
|
+
const rows = await c.read({
|
|
64
|
+
table: "contacts",
|
|
65
|
+
columns: ["firstname", "lastname"],
|
|
66
|
+
limit: 50,
|
|
67
|
+
});
|
|
68
|
+
expect(rows.length).toBe(2);
|
|
69
|
+
expect(rows[0]!.id).toBe("1");
|
|
70
|
+
expect(rows[0]!.firstname).toBe("Ada");
|
|
71
|
+
expect(rows[1]!.lastname).toBe("Turing");
|
|
72
|
+
|
|
73
|
+
const url = fetchMock.mock.calls[0]![0] as string;
|
|
74
|
+
expect(url).toContain("/crm/v3/objects/contacts");
|
|
75
|
+
expect(url).toContain("limit=50");
|
|
76
|
+
expect(url).toContain("properties=firstname,lastname");
|
|
77
|
+
|
|
78
|
+
const init = fetchMock.mock.calls[0]![1] as RequestInit;
|
|
79
|
+
expect((init.headers as Record<string, string>).Authorization).toBe(
|
|
80
|
+
"Bearer key",
|
|
81
|
+
);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("paginates via paging.next.link", async () => {
|
|
85
|
+
fetchMock
|
|
86
|
+
.mockResolvedValueOnce(
|
|
87
|
+
jsonResponse({
|
|
88
|
+
results: [{ id: "1", properties: { name: "A" } }],
|
|
89
|
+
paging: { next: { link: "https://api.hubapi.com/crm/v3/objects/contacts?after=abc" } },
|
|
90
|
+
}),
|
|
91
|
+
)
|
|
92
|
+
.mockResolvedValueOnce(
|
|
93
|
+
jsonResponse({
|
|
94
|
+
results: [{ id: "2", properties: { name: "B" } }],
|
|
95
|
+
}),
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
99
|
+
const rows = await c.read({ table: "contacts" });
|
|
100
|
+
expect(rows.length).toBe(2);
|
|
101
|
+
expect(fetchMock).toHaveBeenCalledTimes(2);
|
|
102
|
+
const secondUrl = fetchMock.mock.calls[1]![0] as string;
|
|
103
|
+
expect(secondUrl).toContain("after=abc");
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it("throws on non-2xx response", async () => {
|
|
107
|
+
fetchMock.mockResolvedValueOnce({
|
|
108
|
+
ok: false,
|
|
109
|
+
status: 403,
|
|
110
|
+
text: async () => "forbidden",
|
|
111
|
+
json: async () => ({}),
|
|
112
|
+
} as Response);
|
|
113
|
+
const c = createHubSpotConnector({ apiKey: "key" });
|
|
114
|
+
await expect(c.read({ table: "contacts" })).rejects.toThrow(
|
|
115
|
+
/HubSpot query failed/,
|
|
116
|
+
);
|
|
117
|
+
});
|
|
118
|
+
});
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
applyColumnMap,
|
|
4
|
+
validateColumns,
|
|
5
|
+
concatRows,
|
|
6
|
+
tagSource,
|
|
7
|
+
assignRowIds,
|
|
8
|
+
} from "../../src/core/ingest.js";
|
|
9
|
+
import type { Row } from "../../src/core/types.js";
|
|
10
|
+
|
|
11
|
+
describe("applyColumnMap", () => {
|
|
12
|
+
it("renames columns per the map", () => {
|
|
13
|
+
const rows: Row[] = [
|
|
14
|
+
{ first: "Alice", last: "Brown", email: "a@x.com" },
|
|
15
|
+
{ first: "Bob", last: "Smith", email: "b@x.com" },
|
|
16
|
+
];
|
|
17
|
+
const out = applyColumnMap(rows, { first: "first_name", last: "last_name" });
|
|
18
|
+
expect(Object.keys(out[0]!).sort()).toEqual(
|
|
19
|
+
["email", "first_name", "last_name"].sort(),
|
|
20
|
+
);
|
|
21
|
+
expect(out[0]!["first_name"]).toBe("Alice");
|
|
22
|
+
expect(out[0]!["last_name"]).toBe("Brown");
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("passes unmapped keys through untouched", () => {
|
|
26
|
+
const rows: Row[] = [{ a: 1, b: 2 }];
|
|
27
|
+
const out = applyColumnMap(rows, { a: "aa" });
|
|
28
|
+
expect(out[0]!["aa"]).toBe(1);
|
|
29
|
+
expect(out[0]!["b"]).toBe(2);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it("empty map clones rows unchanged", () => {
|
|
33
|
+
const rows: Row[] = [{ a: 1 }];
|
|
34
|
+
const out = applyColumnMap(rows, {});
|
|
35
|
+
expect(out).toEqual(rows);
|
|
36
|
+
// Different reference (shallow clone).
|
|
37
|
+
expect(out[0]).not.toBe(rows[0]);
|
|
38
|
+
});
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
describe("validateColumns", () => {
|
|
42
|
+
it("passes when all required columns are present", () => {
|
|
43
|
+
const rows: Row[] = [{ a: 1, b: 2, c: 3 }];
|
|
44
|
+
expect(() => validateColumns(rows, ["a", "b"])).not.toThrow();
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it("throws with the missing columns listed", () => {
|
|
48
|
+
const rows: Row[] = [{ a: 1 }];
|
|
49
|
+
expect(() => validateColumns(rows, ["a", "b", "c"])).toThrow(
|
|
50
|
+
/Required columns missing:.*b.*c/,
|
|
51
|
+
);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it("no-ops on empty input", () => {
|
|
55
|
+
expect(() => validateColumns([], ["a"])).not.toThrow();
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
describe("concatRows", () => {
|
|
60
|
+
it("unions schemas and fills missing fields with null", () => {
|
|
61
|
+
const a: Row[] = [{ x: 1, y: 2 }];
|
|
62
|
+
const b: Row[] = [{ y: 3, z: 4 }];
|
|
63
|
+
const out = concatRows([a, b]);
|
|
64
|
+
expect(out.length).toBe(2);
|
|
65
|
+
// Both rows have all keys from union.
|
|
66
|
+
expect(Object.keys(out[0]!).sort()).toEqual(["x", "y", "z"]);
|
|
67
|
+
expect(Object.keys(out[1]!).sort()).toEqual(["x", "y", "z"]);
|
|
68
|
+
expect(out[0]!["z"]).toBeNull(); // missing in a
|
|
69
|
+
expect(out[1]!["x"]).toBeNull(); // missing in b
|
|
70
|
+
expect(out[0]!["x"]).toBe(1);
|
|
71
|
+
expect(out[1]!["z"]).toBe(4);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("handles empty arrays gracefully", () => {
|
|
75
|
+
const out = concatRows([]);
|
|
76
|
+
expect(out).toEqual([]);
|
|
77
|
+
const out2 = concatRows([[], []]);
|
|
78
|
+
expect(out2).toEqual([]);
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
describe("tagSource / assignRowIds", () => {
|
|
83
|
+
it("tagSource adds __source__ to each row", () => {
|
|
84
|
+
const rows: Row[] = [{ a: 1 }, { a: 2 }];
|
|
85
|
+
const out = tagSource(rows, "csv_a");
|
|
86
|
+
expect(out[0]!["__source__"]).toBe("csv_a");
|
|
87
|
+
expect(out[1]!["__source__"]).toBe("csv_a");
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it("assignRowIds fills missing __row_id__ with sequential ids", () => {
|
|
91
|
+
const rows: Row[] = [{ a: 1 }, { a: 2, __row_id__: 99 }, { a: 3 }];
|
|
92
|
+
const out = assignRowIds(rows, 10);
|
|
93
|
+
expect(out[0]!["__row_id__"]).toBe(10);
|
|
94
|
+
expect(out[1]!["__row_id__"]).toBe(99); // existing preserved
|
|
95
|
+
expect(out[2]!["__row_id__"]).toBe(12);
|
|
96
|
+
});
|
|
97
|
+
});
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
learnBlockingRules,
|
|
4
|
+
applyLearnedBlocks,
|
|
5
|
+
} from "../../src/core/learned-blocking.js";
|
|
6
|
+
import type { Row, ScoredPair } from "../../src/core/types.js";
|
|
7
|
+
|
|
8
|
+
// Build a small dataset where known duplicate pairs share last-name/email prefixes
|
|
9
|
+
// so predicates like equal-on-last-name or soundex actually discriminate.
|
|
10
|
+
function makeDataset(): { rows: Row[]; pairs: ScoredPair[] } {
|
|
11
|
+
const dupeGroups: Array<[string, string][]> = [
|
|
12
|
+
[["John", "Smith"], ["Jon", "Smith"]],
|
|
13
|
+
[["Mary", "Jones"], ["Marie", "Jones"]],
|
|
14
|
+
[["Alice", "Brown"], ["Alicia", "Brown"]],
|
|
15
|
+
[["Bob", "Miller"], ["Robert", "Miller"]],
|
|
16
|
+
[["Carol", "Davis"], ["Caroline", "Davis"]],
|
|
17
|
+
];
|
|
18
|
+
// Plus distractor rows to give the predicate some reduction work to do.
|
|
19
|
+
const distractorLasts = [
|
|
20
|
+
"Zygmunt", "Xiong", "Petrov", "Kowalski", "Nakamura",
|
|
21
|
+
"Rasmussen", "Tanaka", "Vasquez", "Wojcik", "Yamamoto",
|
|
22
|
+
];
|
|
23
|
+
const rows: Row[] = [];
|
|
24
|
+
let id = 0;
|
|
25
|
+
const pairs: ScoredPair[] = [];
|
|
26
|
+
for (const pair of dupeGroups) {
|
|
27
|
+
const idA = id++;
|
|
28
|
+
rows.push({ __row_id__: idA, first_name: pair[0]![0], last_name: pair[0]![1] });
|
|
29
|
+
const idB = id++;
|
|
30
|
+
rows.push({ __row_id__: idB, first_name: pair[1]![0], last_name: pair[1]![1] });
|
|
31
|
+
pairs.push({ idA, idB, score: 1.0 });
|
|
32
|
+
}
|
|
33
|
+
for (const last of distractorLasts) {
|
|
34
|
+
rows.push({ __row_id__: id++, first_name: `First${id}`, last_name: last });
|
|
35
|
+
}
|
|
36
|
+
return { rows, pairs };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
describe("learnBlockingRules", () => {
|
|
40
|
+
it("produces predicates for a dataset with known matching pairs", () => {
|
|
41
|
+
const { rows, pairs } = makeDataset();
|
|
42
|
+
const rules = learnBlockingRules(rows, pairs, ["first_name", "last_name"], {
|
|
43
|
+
minRecall: 0.8,
|
|
44
|
+
minReduction: 0.5,
|
|
45
|
+
predicateDepth: 3,
|
|
46
|
+
});
|
|
47
|
+
expect(rules.predicates.length).toBeGreaterThan(0);
|
|
48
|
+
expect(rules.predicates.length).toBeLessThanOrEqual(3);
|
|
49
|
+
// Every selected predicate should have positive recall.
|
|
50
|
+
for (const p of rules.predicates) expect(p.recall).toBeGreaterThan(0);
|
|
51
|
+
expect(rules.minRecall).toBe(0.8);
|
|
52
|
+
expect(rules.minReduction).toBe(0.5);
|
|
53
|
+
expect(typeof rules.learnedAt).toBe("string");
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it("learned predicates cover at least one of the known pairs (useful recall)", () => {
|
|
57
|
+
const { rows, pairs } = makeDataset();
|
|
58
|
+
const rules = learnBlockingRules(rows, pairs, ["last_name"], {
|
|
59
|
+
minRecall: 0.95,
|
|
60
|
+
minReduction: 0.5,
|
|
61
|
+
predicateDepth: 3,
|
|
62
|
+
});
|
|
63
|
+
// At least one predicate should retain some useful recall.
|
|
64
|
+
const bestRecall = Math.max(...rules.predicates.map((p) => p.recall));
|
|
65
|
+
expect(bestRecall).toBeGreaterThan(0);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("empty known-pair input returns rules with no predicates (graceful fallback)", () => {
|
|
69
|
+
const { rows } = makeDataset();
|
|
70
|
+
const rules = learnBlockingRules(rows, [], ["first_name", "last_name"]);
|
|
71
|
+
expect(rules.predicates).toEqual([]);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it("is deterministic given the same inputs", () => {
|
|
75
|
+
const { rows, pairs } = makeDataset();
|
|
76
|
+
const r1 = learnBlockingRules(rows, pairs, ["first_name", "last_name"]);
|
|
77
|
+
const r2 = learnBlockingRules(rows, pairs, ["first_name", "last_name"]);
|
|
78
|
+
expect(r1.predicates.length).toBe(r2.predicates.length);
|
|
79
|
+
for (let i = 0; i < r1.predicates.length; i++) {
|
|
80
|
+
expect(r1.predicates[i]!.type).toBe(r2.predicates[i]!.type);
|
|
81
|
+
expect(r1.predicates[i]!.field).toBe(r2.predicates[i]!.field);
|
|
82
|
+
expect(r1.predicates[i]!.recall).toBeCloseTo(r2.predicates[i]!.recall, 10);
|
|
83
|
+
expect(r1.predicates[i]!.reduction).toBeCloseTo(
|
|
84
|
+
r2.predicates[i]!.reduction,
|
|
85
|
+
10,
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
describe("applyLearnedBlocks", () => {
|
|
92
|
+
it("produces BlockResult[] from learned rules with rows >= 2", () => {
|
|
93
|
+
const { rows, pairs } = makeDataset();
|
|
94
|
+
const rules = learnBlockingRules(rows, pairs, ["last_name"]);
|
|
95
|
+
const blocks = applyLearnedBlocks(rows, rules, 100);
|
|
96
|
+
for (const b of blocks) {
|
|
97
|
+
expect(b.rows.length).toBeGreaterThanOrEqual(2);
|
|
98
|
+
expect(b.strategy).toBe("learned");
|
|
99
|
+
expect(b.blockKey.startsWith("learned:")).toBe(true);
|
|
100
|
+
}
|
|
101
|
+
// Each duplicate pair should share at least one block.
|
|
102
|
+
if (blocks.length > 0) {
|
|
103
|
+
const groupsById = new Map<number, Set<string>>();
|
|
104
|
+
for (const b of blocks) {
|
|
105
|
+
for (const r of b.rows) {
|
|
106
|
+
const id = r["__row_id__"] as number;
|
|
107
|
+
let set = groupsById.get(id);
|
|
108
|
+
if (!set) {
|
|
109
|
+
set = new Set();
|
|
110
|
+
groupsById.set(id, set);
|
|
111
|
+
}
|
|
112
|
+
set.add(b.blockKey);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// Sanity: if a useful predicate was chosen, the first dupe pair shares a key.
|
|
116
|
+
const a = groupsById.get(0);
|
|
117
|
+
const b = groupsById.get(1);
|
|
118
|
+
if (a && b) {
|
|
119
|
+
const shared = [...a].some((k) => b.has(k));
|
|
120
|
+
expect(shared).toBe(true);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
it("returns empty array when rules contain no predicates", () => {
|
|
126
|
+
const { rows } = makeDataset();
|
|
127
|
+
const blocks = applyLearnedBlocks(
|
|
128
|
+
rows,
|
|
129
|
+
{ predicates: [], minRecall: 0.9, minReduction: 0.9, learnedAt: "" },
|
|
130
|
+
100,
|
|
131
|
+
);
|
|
132
|
+
expect(blocks).toEqual([]);
|
|
133
|
+
});
|
|
134
|
+
});
|