goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,494 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ buildComparisonVector,
4
+ trainEM,
5
+ scoreProbabilistic,
6
+ scoreProbabilisticPair,
7
+ } from "../../src/core/probabilistic.js";
8
+ import { makeMatchkeyConfig, makeMatchkeyField } from "../../src/core/index.js";
9
+ import type { Row } from "../../src/core/index.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Helpers
13
+ // ---------------------------------------------------------------------------
14
+
15
+ let nextId = 0;
16
+ function resetIds() {
17
+ nextId = 0;
18
+ }
19
+ function makePerson(first: string, last: string, email: string): Row {
20
+ return {
21
+ __row_id__: nextId++,
22
+ first_name: first,
23
+ last_name: last,
24
+ email,
25
+ };
26
+ }
27
+
28
+ // Matchkey builder used across tests.
29
+ function buildMatchkey() {
30
+ return makeMatchkeyConfig({
31
+ name: "identity",
32
+ type: "probabilistic",
33
+ fields: [
34
+ makeMatchkeyField({
35
+ field: "first_name",
36
+ scorer: "jaro_winkler",
37
+ transforms: ["lowercase"],
38
+ }),
39
+ makeMatchkeyField({
40
+ field: "last_name",
41
+ scorer: "jaro_winkler",
42
+ transforms: ["lowercase"],
43
+ }),
44
+ makeMatchkeyField({
45
+ field: "email",
46
+ scorer: "jaro_winkler",
47
+ transforms: ["lowercase"],
48
+ }),
49
+ ],
50
+ });
51
+ }
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // buildComparisonVector
55
+ // ---------------------------------------------------------------------------
56
+
57
+ describe("buildComparisonVector", () => {
58
+ it("levels=2: agree/disagree based on partial threshold", () => {
59
+ const rowA: Row = { name: "John Smith" };
60
+ const rowB: Row = { name: "John Smith" };
61
+ const rowC: Row = { name: "Zxqwer" };
62
+ const fields = [
63
+ makeMatchkeyField({
64
+ field: "name",
65
+ scorer: "jaro_winkler",
66
+ levels: 2,
67
+ partialThreshold: 0.7,
68
+ }),
69
+ ];
70
+
71
+ expect(buildComparisonVector(rowA, rowB, fields)).toEqual([1]);
72
+ expect(buildComparisonVector(rowA, rowC, fields)).toEqual([0]);
73
+ });
74
+
75
+ it("levels=2: null inputs treated as disagree", () => {
76
+ const rowA: Row = { name: "John Smith" };
77
+ const rowB: Row = { name: null };
78
+ const fields = [
79
+ makeMatchkeyField({ field: "name", scorer: "jaro_winkler", levels: 2 }),
80
+ ];
81
+ const vec = buildComparisonVector(rowA, rowB, fields);
82
+ expect(vec).toEqual([0]);
83
+ });
84
+
85
+ it("levels=2: missing field keys treated as disagree", () => {
86
+ const rowA: Row = {};
87
+ const rowB: Row = {};
88
+ const fields = [
89
+ makeMatchkeyField({ field: "name", scorer: "jaro_winkler", levels: 2 }),
90
+ ];
91
+ const vec = buildComparisonVector(rowA, rowB, fields);
92
+ // Two empty strings -> scoreField likely returns null; falls through to 0.
93
+ expect(vec[0]).toBe(0);
94
+ });
95
+
96
+ it("levels=3: distinct levels for exact/partial/disagree", () => {
97
+ const equal = buildComparisonVector(
98
+ { email: "alice@example.com" },
99
+ { email: "alice@example.com" },
100
+ [
101
+ makeMatchkeyField({
102
+ field: "email",
103
+ scorer: "jaro_winkler",
104
+ levels: 3,
105
+ partialThreshold: 0.7,
106
+ }),
107
+ ],
108
+ );
109
+ expect(equal).toEqual([2]); // full agreement, s >= 0.95
110
+
111
+ const partial = buildComparisonVector(
112
+ { name: "Jonathan" },
113
+ { name: "Jonas" }, // JW ~0.86: below 0.95, above 0.7
114
+ [
115
+ makeMatchkeyField({
116
+ field: "name",
117
+ scorer: "jaro_winkler",
118
+ levels: 3,
119
+ partialThreshold: 0.7,
120
+ }),
121
+ ],
122
+ );
123
+ expect(partial).toEqual([1]);
124
+
125
+ const disagree = buildComparisonVector(
126
+ { name: "Alice" },
127
+ { name: "Zxqwer" },
128
+ [
129
+ makeMatchkeyField({
130
+ field: "name",
131
+ scorer: "jaro_winkler",
132
+ levels: 3,
133
+ partialThreshold: 0.7,
134
+ }),
135
+ ],
136
+ );
137
+ expect(disagree).toEqual([0]);
138
+ });
139
+
140
+ it("applies field transforms before scoring", () => {
141
+ const rowA: Row = { name: " ALICE " };
142
+ const rowB: Row = { name: "alice" };
143
+ const fields = [
144
+ makeMatchkeyField({
145
+ field: "name",
146
+ scorer: "exact",
147
+ transforms: ["lowercase", "strip"],
148
+ levels: 2,
149
+ partialThreshold: 0.9,
150
+ }),
151
+ ];
152
+ const vec = buildComparisonVector(rowA, rowB, fields);
153
+ expect(vec).toEqual([1]);
154
+ });
155
+ });
156
+
157
+ // ---------------------------------------------------------------------------
158
+ // trainEM
159
+ // ---------------------------------------------------------------------------
160
+
161
+ describe("trainEM", () => {
162
+ it("learns m/u on constructed near-dup dataset and converges", () => {
163
+ resetIds();
164
+ // 10 near-duplicate pairs (small typos) — 20 rows total that cluster.
165
+ const duplicates: Row[] = [];
166
+ const dupeSpecs: Array<[string, string, string]> = [
167
+ ["John", "Smith", "john@x.com"],
168
+ ["Mary", "Jones", "mary@y.com"],
169
+ ["Alice", "Brown", "alice@z.com"],
170
+ ["Bob", "Miller", "bob@a.com"],
171
+ ["Carol", "Davis", "carol@b.com"],
172
+ ["David", "Wilson", "david@c.com"],
173
+ ["Eve", "Moore", "eve@d.com"],
174
+ ["Frank", "Taylor", "frank@e.com"],
175
+ ["Grace", "Anderson", "grace@f.com"],
176
+ ["Hank", "Thomas", "hank@g.com"],
177
+ ];
178
+ for (const [first, last, email] of dupeSpecs) {
179
+ duplicates.push(makePerson(first, last, email));
180
+ // Genuine typo: swap last two chars.
181
+ const firstTypo =
182
+ first.length >= 3
183
+ ? first.slice(0, -2) + first.slice(-1) + first.slice(-2, -1)
184
+ : first;
185
+ duplicates.push(makePerson(firstTypo, last, email));
186
+ }
187
+
188
+ // 60 random non-match rows — each row gets a distinct first/last/email
189
+ // that the JW scorer won't accidentally treat as "agree".
190
+ const randomFirsts = [
191
+ "Zoltan", "Xavier", "Yolanda", "Victor", "Wendy", "Nathan", "Oscar",
192
+ "Penny", "Quincy", "Rita", "Sasha", "Trevor", "Ursula", "Vince",
193
+ "Walter", "Yusuf", "Zara", "Ahmed", "Beatrice", "Clive", "Dimitri",
194
+ "Esperanza", "Farid", "Gabriela", "Horacio", "Ingrid", "Jorge",
195
+ "Katarina", "Lorenzo", "Mikhail", "Natalia", "Octavio", "Priya",
196
+ "Qasim", "Rosalind", "Sergio", "Tatiana", "Ulrich", "Valentina",
197
+ "Wilhelm", "Xiomara", "Yaroslav", "Zephyr", "Adelaide", "Bartholomew",
198
+ "Cressida", "Demetrius", "Evangeline", "Fionnuala", "Gregorius",
199
+ "Hephzibah", "Isambard", "Jocasta", "Kenelm", "Ludmilla", "Mordecai",
200
+ "Nicephorus", "Ophelia", "Peregrine", "Quirinus",
201
+ ];
202
+ const randomLasts = [
203
+ "Zygmunt", "Xiong", "Petrov", "Kowalski", "Nakamura", "Oduya",
204
+ "Fernandez", "Kaplan", "Lowenstein", "Meyer", "Papadopoulos",
205
+ "Obradovic", "Rasmussen", "Silva", "Tanaka", "Ueno", "Vasquez",
206
+ "Wojcik", "Yamamoto", "Zalewski", "Ababukh", "Beaumaris",
207
+ "Caravaggio", "Drobny", "Eisenhower", "Filippov", "Gauthier",
208
+ "Hashimoto", "Ignatiev", "Jankovic", "Khatri", "Lindqvist",
209
+ "Magnusson", "Novotny", "Ostrowski", "Pemberton", "Quesnel",
210
+ "Rostropovich", "Schmidt", "Tartaglia", "Ulyanov", "Vermeer",
211
+ "Wroblewski", "Xanthopoulos", "Yankovic", "Zielinski",
212
+ "Abercrombie", "Blumberg", "Carnelian", "Dumitrescu", "Esterhazy",
213
+ "Finkelstein", "Grzybowski", "Hohenzollern", "Iglesias", "Jimenez",
214
+ "Kaczmarek", "Lindstrom", "Mazzanti", "Niedermeyer", "Oppenheimer",
215
+ ];
216
+ const randoms: Row[] = [];
217
+ for (let i = 0; i < 60; i++) {
218
+ const first = randomFirsts[i]!;
219
+ const last = randomLasts[i]!;
220
+ randoms.push(
221
+ makePerson(first, last, `uniq${i}_${(i * 97) % 1000}@zzz${i}.io`),
222
+ );
223
+ }
224
+
225
+ const allRows = [...duplicates, ...randoms];
226
+ const mk = buildMatchkey();
227
+
228
+ const result = trainEM(allRows, mk, { seed: 123, maxIterations: 100 });
229
+
230
+ // Basic shape checks.
231
+ expect(result.m).toHaveProperty("first_name");
232
+ expect(result.m).toHaveProperty("last_name");
233
+ expect(result.m).toHaveProperty("email");
234
+ expect(result.u).toHaveProperty("first_name");
235
+ expect(result.matchWeights).toHaveProperty("email");
236
+
237
+ // proportionMatched should be a sensible probability.
238
+ expect(result.proportionMatched).toBeGreaterThan(0);
239
+ expect(result.proportionMatched).toBeLessThan(1);
240
+
241
+ // With default levels=2 the vectors are [disagree, agree].
242
+ // For cleanly-differentiated name fields, u[agree] should be small
243
+ // (random pairs rarely agree) and m[agree] should be large
244
+ // (true-match pairs do agree), yielding positive match weight at level=1.
245
+ for (const f of ["first_name", "last_name"]) {
246
+ const uAgree = result.u[f]![1]!;
247
+ const mAgree = result.m[f]![1]!;
248
+ expect(uAgree).toBeLessThan(0.2);
249
+ expect(mAgree).toBeGreaterThan(0.6);
250
+ // match weight at "agree" level must be positive (log2(m/u) > 0).
251
+ expect(result.matchWeights[f]![1]!).toBeGreaterThan(0);
252
+ }
253
+
254
+ // EM should finish within the iteration cap.
255
+ expect(result.iterations).toBeLessThanOrEqual(100);
256
+ expect(result.iterations).toBeGreaterThan(0);
257
+ // With this well-separated dataset EM converges before hitting the cap.
258
+ expect(result.converged).toBe(true);
259
+ });
260
+
261
+ it("blocking-field invariant: near-constant field has m approximately u (zero discriminative weight)", () => {
262
+ resetIds();
263
+ // Dataset where `country` is constant across all rows — a blocking field candidate.
264
+ const rows: Row[] = [];
265
+ for (let i = 0; i < 40; i++) {
266
+ rows.push({
267
+ __row_id__: nextId++,
268
+ first_name: `Name${i % 5}`,
269
+ country: "US",
270
+ });
271
+ }
272
+
273
+ const mk = makeMatchkeyConfig({
274
+ name: "identity",
275
+ type: "probabilistic",
276
+ fields: [
277
+ makeMatchkeyField({ field: "first_name", scorer: "jaro_winkler" }),
278
+ makeMatchkeyField({ field: "country", scorer: "exact" }),
279
+ ],
280
+ });
281
+
282
+ const result = trainEM(rows, mk, {
283
+ seed: 7,
284
+ blockingFields: ["country"],
285
+ maxIterations: 20,
286
+ });
287
+
288
+ // Blocking field gets fixed neutral u (0.5, 0.5 for 2-level).
289
+ expect(result.u.country).toEqual([0.5, 0.5]);
290
+
291
+ // Blocking field retains its prior m (exponential, normalized): [1/3, 2/3].
292
+ // In either case, match weights for blocking fields are the fixed linear
293
+ // interpolation from -3 to +3, by construction.
294
+ const cw = result.matchWeights.country!;
295
+ expect(cw.length).toBe(2);
296
+ expect(cw[0]!).toBeCloseTo(-3.0, 5);
297
+ expect(cw[1]!).toBeCloseTo(3.0, 5);
298
+ });
299
+
300
+ it("iterations cap respected on degenerate (all identical rows) data", () => {
301
+ resetIds();
302
+ const rows: Row[] = [];
303
+ for (let i = 0; i < 30; i++) {
304
+ rows.push({ __row_id__: nextId++, first_name: "John", last_name: "Smith" });
305
+ }
306
+ const mk = makeMatchkeyConfig({
307
+ name: "identity",
308
+ type: "probabilistic",
309
+ fields: [
310
+ makeMatchkeyField({ field: "first_name", scorer: "jaro_winkler" }),
311
+ makeMatchkeyField({ field: "last_name", scorer: "jaro_winkler" }),
312
+ ],
313
+ });
314
+ const result = trainEM(rows, mk, { seed: 1, maxIterations: 5 });
315
+ // Must not exceed cap; must terminate cleanly.
316
+ expect(result.iterations).toBeLessThanOrEqual(5);
317
+ expect(Number.isFinite(result.proportionMatched)).toBe(true);
318
+ });
319
+
320
+ it("deterministic: same seed + same input => identical EMResult", () => {
321
+ resetIds();
322
+ const rows: Row[] = [];
323
+ for (let i = 0; i < 25; i++) {
324
+ rows.push(makePerson(`First${i % 7}`, `Last${i % 5}`, `user${i % 9}@x.com`));
325
+ }
326
+ const mk = buildMatchkey();
327
+
328
+ const r1 = trainEM(rows, mk, { seed: 999, maxIterations: 20 });
329
+ const r2 = trainEM(rows, mk, { seed: 999, maxIterations: 20 });
330
+
331
+ expect(r1.iterations).toBe(r2.iterations);
332
+ expect(r1.converged).toBe(r2.converged);
333
+ expect(r1.proportionMatched).toBeCloseTo(r2.proportionMatched, 12);
334
+ for (const f of Object.keys(r1.m)) {
335
+ for (let k = 0; k < r1.m[f]!.length; k++) {
336
+ expect(r1.m[f]![k]!).toBeCloseTo(r2.m[f]![k]!, 12);
337
+ expect(r1.u[f]![k]!).toBeCloseTo(r2.u[f]![k]!, 12);
338
+ expect(r1.matchWeights[f]![k]!).toBeCloseTo(r2.matchWeights[f]![k]!, 12);
339
+ }
340
+ }
341
+ });
342
+
343
+ it("fallback result on tiny dataset (< 10 sampled pairs) still returns a valid EMResult", () => {
344
+ resetIds();
345
+ // Only 3 rows -> 3 possible pairs, below the fallback threshold of 10.
346
+ const rows: Row[] = [
347
+ makePerson("A", "B", "a@b.com"),
348
+ makePerson("C", "D", "c@d.com"),
349
+ makePerson("E", "F", "e@f.com"),
350
+ ];
351
+ const mk = buildMatchkey();
352
+ const result = trainEM(rows, mk, { seed: 1 });
353
+ // Fallback path: iterations=0, converged=false, but shape is valid.
354
+ expect(result.iterations).toBe(0);
355
+ expect(result.converged).toBe(false);
356
+ for (const f of ["first_name", "last_name", "email"]) {
357
+ expect(result.m[f]!.length).toBe(2);
358
+ expect(result.u[f]!.length).toBe(2);
359
+ expect(result.matchWeights[f]!.length).toBe(2);
360
+ }
361
+ });
362
+ });
363
+
364
+ // ---------------------------------------------------------------------------
365
+ // scoreProbabilistic
366
+ // ---------------------------------------------------------------------------
367
+
368
+ describe("scoreProbabilistic", () => {
369
+ it("identical rows score near 1.0; very different rows drop out or score near 0", () => {
370
+ resetIds();
371
+ // Build a reasonable EM model on a small mixed dataset.
372
+ const trainRows: Row[] = [];
373
+ for (let i = 0; i < 10; i++) {
374
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
375
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
376
+ }
377
+ for (let i = 0; i < 10; i++) {
378
+ trainRows.push(
379
+ makePerson(`Zzz${i}`, `Qqq${i}`, `other${i}@y.com`),
380
+ );
381
+ }
382
+ const mk = buildMatchkey();
383
+ const em = trainEM(trainRows, mk, { seed: 5, maxIterations: 20 });
384
+
385
+ // Now score a block containing one clearly-matching pair and one clearly-different pair.
386
+ resetIds();
387
+ const block: Row[] = [
388
+ makePerson("Alice", "Smith", "alice@example.com"),
389
+ makePerson("Alice", "Smith", "alice@example.com"),
390
+ makePerson("Zoltan", "Qwerty", "zzz@nope.com"),
391
+ ];
392
+ const scored = scoreProbabilistic(block, mk, em, { threshold: 0.0 });
393
+
394
+ // All 3 pairs evaluated (threshold=0).
395
+ expect(scored.length).toBe(3);
396
+
397
+ const byKey = new Map<string, number>();
398
+ for (const s of scored) byKey.set(`${s.idA}:${s.idB}`, s.score);
399
+
400
+ // Identical rows (ids 0,1) -> near 1.0
401
+ const identical = byKey.get("0:1")!;
402
+ expect(identical).toBeGreaterThan(0.9);
403
+
404
+ // Completely different pair (0,2) -> near 0.0 (low match weight sum).
405
+ const different = byKey.get("0:2")!;
406
+ expect(different).toBeLessThan(identical);
407
+ expect(different).toBeLessThan(0.3);
408
+ });
409
+
410
+ it("threshold filters out low-scoring pairs", () => {
411
+ resetIds();
412
+ const trainRows: Row[] = [];
413
+ for (let i = 0; i < 10; i++) {
414
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
415
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
416
+ }
417
+ for (let i = 0; i < 10; i++) {
418
+ trainRows.push(makePerson(`Zzz${i}`, `Qqq${i}`, `other${i}@y.com`));
419
+ }
420
+ const mk = buildMatchkey();
421
+ const em = trainEM(trainRows, mk, { seed: 5, maxIterations: 20 });
422
+
423
+ resetIds();
424
+ const block: Row[] = [
425
+ makePerson("Alice", "Smith", "alice@example.com"),
426
+ makePerson("Alice", "Smith", "alice@example.com"),
427
+ makePerson("Zoltan", "Qwerty", "zzz@nope.com"),
428
+ ];
429
+ const highT = scoreProbabilistic(block, mk, em, { threshold: 0.9 });
430
+ // Only the identical pair survives a high threshold.
431
+ expect(highT.length).toBe(1);
432
+ expect(highT[0]!.idA).toBe(0);
433
+ expect(highT[0]!.idB).toBe(1);
434
+ });
435
+
436
+ it("excludePairs skips already-matched pairs", () => {
437
+ resetIds();
438
+ const trainRows: Row[] = [];
439
+ for (let i = 0; i < 10; i++) {
440
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
441
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `user${i}@x.com`));
442
+ }
443
+ for (let i = 0; i < 10; i++) {
444
+ trainRows.push(makePerson(`Zzz${i}`, `Qqq${i}`, `other${i}@y.com`));
445
+ }
446
+ const mk = buildMatchkey();
447
+ const em = trainEM(trainRows, mk, { seed: 5, maxIterations: 20 });
448
+
449
+ resetIds();
450
+ const block: Row[] = [
451
+ makePerson("Alice", "Smith", "alice@example.com"),
452
+ makePerson("Alice", "Smith", "alice@example.com"),
453
+ ];
454
+ const excluded = new Set<string>(["0:1"]);
455
+ const scored = scoreProbabilistic(block, mk, em, {
456
+ excludePairs: excluded,
457
+ threshold: 0.0,
458
+ });
459
+ expect(scored.length).toBe(0);
460
+ });
461
+ });
462
+
463
+ // ---------------------------------------------------------------------------
464
+ // scoreProbabilisticPair
465
+ // ---------------------------------------------------------------------------
466
+
467
+ describe("scoreProbabilisticPair", () => {
468
+ it("returns a [0,1] score for a trained EMResult", () => {
469
+ resetIds();
470
+ const trainRows: Row[] = [];
471
+ for (let i = 0; i < 10; i++) {
472
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `u${i}@x.com`));
473
+ trainRows.push(makePerson(`First${i}`, `Last${i}`, `u${i}@x.com`));
474
+ }
475
+ for (let i = 0; i < 10; i++) {
476
+ trainRows.push(makePerson(`Zzz${i}`, `Qqq${i}`, `o${i}@y.com`));
477
+ }
478
+ const mk = buildMatchkey();
479
+ const em = trainEM(trainRows, mk, { seed: 11, maxIterations: 20 });
480
+
481
+ const a: Row = { first_name: "Alice", last_name: "Smith", email: "a@x.com" };
482
+ const b: Row = { first_name: "Alice", last_name: "Smith", email: "a@x.com" };
483
+ const c: Row = { first_name: "Zoltan", last_name: "Qqq", email: "z@z.com" };
484
+
485
+ const hi = scoreProbabilisticPair(a, b, mk, em);
486
+ const lo = scoreProbabilisticPair(a, c, mk, em);
487
+
488
+ expect(hi).toBeGreaterThanOrEqual(0);
489
+ expect(hi).toBeLessThanOrEqual(1);
490
+ expect(lo).toBeGreaterThanOrEqual(0);
491
+ expect(lo).toBeLessThanOrEqual(1);
492
+ expect(hi).toBeGreaterThan(lo);
493
+ });
494
+ });
@@ -0,0 +1,68 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { profileRows } from "../../src/core/profiler.js";
3
+ import type { Row } from "../../src/core/types.js";
4
+
5
+ describe("profileRows", () => {
6
+ it("empty input returns rowCount=0 and empty columns", () => {
7
+ const p = profileRows([]);
8
+ expect(p.rowCount).toBe(0);
9
+ expect(p.columns).toEqual([]);
10
+ expect(p.byName).toEqual({});
11
+ });
12
+
13
+ it("infers 'email' for a column of email-shaped values", () => {
14
+ const rows: Row[] = [];
15
+ for (let i = 0; i < 10; i++) {
16
+ rows.push({ email: `user${i}@example.com`, name: `Name${i}` });
17
+ }
18
+ const p = profileRows(rows);
19
+ expect(p.byName["email"]!.inferredType).toBe("email");
20
+ });
21
+
22
+ it("computes accurate null counts and rates", () => {
23
+ const rows: Row[] = [
24
+ { a: "x" },
25
+ { a: null },
26
+ { a: "" }, // treated as null
27
+ { a: "y" },
28
+ ];
29
+ const p = profileRows(rows);
30
+ const a = p.byName["a"]!;
31
+ expect(a.totalCount).toBe(4);
32
+ expect(a.nullCount).toBe(2);
33
+ expect(a.nullRate).toBeCloseTo(0.5, 5);
34
+ });
35
+
36
+ it("cardinality ratio reflects distinct-per-total for mostly-unique vs repeating", () => {
37
+ // Unique column
38
+ const unique: Row[] = [];
39
+ for (let i = 0; i < 10; i++) unique.push({ id: `v${i}` });
40
+ const pu = profileRows(unique);
41
+ expect(pu.byName["id"]!.cardinalityRatio).toBeCloseTo(1.0, 5);
42
+
43
+ // Repeating column (2 distinct values)
44
+ const repeating: Row[] = [];
45
+ for (let i = 0; i < 10; i++) {
46
+ repeating.push({ status: i % 2 === 0 ? "active" : "inactive" });
47
+ }
48
+ const pr = profileRows(repeating);
49
+ expect(pr.byName["status"]!.cardinalityRatio).toBeCloseTo(0.2, 5);
50
+ expect(pr.byName["status"]!.distinctCount).toBe(2);
51
+ });
52
+
53
+ it("computes accurate avgLength and maxLength for string columns", () => {
54
+ const rows: Row[] = [{ s: "a" }, { s: "abc" }, { s: "ab" }];
55
+ const p = profileRows(rows);
56
+ const s = p.byName["s"]!;
57
+ // avg = (1 + 3 + 2) / 3 = 2.0, max = 3
58
+ expect(s.avgLength).toBeCloseTo(2.0, 5);
59
+ expect(s.maxLength).toBe(3);
60
+ });
61
+
62
+ it("ignores internal __ columns", () => {
63
+ const rows: Row[] = [{ __row_id__: 0, name: "Alice" }];
64
+ const p = profileRows(rows);
65
+ expect(p.byName["__row_id__"]).toBeUndefined();
66
+ expect(p.byName["name"]).toBeDefined();
67
+ });
68
+ });
@@ -0,0 +1,68 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { gatePairs, ReviewQueue } from "../../src/core/index.js";
3
+ import type { ScoredPair } from "../../src/core/index.js";
4
+
5
+ describe("gatePairs", () => {
6
+ it("default thresholds split pairs into 3 buckets", () => {
7
+ const pairs: ScoredPair[] = [
8
+ { idA: 1, idB: 2, score: 0.99 }, // approve
9
+ { idA: 3, idB: 4, score: 0.80 }, // review
10
+ { idA: 5, idB: 6, score: 0.50 }, // reject
11
+ ];
12
+ const result = gatePairs(pairs);
13
+ expect(result.autoApproved.length).toBe(1);
14
+ expect(result.needsReview.length).toBe(1);
15
+ expect(result.rejected.length).toBe(1);
16
+ });
17
+
18
+ it("custom thresholds", () => {
19
+ const pairs: ScoredPair[] = [{ idA: 1, idB: 2, score: 0.85 }];
20
+ const result = gatePairs(pairs, { approveAbove: 0.8, rejectBelow: 0.5 });
21
+ expect(result.autoApproved.length).toBe(1);
22
+ });
23
+
24
+ it("review items have canonical pair ids", () => {
25
+ const pairs: ScoredPair[] = [{ idA: 5, idB: 3, score: 0.8 }];
26
+ const result = gatePairs(pairs);
27
+ expect(result.needsReview[0]!.pairId).toBe("3:5");
28
+ expect(result.needsReview[0]!.idA).toBe(3);
29
+ expect(result.needsReview[0]!.idB).toBe(5);
30
+ });
31
+ });
32
+
33
+ describe("ReviewQueue lifecycle", () => {
34
+ it("add -> approve", () => {
35
+ const q = new ReviewQueue();
36
+ q.add({ idA: 1, idB: 2, score: 0.8 });
37
+ expect(q.size()).toBe(1);
38
+ expect(q.pending().length).toBe(1);
39
+ q.approve("1:2");
40
+ expect(q.approved().length).toBe(1);
41
+ expect(q.pending().length).toBe(0);
42
+ });
43
+
44
+ it("add -> reject", () => {
45
+ const q = new ReviewQueue();
46
+ q.add({ idA: 1, idB: 2, score: 0.8 });
47
+ q.reject("1:2");
48
+ expect(q.rejected().length).toBe(1);
49
+ });
50
+
51
+ it("canonicalizes pair id on add (2,1 same as 1,2)", () => {
52
+ const q = new ReviewQueue();
53
+ q.add({ idA: 2, idB: 1, score: 0.8 });
54
+ q.add({ idA: 1, idB: 2, score: 0.9 }); // same pair -> idempotent
55
+ expect(q.size()).toBe(1);
56
+ });
57
+
58
+ it("approve on unknown pair is a no-op", () => {
59
+ const q = new ReviewQueue();
60
+ q.approve("99:100"); // no throw
61
+ expect(q.approved().length).toBe(0);
62
+ });
63
+
64
+ it("static pairIdFor helper", () => {
65
+ expect(ReviewQueue.pairIdFor(5, 3)).toBe("3:5");
66
+ expect(ReviewQueue.pairIdFor(1, 2)).toBe("1:2");
67
+ });
68
+ });