goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,239 @@
1
+ import { describe, it, expect, beforeAll, afterAll } from "vitest";
2
+ import type { Server } from "node:http";
3
+ import { startApiServer } from "../../src/node/api/server.js";
4
+
5
+ let server: Server;
6
+ let baseUrl: string;
7
+
8
+ beforeAll(async () => {
9
+ server = startApiServer({ port: 0, host: "127.0.0.1" });
10
+ // Wait for listen to complete.
11
+ await new Promise<void>((resolveFn) => {
12
+ if (server.listening) {
13
+ resolveFn();
14
+ return;
15
+ }
16
+ server.once("listening", () => resolveFn());
17
+ });
18
+ const addr = server.address();
19
+ const port =
20
+ typeof addr === "object" && addr !== null && "port" in addr ? addr.port : 8000;
21
+ baseUrl = `http://127.0.0.1:${port}`;
22
+ });
23
+
24
+ afterAll(async () => {
25
+ if (server) {
26
+ await new Promise<void>((resolveFn, rejectFn) => {
27
+ server.close((err) => (err ? rejectFn(err) : resolveFn()));
28
+ });
29
+ }
30
+ });
31
+
32
+ describe("REST API server", () => {
33
+ it("GET /health returns 200 with status ok", async () => {
34
+ const res = await fetch(baseUrl + "/health");
35
+ expect(res.status).toBe(200);
36
+ const body = (await res.json()) as { status: string };
37
+ expect(body.status).toBe("ok");
38
+ });
39
+
40
+ it("POST /dedupe returns DedupeResult-like shape", async () => {
41
+ const res = await fetch(baseUrl + "/dedupe", {
42
+ method: "POST",
43
+ headers: { "Content-Type": "application/json" },
44
+ body: JSON.stringify({
45
+ rows: [
46
+ { email: "a@x.com", name: "Alice" },
47
+ { email: "a@x.com", name: "A." },
48
+ { email: "b@x.com", name: "Bob" },
49
+ ],
50
+ exact: ["email"],
51
+ }),
52
+ });
53
+ expect(res.status).toBe(200);
54
+ const body = (await res.json()) as {
55
+ stats: { total_records: number; total_clusters: number };
56
+ golden_records: unknown[];
57
+ dupes: unknown[];
58
+ unique: unknown[];
59
+ };
60
+ expect(body.stats.total_records).toBe(3);
61
+ expect(typeof body.stats.total_clusters).toBe("number");
62
+ expect(Array.isArray(body.golden_records)).toBe(true);
63
+ expect(Array.isArray(body.dupes)).toBe(true);
64
+ expect(Array.isArray(body.unique)).toBe(true);
65
+ });
66
+
67
+ it("POST /match returns matched/unmatched", async () => {
68
+ const res = await fetch(baseUrl + "/match", {
69
+ method: "POST",
70
+ headers: { "Content-Type": "application/json" },
71
+ body: JSON.stringify({
72
+ target: [{ email: "a@x.com", name: "Alice" }],
73
+ reference: [
74
+ { email: "a@x.com", name: "A." },
75
+ { email: "z@x.com", name: "Zack" },
76
+ ],
77
+ exact: ["email"],
78
+ }),
79
+ });
80
+ expect(res.status).toBe(200);
81
+ const body = (await res.json()) as { matched: unknown[]; unmatched: unknown[] };
82
+ expect(Array.isArray(body.matched)).toBe(true);
83
+ expect(Array.isArray(body.unmatched)).toBe(true);
84
+ });
85
+
86
+ it("POST /score returns numeric score", async () => {
87
+ const res = await fetch(baseUrl + "/score", {
88
+ method: "POST",
89
+ headers: { "Content-Type": "application/json" },
90
+ body: JSON.stringify({ a: "John", b: "Jon", scorer: "jaro_winkler" }),
91
+ });
92
+ expect(res.status).toBe(200);
93
+ const body = (await res.json()) as { score: number; scorer: string };
94
+ expect(body.scorer).toBe("jaro_winkler");
95
+ expect(typeof body.score).toBe("number");
96
+ expect(body.score).toBeGreaterThan(0.9);
97
+ });
98
+
99
+ it("POST /explain returns PairExplanation shape", async () => {
100
+ const res = await fetch(baseUrl + "/explain", {
101
+ method: "POST",
102
+ headers: { "Content-Type": "application/json" },
103
+ body: JSON.stringify({
104
+ row_a: { name: "John Smith" },
105
+ row_b: { name: "Jon Smith" },
106
+ fields: [{ field: "name", scorer: "jaro_winkler", weight: 1.0 }],
107
+ }),
108
+ });
109
+ expect(res.status).toBe(200);
110
+ const body = (await res.json()) as {
111
+ score: number;
112
+ confidence: number;
113
+ explanation: string;
114
+ field_scores: unknown;
115
+ };
116
+ expect(typeof body.score).toBe("number");
117
+ // confidence may be numeric or categorical ("high"/"medium"/"low")
118
+ expect(["number", "string"]).toContain(typeof body.confidence);
119
+ expect(typeof body.explanation).toBe("string");
120
+ });
121
+
122
+ it("POST /profile returns column profiles", async () => {
123
+ const res = await fetch(baseUrl + "/profile", {
124
+ method: "POST",
125
+ headers: { "Content-Type": "application/json" },
126
+ body: JSON.stringify({
127
+ rows: [
128
+ { email: "a@x.com", age: 20 },
129
+ { email: "b@x.com", age: 30 },
130
+ { email: "c@x.com", age: 40 },
131
+ ],
132
+ }),
133
+ });
134
+ expect(res.status).toBe(200);
135
+ const body = (await res.json()) as {
136
+ row_count: number;
137
+ columns: Array<{ name: string; inferred_type: string }>;
138
+ };
139
+ expect(body.row_count).toBe(3);
140
+ expect(Array.isArray(body.columns)).toBe(true);
141
+ expect(body.columns.length).toBeGreaterThan(0);
142
+ });
143
+
144
+ it("POST /clusters returns clusters object", async () => {
145
+ const res = await fetch(baseUrl + "/clusters", {
146
+ method: "POST",
147
+ headers: { "Content-Type": "application/json" },
148
+ body: JSON.stringify({
149
+ rows: [
150
+ { email: "a@x.com", name: "Alice" },
151
+ { email: "a@x.com", name: "A." },
152
+ { email: "b@x.com", name: "Bob" },
153
+ ],
154
+ exact: ["email"],
155
+ }),
156
+ });
157
+ expect(res.status).toBe(200);
158
+ const body = (await res.json()) as {
159
+ cluster_count: number;
160
+ clusters: Array<{ cluster_id: number; members: number[] }>;
161
+ };
162
+ expect(typeof body.cluster_count).toBe("number");
163
+ expect(Array.isArray(body.clusters)).toBe(true);
164
+ });
165
+
166
+ it("GET /reviews returns { pending: [] }", async () => {
167
+ const res = await fetch(baseUrl + "/reviews");
168
+ expect(res.status).toBe(200);
169
+ const body = (await res.json()) as { pending: unknown[] };
170
+ expect(Array.isArray(body.pending)).toBe(true);
171
+ });
172
+
173
+ it("invalid JSON body returns 500 with error", async () => {
174
+ const res = await fetch(baseUrl + "/dedupe", {
175
+ method: "POST",
176
+ headers: { "Content-Type": "application/json" },
177
+ body: "{not valid json",
178
+ });
179
+ expect([400, 500]).toContain(res.status);
180
+ const body = (await res.json()) as { error: string };
181
+ expect(typeof body.error).toBe("string");
182
+ });
183
+
184
+ it("unknown route returns 404", async () => {
185
+ const res = await fetch(baseUrl + "/no-such-route");
186
+ expect(res.status).toBe(404);
187
+ const body = (await res.json()) as { error: string };
188
+ expect(typeof body.error).toBe("string");
189
+ });
190
+
191
+ it("POST /reviews/decide with missing id returns error", async () => {
192
+ const res = await fetch(baseUrl + "/reviews/decide", {
193
+ method: "POST",
194
+ headers: { "Content-Type": "application/json" },
195
+ body: JSON.stringify({}),
196
+ });
197
+ // Missing id -> handler throws -> 500 with error. If id exists but missing -> 404.
198
+ expect([400, 404, 500]).toContain(res.status);
199
+ const body = (await res.json()) as { error: string };
200
+ expect(typeof body.error).toBe("string");
201
+ });
202
+
203
+ it("POST /reviews/decide with unknown id returns 404", async () => {
204
+ const res = await fetch(baseUrl + "/reviews/decide", {
205
+ method: "POST",
206
+ headers: { "Content-Type": "application/json" },
207
+ body: JSON.stringify({ id: "never-enqueued", accept: true }),
208
+ });
209
+ expect(res.status).toBe(404);
210
+ const body = (await res.json()) as { error: string };
211
+ expect(typeof body.error).toBe("string");
212
+ });
213
+
214
+ it("enqueue + decide round-trip works", async () => {
215
+ const enq = await fetch(baseUrl + "/reviews/enqueue", {
216
+ method: "POST",
217
+ headers: { "Content-Type": "application/json" },
218
+ body: JSON.stringify({
219
+ id_a: 1,
220
+ id_b: 2,
221
+ score: 0.8,
222
+ row_a: { name: "A" },
223
+ row_b: { name: "A." },
224
+ }),
225
+ });
226
+ expect(enq.status).toBe(200);
227
+ const enqBody = (await enq.json()) as { item: { id: string } };
228
+ expect(typeof enqBody.item.id).toBe("string");
229
+
230
+ const dec = await fetch(baseUrl + "/reviews/decide", {
231
+ method: "POST",
232
+ headers: { "Content-Type": "application/json" },
233
+ body: JSON.stringify({ id: enqBody.item.id, accept: true }),
234
+ });
235
+ expect(dec.status).toBe(200);
236
+ const decBody = (await dec.json()) as { decided: { status: string } };
237
+ expect(decBody.decided.status).toBe("accepted");
238
+ });
239
+ });
@@ -0,0 +1,77 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { dedupe, match, scoreStrings, scorePairRecord } from "../../src/core/index.js";
3
+ import type { MatchkeyField, Row } from "../../src/core/index.js";
4
+
5
+ describe("dedupe() — shorthand API", () => {
6
+ it("with exact catches identical emails", () => {
7
+ const rows: Row[] = [
8
+ { email: "a@x.com", name: "Alice" },
9
+ { email: "a@x.com", name: "A." },
10
+ { email: "b@x.com", name: "Bob" },
11
+ ];
12
+ const result = dedupe(rows, { exact: ["email"] });
13
+ expect(result.stats.totalRecords).toBe(3);
14
+ expect(result.dupes.length).toBeGreaterThanOrEqual(2);
15
+ });
16
+
17
+ it("with fuzzy catches similar names", () => {
18
+ const rows: Row[] = [
19
+ { name: "John Smith", zip: "111" },
20
+ { name: "Jon Smith", zip: "111" },
21
+ { name: "Zeke Xavier", zip: "222" },
22
+ ];
23
+ const result = dedupe(rows, {
24
+ fuzzy: { name: 1.0 },
25
+ blocking: ["zip"],
26
+ threshold: 0.7,
27
+ });
28
+ expect(result.stats.totalRecords).toBe(3);
29
+ expect(result.scoredPairs.length).toBeGreaterThanOrEqual(1);
30
+ });
31
+ });
32
+
33
+ describe("match() — cross-dataset", () => {
34
+ it("finds matches across datasets", () => {
35
+ const target: Row[] = [{ email: "a@x.com" }];
36
+ const reference: Row[] = [{ email: "a@x.com" }, { email: "b@x.com" }];
37
+ const result = match(target, reference, { exact: ["email"] });
38
+ expect(result.matched.length).toBe(1);
39
+ });
40
+ });
41
+
42
+ describe("scoreStrings()", () => {
43
+ it("exact identical", () => {
44
+ expect(scoreStrings("hello", "hello", "exact")).toBe(1.0);
45
+ });
46
+
47
+ it("jaro_winkler default", () => {
48
+ const s = scoreStrings("John", "John");
49
+ expect(s).toBe(1.0);
50
+ });
51
+
52
+ it("returns 0-1 range", () => {
53
+ const s = scoreStrings("foo", "bar");
54
+ expect(s).toBeGreaterThanOrEqual(0);
55
+ expect(s).toBeLessThanOrEqual(1);
56
+ });
57
+
58
+ it("levenshtein", () => {
59
+ expect(scoreStrings("abc", "abc", "levenshtein")).toBe(1.0);
60
+ });
61
+
62
+ it("token_sort reorders", () => {
63
+ expect(scoreStrings("a b", "b a", "token_sort")).toBe(1.0);
64
+ });
65
+ });
66
+
67
+ describe("scorePairRecord()", () => {
68
+ it("scores two row objects across fields", () => {
69
+ const rowA: Row = { name: "John", city: "NYC" };
70
+ const rowB: Row = { name: "John", city: "NYC" };
71
+ const fields: MatchkeyField[] = [
72
+ { field: "name", transforms: [], scorer: "jaro_winkler", weight: 1.0 },
73
+ { field: "city", transforms: [], scorer: "exact", weight: 1.0 },
74
+ ];
75
+ expect(scorePairRecord(rowA, rowB, fields)).toBe(1.0);
76
+ });
77
+ });
@@ -0,0 +1,103 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { autoConfigureRows } from "../../src/core/autoconfig.js";
3
+ import type { Row } from "../../src/core/types.js";
4
+
5
+ // Small synthetic person dataset.
6
+ function makePeople(n: number): Row[] {
7
+ const first = ["John", "Jane", "Bob", "Alice", "Carol", "David", "Eve", "Frank"];
8
+ const last = ["Smith", "Jones", "Brown", "Miller", "Davis", "Wilson", "Moore", "Taylor"];
9
+ const cities = ["Boston", "Seattle", "Austin", "Denver"];
10
+ const rows: Row[] = [];
11
+ for (let i = 0; i < n; i++) {
12
+ rows.push({
13
+ __row_id__: i,
14
+ first_name: first[i % first.length]!,
15
+ last_name: last[i % last.length]!,
16
+ email: `user${i}@example.com`,
17
+ phone: `555-010-${String(1000 + i).padStart(4, "0")}`,
18
+ zip: String(10000 + (i % 50)),
19
+ city: cities[i % cities.length]!,
20
+ });
21
+ }
22
+ return rows;
23
+ }
24
+
25
+ describe("autoConfigureRows", () => {
26
+ it("picks exact matchkeys on email/phone and produces a weighted matchkey", () => {
27
+ const rows = makePeople(40);
28
+ const cfg = autoConfigureRows(rows);
29
+ const names = (cfg.matchkeys ?? []).map((m) => m.name);
30
+ // Exact matchkeys for identifier columns.
31
+ expect(names).toContain("exact_email");
32
+ // Phone column has phone-shaped values and is near-unique -> exact allowed.
33
+ expect(names).toContain("exact_phone");
34
+ // There should be a weighted matchkey for fuzzy fields.
35
+ expect(names).toContain("weighted_identity");
36
+ });
37
+
38
+ it("zip and geo columns do NOT back exact matchkeys (blocking signal only)", () => {
39
+ const rows = makePeople(40);
40
+ const cfg = autoConfigureRows(rows);
41
+ const names = (cfg.matchkeys ?? []).map((m) => m.name);
42
+ expect(names).not.toContain("exact_zip");
43
+ expect(names).not.toContain("exact_city");
44
+ });
45
+
46
+ it("exact matchkey skipped for columns with cardinality_ratio < 0.01", () => {
47
+ // 200 rows, one constant id-like column with only 1 distinct value.
48
+ const rows: Row[] = [];
49
+ for (let i = 0; i < 200; i++) {
50
+ rows.push({
51
+ __row_id__: i,
52
+ email: `user${i}@example.com`,
53
+ account_id: "ACME-123", // constant -> cardinality ratio 1/200 = 0.005
54
+ });
55
+ }
56
+ const cfg = autoConfigureRows(rows);
57
+ const names = (cfg.matchkeys ?? []).map((m) => m.name);
58
+ expect(names).not.toContain("exact_account_id");
59
+ });
60
+
61
+ it("skips blocking on columns with >20% null rate", () => {
62
+ // zip is mostly null, city has values -> blocking should prefer city
63
+ const rows: Row[] = [];
64
+ for (let i = 0; i < 30; i++) {
65
+ rows.push({
66
+ __row_id__: i,
67
+ email: `user${i}@example.com`,
68
+ first_name: `First${i % 5}`,
69
+ zip: i % 5 === 0 ? String(10000 + i) : null, // 80% null
70
+ city: i % 3 === 0 ? "Boston" : i % 3 === 1 ? "Austin" : "Denver",
71
+ });
72
+ }
73
+ const cfg = autoConfigureRows(rows);
74
+ const keyFields = (cfg.blocking?.keys ?? []).map((k) => k.fields[0]);
75
+ // zip must not be chosen as a blocking key.
76
+ expect(keyFields).not.toContain("zip");
77
+ });
78
+
79
+ it("skips blocking on columns with cardinality_ratio >= 0.95 (near-unique)", () => {
80
+ // email is near-unique; don't block on it.
81
+ const rows: Row[] = [];
82
+ for (let i = 0; i < 30; i++) {
83
+ rows.push({
84
+ __row_id__: i,
85
+ email: `user${i}@example.com`, // fully unique
86
+ last_name: `Smith${i % 3}`, // low-cardinality name-ish column
87
+ });
88
+ }
89
+ const cfg = autoConfigureRows(rows);
90
+ const keyFields = (cfg.blocking?.keys ?? []).map((k) => k.fields[0]);
91
+ expect(keyFields).not.toContain("email");
92
+ });
93
+
94
+ it("email column detected as exact identifier candidate when cardinality is high", () => {
95
+ const rows: Row[] = [];
96
+ for (let i = 0; i < 20; i++) {
97
+ rows.push({ __row_id__: i, email: `user${i}@x.com` });
98
+ }
99
+ const cfg = autoConfigureRows(rows);
100
+ const names = (cfg.matchkeys ?? []).map((m) => m.name);
101
+ expect(names).toContain("exact_email");
102
+ });
103
+ });
@@ -0,0 +1,71 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { autoFixRows } from "../../src/core/autofix.js";
3
+ import type { Row } from "../../src/core/types.js";
4
+
5
+ describe("autoFixRows", () => {
6
+ it("trims whitespace from string values", () => {
7
+ const rows: Row[] = [{ name: " Alice ", email: " a@x.com" }];
8
+ const { rows: out, log } = autoFixRows(rows);
9
+ expect(out[0]!["name"]).toBe("Alice");
10
+ expect(out[0]!["email"]).toBe("a@x.com");
11
+ const trimLog = log.find((l) => l.fixType === "trim_whitespace");
12
+ expect(trimLog).toBeDefined();
13
+ });
14
+
15
+ it("converts empty strings to null", () => {
16
+ const rows: Row[] = [{ a: "", b: " " }];
17
+ const { rows: out } = autoFixRows(rows);
18
+ expect(out[0]!["a"]).toBeNull();
19
+ expect(out[0]!["b"]).toBeNull();
20
+ });
21
+
22
+ it("converts common null tokens to null (case-insensitive)", () => {
23
+ const rows: Row[] = [
24
+ { a: "N/A", b: "NULL", c: "Unknown", d: "-", e: "n/a" },
25
+ ];
26
+ const { rows: out } = autoFixRows(rows);
27
+ expect(out[0]!["a"]).toBeNull();
28
+ expect(out[0]!["b"]).toBeNull();
29
+ expect(out[0]!["c"]).toBeNull();
30
+ expect(out[0]!["d"]).toBeNull();
31
+ expect(out[0]!["e"]).toBeNull();
32
+ });
33
+
34
+ it("passes non-string values through unchanged", () => {
35
+ const rows: Row[] = [{ n: 42, b: true, x: null }];
36
+ const { rows: out } = autoFixRows(rows);
37
+ expect(out[0]!["n"]).toBe(42);
38
+ expect(out[0]!["b"]).toBe(true);
39
+ expect(out[0]!["x"]).toBeNull();
40
+ });
41
+
42
+ it("leaves internal __ columns untouched", () => {
43
+ const rows: Row[] = [
44
+ { __row_id__: 0, __source__: " sensitive ", name: " X " },
45
+ ];
46
+ const { rows: out } = autoFixRows(rows);
47
+ expect(out[0]!["__row_id__"]).toBe(0);
48
+ // Internal columns preserved as-is (not trimmed, not nulled).
49
+ expect(out[0]!["__source__"]).toBe(" sensitive ");
50
+ expect(out[0]!["name"]).toBe("X");
51
+ });
52
+
53
+ it("returns a log that aggregates affected rows per column/fix-type", () => {
54
+ const rows: Row[] = [
55
+ { a: " x ", b: "N/A" },
56
+ { a: " y ", b: "" },
57
+ { a: "ok", b: "hello" },
58
+ ];
59
+ const { log } = autoFixRows(rows);
60
+ const trimA = log.find(
61
+ (l) => l.column === "a" && l.fixType === "trim_whitespace",
62
+ );
63
+ const nullB = log.find(
64
+ (l) => l.column === "b" && l.fixType === "null_empty_or_token",
65
+ );
66
+ expect(trimA).toBeDefined();
67
+ expect(trimA!.affectedRows).toBe(2);
68
+ expect(nullB).toBeDefined();
69
+ expect(nullB!.affectedRows).toBe(2);
70
+ });
71
+ });
@@ -0,0 +1,164 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ buildStaticBlocks,
4
+ buildMultiPassBlocks,
5
+ buildAdaptiveBlocks,
6
+ buildBlocks,
7
+ makeBlockingConfig,
8
+ } from "../../src/core/index.js";
9
+ import type { Row, BlockingConfig } from "../../src/core/index.js";
10
+
11
+ describe("buildStaticBlocks", () => {
12
+ it("3 rows with same zip -> 1 block of 3", () => {
13
+ const rows: Row[] = [
14
+ { __row_id__: 0, zip: "12345" },
15
+ { __row_id__: 1, zip: "12345" },
16
+ { __row_id__: 2, zip: "12345" },
17
+ ];
18
+ const config: BlockingConfig = makeBlockingConfig({
19
+ strategy: "static",
20
+ keys: [{ fields: ["zip"], transforms: [] }],
21
+ });
22
+ const blocks = buildStaticBlocks(rows, config);
23
+ expect(blocks.length).toBe(1);
24
+ expect(blocks[0]!.rows.length).toBe(3);
25
+ });
26
+
27
+ it("different zips -> no blocks (singletons skipped)", () => {
28
+ const rows: Row[] = [
29
+ { __row_id__: 0, zip: "11111" },
30
+ { __row_id__: 1, zip: "22222" },
31
+ ];
32
+ const config: BlockingConfig = makeBlockingConfig({
33
+ strategy: "static",
34
+ keys: [{ fields: ["zip"], transforms: [] }],
35
+ });
36
+ const blocks = buildStaticBlocks(rows, config);
37
+ expect(blocks.length).toBe(0);
38
+ });
39
+
40
+ it("applies transforms to block key (lowercase)", () => {
41
+ const rows: Row[] = [
42
+ { __row_id__: 0, city: "NYC" },
43
+ { __row_id__: 1, city: "nyc" },
44
+ ];
45
+ const config: BlockingConfig = makeBlockingConfig({
46
+ strategy: "static",
47
+ keys: [{ fields: ["city"], transforms: ["lowercase"] }],
48
+ });
49
+ const blocks = buildStaticBlocks(rows, config);
50
+ expect(blocks.length).toBe(1);
51
+ expect(blocks[0]!.rows.length).toBe(2);
52
+ });
53
+
54
+ it("missing field produces null block key and row is skipped", () => {
55
+ const rows: Row[] = [
56
+ { __row_id__: 0, zip: "12345" },
57
+ { __row_id__: 1, zip: null },
58
+ { __row_id__: 2, zip: "12345" },
59
+ ];
60
+ const config: BlockingConfig = makeBlockingConfig({
61
+ strategy: "static",
62
+ keys: [{ fields: ["zip"], transforms: [] }],
63
+ });
64
+ const blocks = buildStaticBlocks(rows, config);
65
+ expect(blocks.length).toBe(1);
66
+ expect(blocks[0]!.rows.length).toBe(2);
67
+ });
68
+
69
+ it("oversized block with skipOversized=true is dropped", () => {
70
+ const rows: Row[] = Array.from({ length: 10 }, (_, i) => ({
71
+ __row_id__: i,
72
+ zip: "12345",
73
+ }));
74
+ const config: BlockingConfig = makeBlockingConfig({
75
+ strategy: "static",
76
+ keys: [{ fields: ["zip"], transforms: [] }],
77
+ maxBlockSize: 5,
78
+ skipOversized: true,
79
+ });
80
+ const blocks = buildStaticBlocks(rows, config);
81
+ expect(blocks.length).toBe(0);
82
+ });
83
+
84
+ it("oversized block with skipOversized=false is kept", () => {
85
+ const rows: Row[] = Array.from({ length: 10 }, (_, i) => ({
86
+ __row_id__: i,
87
+ zip: "12345",
88
+ }));
89
+ const config: BlockingConfig = makeBlockingConfig({
90
+ strategy: "static",
91
+ keys: [{ fields: ["zip"], transforms: [] }],
92
+ maxBlockSize: 5,
93
+ skipOversized: false,
94
+ });
95
+ const blocks = buildStaticBlocks(rows, config);
96
+ expect(blocks.length).toBe(1);
97
+ expect(blocks[0]!.rows.length).toBe(10);
98
+ });
99
+ });
100
+
101
+ describe("buildMultiPassBlocks", () => {
102
+ it("runs multiple passes with different keys", () => {
103
+ const rows: Row[] = [
104
+ { __row_id__: 0, zip: "111", last: "Smith" },
105
+ { __row_id__: 1, zip: "111", last: "Jones" },
106
+ { __row_id__: 2, zip: "222", last: "Smith" },
107
+ ];
108
+ const config: BlockingConfig = makeBlockingConfig({
109
+ strategy: "multi_pass",
110
+ keys: [{ fields: ["zip"], transforms: [] }],
111
+ passes: [
112
+ { fields: ["zip"], transforms: [] },
113
+ { fields: ["last"], transforms: [] },
114
+ ],
115
+ });
116
+ const blocks = buildMultiPassBlocks(rows, config);
117
+ // Pass 1: zip 111 has 2 rows -> 1 block
118
+ // Pass 2: last Smith has 2 rows -> 1 block
119
+ expect(blocks.length).toBe(2);
120
+ });
121
+ });
122
+
123
+ describe("buildAdaptiveBlocks", () => {
124
+ it("auto-split oversized block", () => {
125
+ const rows: Row[] = [
126
+ { __row_id__: 0, zip: "111", city: "A" },
127
+ { __row_id__: 1, zip: "111", city: "A" },
128
+ { __row_id__: 2, zip: "111", city: "B" },
129
+ { __row_id__: 3, zip: "111", city: "B" },
130
+ ];
131
+ const config: BlockingConfig = makeBlockingConfig({
132
+ strategy: "adaptive",
133
+ keys: [{ fields: ["zip"], transforms: [] }],
134
+ maxBlockSize: 3,
135
+ skipOversized: false,
136
+ });
137
+ const blocks = buildAdaptiveBlocks(rows, config);
138
+ // Should split by city
139
+ expect(blocks.length).toBeGreaterThanOrEqual(2);
140
+ });
141
+ });
142
+
143
+ describe("buildBlocks dispatch", () => {
144
+ it("static strategy routes correctly", () => {
145
+ const rows: Row[] = [
146
+ { __row_id__: 0, zip: "111" },
147
+ { __row_id__: 1, zip: "111" },
148
+ ];
149
+ const config: BlockingConfig = makeBlockingConfig({
150
+ strategy: "static",
151
+ keys: [{ fields: ["zip"], transforms: [] }],
152
+ });
153
+ const blocks = buildBlocks(rows, config);
154
+ expect(blocks.length).toBe(1);
155
+ });
156
+
157
+ it("fewer than 2 rows returns empty", () => {
158
+ const config: BlockingConfig = makeBlockingConfig({
159
+ strategy: "static",
160
+ keys: [{ fields: ["zip"], transforms: [] }],
161
+ });
162
+ expect(buildBlocks([], config)).toEqual([]);
163
+ });
164
+ });