goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,82 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { StreamProcessor } from "../../src/core/streaming.js";
3
+ import { makeMatchkeyConfig, makeMatchkeyField } from "../../src/core/types.js";
4
+ import type { Row } from "../../src/core/types.js";
5
+
6
+ function nameMk() {
7
+ return makeMatchkeyConfig({
8
+ name: "name_fuzzy",
9
+ type: "weighted",
10
+ fields: [
11
+ makeMatchkeyField({
12
+ field: "name",
13
+ scorer: "jaro_winkler",
14
+ transforms: ["lowercase"],
15
+ }),
16
+ ],
17
+ });
18
+ }
19
+
20
+ describe("StreamProcessor", () => {
21
+ it("first add with no existing cluster state creates a singleton", () => {
22
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.85 });
23
+ const result = sp.add({ __row_id__: 0, name: "Alice" });
24
+ expect(result.rowId).toBe(0);
25
+ expect(result.matchedIds).toEqual([]);
26
+ expect(result.clusterId).toBeGreaterThanOrEqual(0);
27
+ expect(sp.size).toBe(1);
28
+ });
29
+
30
+ it("matching record joins the existing cluster", () => {
31
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.85 });
32
+ const a = sp.add({ __row_id__: 0, name: "John Smith" });
33
+ const b = sp.add({ __row_id__: 1, name: "John Smith" });
34
+ expect(b.matchedIds).toContain(0);
35
+ expect(b.clusterId).toBe(a.clusterId);
36
+ expect(sp.size).toBe(2);
37
+ });
38
+
39
+ it("non-matching records get their own singleton clusters", () => {
40
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.95 });
41
+ const a = sp.add({ __row_id__: 0, name: "Alice Brown" });
42
+ const b = sp.add({ __row_id__: 1, name: "Zoltan Xiong" });
43
+ expect(b.matchedIds).toEqual([]);
44
+ expect(b.clusterId).not.toBe(a.clusterId);
45
+ });
46
+
47
+ it("size increments with each add", () => {
48
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.85 });
49
+ expect(sp.size).toBe(0);
50
+ sp.add({ __row_id__: 0, name: "A" });
51
+ expect(sp.size).toBe(1);
52
+ sp.add({ __row_id__: 1, name: "B" });
53
+ expect(sp.size).toBe(2);
54
+ sp.add({ __row_id__: 2, name: "C" });
55
+ expect(sp.size).toBe(3);
56
+ });
57
+
58
+ it("snapshot returns current clusters and rows", () => {
59
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.85 });
60
+ sp.add({ __row_id__: 0, name: "Alice" });
61
+ sp.add({ __row_id__: 1, name: "Bob" });
62
+ const snap = sp.snapshot();
63
+ expect(snap.rows.length).toBe(2);
64
+ expect(snap.clusters.size).toBeGreaterThan(0);
65
+ // Cluster members collectively cover all added row ids.
66
+ const allMembers = new Set<number>();
67
+ for (const info of snap.clusters.values()) {
68
+ for (const m of info.members) allMembers.add(m);
69
+ }
70
+ expect(allMembers.has(0)).toBe(true);
71
+ expect(allMembers.has(1)).toBe(true);
72
+ });
73
+
74
+ it("assigns __row_id__ automatically when absent on input row", () => {
75
+ const sp = new StreamProcessor({ matchkey: nameMk(), threshold: 0.85 });
76
+ const a = sp.add({ name: "X" });
77
+ const b = sp.add({ name: "Y" });
78
+ expect(a.rowId).toBe(0);
79
+ expect(b.rowId).toBe(1);
80
+ expect(sp.size).toBe(2);
81
+ });
82
+ });
@@ -0,0 +1,208 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { applyTransform, applyTransforms, soundex, metaphone } from "../../src/core/index.js";
3
+ import { sha256Hex, hmacSha256Hex } from "../../src/core/transforms.js";
4
+
5
+ describe("applyTransform - basic transforms", () => {
6
+ it("lowercase", () => {
7
+ expect(applyTransform("HELLO", "lowercase")).toBe("hello");
8
+ });
9
+
10
+ it("uppercase", () => {
11
+ expect(applyTransform("hello", "uppercase")).toBe("HELLO");
12
+ });
13
+
14
+ it("strip", () => {
15
+ expect(applyTransform(" hello ", "strip")).toBe("hello");
16
+ });
17
+
18
+ it("strip_all", () => {
19
+ expect(applyTransform("a b\tc\nd", "strip_all")).toBe("abcd");
20
+ });
21
+
22
+ it("digits_only", () => {
23
+ expect(applyTransform("abc123def", "digits_only")).toBe("123");
24
+ });
25
+
26
+ it("alpha_only", () => {
27
+ expect(applyTransform("abc123def!", "alpha_only")).toBe("abcdef");
28
+ });
29
+
30
+ it("normalize_whitespace", () => {
31
+ expect(applyTransform(" a b\tc ", "normalize_whitespace")).toBe("a b c");
32
+ });
33
+
34
+ it("token_sort", () => {
35
+ expect(applyTransform("Smith John", "token_sort")).toBe("John Smith");
36
+ });
37
+
38
+ it("first_token", () => {
39
+ expect(applyTransform("John Smith Doe", "first_token")).toBe("John");
40
+ });
41
+
42
+ it("last_token", () => {
43
+ expect(applyTransform("John Smith Doe", "last_token")).toBe("Doe");
44
+ });
45
+
46
+ it("returns null for null input", () => {
47
+ expect(applyTransform(null, "lowercase")).toBe(null);
48
+ });
49
+
50
+ it("unknown transform returns value unchanged", () => {
51
+ expect(applyTransform("hello", "nonexistent")).toBe("hello");
52
+ });
53
+ });
54
+
55
+ describe("applyTransform - parameterized", () => {
56
+ it("substring:0:3", () => {
57
+ expect(applyTransform("abcdef", "substring:0:3")).toBe("abc");
58
+ });
59
+
60
+ it("substring:2:5", () => {
61
+ expect(applyTransform("abcdef", "substring:2:5")).toBe("cde");
62
+ });
63
+
64
+ it("qgram:3 splits to 3-grams", () => {
65
+ const result = applyTransform("abcde", "qgram:3");
66
+ // 3-grams of "abcde": abc, bcd, cde (sorted)
67
+ expect(result).toBe("abc bcd cde");
68
+ });
69
+
70
+ it("qgram:2 splits to bigrams", () => {
71
+ const result = applyTransform("abc", "qgram:2");
72
+ // bigrams: ab, bc
73
+ expect(result).toBe("ab bc");
74
+ });
75
+ });
76
+
77
+ describe("soundex", () => {
78
+ it("Robert -> R163", () => {
79
+ expect(soundex("Robert")).toBe("R163");
80
+ });
81
+
82
+ it("Smith and Smyth have same code", () => {
83
+ expect(soundex("Smith")).toBe(soundex("Smyth"));
84
+ });
85
+
86
+ it("Rupert -> R163 (same as Robert)", () => {
87
+ expect(soundex("Rupert")).toBe("R163");
88
+ });
89
+
90
+ it("empty string -> 0000", () => {
91
+ expect(soundex("")).toBe("0000");
92
+ });
93
+
94
+ it("returns 4-character code", () => {
95
+ expect(soundex("Washington").length).toBe(4);
96
+ });
97
+ });
98
+
99
+ describe("metaphone", () => {
100
+ it("returns a string", () => {
101
+ expect(typeof metaphone("Thompson")).toBe("string");
102
+ });
103
+
104
+ it("empty string returns empty", () => {
105
+ expect(metaphone("")).toBe("");
106
+ });
107
+
108
+ it("code has at most 4 characters", () => {
109
+ expect(metaphone("Washington").length).toBeLessThanOrEqual(4);
110
+ });
111
+ });
112
+
113
+ describe("applyTransforms - chain", () => {
114
+ it("applies multiple in order", () => {
115
+ // lowercase then strip
116
+ expect(applyTransforms(" HELLO ", ["lowercase", "strip"])).toBe("hello");
117
+ });
118
+
119
+ it("strip then digits_only", () => {
120
+ expect(applyTransforms(" abc123 ", ["strip", "digits_only"])).toBe("123");
121
+ });
122
+
123
+ it("empty chain returns value unchanged", () => {
124
+ expect(applyTransforms("hello", [])).toBe("hello");
125
+ });
126
+
127
+ it("propagates null through chain", () => {
128
+ expect(applyTransforms(null, ["lowercase", "strip"])).toBe(null);
129
+ });
130
+ });
131
+
132
+ describe("sha256Hex / hmacSha256Hex - Python parity", () => {
133
+ it("sha256 of empty string", () => {
134
+ expect(sha256Hex("")).toBe(
135
+ "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
136
+ );
137
+ });
138
+
139
+ it("sha256 of 'abc' (FIPS 180-2 reference vector)", () => {
140
+ expect(sha256Hex("abc")).toBe(
141
+ "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad",
142
+ );
143
+ });
144
+
145
+ it("sha256 handles UTF-8 multibyte input", () => {
146
+ // hashlib.sha256("héllo".encode()).hexdigest()
147
+ expect(sha256Hex("héllo")).toBe(
148
+ "3c48591d8d098a4538f5e013dfcf406e948eac4d3277b10bf614e295d6068179",
149
+ );
150
+ });
151
+
152
+ it("hmac-sha256 matches Python reference (empty key, empty msg)", () => {
153
+ // hmac.new(b"", b"", hashlib.sha256).hexdigest()
154
+ expect(hmacSha256Hex("", "")).toBe(
155
+ "b613679a0814d9ec772f95d778c35fc5ff1697c493715653c6c712144292c5ad",
156
+ );
157
+ });
158
+
159
+ it("hmac-sha256 matches Python reference (RFC 4231 test case)", () => {
160
+ // hmac.new(b"key", b"The quick brown fox jumps over the lazy dog", hashlib.sha256).hexdigest()
161
+ expect(hmacSha256Hex("key", "The quick brown fox jumps over the lazy dog")).toBe(
162
+ "f7bc83f430538424b13298e6aa6fb143ef4d59a14946175997479dbc2d1a3cd8",
163
+ );
164
+ });
165
+ });
166
+
167
+ describe("bloom_filter - default size + hex length", () => {
168
+ it("default bloom_filter produces 256 hex chars (1024 bits)", () => {
169
+ const hex = applyTransform("hello", "bloom_filter");
170
+ expect(hex).not.toBe(null);
171
+ expect(hex!).toMatch(/^[0-9a-f]+$/);
172
+ expect(hex!.length).toBe(256); // 1024 bits / 4 bits per hex char
173
+ });
174
+
175
+ it("bloom_filter:standard produces 128 hex chars (512 bits)", () => {
176
+ const hex = applyTransform("hello", "bloom_filter:standard");
177
+ expect(hex!.length).toBe(128);
178
+ });
179
+
180
+ it("bloom_filter:high produces 256 hex chars (1024 bits)", () => {
181
+ const hex = applyTransform("hello", "bloom_filter:high");
182
+ expect(hex!.length).toBe(256);
183
+ });
184
+
185
+ it("bloom_filter:paranoid produces 512 hex chars (2048 bits)", () => {
186
+ const hex = applyTransform("hello", "bloom_filter:paranoid");
187
+ expect(hex!.length).toBe(512);
188
+ });
189
+
190
+ it("same input produces same output (deterministic)", () => {
191
+ expect(applyTransform("hello", "bloom_filter")).toBe(
192
+ applyTransform("hello", "bloom_filter"),
193
+ );
194
+ });
195
+
196
+ it("different inputs produce different outputs", () => {
197
+ const a = applyTransform("hello", "bloom_filter");
198
+ const b = applyTransform("world", "bloom_filter");
199
+ expect(a).not.toBe(b);
200
+ });
201
+
202
+ it("byte-for-byte Python parity for 'hello' default bloom_filter", () => {
203
+ // Reference generated by goldenmatch.utils.transforms.apply_transform("hello", "bloom_filter")
204
+ const expected =
205
+ "a008a1041000204000000400a000140000100048810004000008102010004000008400000000000080010100800000011014000000200000008002101000010002002100010000022000010020800000c00060000040000010010000000002400080000000800004008900200090000080800001000009000001000000100c20";
206
+ expect(applyTransform("hello", "bloom_filter")).toBe(expected);
207
+ });
208
+ });
@@ -0,0 +1,42 @@
1
+ import { describe, it, expect } from "vitest";
2
+
3
+ describe("TUI widgets", () => {
4
+ it("exports widgets helper", async () => {
5
+ const mod = await import("../../src/node/tui/widgets.js");
6
+ expect(typeof mod.tryLoad).toBe("function");
7
+ expect(typeof mod.loadAddons).toBe("function");
8
+ expect(typeof mod.inkAddons).toBe("object");
9
+ });
10
+
11
+ it("tryLoad returns null for missing package", async () => {
12
+ const { tryLoad } = await import("../../src/node/tui/widgets.js");
13
+ expect(tryLoad("nonexistent-package-xyz")).toBeNull();
14
+ });
15
+
16
+ it("inkAddons getters return null for missing addons", async () => {
17
+ const { inkAddons } = await import("../../src/node/tui/widgets.js");
18
+ // In the test env none of the optional ink addons are installed.
19
+ // Each getter must return null rather than throwing.
20
+ expect(() => inkAddons.table).not.toThrow();
21
+ expect(() => inkAddons.selectInput).not.toThrow();
22
+ expect(() => inkAddons.textInput).not.toThrow();
23
+ expect(() => inkAddons.spinner).not.toThrow();
24
+ expect(() => inkAddons.gradient).not.toThrow();
25
+ });
26
+
27
+ it("loadAddons resolves to an object with all keys", async () => {
28
+ const { loadAddons } = await import("../../src/node/tui/widgets.js");
29
+ const addons = await loadAddons();
30
+ expect(addons).toHaveProperty("Table");
31
+ expect(addons).toHaveProperty("SelectInput");
32
+ expect(addons).toHaveProperty("TextInput");
33
+ expect(addons).toHaveProperty("Spinner");
34
+ expect(addons).toHaveProperty("Gradient");
35
+ // Each field is either null (missing package) or a loaded module/component.
36
+ for (const v of Object.values(addons)) {
37
+ expect(v === null || typeof v === "function" || typeof v === "object").toBe(
38
+ true,
39
+ );
40
+ }
41
+ });
42
+ });
@@ -0,0 +1,24 @@
1
+ import { describe, it, expect } from "vitest";
2
+
3
+ describe("startTui", () => {
4
+ it("is exported as a function", async () => {
5
+ const { startTui } = await import("../../src/node/tui/app.js");
6
+ expect(typeof startTui).toBe("function");
7
+ });
8
+
9
+ it("throws helpful error when 'ink'/'react' not installed", async () => {
10
+ const { startTui } = await import("../../src/node/tui/app.js");
11
+ // ink/react are optional peer deps. In environments without them
12
+ // the loader inside startTui throws a helpful install message.
13
+ // If they happen to be installed, calling startTui would actually
14
+ // launch the TUI (which would block on stdin in tests) — so we
15
+ // only invoke when we expect the load to fail.
16
+ try {
17
+ await startTui({});
18
+ // If we get here, ink+react ARE installed. Just confirm no crash.
19
+ expect(true).toBe(true);
20
+ } catch (err) {
21
+ expect(String(err)).toMatch(/ink|react/i);
22
+ }
23
+ });
24
+ });
@@ -0,0 +1,145 @@
1
+ import { describe, it, expect, vi } from "vitest";
2
+ import { validateRows, checkRule } from "../../src/core/validate.js";
3
+ import type { ValidationRule } from "../../src/core/validate.js";
4
+ import type { Row } from "../../src/core/types.js";
5
+
6
+ describe("validateRows — not_null rule", () => {
7
+ it("quarantines rows that fail not_null when action is quarantine", () => {
8
+ const rows: Row[] = [
9
+ { email: "a@x.com" },
10
+ { email: null },
11
+ { email: "" },
12
+ ];
13
+ const rules: ValidationRule[] = [
14
+ {
15
+ column: "email",
16
+ ruleType: "not_null",
17
+ params: {},
18
+ action: "quarantine",
19
+ },
20
+ ];
21
+ const res = validateRows(rows, rules);
22
+ expect(res.valid.length).toBe(1);
23
+ expect(res.quarantine.length).toBe(2);
24
+ expect(res.report.quarantined).toBe(2);
25
+ expect(res.report.ruleViolations["email:not_null"]).toBe(2);
26
+ });
27
+
28
+ it("action='null' sets the failing cell to null but keeps the row", () => {
29
+ const rows: Row[] = [{ email: "" }];
30
+ const rules: ValidationRule[] = [
31
+ { column: "email", ruleType: "not_null", params: {}, action: "null" },
32
+ ];
33
+ const res = validateRows(rows, rules);
34
+ expect(res.valid.length).toBe(1);
35
+ expect(res.valid[0]!["email"]).toBeNull();
36
+ expect(res.quarantine.length).toBe(0);
37
+ });
38
+
39
+ it("action='flag' keeps the row and adds to __flags__ without quarantine", () => {
40
+ const rows: Row[] = [{ email: null }];
41
+ const rules: ValidationRule[] = [
42
+ { column: "email", ruleType: "not_null", params: {}, action: "flag" },
43
+ ];
44
+ const res = validateRows(rows, rules);
45
+ expect(res.valid.length).toBe(1);
46
+ const flags = res.valid[0]!["__flags__"] as string[];
47
+ expect(flags).toContain("email:not_null");
48
+ expect(res.report.flagged).toBe(1);
49
+ });
50
+ });
51
+
52
+ describe("validateRows — regex rule", () => {
53
+ it("valid values pass, invalid values trigger the action", () => {
54
+ const rows: Row[] = [
55
+ { email: "alice@example.com" },
56
+ { email: "not-an-email" },
57
+ ];
58
+ const rules: ValidationRule[] = [
59
+ {
60
+ column: "email",
61
+ ruleType: "regex",
62
+ params: { pattern: "^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$" },
63
+ action: "quarantine",
64
+ },
65
+ ];
66
+ const res = validateRows(rows, rules);
67
+ expect(res.valid.length).toBe(1);
68
+ expect(res.quarantine.length).toBe(1);
69
+ });
70
+
71
+ it("invalid regex pattern does not crash; value is treated as failing", () => {
72
+ const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
73
+ const rows: Row[] = [{ email: "alice@example.com" }];
74
+ const rules: ValidationRule[] = [
75
+ {
76
+ column: "email",
77
+ ruleType: "regex",
78
+ params: { pattern: "([unclosed" },
79
+ action: "flag",
80
+ },
81
+ ];
82
+ // Must not throw.
83
+ expect(() => validateRows(rows, rules)).not.toThrow();
84
+ // Direct checkRule returns false for broken regex (via catch).
85
+ expect(
86
+ checkRule("a", {
87
+ column: "x",
88
+ ruleType: "regex",
89
+ params: { pattern: "([unclosed" },
90
+ action: "flag",
91
+ }),
92
+ ).toBe(false);
93
+ warn.mockRestore();
94
+ });
95
+ });
96
+
97
+ describe("checkRule — misc", () => {
98
+ it("min_length / max_length respect the `value` parameter", () => {
99
+ const minOk = checkRule("hello", {
100
+ column: "x",
101
+ ruleType: "min_length",
102
+ params: { value: 3 },
103
+ action: "flag",
104
+ });
105
+ const minFail = checkRule("hi", {
106
+ column: "x",
107
+ ruleType: "min_length",
108
+ params: { value: 3 },
109
+ action: "flag",
110
+ });
111
+ const maxOk = checkRule("hi", {
112
+ column: "x",
113
+ ruleType: "max_length",
114
+ params: { value: 3 },
115
+ action: "flag",
116
+ });
117
+ const maxFail = checkRule("toolong", {
118
+ column: "x",
119
+ ruleType: "max_length",
120
+ params: { value: 3 },
121
+ action: "flag",
122
+ });
123
+ expect(minOk).toBe(true);
124
+ expect(minFail).toBe(false);
125
+ expect(maxOk).toBe(true);
126
+ expect(maxFail).toBe(false);
127
+ });
128
+
129
+ it("in_set accepts allowed and rejects disallowed values", () => {
130
+ const allowed = checkRule("a", {
131
+ column: "x",
132
+ ruleType: "in_set",
133
+ params: { values: ["a", "b"] },
134
+ action: "flag",
135
+ });
136
+ const rejected = checkRule("z", {
137
+ column: "x",
138
+ ruleType: "in_set",
139
+ params: { values: ["a", "b"] },
140
+ action: "flag",
141
+ });
142
+ expect(allowed).toBe(true);
143
+ expect(rejected).toBe(false);
144
+ });
145
+ });
@@ -0,0 +1,99 @@
1
+ /**
2
+ * workers-parallel.test.ts -- Smoke tests for `scoreBlocksParallel`.
3
+ *
4
+ * piscina is an optional peer dep; these tests assert that the function
5
+ * degrades gracefully (fast-path / fallback) even when piscina isn't
6
+ * installed. When piscina IS installed, the worker path exercises true
7
+ * worker_threads, but we use block shapes that also work in the fallback.
8
+ */
9
+ import { describe, it, expect } from "vitest";
10
+ import { scoreBlocksParallel } from "../../src/node/backends/workers.js";
11
+ import { pairKey } from "../../src/core/cluster.js";
12
+ import type {
13
+ BlockResult,
14
+ MatchkeyConfig,
15
+ PairKey,
16
+ Row,
17
+ } from "../../src/core/types.js";
18
+
19
+ const mk: MatchkeyConfig = {
20
+ name: "name_match",
21
+ type: "weighted",
22
+ threshold: 0.5,
23
+ fields: [
24
+ { field: "name", transforms: [], scorer: "jaro_winkler", weight: 1 },
25
+ ],
26
+ };
27
+
28
+ function makeBlock(blockKey: string, rows: Row[]): BlockResult {
29
+ return { blockKey, rows, strategy: "static", depth: 0 };
30
+ }
31
+
32
+ describe("scoreBlocksParallel", () => {
33
+ it("returns empty for 0 blocks", async () => {
34
+ const result = await scoreBlocksParallel([], mk, new Set());
35
+ expect(result).toEqual([]);
36
+ });
37
+
38
+ it("uses sequential fast-path for <= 2 blocks", async () => {
39
+ const blocks = [
40
+ makeBlock("b0", [
41
+ { __row_id__: 1, name: "John" },
42
+ { __row_id__: 2, name: "Jon" },
43
+ ]),
44
+ ];
45
+ const result = await scoreBlocksParallel(blocks, mk, new Set());
46
+ expect(Array.isArray(result)).toBe(true);
47
+ // "John" vs "Jon" with jaro_winkler clears 0.5 threshold.
48
+ expect(result.length).toBeGreaterThanOrEqual(1);
49
+ });
50
+
51
+ it("falls back to concurrent path when piscina not installed", async () => {
52
+ // piscina is not installed in this dev env -- the dynamic import
53
+ // inside scoreBlocksParallel fails and we should land on the
54
+ // scoreBlocksConcurrent fallback, not throw.
55
+ const blocks: BlockResult[] = Array.from({ length: 5 }, (_, i) =>
56
+ makeBlock(`b${i}`, [
57
+ { __row_id__: i * 10 + 1, name: "John" },
58
+ { __row_id__: i * 10 + 2, name: "Jon" },
59
+ ]),
60
+ );
61
+ const result = await scoreBlocksParallel(blocks, mk, new Set());
62
+ expect(Array.isArray(result)).toBe(true);
63
+ // 5 blocks × at least 1 pair each clearing 0.5 threshold.
64
+ expect(result.length).toBeGreaterThanOrEqual(5);
65
+ });
66
+
67
+ it("mutates matchedPairs with newly discovered pairs", async () => {
68
+ const blocks: BlockResult[] = Array.from({ length: 4 }, (_, i) =>
69
+ makeBlock(`b${i}`, [
70
+ { __row_id__: i * 10 + 1, name: "Alice" },
71
+ { __row_id__: i * 10 + 2, name: "Alyce" },
72
+ ]),
73
+ );
74
+ const matched = new Set<PairKey>();
75
+ const result = await scoreBlocksParallel(blocks, mk, matched);
76
+ // Every newly returned pair must be in matchedPairs.
77
+ for (const p of result) {
78
+ const key = pairKey(p.idA, p.idB);
79
+ expect(matched.has(key)).toBe(true);
80
+ }
81
+ });
82
+
83
+ it("respects the exclude set (no duplicate pairs)", async () => {
84
+ const blocks: BlockResult[] = Array.from({ length: 3 }, (_, i) =>
85
+ makeBlock(`b${i}`, [
86
+ { __row_id__: i * 10 + 1, name: "Alice" },
87
+ { __row_id__: i * 10 + 2, name: "Alyce" },
88
+ ]),
89
+ );
90
+ const result = await scoreBlocksParallel(blocks, mk, new Set());
91
+ const keys = new Set<string>();
92
+ for (const p of result) {
93
+ const key =
94
+ p.idA < p.idB ? `${p.idA}:${p.idB}` : `${p.idB}:${p.idA}`;
95
+ expect(keys.has(key)).toBe(false);
96
+ keys.add(key);
97
+ }
98
+ });
99
+ });
@@ -0,0 +1,74 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { scoreBlocksConcurrent } from "../../src/node/backends/workers.js";
3
+ import { scoreBlocksSequential } from "../../src/core/index.js";
4
+ import type {
5
+ BlockResult,
6
+ MatchkeyConfig,
7
+ Row,
8
+ } from "../../src/core/index.js";
9
+
10
+ const mk: MatchkeyConfig = {
11
+ name: "name_match",
12
+ type: "weighted",
13
+ threshold: 0.5,
14
+ fields: [
15
+ { field: "name", transforms: [], scorer: "jaro_winkler", weight: 1 },
16
+ ],
17
+ };
18
+
19
+ function makeBlock(blockKey: string, rows: Row[]): BlockResult {
20
+ return { blockKey, rows, strategy: "static", depth: 0 };
21
+ }
22
+
23
+ describe("scoreBlocksConcurrent", () => {
24
+ it("0 blocks -> empty", async () => {
25
+ const out = await scoreBlocksConcurrent([], mk, new Set());
26
+ expect(out).toEqual([]);
27
+ });
28
+
29
+ it("small N (<=2 blocks) returns same pairs as sequential", async () => {
30
+ const block: Row[] = [
31
+ { __row_id__: 0, name: "Alice" },
32
+ { __row_id__: 1, name: "Alyce" },
33
+ ];
34
+ const blocks = [makeBlock("b0", block)];
35
+ const concurrent = await scoreBlocksConcurrent(blocks, mk, new Set());
36
+ const sequential = scoreBlocksSequential(blocks, mk, new Set());
37
+ expect(concurrent.length).toBe(sequential.length);
38
+ if (sequential.length > 0) {
39
+ expect(concurrent[0]!.idA).toBe(sequential[0]!.idA);
40
+ expect(concurrent[0]!.idB).toBe(sequential[0]!.idB);
41
+ }
42
+ });
43
+
44
+ it("many blocks (>2) batched concurrently returns same set as sequential", async () => {
45
+ const blocks: BlockResult[] = [];
46
+ for (let i = 0; i < 6; i++) {
47
+ const base = i * 10;
48
+ blocks.push(
49
+ makeBlock(`b${i}`, [
50
+ { __row_id__: base, name: "Alice" },
51
+ { __row_id__: base + 1, name: "Alyce" },
52
+ ]),
53
+ );
54
+ }
55
+ const concurrent = await scoreBlocksConcurrent(blocks, mk, new Set());
56
+ const sequential = scoreBlocksSequential(blocks, mk, new Set());
57
+ expect(concurrent.length).toBe(sequential.length);
58
+
59
+ const cKeys = new Set(concurrent.map((p) => `${p.idA}:${p.idB}`));
60
+ const sKeys = new Set(sequential.map((p) => `${p.idA}:${p.idB}`));
61
+ expect(cKeys).toEqual(sKeys);
62
+ });
63
+
64
+ it("handles empty/singleton blocks gracefully", async () => {
65
+ const blocks: BlockResult[] = [
66
+ makeBlock("empty", []),
67
+ makeBlock("singleton", [{ __row_id__: 0, name: "Alice" }]),
68
+ makeBlock("empty2", []),
69
+ makeBlock("singleton2", [{ __row_id__: 1, name: "Bob" }]),
70
+ ];
71
+ const out = await scoreBlocksConcurrent(blocks, mk, new Set());
72
+ expect(out).toEqual([]);
73
+ });
74
+ });