goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,118 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { runDedupePipeline, runMatchPipeline, makeConfig, makeBlockingConfig } from "../../src/core/index.js";
3
+ import type { MatchkeyConfig, Row } from "../../src/core/index.js";
4
+
5
+ describe("runDedupePipeline", () => {
6
+ it("with exact matchkey catches identical emails", () => {
7
+ const rows: Row[] = [
8
+ { id: 1, email: "a@x.com", name: "Alice" },
9
+ { id: 2, email: "a@x.com", name: "A." },
10
+ { id: 3, email: "b@x.com", name: "Bob" },
11
+ ];
12
+ const mk: MatchkeyConfig = {
13
+ name: "email_exact",
14
+ type: "exact",
15
+ fields: [{ field: "email", transforms: ["lowercase"], scorer: "exact", weight: 1.0 }],
16
+ };
17
+ const config = makeConfig({ matchkeys: [mk] });
18
+ const result = runDedupePipeline(rows, config);
19
+ expect(result.stats.totalRecords).toBe(3);
20
+ expect(result.scoredPairs.length).toBeGreaterThanOrEqual(1);
21
+ expect(result.dupes.length).toBeGreaterThanOrEqual(2);
22
+ });
23
+
24
+ it("with weighted matchkey + blocking", () => {
25
+ const rows: Row[] = [
26
+ { id: 1, name: "John Smith", zip: "111" },
27
+ { id: 2, name: "Jon Smith", zip: "111" },
28
+ { id: 3, name: "Zeke Xavier", zip: "222" },
29
+ ];
30
+ const mk: MatchkeyConfig = {
31
+ name: "name_fuzzy",
32
+ type: "weighted",
33
+ threshold: 0.7,
34
+ fields: [{ field: "name", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 1.0 }],
35
+ };
36
+ const blocking = makeBlockingConfig({
37
+ strategy: "static",
38
+ keys: [{ fields: ["zip"], transforms: [] }],
39
+ });
40
+ const config = makeConfig({ matchkeys: [mk], blocking });
41
+ const result = runDedupePipeline(rows, config);
42
+ expect(result.stats.totalRecords).toBe(3);
43
+ // John/Jon should match, Zeke should not
44
+ const hasMatch = result.scoredPairs.some((p) =>
45
+ (p.idA === 0 && p.idB === 1) || (p.idA === 1 && p.idB === 0),
46
+ );
47
+ expect(hasMatch).toBe(true);
48
+ });
49
+
50
+ it("empty input returns empty result", () => {
51
+ const result = runDedupePipeline([], makeConfig());
52
+ expect(result.stats.totalRecords).toBe(0);
53
+ expect(result.stats.totalClusters).toBe(0);
54
+ });
55
+
56
+ it("stats are computed correctly", () => {
57
+ const rows: Row[] = [
58
+ { id: 1, email: "a@x.com" },
59
+ { id: 2, email: "a@x.com" },
60
+ { id: 3, email: "b@x.com" },
61
+ ];
62
+ const mk: MatchkeyConfig = {
63
+ name: "email",
64
+ type: "exact",
65
+ fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
66
+ };
67
+ const config = makeConfig({ matchkeys: [mk] });
68
+ const result = runDedupePipeline(rows, config);
69
+ // totalRecords == matchedRecords + uniqueRecords
70
+ expect(result.stats.matchedRecords + result.stats.uniqueRecords).toBe(
71
+ result.stats.totalRecords,
72
+ );
73
+ // matchRate = matchedRecords / totalRecords
74
+ expect(result.stats.matchRate).toBeCloseTo(
75
+ result.stats.matchedRecords / result.stats.totalRecords,
76
+ 5,
77
+ );
78
+ });
79
+ });
80
+
81
+ describe("runMatchPipeline", () => {
82
+ it("finds cross-dataset matches", () => {
83
+ const target: Row[] = [{ id: 1, email: "a@x.com" }];
84
+ const reference: Row[] = [
85
+ { id: 10, email: "a@x.com" },
86
+ { id: 11, email: "b@x.com" },
87
+ ];
88
+ const mk: MatchkeyConfig = {
89
+ name: "email_exact",
90
+ type: "exact",
91
+ fields: [{ field: "email", transforms: ["lowercase"], scorer: "exact", weight: 1.0 }],
92
+ };
93
+ const config = makeConfig({ matchkeys: [mk] });
94
+ const result = runMatchPipeline(target, reference, config);
95
+ expect(result.matched.length).toBe(1);
96
+ expect(result.unmatched.length).toBe(0);
97
+ });
98
+
99
+ it("empty target yields no matches", () => {
100
+ const result = runMatchPipeline([], [{ id: 1, email: "a@x.com" }], makeConfig());
101
+ expect(result.matched).toEqual([]);
102
+ expect(result.unmatched).toEqual([]);
103
+ });
104
+
105
+ it("records with no reference match go to unmatched", () => {
106
+ const target: Row[] = [{ id: 1, email: "no-match@x.com" }];
107
+ const reference: Row[] = [{ id: 10, email: "a@x.com" }];
108
+ const mk: MatchkeyConfig = {
109
+ name: "email_exact",
110
+ type: "exact",
111
+ fields: [{ field: "email", transforms: [], scorer: "exact", weight: 1.0 }],
112
+ };
113
+ const config = makeConfig({ matchkeys: [mk] });
114
+ const result = runMatchPipeline(target, reference, config);
115
+ expect(result.matched.length).toBe(0);
116
+ expect(result.unmatched.length).toBe(1);
117
+ });
118
+ });
@@ -0,0 +1,381 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ runPPRL,
4
+ autoConfigurePPRL,
5
+ linkTrustedThirdParty,
6
+ linkSMC,
7
+ type PPRLConfig,
8
+ } from "../../src/core/pprl/protocol.js";
9
+ import type { Row } from "../../src/core/index.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Deterministic synthetic person generator
13
+ // ---------------------------------------------------------------------------
14
+
15
+ const FIRST_NAMES = [
16
+ "Alice", "Bob", "Carol", "David", "Eve",
17
+ "Frank", "Grace", "Hank", "Ivy", "Jack",
18
+ "Karen", "Leo", "Mary", "Noah", "Olive",
19
+ "Paul", "Quinn", "Ruth", "Steve", "Tina",
20
+ ];
21
+ const LAST_NAMES = [
22
+ "Smith", "Jones", "Brown", "Miller", "Davis",
23
+ "Wilson", "Moore", "Taylor", "Anderson", "Thomas",
24
+ "Jackson", "White", "Harris", "Martin", "Young",
25
+ ];
26
+
27
+ function personDataset(n: number, seed: number): Row[] {
28
+ const rows: Row[] = [];
29
+ for (let i = 0; i < n; i++) {
30
+ const fi = (seed * 7 + i * 3) % FIRST_NAMES.length;
31
+ const li = (seed * 11 + i * 5) % LAST_NAMES.length;
32
+ const first = FIRST_NAMES[fi]!;
33
+ const last = LAST_NAMES[li]!;
34
+ rows.push({
35
+ __row_id__: i,
36
+ id: `SEED${seed}-ROW${i}`, // near-unique, should be skipped by auto-config
37
+ first_name: first,
38
+ last_name: last,
39
+ email: `${first.toLowerCase()}.${last.toLowerCase()}${i}@example.com`,
40
+ city: ["NYC", "LA", "CHI", "BOS", "SEA"][i % 5]!,
41
+ });
42
+ }
43
+ return rows;
44
+ }
45
+
46
+ // Introduce typos to simulate overlap between two parties.
47
+ function typo(s: string): string {
48
+ if (s.length < 3) return s;
49
+ // Swap two adjacent middle chars.
50
+ const i = Math.floor(s.length / 2);
51
+ return s.slice(0, i - 1) + s[i]! + s[i - 1]! + s.slice(i + 1);
52
+ }
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // autoConfigurePPRL
56
+ // ---------------------------------------------------------------------------
57
+
58
+ describe("autoConfigurePPRL", () => {
59
+ it("picks sensible defaults, skipping ID-like and out-of-range fields", () => {
60
+ const a = personDataset(30, 1);
61
+ const b = personDataset(30, 2);
62
+
63
+ const cfg = autoConfigurePPRL(a, b);
64
+
65
+ // Basic invariants.
66
+ expect(cfg.fields.length).toBeLessThanOrEqual(4);
67
+ expect(cfg.threshold).toBeGreaterThanOrEqual(0.85);
68
+ expect(cfg.protocol).toBe("trusted_third_party");
69
+ expect(cfg.securityLevel).toBe("standard");
70
+
71
+ // `id` has cardinality_ratio=1.0 (unique per-row) => must be skipped.
72
+ expect(cfg.fields).not.toContain("id");
73
+ // `__row_id__` is also unique => must be skipped.
74
+ expect(cfg.fields).not.toContain("__row_id__");
75
+ });
76
+
77
+ it("handles a field with mixed null + real values (still considered if null rate < 30%)", () => {
78
+ const a: Row[] = [];
79
+ const b: Row[] = [];
80
+ // 20 rows each: first_name always present, middle_name null 20% of the time.
81
+ for (let i = 0; i < 20; i++) {
82
+ a.push({
83
+ __row_id__: i,
84
+ first_name: FIRST_NAMES[i % FIRST_NAMES.length]!,
85
+ last_name: LAST_NAMES[i % LAST_NAMES.length]!,
86
+ middle_name: i % 5 === 0 ? null : `Middle${i % 4}`,
87
+ });
88
+ b.push({
89
+ __row_id__: i,
90
+ first_name: FIRST_NAMES[(i + 3) % FIRST_NAMES.length]!,
91
+ last_name: LAST_NAMES[(i + 2) % LAST_NAMES.length]!,
92
+ middle_name: i % 7 === 0 ? null : `Middle${i % 4}`,
93
+ });
94
+ }
95
+ const cfg = autoConfigurePPRL(a, b);
96
+ // Middle name with ~20% nulls passes (<30%), last_name/first_name should both be considered.
97
+ expect(cfg.fields.length).toBeGreaterThan(0);
98
+ expect(cfg.fields.every((f) => typeof f === "string")).toBe(true);
99
+ });
100
+
101
+ it("drops high-null fields (> 30% null rate)", () => {
102
+ const a: Row[] = [];
103
+ const b: Row[] = [];
104
+ for (let i = 0; i < 30; i++) {
105
+ a.push({
106
+ __row_id__: i,
107
+ first_name: FIRST_NAMES[i % FIRST_NAMES.length]!,
108
+ last_name: LAST_NAMES[i % LAST_NAMES.length]!,
109
+ optional: i % 2 === 0 ? null : `val${i}`, // 50% null
110
+ });
111
+ b.push({
112
+ __row_id__: i,
113
+ first_name: FIRST_NAMES[(i + 2) % FIRST_NAMES.length]!,
114
+ last_name: LAST_NAMES[(i + 1) % LAST_NAMES.length]!,
115
+ optional: i % 2 === 0 ? null : `val${i}`,
116
+ });
117
+ }
118
+ const cfg = autoConfigurePPRL(a, b);
119
+ expect(cfg.fields).not.toContain("optional");
120
+ });
121
+ });
122
+
123
+ // ---------------------------------------------------------------------------
124
+ // runPPRL
125
+ // ---------------------------------------------------------------------------
126
+
127
+ describe("runPPRL", () => {
128
+ function twoPartiesWithOverlap() {
129
+ // Dataset A: 50 people.
130
+ const a: Row[] = [];
131
+ for (let i = 0; i < 50; i++) {
132
+ const first = FIRST_NAMES[i % FIRST_NAMES.length]!;
133
+ const last = LAST_NAMES[i % LAST_NAMES.length]!;
134
+ a.push({
135
+ __row_id__: i,
136
+ first_name: first,
137
+ last_name: last,
138
+ email: `${first.toLowerCase()}.${last.toLowerCase()}${i}@x.com`,
139
+ });
140
+ }
141
+
142
+ // Dataset B: 50 people, first 10 are "the same people" with typos.
143
+ const b: Row[] = [];
144
+ for (let i = 0; i < 10; i++) {
145
+ const first = FIRST_NAMES[i % FIRST_NAMES.length]!;
146
+ const last = LAST_NAMES[i % LAST_NAMES.length]!;
147
+ b.push({
148
+ __row_id__: i,
149
+ // Use typos on last name to simulate noisy overlap.
150
+ first_name: first,
151
+ last_name: typo(last),
152
+ email: `${first.toLowerCase()}.${last.toLowerCase()}${i}@x.com`,
153
+ });
154
+ }
155
+ for (let i = 10; i < 50; i++) {
156
+ b.push({
157
+ __row_id__: i,
158
+ first_name: `NonOverlap${i}`,
159
+ last_name: `Different${i}`,
160
+ email: `novel${i}@other.org`,
161
+ });
162
+ }
163
+ return { a, b };
164
+ }
165
+
166
+ it("finds most shared entities in two datasets with partial overlap", () => {
167
+ const { a, b } = twoPartiesWithOverlap();
168
+ const config: PPRLConfig = {
169
+ fields: ["first_name", "last_name", "email"],
170
+ securityLevel: "standard",
171
+ protocol: "trusted_third_party",
172
+ threshold: 0.5,
173
+ };
174
+ const result = runPPRL(a, b, config);
175
+
176
+ // Every match has correct shape.
177
+ for (const m of result.matches) {
178
+ expect(m).toHaveProperty("idA");
179
+ expect(m).toHaveProperty("idB");
180
+ expect(m).toHaveProperty("score");
181
+ expect(m.score).toBeGreaterThanOrEqual(config.threshold);
182
+ expect(m.score).toBeLessThanOrEqual(1);
183
+ }
184
+
185
+ // Stats reflect the pass.
186
+ expect(result.stats["comparedPairs"]).toBe(50 * 50);
187
+ expect(result.stats["matchCount"]).toBe(result.matches.length);
188
+ expect(result.stats["protocol"]).toBe("trusted_third_party");
189
+
190
+ // True pairs should surface: a[i] ~ b[i] for i in 0..9.
191
+ const truePairs = new Set<string>();
192
+ for (let i = 0; i < 10; i++) truePairs.add(`${i}:${i}`);
193
+
194
+ let hits = 0;
195
+ for (const m of result.matches) {
196
+ if (truePairs.has(`${m.idA}:${m.idB}`)) hits++;
197
+ }
198
+ // We expect most of the 10 shared entities to be recovered at threshold 0.5.
199
+ expect(hits).toBeGreaterThanOrEqual(7);
200
+ });
201
+
202
+ it("runs end-to-end with security_level standard/high/paranoid and finds same true pairs", () => {
203
+ const { a, b } = twoPartiesWithOverlap();
204
+ const baseFields: string[] = ["first_name", "last_name", "email"];
205
+
206
+ const runAt = (level: "standard" | "high" | "paranoid") => {
207
+ const cfg: PPRLConfig =
208
+ level === "standard"
209
+ ? {
210
+ fields: baseFields,
211
+ securityLevel: level,
212
+ protocol: "trusted_third_party",
213
+ threshold: 0.4,
214
+ }
215
+ : {
216
+ fields: baseFields,
217
+ securityLevel: level,
218
+ protocol: "trusted_third_party",
219
+ threshold: 0.4,
220
+ salt: "shared-secret",
221
+ };
222
+ return runPPRL(a, b, cfg);
223
+ };
224
+
225
+ const rStandard = runAt("standard");
226
+ const rHigh = runAt("high");
227
+ const rParanoid = runAt("paranoid");
228
+
229
+ // All produce matches without throwing.
230
+ expect(rStandard.matches.length).toBeGreaterThan(0);
231
+ expect(rHigh.matches.length).toBeGreaterThan(0);
232
+ expect(rParanoid.matches.length).toBeGreaterThan(0);
233
+
234
+ // The true pairs (a[i] ~ b[i] for i in 0..9) should be found in all three.
235
+ for (const result of [rStandard, rHigh, rParanoid]) {
236
+ const keys = new Set<string>();
237
+ for (const m of result.matches) keys.add(`${m.idA}:${m.idB}`);
238
+ let hits = 0;
239
+ for (let i = 0; i < 10; i++) if (keys.has(`${i}:${i}`)) hits++;
240
+ expect(hits).toBeGreaterThanOrEqual(5);
241
+ }
242
+ });
243
+
244
+ it("deterministic CLK: running twice on same data gives identical matches", () => {
245
+ const { a, b } = twoPartiesWithOverlap();
246
+ const config: PPRLConfig = {
247
+ fields: ["first_name", "last_name", "email"],
248
+ securityLevel: "standard",
249
+ protocol: "trusted_third_party",
250
+ threshold: 0.5,
251
+ };
252
+ const r1 = runPPRL(a, b, config);
253
+ const r2 = runPPRL(a, b, config);
254
+ expect(r1.matches.length).toBe(r2.matches.length);
255
+ for (let i = 0; i < r1.matches.length; i++) {
256
+ expect(r1.matches[i]!.idA).toBe(r2.matches[i]!.idA);
257
+ expect(r1.matches[i]!.idB).toBe(r2.matches[i]!.idB);
258
+ expect(r1.matches[i]!.score).toBeCloseTo(r2.matches[i]!.score, 10);
259
+ }
260
+ });
261
+
262
+ it("empty rowsA or rowsB returns empty matches", () => {
263
+ const { a } = twoPartiesWithOverlap();
264
+ const cfg: PPRLConfig = {
265
+ fields: ["first_name", "last_name"],
266
+ securityLevel: "standard",
267
+ protocol: "trusted_third_party",
268
+ threshold: 0.5,
269
+ };
270
+ expect(runPPRL([], a, cfg).matches).toEqual([]);
271
+ expect(runPPRL(a, [], cfg).matches).toEqual([]);
272
+ expect(runPPRL([], [], cfg).matches).toEqual([]);
273
+ });
274
+
275
+ it("skips rows that encode to empty strings (all fields null)", () => {
276
+ const a: Row[] = [
277
+ { __row_id__: 0, first_name: "Alice", last_name: "Smith" },
278
+ { __row_id__: 1, first_name: null, last_name: null },
279
+ ];
280
+ const b: Row[] = [
281
+ { __row_id__: 0, first_name: "Alice", last_name: "Smith" },
282
+ ];
283
+ const cfg: PPRLConfig = {
284
+ fields: ["first_name", "last_name"],
285
+ securityLevel: "standard",
286
+ protocol: "trusted_third_party",
287
+ threshold: 0.5,
288
+ };
289
+ const result = runPPRL(a, b, cfg);
290
+ // Only the one non-null row on each side produces a match.
291
+ expect(result.matches.length).toBe(1);
292
+ expect(result.matches[0]!.idA).toBe(0);
293
+ expect(result.matches[0]!.idB).toBe(0);
294
+ });
295
+ });
296
+
297
+ // ---------------------------------------------------------------------------
298
+ // Bloom filter output format (via low-level transform)
299
+ // ---------------------------------------------------------------------------
300
+
301
+ describe("bloom filter hex output", () => {
302
+ it("hex length differs across security levels (512/1024/2048 bits per current presets)", async () => {
303
+ // runPPRL does not expose raw encodings, so we import the transform directly.
304
+ const { applyTransform } = await import("../../src/core/transforms.js");
305
+ const value = "alice smith";
306
+
307
+ const std = applyTransform(value, "bloom_filter:standard")!;
308
+ const high = applyTransform(value, "bloom_filter:high:secret")!;
309
+ const paranoid = applyTransform(value, "bloom_filter:paranoid:secret")!;
310
+
311
+ // hex => 2 chars per byte.
312
+ // Active presets: standard=512 bits, high=1024, paranoid=2048.
313
+ expect(std.length).toBe(128);
314
+ expect(high.length).toBe(256);
315
+ expect(paranoid.length).toBe(512);
316
+
317
+ // Strictly increasing lengths confirm the "larger filter = higher security" invariant.
318
+ expect(std.length).toBeLessThan(high.length);
319
+ expect(high.length).toBeLessThan(paranoid.length);
320
+
321
+ // All valid hex.
322
+ for (const s of [std, high, paranoid]) {
323
+ expect(/^[0-9a-f]+$/.test(s)).toBe(true);
324
+ }
325
+ });
326
+ });
327
+
328
+ // ---------------------------------------------------------------------------
329
+ // linkTrustedThirdParty / linkSMC
330
+ // ---------------------------------------------------------------------------
331
+
332
+ describe("link protocol wrappers", () => {
333
+ const a = personDataset(10, 1);
334
+ const b = personDataset(10, 2);
335
+
336
+ it("linkTrustedThirdParty returns a PPRLResult shape", () => {
337
+ const result = linkTrustedThirdParty(a, b, {
338
+ fields: ["first_name", "last_name"],
339
+ securityLevel: "standard",
340
+ protocol: "smc", // intentionally wrong; wrapper must normalize it.
341
+ threshold: 0.5,
342
+ });
343
+ expect(Array.isArray(result.matches)).toBe(true);
344
+ expect(result.stats["protocol"]).toBe("trusted_third_party");
345
+ });
346
+
347
+ it("linkSMC requires a salt and non-standard security level", () => {
348
+ // Missing salt => throws.
349
+ expect(() =>
350
+ linkSMC(a, b, {
351
+ fields: ["first_name", "last_name"],
352
+ securityLevel: "high",
353
+ protocol: "smc",
354
+ threshold: 0.5,
355
+ }),
356
+ ).toThrow(/salt/);
357
+
358
+ // standard security => throws.
359
+ expect(() =>
360
+ linkSMC(a, b, {
361
+ fields: ["first_name", "last_name"],
362
+ securityLevel: "standard",
363
+ protocol: "smc",
364
+ threshold: 0.5,
365
+ salt: "shhh",
366
+ }),
367
+ ).toThrow(/high.*paranoid|paranoid/i);
368
+
369
+ // Happy path.
370
+ const result = linkSMC(a, b, {
371
+ fields: ["first_name", "last_name"],
372
+ securityLevel: "high",
373
+ protocol: "smc",
374
+ threshold: 0.5,
375
+ salt: "shhh",
376
+ });
377
+ expect(Array.isArray(result.matches)).toBe(true);
378
+ expect(result.stats["protocol"]).toBe("smc");
379
+ expect(result.stats["securityLevel"]).toBe("high");
380
+ });
381
+ });