goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,366 @@
1
+ /**
2
+ * pipeline.ts — Core pipeline orchestrator for GoldenMatch-JS.
3
+ * Edge-safe: no `node:` imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/pipeline.py.
6
+ * Chains: standardize -> matchkeys -> block -> score -> cluster -> golden.
7
+ */
8
+
9
+ import type {
10
+ Row,
11
+ GoldenMatchConfig,
12
+ MatchkeyConfig,
13
+ DedupeResult,
14
+ DedupeStats,
15
+ PairKey,
16
+ ScoredPair,
17
+ MatchResult,
18
+ GoldenRulesConfig,
19
+ ClusterInfo,
20
+ } from "./types.js";
21
+ import { makeGoldenRulesConfig, getMatchkeys, makeBlockingConfig } from "./types.js";
22
+ import { computeMatchkeys, addRowIds, addSourceColumn } from "./matchkey.js";
23
+ import { applyStandardization } from "./standardize.js";
24
+ import { buildBlocks } from "./blocker.js";
25
+ import {
26
+ findExactMatches,
27
+ scoreBlocksSequential,
28
+ } from "./scorer.js";
29
+ import { buildClusters, pairKey } from "./cluster.js";
30
+ import { buildGoldenRecord } from "./golden.js";
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // Options
34
+ // ---------------------------------------------------------------------------
35
+
36
+ export interface DedupeOptions {
37
+ readonly outputGolden?: boolean;
38
+ readonly outputReport?: boolean;
39
+ readonly acrossFilesOnly?: boolean;
40
+ }
41
+
42
+ // ---------------------------------------------------------------------------
43
+ // Internal helpers
44
+ // ---------------------------------------------------------------------------
45
+
46
+ /** Build a source lookup map from rows (rowId -> source name). */
47
+ function buildSourceLookup(rows: readonly Row[]): Map<number, string> {
48
+ const lookup = new Map<number, string>();
49
+ for (const row of rows) {
50
+ const id = row.__row_id__ as number;
51
+ const src = row.__source__ as string | undefined;
52
+ if (id !== undefined && src !== undefined) {
53
+ lookup.set(id, src);
54
+ }
55
+ }
56
+ return lookup;
57
+ }
58
+
59
+ /** Collect all row IDs from rows. */
60
+ function collectRowIds(rows: readonly Row[]): number[] {
61
+ return rows.map((r) => r.__row_id__ as number);
62
+ }
63
+
64
+ /** Assign __cluster_id__ to rows based on cluster membership. */
65
+ function assignClusterIds(
66
+ rows: readonly Row[],
67
+ clusters: ReadonlyMap<number, ClusterInfo>,
68
+ ): Row[] {
69
+ // Build rowId -> clusterId lookup
70
+ const rowToCluster = new Map<number, number>();
71
+ for (const [cid, cinfo] of clusters) {
72
+ for (const memberId of cinfo.members) {
73
+ rowToCluster.set(memberId, cid);
74
+ }
75
+ }
76
+
77
+ return rows.map((row) => {
78
+ const rowId = row.__row_id__ as number;
79
+ const cid = rowToCluster.get(rowId);
80
+ return cid !== undefined ? { ...row, __cluster_id__: cid } : row;
81
+ });
82
+ }
83
+
84
+ // ---------------------------------------------------------------------------
85
+ // runDedupePipeline
86
+ // ---------------------------------------------------------------------------
87
+
88
+ /**
89
+ * Run the full deduplication pipeline.
90
+ *
91
+ * Steps:
92
+ * 1. Add __row_id__ and __source__ if not present
93
+ * 2. Apply standardization
94
+ * 3. Compute matchkeys
95
+ * 4. Phase 1: Exact matchkeys (hash-based grouping)
96
+ * 5. Phase 2: Fuzzy matchkeys (block + score)
97
+ * 6. Phase 3: Cluster (Union-Find with MST splitting)
98
+ * 7. Phase 4: Build golden records for multi-member clusters
99
+ * 8. Classify dupes vs unique
100
+ * 9. Compute stats
101
+ * 10. Return DedupeResult
102
+ */
103
+ export function runDedupePipeline(
104
+ rows: readonly Row[],
105
+ config: GoldenMatchConfig,
106
+ options?: DedupeOptions,
107
+ ): DedupeResult {
108
+ if (rows.length === 0) {
109
+ return _emptyDedupeResult(config);
110
+ }
111
+
112
+ const matchkeys = getMatchkeys(config);
113
+ const goldenRules = config.goldenRules ?? makeGoldenRulesConfig();
114
+ const blockingConfig = config.blocking ?? makeBlockingConfig();
115
+ const acrossFilesOnly = options?.acrossFilesOnly ?? false;
116
+
117
+ // ---- Step 1: Add __row_id__ and __source__ ----
118
+ let processed: Row[] = rows.map((r, i) => {
119
+ const extra: Record<string, unknown> = {};
120
+ if (r.__row_id__ === undefined) extra.__row_id__ = i;
121
+ if (r.__source__ === undefined) extra.__source__ = "default";
122
+ return Object.keys(extra).length > 0 ? { ...r, ...extra } : (r as Row);
123
+ });
124
+
125
+ // ---- Step 2: Apply standardization ----
126
+ if (config.standardization) {
127
+ processed = applyStandardization(processed, config.standardization.rules);
128
+ }
129
+
130
+ // ---- Step 3: Compute matchkeys ----
131
+ processed = computeMatchkeys(processed, matchkeys);
132
+
133
+ // ---- Step 4 & 5: Score exact + fuzzy matchkeys ----
134
+ const allPairs: ScoredPair[] = [];
135
+ const matchedPairKeys = new Set<PairKey>();
136
+ const sourceLookup = buildSourceLookup(processed);
137
+
138
+ for (const mk of matchkeys) {
139
+ if (mk.type === "exact") {
140
+ // Phase 1: Exact matching via hash grouping
141
+ let pairs = findExactMatches(processed, mk);
142
+
143
+ // Cross-file filter
144
+ if (acrossFilesOnly) {
145
+ pairs = pairs.filter((p) => {
146
+ const srcA = sourceLookup.get(p.idA);
147
+ const srcB = sourceLookup.get(p.idB);
148
+ return srcA !== srcB;
149
+ });
150
+ }
151
+
152
+ for (const p of pairs) {
153
+ const key = pairKey(p.idA, p.idB);
154
+ if (!matchedPairKeys.has(key)) {
155
+ matchedPairKeys.add(key);
156
+ allPairs.push(p);
157
+ }
158
+ }
159
+ } else {
160
+ // Phase 2: Fuzzy (weighted/probabilistic) — block then score
161
+ const blocks = buildBlocks(processed, blockingConfig);
162
+
163
+ const pairs = scoreBlocksSequential(blocks, mk, matchedPairKeys, {
164
+ acrossFilesOnly,
165
+ sourceLookup,
166
+ });
167
+
168
+ for (const p of pairs) {
169
+ allPairs.push(p);
170
+ }
171
+ }
172
+ }
173
+
174
+ // ---- Step 6: Cluster ----
175
+ const allIds = collectRowIds(processed);
176
+ const pairTuples: [number, number, number][] = allPairs.map((p) => [
177
+ p.idA,
178
+ p.idB,
179
+ p.score,
180
+ ]);
181
+
182
+ const clusters = buildClusters(pairTuples, allIds, {
183
+ maxClusterSize: goldenRules.maxClusterSize,
184
+ weakClusterThreshold: goldenRules.weakClusterThreshold,
185
+ autoSplit: goldenRules.autoSplit,
186
+ });
187
+
188
+ // ---- Step 7: Build golden records ----
189
+ const rowsWithClusters = assignClusterIds(processed, clusters);
190
+ const goldenRecords: Row[] = [];
191
+
192
+ if (options?.outputGolden !== false) {
193
+ for (const [cid, cinfo] of clusters) {
194
+ if (cinfo.size < 2) continue; // Only build golden for multi-member clusters
195
+
196
+ const clusterRows = rowsWithClusters.filter(
197
+ (r) => (r.__cluster_id__ as number) === cid,
198
+ );
199
+ const golden = buildGoldenRecord(clusterRows, goldenRules);
200
+
201
+ const goldenRow: Record<string, unknown> = {
202
+ __cluster_id__: cid,
203
+ __golden_confidence__: golden.goldenConfidence,
204
+ };
205
+ for (const [col, info] of Object.entries(golden.fields)) {
206
+ goldenRow[col] = info.value;
207
+ }
208
+ goldenRecords.push(goldenRow as Row);
209
+ }
210
+ }
211
+
212
+ // ---- Step 8: Classify dupes vs unique ----
213
+ const multiMemberClusterIds = new Set<number>();
214
+ for (const [cid, cinfo] of clusters) {
215
+ if (cinfo.size >= 2) multiMemberClusterIds.add(cid);
216
+ }
217
+
218
+ const dupeRowIds = new Set<number>();
219
+ for (const [, cinfo] of clusters) {
220
+ if (cinfo.size >= 2) {
221
+ for (const m of cinfo.members) {
222
+ dupeRowIds.add(m);
223
+ }
224
+ }
225
+ }
226
+
227
+ const dupes: Row[] = [];
228
+ const unique: Row[] = [];
229
+ for (const row of rowsWithClusters) {
230
+ const rowId = row.__row_id__ as number;
231
+ if (dupeRowIds.has(rowId)) {
232
+ dupes.push(row);
233
+ } else {
234
+ unique.push(row);
235
+ }
236
+ }
237
+
238
+ // ---- Step 9: Compute stats ----
239
+ const totalRecords = processed.length;
240
+ const totalClusters = clusters.size;
241
+ const matchedRecords = dupes.length;
242
+ const uniqueRecords = unique.length;
243
+ const matchRate = totalRecords > 0 ? matchedRecords / totalRecords : 0;
244
+
245
+ const stats: DedupeStats = {
246
+ totalRecords,
247
+ totalClusters,
248
+ matchRate,
249
+ matchedRecords,
250
+ uniqueRecords,
251
+ };
252
+
253
+ // ---- Step 10: Return result ----
254
+ return {
255
+ goldenRecords,
256
+ clusters,
257
+ dupes,
258
+ unique,
259
+ stats,
260
+ scoredPairs: allPairs,
261
+ config,
262
+ };
263
+ }
264
+
265
+ // ---------------------------------------------------------------------------
266
+ // runMatchPipeline
267
+ // ---------------------------------------------------------------------------
268
+
269
+ /**
270
+ * Run the match pipeline: match target rows against reference rows.
271
+ *
272
+ * - Assigns __row_id__ with offset for reference rows
273
+ * - Assigns __source__ ("target" / "reference")
274
+ * - Runs same pipeline but filters to cross-source pairs only
275
+ */
276
+ export function runMatchPipeline(
277
+ targetRows: readonly Row[],
278
+ referenceRows: readonly Row[],
279
+ config: GoldenMatchConfig,
280
+ ): MatchResult {
281
+ if (targetRows.length === 0 || referenceRows.length === 0) {
282
+ return {
283
+ matched: [],
284
+ unmatched: [...targetRows],
285
+ stats: {
286
+ totalTarget: targetRows.length,
287
+ totalReference: referenceRows.length,
288
+ matchedCount: 0,
289
+ unmatchedCount: targetRows.length,
290
+ matchRate: 0,
291
+ },
292
+ };
293
+ }
294
+
295
+ // Add row IDs and source labels
296
+ const target = addSourceColumn(addRowIds(targetRows, 0), "target");
297
+ const reference = addSourceColumn(
298
+ addRowIds(referenceRows, targetRows.length),
299
+ "reference",
300
+ );
301
+
302
+ // Combine and run dedupe pipeline with cross-file filter
303
+ const combined = [...target, ...reference];
304
+ const result = runDedupePipeline(combined, config, {
305
+ acrossFilesOnly: true,
306
+ outputGolden: false,
307
+ });
308
+
309
+ // Track which target row IDs got matched
310
+ const targetIds = new Set<number>(
311
+ target.map((r) => r.__row_id__ as number),
312
+ );
313
+ const matchedTargetIds = new Set<number>();
314
+
315
+ for (const pair of result.scoredPairs) {
316
+ if (targetIds.has(pair.idA)) matchedTargetIds.add(pair.idA);
317
+ if (targetIds.has(pair.idB)) matchedTargetIds.add(pair.idB);
318
+ }
319
+
320
+ // Build matched/unmatched from original target rows
321
+ const matched: Row[] = [];
322
+ const unmatched: Row[] = [];
323
+ for (const row of target) {
324
+ const rowId = row.__row_id__ as number;
325
+ if (matchedTargetIds.has(rowId)) {
326
+ matched.push(row);
327
+ } else {
328
+ unmatched.push(row);
329
+ }
330
+ }
331
+
332
+ return {
333
+ matched,
334
+ unmatched,
335
+ stats: {
336
+ totalTarget: targetRows.length,
337
+ totalReference: referenceRows.length,
338
+ matchedCount: matched.length,
339
+ unmatchedCount: unmatched.length,
340
+ matchRate:
341
+ targetRows.length > 0 ? matched.length / targetRows.length : 0,
342
+ },
343
+ };
344
+ }
345
+
346
+ // ---------------------------------------------------------------------------
347
+ // Internal: empty result
348
+ // ---------------------------------------------------------------------------
349
+
350
+ function _emptyDedupeResult(config: GoldenMatchConfig): DedupeResult {
351
+ return {
352
+ goldenRecords: [],
353
+ clusters: new Map(),
354
+ dupes: [],
355
+ unique: [],
356
+ stats: {
357
+ totalRecords: 0,
358
+ totalClusters: 0,
359
+ matchRate: 0,
360
+ matchedRecords: 0,
361
+ uniqueRecords: 0,
362
+ },
363
+ scoredPairs: [],
364
+ config,
365
+ };
366
+ }
@@ -0,0 +1,216 @@
1
+ /**
2
+ * pprl/protocol.ts — Privacy-preserving record linkage.
3
+ * Edge-safe: no `node:` imports.
4
+ *
5
+ * Ports goldenmatch/pprl/protocol.py. Encodes both datasets as bloom
6
+ * filters (CLKs) over the selected fields, then scores pairs via Dice or
7
+ * Jaccard similarity. Two protocol stubs are surfaced: trusted third
8
+ * party (no crypto beyond the bloom filter itself) and a simple SMC
9
+ * sketch that adds a salt per party before encoding.
10
+ */
11
+
12
+ import type { Row } from "../types.js";
13
+ import { applyTransform } from "../transforms.js";
14
+ import { diceCoefficient } from "../scorer.js";
15
+ import { profileRows, type ColumnProfile } from "../profiler.js";
16
+
17
+ // ---------------------------------------------------------------------------
18
+ // Types
19
+ // ---------------------------------------------------------------------------
20
+
21
+ export interface PPRLConfig {
22
+ readonly fields: readonly string[];
23
+ readonly securityLevel: "standard" | "high" | "paranoid";
24
+ readonly protocol: "trusted_third_party" | "smc";
25
+ readonly threshold: number;
26
+ /** Optional salt used with "high"/"paranoid" levels. */
27
+ readonly salt?: string;
28
+ }
29
+
30
+ export interface PPRLMatch {
31
+ readonly idA: number;
32
+ readonly idB: number;
33
+ readonly score: number;
34
+ }
35
+
36
+ export interface PPRLResult {
37
+ readonly matches: readonly PPRLMatch[];
38
+ readonly stats: Readonly<Record<string, unknown>>;
39
+ }
40
+
41
+ // ---------------------------------------------------------------------------
42
+ // Helpers
43
+ // ---------------------------------------------------------------------------
44
+
45
+ function rowString(row: Row, fields: readonly string[]): string {
46
+ const parts: string[] = [];
47
+ for (const f of fields) {
48
+ const v = row[f];
49
+ if (v === null || v === undefined) continue;
50
+ const s = typeof v === "string" ? v : String(v);
51
+ const normalized = applyTransform(s, "lowercase") ?? s;
52
+ const cleaned = applyTransform(normalized, "normalize_whitespace") ?? normalized;
53
+ parts.push(cleaned);
54
+ }
55
+ return parts.join(" ");
56
+ }
57
+
58
+ function encodeRow(row: Row, config: PPRLConfig): string {
59
+ const value = rowString(row, config.fields);
60
+ if (value.length === 0) return "";
61
+ const transformName =
62
+ config.salt && (config.securityLevel === "high" || config.securityLevel === "paranoid")
63
+ ? `bloom_filter:${config.securityLevel}:${config.salt}`
64
+ : `bloom_filter:${config.securityLevel}`;
65
+ return applyTransform(value, transformName) ?? "";
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Core linkage
70
+ // ---------------------------------------------------------------------------
71
+
72
+ /**
73
+ * Encode both row sets as bloom filters and emit pair matches above the
74
+ * configured threshold.
75
+ */
76
+ export function runPPRL(
77
+ rowsA: readonly Row[],
78
+ rowsB: readonly Row[],
79
+ config: PPRLConfig,
80
+ ): PPRLResult {
81
+ const encodedA: string[] = rowsA.map((r) => encodeRow(r, config));
82
+ const encodedB: string[] = rowsB.map((r) => encodeRow(r, config));
83
+
84
+ const matches: PPRLMatch[] = [];
85
+ let compared = 0;
86
+
87
+ for (let i = 0; i < encodedA.length; i++) {
88
+ const a = encodedA[i]!;
89
+ if (a.length === 0) continue;
90
+ for (let j = 0; j < encodedB.length; j++) {
91
+ const b = encodedB[j]!;
92
+ if (b.length === 0) continue;
93
+ compared++;
94
+ const score = diceCoefficient(a, b);
95
+ if (score >= config.threshold) {
96
+ matches.push({ idA: i, idB: j, score });
97
+ }
98
+ }
99
+ }
100
+
101
+ return {
102
+ matches,
103
+ stats: {
104
+ protocol: config.protocol,
105
+ securityLevel: config.securityLevel,
106
+ comparedPairs: compared,
107
+ matchCount: matches.length,
108
+ threshold: config.threshold,
109
+ fields: config.fields,
110
+ },
111
+ };
112
+ }
113
+
114
+ // ---------------------------------------------------------------------------
115
+ // Auto-config
116
+ // ---------------------------------------------------------------------------
117
+
118
+ const MIN_LENGTH = 3;
119
+ const MAX_LENGTH = 15;
120
+ const MAX_FIELDS = 4;
121
+ const MIN_THRESHOLD = 0.85;
122
+
123
+ /**
124
+ * Auto-pick PPRL parameters for the given dataset pair. Penalizes
125
+ * near-unique fields (IDs), over-long fields, and high-null fields.
126
+ */
127
+ export function autoConfigurePPRL(
128
+ rowsA: readonly Row[],
129
+ rowsB: readonly Row[],
130
+ ): PPRLConfig {
131
+ const profileA = profileRows(rowsA);
132
+ const profileB = profileRows(rowsB);
133
+
134
+ const commonCols = new Set<string>();
135
+ for (const c of profileA.columns) {
136
+ if (profileB.byName[c.name]) commonCols.add(c.name);
137
+ }
138
+
139
+ interface Candidate {
140
+ readonly name: string;
141
+ readonly score: number;
142
+ }
143
+
144
+ const candidates: Candidate[] = [];
145
+ for (const name of commonCols) {
146
+ const pa = profileA.byName[name];
147
+ const pb = profileB.byName[name];
148
+ if (!pa || !pb) continue;
149
+
150
+ const nullRate = Math.max(pa.nullRate, pb.nullRate);
151
+ if (nullRate > 0.3) continue;
152
+
153
+ const avgLen = (pa.avgLength + pb.avgLength) / 2;
154
+ if (avgLen < MIN_LENGTH) continue;
155
+ if (avgLen > MAX_LENGTH) continue;
156
+
157
+ // Penalize near-unique fields (likely IDs)
158
+ const card = Math.max(pa.cardinalityRatio, pb.cardinalityRatio);
159
+ if (card > 0.95) continue;
160
+
161
+ // Score: prefer moderate cardinality, low nulls, moderate length.
162
+ const lenPenalty = Math.abs(avgLen - 8) / 8;
163
+ const score = (1 - nullRate) * (1 - Math.abs(card - 0.5)) * (1 - lenPenalty);
164
+
165
+ candidates.push({ name, score });
166
+ }
167
+
168
+ candidates.sort((a, b) => b.score - a.score);
169
+ const fields = candidates.slice(0, MAX_FIELDS).map((c) => c.name);
170
+
171
+ return {
172
+ fields,
173
+ securityLevel: "standard",
174
+ protocol: "trusted_third_party",
175
+ threshold: MIN_THRESHOLD,
176
+ };
177
+ }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Protocol wrappers (API-parity stubs)
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /**
184
+ * Trusted-third-party linkage: both parties ship encoded CLKs to a
185
+ * trusted intermediary that runs the similarity scoring. Same mechanics
186
+ * as `runPPRL`, but callsite is semantically distinct.
187
+ */
188
+ export function linkTrustedThirdParty(
189
+ rowsA: readonly Row[],
190
+ rowsB: readonly Row[],
191
+ config: PPRLConfig,
192
+ ): PPRLResult {
193
+ return runPPRL(rowsA, rowsB, { ...config, protocol: "trusted_third_party" });
194
+ }
195
+
196
+ /**
197
+ * Secure-multiparty-computation linkage (simplified): each party salts
198
+ * its inputs with a shared secret. Requires a non-empty `salt` in config
199
+ * and a "high"/"paranoid" security level.
200
+ */
201
+ export function linkSMC(
202
+ rowsA: readonly Row[],
203
+ rowsB: readonly Row[],
204
+ config: PPRLConfig,
205
+ ): PPRLResult {
206
+ if (!config.salt || config.salt.length === 0) {
207
+ throw new Error("SMC protocol requires a non-empty `salt`");
208
+ }
209
+ if (config.securityLevel === "standard") {
210
+ throw new Error("SMC protocol requires securityLevel of 'high' or 'paranoid'");
211
+ }
212
+ return runPPRL(rowsA, rowsB, { ...config, protocol: "smc" });
213
+ }
214
+
215
+ // Re-export profile type for consumers that want it alongside.
216
+ export type { ColumnProfile };