goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline.ts — Core pipeline orchestrator for GoldenMatch-JS.
|
|
3
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/pipeline.py.
|
|
6
|
+
* Chains: standardize -> matchkeys -> block -> score -> cluster -> golden.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type {
|
|
10
|
+
Row,
|
|
11
|
+
GoldenMatchConfig,
|
|
12
|
+
MatchkeyConfig,
|
|
13
|
+
DedupeResult,
|
|
14
|
+
DedupeStats,
|
|
15
|
+
PairKey,
|
|
16
|
+
ScoredPair,
|
|
17
|
+
MatchResult,
|
|
18
|
+
GoldenRulesConfig,
|
|
19
|
+
ClusterInfo,
|
|
20
|
+
} from "./types.js";
|
|
21
|
+
import { makeGoldenRulesConfig, getMatchkeys, makeBlockingConfig } from "./types.js";
|
|
22
|
+
import { computeMatchkeys, addRowIds, addSourceColumn } from "./matchkey.js";
|
|
23
|
+
import { applyStandardization } from "./standardize.js";
|
|
24
|
+
import { buildBlocks } from "./blocker.js";
|
|
25
|
+
import {
|
|
26
|
+
findExactMatches,
|
|
27
|
+
scoreBlocksSequential,
|
|
28
|
+
} from "./scorer.js";
|
|
29
|
+
import { buildClusters, pairKey } from "./cluster.js";
|
|
30
|
+
import { buildGoldenRecord } from "./golden.js";
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Options
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
export interface DedupeOptions {
|
|
37
|
+
readonly outputGolden?: boolean;
|
|
38
|
+
readonly outputReport?: boolean;
|
|
39
|
+
readonly acrossFilesOnly?: boolean;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Internal helpers
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
/** Build a source lookup map from rows (rowId -> source name). */
|
|
47
|
+
function buildSourceLookup(rows: readonly Row[]): Map<number, string> {
|
|
48
|
+
const lookup = new Map<number, string>();
|
|
49
|
+
for (const row of rows) {
|
|
50
|
+
const id = row.__row_id__ as number;
|
|
51
|
+
const src = row.__source__ as string | undefined;
|
|
52
|
+
if (id !== undefined && src !== undefined) {
|
|
53
|
+
lookup.set(id, src);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return lookup;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Collect all row IDs from rows. */
|
|
60
|
+
function collectRowIds(rows: readonly Row[]): number[] {
|
|
61
|
+
return rows.map((r) => r.__row_id__ as number);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Assign __cluster_id__ to rows based on cluster membership. */
|
|
65
|
+
function assignClusterIds(
|
|
66
|
+
rows: readonly Row[],
|
|
67
|
+
clusters: ReadonlyMap<number, ClusterInfo>,
|
|
68
|
+
): Row[] {
|
|
69
|
+
// Build rowId -> clusterId lookup
|
|
70
|
+
const rowToCluster = new Map<number, number>();
|
|
71
|
+
for (const [cid, cinfo] of clusters) {
|
|
72
|
+
for (const memberId of cinfo.members) {
|
|
73
|
+
rowToCluster.set(memberId, cid);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return rows.map((row) => {
|
|
78
|
+
const rowId = row.__row_id__ as number;
|
|
79
|
+
const cid = rowToCluster.get(rowId);
|
|
80
|
+
return cid !== undefined ? { ...row, __cluster_id__: cid } : row;
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// runDedupePipeline
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Run the full deduplication pipeline.
|
|
90
|
+
*
|
|
91
|
+
* Steps:
|
|
92
|
+
* 1. Add __row_id__ and __source__ if not present
|
|
93
|
+
* 2. Apply standardization
|
|
94
|
+
* 3. Compute matchkeys
|
|
95
|
+
* 4. Phase 1: Exact matchkeys (hash-based grouping)
|
|
96
|
+
* 5. Phase 2: Fuzzy matchkeys (block + score)
|
|
97
|
+
* 6. Phase 3: Cluster (Union-Find with MST splitting)
|
|
98
|
+
* 7. Phase 4: Build golden records for multi-member clusters
|
|
99
|
+
* 8. Classify dupes vs unique
|
|
100
|
+
* 9. Compute stats
|
|
101
|
+
* 10. Return DedupeResult
|
|
102
|
+
*/
|
|
103
|
+
export function runDedupePipeline(
|
|
104
|
+
rows: readonly Row[],
|
|
105
|
+
config: GoldenMatchConfig,
|
|
106
|
+
options?: DedupeOptions,
|
|
107
|
+
): DedupeResult {
|
|
108
|
+
if (rows.length === 0) {
|
|
109
|
+
return _emptyDedupeResult(config);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const matchkeys = getMatchkeys(config);
|
|
113
|
+
const goldenRules = config.goldenRules ?? makeGoldenRulesConfig();
|
|
114
|
+
const blockingConfig = config.blocking ?? makeBlockingConfig();
|
|
115
|
+
const acrossFilesOnly = options?.acrossFilesOnly ?? false;
|
|
116
|
+
|
|
117
|
+
// ---- Step 1: Add __row_id__ and __source__ ----
|
|
118
|
+
let processed: Row[] = rows.map((r, i) => {
|
|
119
|
+
const extra: Record<string, unknown> = {};
|
|
120
|
+
if (r.__row_id__ === undefined) extra.__row_id__ = i;
|
|
121
|
+
if (r.__source__ === undefined) extra.__source__ = "default";
|
|
122
|
+
return Object.keys(extra).length > 0 ? { ...r, ...extra } : (r as Row);
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
// ---- Step 2: Apply standardization ----
|
|
126
|
+
if (config.standardization) {
|
|
127
|
+
processed = applyStandardization(processed, config.standardization.rules);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ---- Step 3: Compute matchkeys ----
|
|
131
|
+
processed = computeMatchkeys(processed, matchkeys);
|
|
132
|
+
|
|
133
|
+
// ---- Step 4 & 5: Score exact + fuzzy matchkeys ----
|
|
134
|
+
const allPairs: ScoredPair[] = [];
|
|
135
|
+
const matchedPairKeys = new Set<PairKey>();
|
|
136
|
+
const sourceLookup = buildSourceLookup(processed);
|
|
137
|
+
|
|
138
|
+
for (const mk of matchkeys) {
|
|
139
|
+
if (mk.type === "exact") {
|
|
140
|
+
// Phase 1: Exact matching via hash grouping
|
|
141
|
+
let pairs = findExactMatches(processed, mk);
|
|
142
|
+
|
|
143
|
+
// Cross-file filter
|
|
144
|
+
if (acrossFilesOnly) {
|
|
145
|
+
pairs = pairs.filter((p) => {
|
|
146
|
+
const srcA = sourceLookup.get(p.idA);
|
|
147
|
+
const srcB = sourceLookup.get(p.idB);
|
|
148
|
+
return srcA !== srcB;
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
for (const p of pairs) {
|
|
153
|
+
const key = pairKey(p.idA, p.idB);
|
|
154
|
+
if (!matchedPairKeys.has(key)) {
|
|
155
|
+
matchedPairKeys.add(key);
|
|
156
|
+
allPairs.push(p);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
// Phase 2: Fuzzy (weighted/probabilistic) — block then score
|
|
161
|
+
const blocks = buildBlocks(processed, blockingConfig);
|
|
162
|
+
|
|
163
|
+
const pairs = scoreBlocksSequential(blocks, mk, matchedPairKeys, {
|
|
164
|
+
acrossFilesOnly,
|
|
165
|
+
sourceLookup,
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
for (const p of pairs) {
|
|
169
|
+
allPairs.push(p);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ---- Step 6: Cluster ----
|
|
175
|
+
const allIds = collectRowIds(processed);
|
|
176
|
+
const pairTuples: [number, number, number][] = allPairs.map((p) => [
|
|
177
|
+
p.idA,
|
|
178
|
+
p.idB,
|
|
179
|
+
p.score,
|
|
180
|
+
]);
|
|
181
|
+
|
|
182
|
+
const clusters = buildClusters(pairTuples, allIds, {
|
|
183
|
+
maxClusterSize: goldenRules.maxClusterSize,
|
|
184
|
+
weakClusterThreshold: goldenRules.weakClusterThreshold,
|
|
185
|
+
autoSplit: goldenRules.autoSplit,
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
// ---- Step 7: Build golden records ----
|
|
189
|
+
const rowsWithClusters = assignClusterIds(processed, clusters);
|
|
190
|
+
const goldenRecords: Row[] = [];
|
|
191
|
+
|
|
192
|
+
if (options?.outputGolden !== false) {
|
|
193
|
+
for (const [cid, cinfo] of clusters) {
|
|
194
|
+
if (cinfo.size < 2) continue; // Only build golden for multi-member clusters
|
|
195
|
+
|
|
196
|
+
const clusterRows = rowsWithClusters.filter(
|
|
197
|
+
(r) => (r.__cluster_id__ as number) === cid,
|
|
198
|
+
);
|
|
199
|
+
const golden = buildGoldenRecord(clusterRows, goldenRules);
|
|
200
|
+
|
|
201
|
+
const goldenRow: Record<string, unknown> = {
|
|
202
|
+
__cluster_id__: cid,
|
|
203
|
+
__golden_confidence__: golden.goldenConfidence,
|
|
204
|
+
};
|
|
205
|
+
for (const [col, info] of Object.entries(golden.fields)) {
|
|
206
|
+
goldenRow[col] = info.value;
|
|
207
|
+
}
|
|
208
|
+
goldenRecords.push(goldenRow as Row);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// ---- Step 8: Classify dupes vs unique ----
|
|
213
|
+
const multiMemberClusterIds = new Set<number>();
|
|
214
|
+
for (const [cid, cinfo] of clusters) {
|
|
215
|
+
if (cinfo.size >= 2) multiMemberClusterIds.add(cid);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const dupeRowIds = new Set<number>();
|
|
219
|
+
for (const [, cinfo] of clusters) {
|
|
220
|
+
if (cinfo.size >= 2) {
|
|
221
|
+
for (const m of cinfo.members) {
|
|
222
|
+
dupeRowIds.add(m);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const dupes: Row[] = [];
|
|
228
|
+
const unique: Row[] = [];
|
|
229
|
+
for (const row of rowsWithClusters) {
|
|
230
|
+
const rowId = row.__row_id__ as number;
|
|
231
|
+
if (dupeRowIds.has(rowId)) {
|
|
232
|
+
dupes.push(row);
|
|
233
|
+
} else {
|
|
234
|
+
unique.push(row);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ---- Step 9: Compute stats ----
|
|
239
|
+
const totalRecords = processed.length;
|
|
240
|
+
const totalClusters = clusters.size;
|
|
241
|
+
const matchedRecords = dupes.length;
|
|
242
|
+
const uniqueRecords = unique.length;
|
|
243
|
+
const matchRate = totalRecords > 0 ? matchedRecords / totalRecords : 0;
|
|
244
|
+
|
|
245
|
+
const stats: DedupeStats = {
|
|
246
|
+
totalRecords,
|
|
247
|
+
totalClusters,
|
|
248
|
+
matchRate,
|
|
249
|
+
matchedRecords,
|
|
250
|
+
uniqueRecords,
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
// ---- Step 10: Return result ----
|
|
254
|
+
return {
|
|
255
|
+
goldenRecords,
|
|
256
|
+
clusters,
|
|
257
|
+
dupes,
|
|
258
|
+
unique,
|
|
259
|
+
stats,
|
|
260
|
+
scoredPairs: allPairs,
|
|
261
|
+
config,
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// ---------------------------------------------------------------------------
|
|
266
|
+
// runMatchPipeline
|
|
267
|
+
// ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Run the match pipeline: match target rows against reference rows.
|
|
271
|
+
*
|
|
272
|
+
* - Assigns __row_id__ with offset for reference rows
|
|
273
|
+
* - Assigns __source__ ("target" / "reference")
|
|
274
|
+
* - Runs same pipeline but filters to cross-source pairs only
|
|
275
|
+
*/
|
|
276
|
+
export function runMatchPipeline(
|
|
277
|
+
targetRows: readonly Row[],
|
|
278
|
+
referenceRows: readonly Row[],
|
|
279
|
+
config: GoldenMatchConfig,
|
|
280
|
+
): MatchResult {
|
|
281
|
+
if (targetRows.length === 0 || referenceRows.length === 0) {
|
|
282
|
+
return {
|
|
283
|
+
matched: [],
|
|
284
|
+
unmatched: [...targetRows],
|
|
285
|
+
stats: {
|
|
286
|
+
totalTarget: targetRows.length,
|
|
287
|
+
totalReference: referenceRows.length,
|
|
288
|
+
matchedCount: 0,
|
|
289
|
+
unmatchedCount: targetRows.length,
|
|
290
|
+
matchRate: 0,
|
|
291
|
+
},
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Add row IDs and source labels
|
|
296
|
+
const target = addSourceColumn(addRowIds(targetRows, 0), "target");
|
|
297
|
+
const reference = addSourceColumn(
|
|
298
|
+
addRowIds(referenceRows, targetRows.length),
|
|
299
|
+
"reference",
|
|
300
|
+
);
|
|
301
|
+
|
|
302
|
+
// Combine and run dedupe pipeline with cross-file filter
|
|
303
|
+
const combined = [...target, ...reference];
|
|
304
|
+
const result = runDedupePipeline(combined, config, {
|
|
305
|
+
acrossFilesOnly: true,
|
|
306
|
+
outputGolden: false,
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
// Track which target row IDs got matched
|
|
310
|
+
const targetIds = new Set<number>(
|
|
311
|
+
target.map((r) => r.__row_id__ as number),
|
|
312
|
+
);
|
|
313
|
+
const matchedTargetIds = new Set<number>();
|
|
314
|
+
|
|
315
|
+
for (const pair of result.scoredPairs) {
|
|
316
|
+
if (targetIds.has(pair.idA)) matchedTargetIds.add(pair.idA);
|
|
317
|
+
if (targetIds.has(pair.idB)) matchedTargetIds.add(pair.idB);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Build matched/unmatched from original target rows
|
|
321
|
+
const matched: Row[] = [];
|
|
322
|
+
const unmatched: Row[] = [];
|
|
323
|
+
for (const row of target) {
|
|
324
|
+
const rowId = row.__row_id__ as number;
|
|
325
|
+
if (matchedTargetIds.has(rowId)) {
|
|
326
|
+
matched.push(row);
|
|
327
|
+
} else {
|
|
328
|
+
unmatched.push(row);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
matched,
|
|
334
|
+
unmatched,
|
|
335
|
+
stats: {
|
|
336
|
+
totalTarget: targetRows.length,
|
|
337
|
+
totalReference: referenceRows.length,
|
|
338
|
+
matchedCount: matched.length,
|
|
339
|
+
unmatchedCount: unmatched.length,
|
|
340
|
+
matchRate:
|
|
341
|
+
targetRows.length > 0 ? matched.length / targetRows.length : 0,
|
|
342
|
+
},
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// ---------------------------------------------------------------------------
|
|
347
|
+
// Internal: empty result
|
|
348
|
+
// ---------------------------------------------------------------------------
|
|
349
|
+
|
|
350
|
+
function _emptyDedupeResult(config: GoldenMatchConfig): DedupeResult {
|
|
351
|
+
return {
|
|
352
|
+
goldenRecords: [],
|
|
353
|
+
clusters: new Map(),
|
|
354
|
+
dupes: [],
|
|
355
|
+
unique: [],
|
|
356
|
+
stats: {
|
|
357
|
+
totalRecords: 0,
|
|
358
|
+
totalClusters: 0,
|
|
359
|
+
matchRate: 0,
|
|
360
|
+
matchedRecords: 0,
|
|
361
|
+
uniqueRecords: 0,
|
|
362
|
+
},
|
|
363
|
+
scoredPairs: [],
|
|
364
|
+
config,
|
|
365
|
+
};
|
|
366
|
+
}
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pprl/protocol.ts — Privacy-preserving record linkage.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/pprl/protocol.py. Encodes both datasets as bloom
|
|
6
|
+
* filters (CLKs) over the selected fields, then scores pairs via Dice or
|
|
7
|
+
* Jaccard similarity. Two protocol stubs are surfaced: trusted third
|
|
8
|
+
* party (no crypto beyond the bloom filter itself) and a simple SMC
|
|
9
|
+
* sketch that adds a salt per party before encoding.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { Row } from "../types.js";
|
|
13
|
+
import { applyTransform } from "../transforms.js";
|
|
14
|
+
import { diceCoefficient } from "../scorer.js";
|
|
15
|
+
import { profileRows, type ColumnProfile } from "../profiler.js";
|
|
16
|
+
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Types
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
export interface PPRLConfig {
|
|
22
|
+
readonly fields: readonly string[];
|
|
23
|
+
readonly securityLevel: "standard" | "high" | "paranoid";
|
|
24
|
+
readonly protocol: "trusted_third_party" | "smc";
|
|
25
|
+
readonly threshold: number;
|
|
26
|
+
/** Optional salt used with "high"/"paranoid" levels. */
|
|
27
|
+
readonly salt?: string;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface PPRLMatch {
|
|
31
|
+
readonly idA: number;
|
|
32
|
+
readonly idB: number;
|
|
33
|
+
readonly score: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface PPRLResult {
|
|
37
|
+
readonly matches: readonly PPRLMatch[];
|
|
38
|
+
readonly stats: Readonly<Record<string, unknown>>;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// Helpers
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
function rowString(row: Row, fields: readonly string[]): string {
|
|
46
|
+
const parts: string[] = [];
|
|
47
|
+
for (const f of fields) {
|
|
48
|
+
const v = row[f];
|
|
49
|
+
if (v === null || v === undefined) continue;
|
|
50
|
+
const s = typeof v === "string" ? v : String(v);
|
|
51
|
+
const normalized = applyTransform(s, "lowercase") ?? s;
|
|
52
|
+
const cleaned = applyTransform(normalized, "normalize_whitespace") ?? normalized;
|
|
53
|
+
parts.push(cleaned);
|
|
54
|
+
}
|
|
55
|
+
return parts.join(" ");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function encodeRow(row: Row, config: PPRLConfig): string {
|
|
59
|
+
const value = rowString(row, config.fields);
|
|
60
|
+
if (value.length === 0) return "";
|
|
61
|
+
const transformName =
|
|
62
|
+
config.salt && (config.securityLevel === "high" || config.securityLevel === "paranoid")
|
|
63
|
+
? `bloom_filter:${config.securityLevel}:${config.salt}`
|
|
64
|
+
: `bloom_filter:${config.securityLevel}`;
|
|
65
|
+
return applyTransform(value, transformName) ?? "";
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// Core linkage
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Encode both row sets as bloom filters and emit pair matches above the
|
|
74
|
+
* configured threshold.
|
|
75
|
+
*/
|
|
76
|
+
export function runPPRL(
|
|
77
|
+
rowsA: readonly Row[],
|
|
78
|
+
rowsB: readonly Row[],
|
|
79
|
+
config: PPRLConfig,
|
|
80
|
+
): PPRLResult {
|
|
81
|
+
const encodedA: string[] = rowsA.map((r) => encodeRow(r, config));
|
|
82
|
+
const encodedB: string[] = rowsB.map((r) => encodeRow(r, config));
|
|
83
|
+
|
|
84
|
+
const matches: PPRLMatch[] = [];
|
|
85
|
+
let compared = 0;
|
|
86
|
+
|
|
87
|
+
for (let i = 0; i < encodedA.length; i++) {
|
|
88
|
+
const a = encodedA[i]!;
|
|
89
|
+
if (a.length === 0) continue;
|
|
90
|
+
for (let j = 0; j < encodedB.length; j++) {
|
|
91
|
+
const b = encodedB[j]!;
|
|
92
|
+
if (b.length === 0) continue;
|
|
93
|
+
compared++;
|
|
94
|
+
const score = diceCoefficient(a, b);
|
|
95
|
+
if (score >= config.threshold) {
|
|
96
|
+
matches.push({ idA: i, idB: j, score });
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
matches,
|
|
103
|
+
stats: {
|
|
104
|
+
protocol: config.protocol,
|
|
105
|
+
securityLevel: config.securityLevel,
|
|
106
|
+
comparedPairs: compared,
|
|
107
|
+
matchCount: matches.length,
|
|
108
|
+
threshold: config.threshold,
|
|
109
|
+
fields: config.fields,
|
|
110
|
+
},
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
// Auto-config
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
const MIN_LENGTH = 3;
|
|
119
|
+
const MAX_LENGTH = 15;
|
|
120
|
+
const MAX_FIELDS = 4;
|
|
121
|
+
const MIN_THRESHOLD = 0.85;
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Auto-pick PPRL parameters for the given dataset pair. Penalizes
|
|
125
|
+
* near-unique fields (IDs), over-long fields, and high-null fields.
|
|
126
|
+
*/
|
|
127
|
+
export function autoConfigurePPRL(
|
|
128
|
+
rowsA: readonly Row[],
|
|
129
|
+
rowsB: readonly Row[],
|
|
130
|
+
): PPRLConfig {
|
|
131
|
+
const profileA = profileRows(rowsA);
|
|
132
|
+
const profileB = profileRows(rowsB);
|
|
133
|
+
|
|
134
|
+
const commonCols = new Set<string>();
|
|
135
|
+
for (const c of profileA.columns) {
|
|
136
|
+
if (profileB.byName[c.name]) commonCols.add(c.name);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
interface Candidate {
|
|
140
|
+
readonly name: string;
|
|
141
|
+
readonly score: number;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const candidates: Candidate[] = [];
|
|
145
|
+
for (const name of commonCols) {
|
|
146
|
+
const pa = profileA.byName[name];
|
|
147
|
+
const pb = profileB.byName[name];
|
|
148
|
+
if (!pa || !pb) continue;
|
|
149
|
+
|
|
150
|
+
const nullRate = Math.max(pa.nullRate, pb.nullRate);
|
|
151
|
+
if (nullRate > 0.3) continue;
|
|
152
|
+
|
|
153
|
+
const avgLen = (pa.avgLength + pb.avgLength) / 2;
|
|
154
|
+
if (avgLen < MIN_LENGTH) continue;
|
|
155
|
+
if (avgLen > MAX_LENGTH) continue;
|
|
156
|
+
|
|
157
|
+
// Penalize near-unique fields (likely IDs)
|
|
158
|
+
const card = Math.max(pa.cardinalityRatio, pb.cardinalityRatio);
|
|
159
|
+
if (card > 0.95) continue;
|
|
160
|
+
|
|
161
|
+
// Score: prefer moderate cardinality, low nulls, moderate length.
|
|
162
|
+
const lenPenalty = Math.abs(avgLen - 8) / 8;
|
|
163
|
+
const score = (1 - nullRate) * (1 - Math.abs(card - 0.5)) * (1 - lenPenalty);
|
|
164
|
+
|
|
165
|
+
candidates.push({ name, score });
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
169
|
+
const fields = candidates.slice(0, MAX_FIELDS).map((c) => c.name);
|
|
170
|
+
|
|
171
|
+
return {
|
|
172
|
+
fields,
|
|
173
|
+
securityLevel: "standard",
|
|
174
|
+
protocol: "trusted_third_party",
|
|
175
|
+
threshold: MIN_THRESHOLD,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Protocol wrappers (API-parity stubs)
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Trusted-third-party linkage: both parties ship encoded CLKs to a
|
|
185
|
+
* trusted intermediary that runs the similarity scoring. Same mechanics
|
|
186
|
+
* as `runPPRL`, but callsite is semantically distinct.
|
|
187
|
+
*/
|
|
188
|
+
export function linkTrustedThirdParty(
|
|
189
|
+
rowsA: readonly Row[],
|
|
190
|
+
rowsB: readonly Row[],
|
|
191
|
+
config: PPRLConfig,
|
|
192
|
+
): PPRLResult {
|
|
193
|
+
return runPPRL(rowsA, rowsB, { ...config, protocol: "trusted_third_party" });
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Secure-multiparty-computation linkage (simplified): each party salts
|
|
198
|
+
* its inputs with a shared secret. Requires a non-empty `salt` in config
|
|
199
|
+
* and a "high"/"paranoid" security level.
|
|
200
|
+
*/
|
|
201
|
+
export function linkSMC(
|
|
202
|
+
rowsA: readonly Row[],
|
|
203
|
+
rowsB: readonly Row[],
|
|
204
|
+
config: PPRLConfig,
|
|
205
|
+
): PPRLResult {
|
|
206
|
+
if (!config.salt || config.salt.length === 0) {
|
|
207
|
+
throw new Error("SMC protocol requires a non-empty `salt`");
|
|
208
|
+
}
|
|
209
|
+
if (config.securityLevel === "standard") {
|
|
210
|
+
throw new Error("SMC protocol requires securityLevel of 'high' or 'paranoid'");
|
|
211
|
+
}
|
|
212
|
+
return runPPRL(rowsA, rowsB, { ...config, protocol: "smc" });
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Re-export profile type for consumers that want it alongside.
|
|
216
|
+
export type { ColumnProfile };
|