goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,371 @@
1
+ /**
2
+ * graph-er.ts — Multi-table entity resolution with evidence propagation.
3
+ * Edge-safe: no `node:` imports.
4
+ *
5
+ * Ports goldenmatch/core/graph_er.py. Each table is deduped independently
6
+ * first, then cluster assignments propagate across foreign-key edges:
7
+ * if row A.fk points into B's cluster, rows of A whose FK shares a cluster
8
+ * get a similarity boost before re-clustering.
9
+ */
10
+
11
+ import type { ClusterInfo, PairKey, Row, ScoredPair } from "./types.js";
12
+ import { pairKey } from "./cluster.js";
13
+
14
+ // ---------------------------------------------------------------------------
15
+ // Types
16
+ // ---------------------------------------------------------------------------
17
+
18
+ export interface TableSchema {
19
+ readonly name: string;
20
+ readonly rows: readonly Row[];
21
+ readonly idColumn: string;
22
+ }
23
+
24
+ export interface Relationship {
25
+ readonly tableA: string;
26
+ readonly tableB: string;
27
+ readonly fkColumn: string; // column in tableA referencing tableB
28
+ }
29
+
30
+ export interface GraphERResult {
31
+ readonly clustersByTable: ReadonlyMap<string, ReadonlyMap<number, ClusterInfo>>;
32
+ readonly converged: boolean;
33
+ readonly iterations: number;
34
+ }
35
+
36
+ /**
37
+ * A scorer for graph ER.
38
+ *
39
+ * **Contract:** Must return `ScoredPair.idA` / `.idB` as **0-based row indices
40
+ * into the input `rows` array**, NOT the `__row_id__` values (or any other
41
+ * external/stable row identifier) those rows may carry.
42
+ *
43
+ * Why: `runGraphER` seeds its Union-Find with `0..rows.length` and indexes
44
+ * foreign-key cluster lookups by row position. Returning external row IDs
45
+ * instead of 0-based indices causes the evidence-propagation boost to never
46
+ * apply (ids won't line up with the UF roots or the fk-index map) and can
47
+ * silently produce wrong clusters.
48
+ *
49
+ * If you have stable external row IDs, re-number your rows to 0-based
50
+ * positional indices before scoring, then map back afterward.
51
+ */
52
+ export interface GraphERScorer {
53
+ (rows: readonly Row[]): readonly ScoredPair[];
54
+ }
55
+
56
+ export interface RunGraphEROptions {
57
+ readonly maxIterations?: number;
58
+ readonly convergenceThreshold?: number;
59
+ readonly similarityBoost?: number;
60
+ /** Per-table scorer: takes rows, returns scored pairs. Required. */
61
+ readonly scorerByTable: ReadonlyMap<string, GraphERScorer>;
62
+ /** Match threshold for building clusters. Default 0.85. */
63
+ readonly threshold?: number;
64
+ }
65
+
66
+ // ---------------------------------------------------------------------------
67
+ // Minimal Union-Find
68
+ // ---------------------------------------------------------------------------
69
+
70
+ class UnionFind {
71
+ private parent: number[] = [];
72
+ private size: number[] = [];
73
+
74
+ add(id: number): void {
75
+ while (this.parent.length <= id) {
76
+ this.parent.push(this.parent.length);
77
+ this.size.push(1);
78
+ }
79
+ }
80
+
81
+ find(id: number): number {
82
+ this.add(id);
83
+ let cur = id;
84
+ while (this.parent[cur] !== cur) {
85
+ const parent = this.parent[cur]!;
86
+ this.parent[cur] = this.parent[parent]!; // path compression
87
+ cur = this.parent[cur]!;
88
+ }
89
+ return cur;
90
+ }
91
+
92
+ union(a: number, b: number): void {
93
+ const rootA = this.find(a);
94
+ const rootB = this.find(b);
95
+ if (rootA === rootB) return;
96
+ if (this.size[rootA]! < this.size[rootB]!) {
97
+ this.parent[rootA] = rootB;
98
+ this.size[rootB]! += this.size[rootA]!;
99
+ } else {
100
+ this.parent[rootB] = rootA;
101
+ this.size[rootA]! += this.size[rootB]!;
102
+ }
103
+ }
104
+ }
105
+
106
+ // ---------------------------------------------------------------------------
107
+ // Helpers
108
+ // ---------------------------------------------------------------------------
109
+
110
+ function toRowIndex(rows: readonly Row[], idColumn: string): Map<unknown, number> {
111
+ const map = new Map<unknown, number>();
112
+ for (let i = 0; i < rows.length; i++) {
113
+ const v = rows[i]![idColumn];
114
+ if (v !== null && v !== undefined) map.set(v, i);
115
+ }
116
+ return map;
117
+ }
118
+
119
+ function clustersFromPairs(
120
+ rowCount: number,
121
+ pairs: readonly ScoredPair[],
122
+ threshold: number,
123
+ ): Map<number, ClusterInfo> {
124
+ const uf = new UnionFind();
125
+ for (let i = 0; i < rowCount; i++) uf.add(i);
126
+
127
+ const scoreMap = new Map<PairKey, number>();
128
+ for (const p of pairs) {
129
+ if (p.score < threshold) continue;
130
+ uf.union(p.idA, p.idB);
131
+ scoreMap.set(pairKey(p.idA, p.idB), p.score);
132
+ }
133
+
134
+ const rootMembers = new Map<number, number[]>();
135
+ for (let i = 0; i < rowCount; i++) {
136
+ const root = uf.find(i);
137
+ const list = rootMembers.get(root);
138
+ if (list) list.push(i);
139
+ else rootMembers.set(root, [i]);
140
+ }
141
+
142
+ const clusters = new Map<number, ClusterInfo>();
143
+ let clusterId = 0;
144
+ for (const members of rootMembers.values()) {
145
+ const pairScores = new Map<PairKey, number>();
146
+ let minEdge = 1;
147
+ let edgeSum = 0;
148
+ let edgeCount = 0;
149
+ for (let i = 0; i < members.length; i++) {
150
+ for (let j = i + 1; j < members.length; j++) {
151
+ const a = members[i]!;
152
+ const b = members[j]!;
153
+ const k = pairKey(a, b);
154
+ const s = scoreMap.get(k);
155
+ if (s !== undefined) {
156
+ pairScores.set(k, s);
157
+ if (s < minEdge) minEdge = s;
158
+ edgeSum += s;
159
+ edgeCount++;
160
+ }
161
+ }
162
+ }
163
+ const avgEdge = edgeCount > 0 ? edgeSum / edgeCount : 0;
164
+ const connectivity = members.length <= 1 ? 0 : Math.min(1, edgeCount / (members.length - 1));
165
+ const confidence = members.length <= 1 ? 1 : 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity;
166
+
167
+ clusters.set(clusterId++, {
168
+ members,
169
+ size: members.length,
170
+ oversized: false,
171
+ pairScores,
172
+ confidence,
173
+ bottleneckPair: null,
174
+ clusterQuality: "strong",
175
+ });
176
+ }
177
+
178
+ return clusters;
179
+ }
180
+
181
+ function rowIdToCluster(clusters: ReadonlyMap<number, ClusterInfo>): Map<number, number> {
182
+ const map = new Map<number, number>();
183
+ for (const [cid, c] of clusters) {
184
+ for (const m of c.members) map.set(m, cid);
185
+ }
186
+ return map;
187
+ }
188
+
189
+ // ---------------------------------------------------------------------------
190
+ // Core algorithm
191
+ // ---------------------------------------------------------------------------
192
+
193
+ /**
194
+ * Run multi-table entity resolution with iterative evidence propagation.
195
+ *
196
+ * For each table, the caller provides a scorer that produces pair scores
197
+ * from a row array. The algorithm:
198
+ * 1. Score & cluster each table independently.
199
+ * 2. For every relationship A->B: find pairs in A whose fk resolves to
200
+ * the same cluster in B. Boost those pair scores by `similarityBoost`.
201
+ * 3. Re-cluster every table. Repeat until clusters stabilize or
202
+ * `maxIterations` is reached.
203
+ *
204
+ * **Scorer contract (important):** scorers in `options.scorerByTable` must
205
+ * return `ScoredPair.idA` / `.idB` as **0-based row indices** into the
206
+ * `rows` array they were handed (NOT the stable `__row_id__` values those
207
+ * rows may carry). The evidence-propagation step keys foreign-key cluster
208
+ * lookups by row position; using external row IDs will silently make the
209
+ * boost no-op and can produce wrong clusters. See {@link GraphERScorer}.
210
+ */
211
+ export function runGraphER(
212
+ tables: readonly TableSchema[],
213
+ relationships: readonly Relationship[],
214
+ options: RunGraphEROptions,
215
+ ): GraphERResult {
216
+ const maxIterations = options.maxIterations ?? 5;
217
+ const convergenceThreshold = options.convergenceThreshold ?? 0.01;
218
+ const similarityBoost = options.similarityBoost ?? 0.1;
219
+ const threshold = options.threshold ?? 0.85;
220
+
221
+ // Per-table state.
222
+ const tableByName = new Map<string, TableSchema>();
223
+ for (const t of tables) tableByName.set(t.name, t);
224
+
225
+ const idIndexByTable = new Map<string, Map<unknown, number>>();
226
+ for (const t of tables) {
227
+ idIndexByTable.set(t.name, toRowIndex(t.rows, t.idColumn));
228
+ }
229
+
230
+ // Initial pair scores per table (without boost).
231
+ const basePairsByTable = new Map<string, ScoredPair[]>();
232
+ for (const t of tables) {
233
+ const scorer = options.scorerByTable.get(t.name);
234
+ if (!scorer) {
235
+ throw new Error(`Missing scorer for table "${t.name}"`);
236
+ }
237
+ basePairsByTable.set(t.name, [...scorer(t.rows)]);
238
+ }
239
+
240
+ let clustersByTable = new Map<string, Map<number, ClusterInfo>>();
241
+ for (const t of tables) {
242
+ clustersByTable.set(
243
+ t.name,
244
+ clustersFromPairs(t.rows.length, basePairsByTable.get(t.name) ?? [], threshold),
245
+ );
246
+ }
247
+
248
+ let converged = false;
249
+ let iter = 0;
250
+
251
+ for (; iter < maxIterations; iter++) {
252
+ const rowToCluster = new Map<string, Map<number, number>>();
253
+ for (const [name, clusters] of clustersByTable) {
254
+ rowToCluster.set(name, rowIdToCluster(clusters));
255
+ }
256
+
257
+ const nextClusters = new Map<string, Map<number, ClusterInfo>>();
258
+ let maxDelta = 0;
259
+
260
+ for (const t of tables) {
261
+ const basePairs = basePairsByTable.get(t.name) ?? [];
262
+ const boosted = basePairs.map((p) => ({ ...p }));
263
+
264
+ // For each relationship where this table is the source, boost pairs
265
+ // whose FK targets land in the same cluster in the referenced table.
266
+ for (const rel of relationships) {
267
+ if (rel.tableA !== t.name) continue;
268
+ const bClusters = rowToCluster.get(rel.tableB);
269
+ if (!bClusters) continue;
270
+ const bIndex = idIndexByTable.get(rel.tableB);
271
+ if (!bIndex) continue;
272
+
273
+ // Build: row index in A (0-based) -> cluster id in B
274
+ // Keyed by positional index because `pair.idA`/`pair.idB` are 0-based
275
+ // indices per the GraphERScorer contract. See GraphERScorer JSDoc.
276
+ const fkClusterByIndex = new Map<number, number>();
277
+ for (let i = 0; i < t.rows.length; i++) {
278
+ const fkVal = t.rows[i]![rel.fkColumn];
279
+ if (fkVal === null || fkVal === undefined) continue;
280
+ const bRowIdx = bIndex.get(fkVal);
281
+ if (bRowIdx === undefined) continue;
282
+ const bCid = bClusters.get(bRowIdx);
283
+ if (bCid !== undefined) fkClusterByIndex.set(i, bCid);
284
+ }
285
+
286
+ for (const pair of boosted) {
287
+ const ca = fkClusterByIndex.get(pair.idA);
288
+ const cb = fkClusterByIndex.get(pair.idB);
289
+ if (ca !== undefined && cb !== undefined && ca === cb) {
290
+ const newScore = Math.min(1, pair.score + similarityBoost);
291
+ (pair as { score: number }).score = newScore;
292
+ }
293
+ }
294
+ }
295
+
296
+ const newClusters = clustersFromPairs(t.rows.length, boosted, threshold);
297
+ const prevClusters = clustersByTable.get(t.name);
298
+ if (prevClusters) {
299
+ const delta = clusterSetDelta(prevClusters, newClusters);
300
+ if (delta > maxDelta) maxDelta = delta;
301
+ }
302
+ nextClusters.set(t.name, newClusters);
303
+ }
304
+
305
+ clustersByTable = nextClusters;
306
+ if (maxDelta < convergenceThreshold) {
307
+ converged = true;
308
+ break;
309
+ }
310
+ }
311
+
312
+ const finalMap = new Map<string, ReadonlyMap<number, ClusterInfo>>();
313
+ for (const [k, v] of clustersByTable) finalMap.set(k, v);
314
+
315
+ return {
316
+ clustersByTable: finalMap,
317
+ converged,
318
+ iterations: iter + (converged ? 1 : 0),
319
+ };
320
+ }
321
+
322
+ /**
323
+ * Compare two cluster assignments over the same row set. Returns fraction of
324
+ * rows whose cluster signature changed — a rough "delta" proxy. Two rows
325
+ * have the same signature if they are in the same cluster in both sets.
326
+ */
327
+ function clusterSetDelta(
328
+ a: ReadonlyMap<number, ClusterInfo>,
329
+ b: ReadonlyMap<number, ClusterInfo>,
330
+ ): number {
331
+ const mapA = rowIdToCluster(a);
332
+ const mapB = rowIdToCluster(b);
333
+
334
+ // Align cluster IDs between a and b by finding the most common b-id for
335
+ // each a-id. Anything mismatched counts as a change.
336
+ const aToB = new Map<number, Map<number, number>>();
337
+ for (const [rowId, aCid] of mapA) {
338
+ const bCid = mapB.get(rowId);
339
+ if (bCid === undefined) continue;
340
+ let sub = aToB.get(aCid);
341
+ if (!sub) {
342
+ sub = new Map();
343
+ aToB.set(aCid, sub);
344
+ }
345
+ sub.set(bCid, (sub.get(bCid) ?? 0) + 1);
346
+ }
347
+
348
+ const majority = new Map<number, number>();
349
+ for (const [aCid, counts] of aToB) {
350
+ let best: [number, number] | null = null;
351
+ for (const [bCid, count] of counts) {
352
+ if (best === null || count > best[1]) best = [bCid, count];
353
+ }
354
+ if (best) majority.set(aCid, best[0]);
355
+ }
356
+
357
+ let changed = 0;
358
+ let total = 0;
359
+ for (const [rowId, aCid] of mapA) {
360
+ const bCid = mapB.get(rowId);
361
+ if (bCid === undefined) {
362
+ changed++;
363
+ total++;
364
+ continue;
365
+ }
366
+ total++;
367
+ if (majority.get(aCid) !== bCid) changed++;
368
+ }
369
+
370
+ return total === 0 ? 0 : changed / total;
371
+ }
@@ -0,0 +1,314 @@
1
+ /**
2
+ * index.ts — Core public API surface for GoldenMatch-JS.
3
+ * Re-exports everything from core modules.
4
+ *
5
+ * Edge-safe: no `node:` imports.
6
+ */
7
+
8
+ // ---------------------------------------------------------------------------
9
+ // Types
10
+ // ---------------------------------------------------------------------------
11
+
12
+ export type {
13
+ Row,
14
+ ColumnValue,
15
+ PairKey,
16
+ MatchkeyConfig,
17
+ ExactMatchkey,
18
+ WeightedMatchkey,
19
+ ProbabilisticMatchkey,
20
+ MakeMatchkeyConfigInput,
21
+ MatchkeyField,
22
+ BlockingConfig,
23
+ BlockingKeyConfig,
24
+ SortKeyField,
25
+ CanopyConfig,
26
+ GoldenRulesConfig,
27
+ GoldenFieldRule,
28
+ StandardizationConfig,
29
+ ValidationRuleConfig,
30
+ ValidationConfig,
31
+ QualityConfig,
32
+ TransformConfig,
33
+ BudgetConfig,
34
+ LLMScorerConfig,
35
+ DomainConfig,
36
+ LearningConfig,
37
+ MemoryConfig,
38
+ InputFileConfig,
39
+ InputConfig,
40
+ OutputConfig,
41
+ GoldenMatchConfig,
42
+ ScoredPair,
43
+ ClusterInfo,
44
+ DedupeStats,
45
+ DedupeResult,
46
+ MatchResult,
47
+ FieldProvenance,
48
+ ClusterProvenance,
49
+ BlockResult,
50
+ } from "./types.js";
51
+
52
+ export {
53
+ VALID_SCORERS,
54
+ VALID_TRANSFORMS,
55
+ VALID_STRATEGIES,
56
+ VALID_STANDARDIZERS,
57
+ makeMatchkeyField,
58
+ makeMatchkeyConfig,
59
+ makeBlockingConfig,
60
+ makeGoldenRulesConfig,
61
+ makeConfig,
62
+ makeScoredPair,
63
+ getMatchkeys,
64
+ } from "./types.js";
65
+
66
+ // ---------------------------------------------------------------------------
67
+ // Data layer
68
+ // ---------------------------------------------------------------------------
69
+
70
+ export { TabularData, isNullish, toColumnValue } from "./data.js";
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // Transforms
74
+ // ---------------------------------------------------------------------------
75
+
76
+ export { applyTransform, applyTransforms, soundex, metaphone } from "./transforms.js";
77
+
78
+ // ---------------------------------------------------------------------------
79
+ // Scoring
80
+ // ---------------------------------------------------------------------------
81
+
82
+ export {
83
+ scoreField,
84
+ scorePair,
85
+ findExactMatches,
86
+ findFuzzyMatches,
87
+ scoreBlocksSequential,
88
+ jaro,
89
+ jaroWinkler,
90
+ levenshteinDistance,
91
+ levenshteinSimilarity,
92
+ indelDistance,
93
+ indelSimilarity,
94
+ tokenSortRatio,
95
+ soundexMatch,
96
+ diceCoefficient,
97
+ jaccardSimilarity,
98
+ ensembleScore,
99
+ scoreMatrix,
100
+ asString,
101
+ } from "./scorer.js";
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // Matchkey
105
+ // ---------------------------------------------------------------------------
106
+
107
+ export {
108
+ computeMatchkeyValue,
109
+ computeMatchkeys,
110
+ addRowIds,
111
+ addSourceColumn,
112
+ } from "./matchkey.js";
113
+
114
+ // ---------------------------------------------------------------------------
115
+ // Standardization
116
+ // ---------------------------------------------------------------------------
117
+
118
+ export { applyStandardizer, applyStandardization } from "./standardize.js";
119
+
120
+ // ---------------------------------------------------------------------------
121
+ // Blocking
122
+ // ---------------------------------------------------------------------------
123
+
124
+ export {
125
+ buildBlocks,
126
+ buildBlocksAsync,
127
+ buildStaticBlocks,
128
+ buildMultiPassBlocks,
129
+ buildAdaptiveBlocks,
130
+ selectBestBlockingKey,
131
+ } from "./blocker.js";
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Embedding + ANN + Cross-encoder
135
+ // ---------------------------------------------------------------------------
136
+
137
+ export { Embedder, getEmbedder, EmbedderError } from "./embedder.js";
138
+ export type {
139
+ EmbedderOptions,
140
+ EmbeddingResult,
141
+ EmbedderProvider,
142
+ } from "./embedder.js";
143
+ export {
144
+ ANNBlocker,
145
+ HNSWANNBlocker,
146
+ createANNBlocker,
147
+ buildANNBlocks,
148
+ buildANNPairBlocks,
149
+ cosineSim,
150
+ euclideanDist,
151
+ } from "./ann-blocker.js";
152
+ export type {
153
+ ANNBlockerOptions,
154
+ ANNBlockerBase,
155
+ BuildANNOptions,
156
+ HNSWOptions,
157
+ HNSWModule,
158
+ HNSWIndexLike,
159
+ CreateANNBlockerOptions,
160
+ } from "./ann-blocker.js";
161
+ export {
162
+ rerankTopPairs,
163
+ rerankPair,
164
+ CrossEncoderHttpError,
165
+ CrossEncoderModel,
166
+ _resetCrossEncoderModelCache,
167
+ } from "./cross-encoder.js";
168
+ export type {
169
+ CrossEncoderOptions,
170
+ CrossEncoderProvider,
171
+ CrossEncoderReranker,
172
+ CrossEncoderModelOptions,
173
+ } from "./cross-encoder.js";
174
+
175
+ // ---------------------------------------------------------------------------
176
+ // Clustering
177
+ // ---------------------------------------------------------------------------
178
+
179
+ export {
180
+ UnionFind,
181
+ buildClusters,
182
+ buildMst,
183
+ splitOversizedCluster,
184
+ computeClusterConfidence,
185
+ addToCluster,
186
+ unmergeRecord,
187
+ unmergeCluster,
188
+ pairKey,
189
+ parsePairKey,
190
+ getClusterPairScores,
191
+ } from "./cluster.js";
192
+
193
+ // ---------------------------------------------------------------------------
194
+ // Golden records
195
+ // ---------------------------------------------------------------------------
196
+
197
+ export {
198
+ mergeField,
199
+ buildGoldenRecord,
200
+ buildGoldenRecordWithProvenance,
201
+ } from "./golden.js";
202
+
203
+ // ---------------------------------------------------------------------------
204
+ // Pipeline
205
+ // ---------------------------------------------------------------------------
206
+
207
+ export { runDedupePipeline, runMatchPipeline } from "./pipeline.js";
208
+
209
+ // ---------------------------------------------------------------------------
210
+ // API
211
+ // ---------------------------------------------------------------------------
212
+
213
+ export { dedupe, match, scoreStrings, scorePairRecord } from "./api.js";
214
+
215
+ // ---------------------------------------------------------------------------
216
+ // Config
217
+ // ---------------------------------------------------------------------------
218
+
219
+ export { parseConfig, parseConfigYaml, configToYaml } from "./config/loader.js";
220
+
221
+ // ---------------------------------------------------------------------------
222
+ // LLM
223
+ // ---------------------------------------------------------------------------
224
+
225
+ export { BudgetTracker, countTokensApprox } from "./llm/budget.js";
226
+ export type { BudgetSnapshot } from "./llm/budget.js";
227
+ export { llmScorePairs, scoreStringsWithLlm } from "./llm/scorer.js";
228
+ export type { LLMScoreResult } from "./llm/scorer.js";
229
+ export { llmClusterPairs } from "./llm/cluster.js";
230
+
231
+ // ---------------------------------------------------------------------------
232
+ // Explain
233
+ // ---------------------------------------------------------------------------
234
+
235
+ export { explainPair, explainCluster } from "./explain.js";
236
+ export type { PairExplanation, ClusterExplanation } from "./explain.js";
237
+
238
+ // ---------------------------------------------------------------------------
239
+ // Probabilistic (Fellegi-Sunter)
240
+ // ---------------------------------------------------------------------------
241
+
242
+ export { buildComparisonVector, trainEM, scoreProbabilistic } from "./probabilistic.js";
243
+ export type { EMResult } from "./probabilistic.js";
244
+
245
+ // ---------------------------------------------------------------------------
246
+ // Evaluation
247
+ // ---------------------------------------------------------------------------
248
+
249
+ export { evaluatePairs, evaluateClusters, loadGroundTruthPairs } from "./evaluate.js";
250
+ export type { EvalResult } from "./evaluate.js";
251
+
252
+ // ---------------------------------------------------------------------------
253
+ // Streaming / match-one
254
+ // ---------------------------------------------------------------------------
255
+
256
+ export { StreamProcessor } from "./streaming.js";
257
+ export { matchOne, findExactMatchesOne } from "./match-one.js";
258
+
259
+ // ---------------------------------------------------------------------------
260
+ // Cluster comparison + sensitivity
261
+ // ---------------------------------------------------------------------------
262
+
263
+ export { compareClusters } from "./compare-clusters.js";
264
+ export type { CCMSResult } from "./compare-clusters.js";
265
+ export { runSensitivity, stabilityReport } from "./sensitivity.js";
266
+ export type { SweepParam, SweepPoint, SensitivityResult } from "./sensitivity.js";
267
+
268
+ // ---------------------------------------------------------------------------
269
+ // Quality, autofix, validation, profiling, ingest
270
+ // ---------------------------------------------------------------------------
271
+
272
+ export { scanQuality, runQualityCheck } from "./quality.js";
273
+ export type { QualityFinding } from "./quality.js";
274
+ export { autoFixRows } from "./autofix.js";
275
+ export type { AutoFixLog } from "./autofix.js";
276
+ export { validateRows } from "./validate.js";
277
+ export type { ValidationRule, ValidationReport } from "./validate.js";
278
+ export { profileRows } from "./profiler.js";
279
+ export type { ColumnProfile, DatasetProfile } from "./profiler.js";
280
+ export { applyColumnMap, validateColumns, concatRows } from "./ingest.js";
281
+
282
+ // ---------------------------------------------------------------------------
283
+ // Review queue, autoconfig, domain, lineage, learned blocking, graph ER
284
+ // ---------------------------------------------------------------------------
285
+
286
+ export { ReviewQueue, gatePairs } from "./review-queue.js";
287
+ export type { ReviewItem, GatedResult } from "./review-queue.js";
288
+ export { autoConfigureRows } from "./autoconfig.js";
289
+ export type { AutoconfigOptions } from "./autoconfig.js";
290
+ export { detectDomain, extractFeatures } from "./domain.js";
291
+ export type { DomainProfile } from "./domain.js";
292
+ export { buildLineage, lineageToJson, lineageFromJson } from "./lineage.js";
293
+ export type { LineageEdge, LineageBundle } from "./lineage.js";
294
+ export { learnBlockingRules, applyLearnedBlocks } from "./learned-blocking.js";
295
+ export type { LearnedPredicate, LearnedRules } from "./learned-blocking.js";
296
+ export { runGraphER } from "./graph-er.js";
297
+ export type { TableSchema, Relationship, GraphERResult } from "./graph-er.js";
298
+
299
+ // ---------------------------------------------------------------------------
300
+ // Memory (learning corrections)
301
+ // ---------------------------------------------------------------------------
302
+
303
+ export { MemoryStore } from "./memory/store.js";
304
+ export type { Correction, MemoryStoreConfig } from "./memory/store.js";
305
+ export { applyCorrections, hashRow } from "./memory/corrections.js";
306
+ export { MemoryLearner } from "./memory/learner.js";
307
+ export type { LearnedParams } from "./memory/learner.js";
308
+
309
+ // ---------------------------------------------------------------------------
310
+ // PPRL (Privacy-Preserving Record Linkage)
311
+ // ---------------------------------------------------------------------------
312
+
313
+ export { runPPRL, autoConfigurePPRL } from "./pprl/protocol.js";
314
+ export type { PPRLConfig, PPRLResult } from "./pprl/protocol.js";