goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* graph-er.ts — Multi-table entity resolution with evidence propagation.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/graph_er.py. Each table is deduped independently
|
|
6
|
+
* first, then cluster assignments propagate across foreign-key edges:
|
|
7
|
+
* if row A.fk points into B's cluster, rows of A whose FK shares a cluster
|
|
8
|
+
* get a similarity boost before re-clustering.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { ClusterInfo, PairKey, Row, ScoredPair } from "./types.js";
|
|
12
|
+
import { pairKey } from "./cluster.js";
|
|
13
|
+
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Types
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
export interface TableSchema {
|
|
19
|
+
readonly name: string;
|
|
20
|
+
readonly rows: readonly Row[];
|
|
21
|
+
readonly idColumn: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface Relationship {
|
|
25
|
+
readonly tableA: string;
|
|
26
|
+
readonly tableB: string;
|
|
27
|
+
readonly fkColumn: string; // column in tableA referencing tableB
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface GraphERResult {
|
|
31
|
+
readonly clustersByTable: ReadonlyMap<string, ReadonlyMap<number, ClusterInfo>>;
|
|
32
|
+
readonly converged: boolean;
|
|
33
|
+
readonly iterations: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* A scorer for graph ER.
|
|
38
|
+
*
|
|
39
|
+
* **Contract:** Must return `ScoredPair.idA` / `.idB` as **0-based row indices
|
|
40
|
+
* into the input `rows` array**, NOT the `__row_id__` values (or any other
|
|
41
|
+
* external/stable row identifier) those rows may carry.
|
|
42
|
+
*
|
|
43
|
+
* Why: `runGraphER` seeds its Union-Find with `0..rows.length` and indexes
|
|
44
|
+
* foreign-key cluster lookups by row position. Returning external row IDs
|
|
45
|
+
* instead of 0-based indices causes the evidence-propagation boost to never
|
|
46
|
+
* apply (ids won't line up with the UF roots or the fk-index map) and can
|
|
47
|
+
* silently produce wrong clusters.
|
|
48
|
+
*
|
|
49
|
+
* If you have stable external row IDs, re-number your rows to 0-based
|
|
50
|
+
* positional indices before scoring, then map back afterward.
|
|
51
|
+
*/
|
|
52
|
+
export interface GraphERScorer {
|
|
53
|
+
(rows: readonly Row[]): readonly ScoredPair[];
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export interface RunGraphEROptions {
|
|
57
|
+
readonly maxIterations?: number;
|
|
58
|
+
readonly convergenceThreshold?: number;
|
|
59
|
+
readonly similarityBoost?: number;
|
|
60
|
+
/** Per-table scorer: takes rows, returns scored pairs. Required. */
|
|
61
|
+
readonly scorerByTable: ReadonlyMap<string, GraphERScorer>;
|
|
62
|
+
/** Match threshold for building clusters. Default 0.85. */
|
|
63
|
+
readonly threshold?: number;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Minimal Union-Find
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
class UnionFind {
|
|
71
|
+
private parent: number[] = [];
|
|
72
|
+
private size: number[] = [];
|
|
73
|
+
|
|
74
|
+
add(id: number): void {
|
|
75
|
+
while (this.parent.length <= id) {
|
|
76
|
+
this.parent.push(this.parent.length);
|
|
77
|
+
this.size.push(1);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
find(id: number): number {
|
|
82
|
+
this.add(id);
|
|
83
|
+
let cur = id;
|
|
84
|
+
while (this.parent[cur] !== cur) {
|
|
85
|
+
const parent = this.parent[cur]!;
|
|
86
|
+
this.parent[cur] = this.parent[parent]!; // path compression
|
|
87
|
+
cur = this.parent[cur]!;
|
|
88
|
+
}
|
|
89
|
+
return cur;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
union(a: number, b: number): void {
|
|
93
|
+
const rootA = this.find(a);
|
|
94
|
+
const rootB = this.find(b);
|
|
95
|
+
if (rootA === rootB) return;
|
|
96
|
+
if (this.size[rootA]! < this.size[rootB]!) {
|
|
97
|
+
this.parent[rootA] = rootB;
|
|
98
|
+
this.size[rootB]! += this.size[rootA]!;
|
|
99
|
+
} else {
|
|
100
|
+
this.parent[rootB] = rootA;
|
|
101
|
+
this.size[rootA]! += this.size[rootB]!;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
// Helpers
|
|
108
|
+
// ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
function toRowIndex(rows: readonly Row[], idColumn: string): Map<unknown, number> {
|
|
111
|
+
const map = new Map<unknown, number>();
|
|
112
|
+
for (let i = 0; i < rows.length; i++) {
|
|
113
|
+
const v = rows[i]![idColumn];
|
|
114
|
+
if (v !== null && v !== undefined) map.set(v, i);
|
|
115
|
+
}
|
|
116
|
+
return map;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function clustersFromPairs(
|
|
120
|
+
rowCount: number,
|
|
121
|
+
pairs: readonly ScoredPair[],
|
|
122
|
+
threshold: number,
|
|
123
|
+
): Map<number, ClusterInfo> {
|
|
124
|
+
const uf = new UnionFind();
|
|
125
|
+
for (let i = 0; i < rowCount; i++) uf.add(i);
|
|
126
|
+
|
|
127
|
+
const scoreMap = new Map<PairKey, number>();
|
|
128
|
+
for (const p of pairs) {
|
|
129
|
+
if (p.score < threshold) continue;
|
|
130
|
+
uf.union(p.idA, p.idB);
|
|
131
|
+
scoreMap.set(pairKey(p.idA, p.idB), p.score);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const rootMembers = new Map<number, number[]>();
|
|
135
|
+
for (let i = 0; i < rowCount; i++) {
|
|
136
|
+
const root = uf.find(i);
|
|
137
|
+
const list = rootMembers.get(root);
|
|
138
|
+
if (list) list.push(i);
|
|
139
|
+
else rootMembers.set(root, [i]);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const clusters = new Map<number, ClusterInfo>();
|
|
143
|
+
let clusterId = 0;
|
|
144
|
+
for (const members of rootMembers.values()) {
|
|
145
|
+
const pairScores = new Map<PairKey, number>();
|
|
146
|
+
let minEdge = 1;
|
|
147
|
+
let edgeSum = 0;
|
|
148
|
+
let edgeCount = 0;
|
|
149
|
+
for (let i = 0; i < members.length; i++) {
|
|
150
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
151
|
+
const a = members[i]!;
|
|
152
|
+
const b = members[j]!;
|
|
153
|
+
const k = pairKey(a, b);
|
|
154
|
+
const s = scoreMap.get(k);
|
|
155
|
+
if (s !== undefined) {
|
|
156
|
+
pairScores.set(k, s);
|
|
157
|
+
if (s < minEdge) minEdge = s;
|
|
158
|
+
edgeSum += s;
|
|
159
|
+
edgeCount++;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const avgEdge = edgeCount > 0 ? edgeSum / edgeCount : 0;
|
|
164
|
+
const connectivity = members.length <= 1 ? 0 : Math.min(1, edgeCount / (members.length - 1));
|
|
165
|
+
const confidence = members.length <= 1 ? 1 : 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity;
|
|
166
|
+
|
|
167
|
+
clusters.set(clusterId++, {
|
|
168
|
+
members,
|
|
169
|
+
size: members.length,
|
|
170
|
+
oversized: false,
|
|
171
|
+
pairScores,
|
|
172
|
+
confidence,
|
|
173
|
+
bottleneckPair: null,
|
|
174
|
+
clusterQuality: "strong",
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return clusters;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function rowIdToCluster(clusters: ReadonlyMap<number, ClusterInfo>): Map<number, number> {
|
|
182
|
+
const map = new Map<number, number>();
|
|
183
|
+
for (const [cid, c] of clusters) {
|
|
184
|
+
for (const m of c.members) map.set(m, cid);
|
|
185
|
+
}
|
|
186
|
+
return map;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
// Core algorithm
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Run multi-table entity resolution with iterative evidence propagation.
|
|
195
|
+
*
|
|
196
|
+
* For each table, the caller provides a scorer that produces pair scores
|
|
197
|
+
* from a row array. The algorithm:
|
|
198
|
+
* 1. Score & cluster each table independently.
|
|
199
|
+
* 2. For every relationship A->B: find pairs in A whose fk resolves to
|
|
200
|
+
* the same cluster in B. Boost those pair scores by `similarityBoost`.
|
|
201
|
+
* 3. Re-cluster every table. Repeat until clusters stabilize or
|
|
202
|
+
* `maxIterations` is reached.
|
|
203
|
+
*
|
|
204
|
+
* **Scorer contract (important):** scorers in `options.scorerByTable` must
|
|
205
|
+
* return `ScoredPair.idA` / `.idB` as **0-based row indices** into the
|
|
206
|
+
* `rows` array they were handed (NOT the stable `__row_id__` values those
|
|
207
|
+
* rows may carry). The evidence-propagation step keys foreign-key cluster
|
|
208
|
+
* lookups by row position; using external row IDs will silently make the
|
|
209
|
+
* boost no-op and can produce wrong clusters. See {@link GraphERScorer}.
|
|
210
|
+
*/
|
|
211
|
+
export function runGraphER(
|
|
212
|
+
tables: readonly TableSchema[],
|
|
213
|
+
relationships: readonly Relationship[],
|
|
214
|
+
options: RunGraphEROptions,
|
|
215
|
+
): GraphERResult {
|
|
216
|
+
const maxIterations = options.maxIterations ?? 5;
|
|
217
|
+
const convergenceThreshold = options.convergenceThreshold ?? 0.01;
|
|
218
|
+
const similarityBoost = options.similarityBoost ?? 0.1;
|
|
219
|
+
const threshold = options.threshold ?? 0.85;
|
|
220
|
+
|
|
221
|
+
// Per-table state.
|
|
222
|
+
const tableByName = new Map<string, TableSchema>();
|
|
223
|
+
for (const t of tables) tableByName.set(t.name, t);
|
|
224
|
+
|
|
225
|
+
const idIndexByTable = new Map<string, Map<unknown, number>>();
|
|
226
|
+
for (const t of tables) {
|
|
227
|
+
idIndexByTable.set(t.name, toRowIndex(t.rows, t.idColumn));
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Initial pair scores per table (without boost).
|
|
231
|
+
const basePairsByTable = new Map<string, ScoredPair[]>();
|
|
232
|
+
for (const t of tables) {
|
|
233
|
+
const scorer = options.scorerByTable.get(t.name);
|
|
234
|
+
if (!scorer) {
|
|
235
|
+
throw new Error(`Missing scorer for table "${t.name}"`);
|
|
236
|
+
}
|
|
237
|
+
basePairsByTable.set(t.name, [...scorer(t.rows)]);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
let clustersByTable = new Map<string, Map<number, ClusterInfo>>();
|
|
241
|
+
for (const t of tables) {
|
|
242
|
+
clustersByTable.set(
|
|
243
|
+
t.name,
|
|
244
|
+
clustersFromPairs(t.rows.length, basePairsByTable.get(t.name) ?? [], threshold),
|
|
245
|
+
);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
let converged = false;
|
|
249
|
+
let iter = 0;
|
|
250
|
+
|
|
251
|
+
for (; iter < maxIterations; iter++) {
|
|
252
|
+
const rowToCluster = new Map<string, Map<number, number>>();
|
|
253
|
+
for (const [name, clusters] of clustersByTable) {
|
|
254
|
+
rowToCluster.set(name, rowIdToCluster(clusters));
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const nextClusters = new Map<string, Map<number, ClusterInfo>>();
|
|
258
|
+
let maxDelta = 0;
|
|
259
|
+
|
|
260
|
+
for (const t of tables) {
|
|
261
|
+
const basePairs = basePairsByTable.get(t.name) ?? [];
|
|
262
|
+
const boosted = basePairs.map((p) => ({ ...p }));
|
|
263
|
+
|
|
264
|
+
// For each relationship where this table is the source, boost pairs
|
|
265
|
+
// whose FK targets land in the same cluster in the referenced table.
|
|
266
|
+
for (const rel of relationships) {
|
|
267
|
+
if (rel.tableA !== t.name) continue;
|
|
268
|
+
const bClusters = rowToCluster.get(rel.tableB);
|
|
269
|
+
if (!bClusters) continue;
|
|
270
|
+
const bIndex = idIndexByTable.get(rel.tableB);
|
|
271
|
+
if (!bIndex) continue;
|
|
272
|
+
|
|
273
|
+
// Build: row index in A (0-based) -> cluster id in B
|
|
274
|
+
// Keyed by positional index because `pair.idA`/`pair.idB` are 0-based
|
|
275
|
+
// indices per the GraphERScorer contract. See GraphERScorer JSDoc.
|
|
276
|
+
const fkClusterByIndex = new Map<number, number>();
|
|
277
|
+
for (let i = 0; i < t.rows.length; i++) {
|
|
278
|
+
const fkVal = t.rows[i]![rel.fkColumn];
|
|
279
|
+
if (fkVal === null || fkVal === undefined) continue;
|
|
280
|
+
const bRowIdx = bIndex.get(fkVal);
|
|
281
|
+
if (bRowIdx === undefined) continue;
|
|
282
|
+
const bCid = bClusters.get(bRowIdx);
|
|
283
|
+
if (bCid !== undefined) fkClusterByIndex.set(i, bCid);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
for (const pair of boosted) {
|
|
287
|
+
const ca = fkClusterByIndex.get(pair.idA);
|
|
288
|
+
const cb = fkClusterByIndex.get(pair.idB);
|
|
289
|
+
if (ca !== undefined && cb !== undefined && ca === cb) {
|
|
290
|
+
const newScore = Math.min(1, pair.score + similarityBoost);
|
|
291
|
+
(pair as { score: number }).score = newScore;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const newClusters = clustersFromPairs(t.rows.length, boosted, threshold);
|
|
297
|
+
const prevClusters = clustersByTable.get(t.name);
|
|
298
|
+
if (prevClusters) {
|
|
299
|
+
const delta = clusterSetDelta(prevClusters, newClusters);
|
|
300
|
+
if (delta > maxDelta) maxDelta = delta;
|
|
301
|
+
}
|
|
302
|
+
nextClusters.set(t.name, newClusters);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
clustersByTable = nextClusters;
|
|
306
|
+
if (maxDelta < convergenceThreshold) {
|
|
307
|
+
converged = true;
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
const finalMap = new Map<string, ReadonlyMap<number, ClusterInfo>>();
|
|
313
|
+
for (const [k, v] of clustersByTable) finalMap.set(k, v);
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
clustersByTable: finalMap,
|
|
317
|
+
converged,
|
|
318
|
+
iterations: iter + (converged ? 1 : 0),
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Compare two cluster assignments over the same row set. Returns fraction of
|
|
324
|
+
* rows whose cluster signature changed — a rough "delta" proxy. Two rows
|
|
325
|
+
* have the same signature if they are in the same cluster in both sets.
|
|
326
|
+
*/
|
|
327
|
+
function clusterSetDelta(
|
|
328
|
+
a: ReadonlyMap<number, ClusterInfo>,
|
|
329
|
+
b: ReadonlyMap<number, ClusterInfo>,
|
|
330
|
+
): number {
|
|
331
|
+
const mapA = rowIdToCluster(a);
|
|
332
|
+
const mapB = rowIdToCluster(b);
|
|
333
|
+
|
|
334
|
+
// Align cluster IDs between a and b by finding the most common b-id for
|
|
335
|
+
// each a-id. Anything mismatched counts as a change.
|
|
336
|
+
const aToB = new Map<number, Map<number, number>>();
|
|
337
|
+
for (const [rowId, aCid] of mapA) {
|
|
338
|
+
const bCid = mapB.get(rowId);
|
|
339
|
+
if (bCid === undefined) continue;
|
|
340
|
+
let sub = aToB.get(aCid);
|
|
341
|
+
if (!sub) {
|
|
342
|
+
sub = new Map();
|
|
343
|
+
aToB.set(aCid, sub);
|
|
344
|
+
}
|
|
345
|
+
sub.set(bCid, (sub.get(bCid) ?? 0) + 1);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
const majority = new Map<number, number>();
|
|
349
|
+
for (const [aCid, counts] of aToB) {
|
|
350
|
+
let best: [number, number] | null = null;
|
|
351
|
+
for (const [bCid, count] of counts) {
|
|
352
|
+
if (best === null || count > best[1]) best = [bCid, count];
|
|
353
|
+
}
|
|
354
|
+
if (best) majority.set(aCid, best[0]);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
let changed = 0;
|
|
358
|
+
let total = 0;
|
|
359
|
+
for (const [rowId, aCid] of mapA) {
|
|
360
|
+
const bCid = mapB.get(rowId);
|
|
361
|
+
if (bCid === undefined) {
|
|
362
|
+
changed++;
|
|
363
|
+
total++;
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
total++;
|
|
367
|
+
if (majority.get(aCid) !== bCid) changed++;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
return total === 0 ? 0 : changed / total;
|
|
371
|
+
}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* index.ts — Core public API surface for GoldenMatch-JS.
|
|
3
|
+
* Re-exports everything from core modules.
|
|
4
|
+
*
|
|
5
|
+
* Edge-safe: no `node:` imports.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Types
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
export type {
|
|
13
|
+
Row,
|
|
14
|
+
ColumnValue,
|
|
15
|
+
PairKey,
|
|
16
|
+
MatchkeyConfig,
|
|
17
|
+
ExactMatchkey,
|
|
18
|
+
WeightedMatchkey,
|
|
19
|
+
ProbabilisticMatchkey,
|
|
20
|
+
MakeMatchkeyConfigInput,
|
|
21
|
+
MatchkeyField,
|
|
22
|
+
BlockingConfig,
|
|
23
|
+
BlockingKeyConfig,
|
|
24
|
+
SortKeyField,
|
|
25
|
+
CanopyConfig,
|
|
26
|
+
GoldenRulesConfig,
|
|
27
|
+
GoldenFieldRule,
|
|
28
|
+
StandardizationConfig,
|
|
29
|
+
ValidationRuleConfig,
|
|
30
|
+
ValidationConfig,
|
|
31
|
+
QualityConfig,
|
|
32
|
+
TransformConfig,
|
|
33
|
+
BudgetConfig,
|
|
34
|
+
LLMScorerConfig,
|
|
35
|
+
DomainConfig,
|
|
36
|
+
LearningConfig,
|
|
37
|
+
MemoryConfig,
|
|
38
|
+
InputFileConfig,
|
|
39
|
+
InputConfig,
|
|
40
|
+
OutputConfig,
|
|
41
|
+
GoldenMatchConfig,
|
|
42
|
+
ScoredPair,
|
|
43
|
+
ClusterInfo,
|
|
44
|
+
DedupeStats,
|
|
45
|
+
DedupeResult,
|
|
46
|
+
MatchResult,
|
|
47
|
+
FieldProvenance,
|
|
48
|
+
ClusterProvenance,
|
|
49
|
+
BlockResult,
|
|
50
|
+
} from "./types.js";
|
|
51
|
+
|
|
52
|
+
export {
|
|
53
|
+
VALID_SCORERS,
|
|
54
|
+
VALID_TRANSFORMS,
|
|
55
|
+
VALID_STRATEGIES,
|
|
56
|
+
VALID_STANDARDIZERS,
|
|
57
|
+
makeMatchkeyField,
|
|
58
|
+
makeMatchkeyConfig,
|
|
59
|
+
makeBlockingConfig,
|
|
60
|
+
makeGoldenRulesConfig,
|
|
61
|
+
makeConfig,
|
|
62
|
+
makeScoredPair,
|
|
63
|
+
getMatchkeys,
|
|
64
|
+
} from "./types.js";
|
|
65
|
+
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Data layer
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
export { TabularData, isNullish, toColumnValue } from "./data.js";
|
|
71
|
+
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Transforms
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
export { applyTransform, applyTransforms, soundex, metaphone } from "./transforms.js";
|
|
77
|
+
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// Scoring
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
export {
|
|
83
|
+
scoreField,
|
|
84
|
+
scorePair,
|
|
85
|
+
findExactMatches,
|
|
86
|
+
findFuzzyMatches,
|
|
87
|
+
scoreBlocksSequential,
|
|
88
|
+
jaro,
|
|
89
|
+
jaroWinkler,
|
|
90
|
+
levenshteinDistance,
|
|
91
|
+
levenshteinSimilarity,
|
|
92
|
+
indelDistance,
|
|
93
|
+
indelSimilarity,
|
|
94
|
+
tokenSortRatio,
|
|
95
|
+
soundexMatch,
|
|
96
|
+
diceCoefficient,
|
|
97
|
+
jaccardSimilarity,
|
|
98
|
+
ensembleScore,
|
|
99
|
+
scoreMatrix,
|
|
100
|
+
asString,
|
|
101
|
+
} from "./scorer.js";
|
|
102
|
+
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
// Matchkey
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
export {
|
|
108
|
+
computeMatchkeyValue,
|
|
109
|
+
computeMatchkeys,
|
|
110
|
+
addRowIds,
|
|
111
|
+
addSourceColumn,
|
|
112
|
+
} from "./matchkey.js";
|
|
113
|
+
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
// Standardization
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
export { applyStandardizer, applyStandardization } from "./standardize.js";
|
|
119
|
+
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
// Blocking
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
export {
|
|
125
|
+
buildBlocks,
|
|
126
|
+
buildBlocksAsync,
|
|
127
|
+
buildStaticBlocks,
|
|
128
|
+
buildMultiPassBlocks,
|
|
129
|
+
buildAdaptiveBlocks,
|
|
130
|
+
selectBestBlockingKey,
|
|
131
|
+
} from "./blocker.js";
|
|
132
|
+
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
// Embedding + ANN + Cross-encoder
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
export { Embedder, getEmbedder, EmbedderError } from "./embedder.js";
|
|
138
|
+
export type {
|
|
139
|
+
EmbedderOptions,
|
|
140
|
+
EmbeddingResult,
|
|
141
|
+
EmbedderProvider,
|
|
142
|
+
} from "./embedder.js";
|
|
143
|
+
export {
|
|
144
|
+
ANNBlocker,
|
|
145
|
+
HNSWANNBlocker,
|
|
146
|
+
createANNBlocker,
|
|
147
|
+
buildANNBlocks,
|
|
148
|
+
buildANNPairBlocks,
|
|
149
|
+
cosineSim,
|
|
150
|
+
euclideanDist,
|
|
151
|
+
} from "./ann-blocker.js";
|
|
152
|
+
export type {
|
|
153
|
+
ANNBlockerOptions,
|
|
154
|
+
ANNBlockerBase,
|
|
155
|
+
BuildANNOptions,
|
|
156
|
+
HNSWOptions,
|
|
157
|
+
HNSWModule,
|
|
158
|
+
HNSWIndexLike,
|
|
159
|
+
CreateANNBlockerOptions,
|
|
160
|
+
} from "./ann-blocker.js";
|
|
161
|
+
export {
|
|
162
|
+
rerankTopPairs,
|
|
163
|
+
rerankPair,
|
|
164
|
+
CrossEncoderHttpError,
|
|
165
|
+
CrossEncoderModel,
|
|
166
|
+
_resetCrossEncoderModelCache,
|
|
167
|
+
} from "./cross-encoder.js";
|
|
168
|
+
export type {
|
|
169
|
+
CrossEncoderOptions,
|
|
170
|
+
CrossEncoderProvider,
|
|
171
|
+
CrossEncoderReranker,
|
|
172
|
+
CrossEncoderModelOptions,
|
|
173
|
+
} from "./cross-encoder.js";
|
|
174
|
+
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
// Clustering
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
export {
|
|
180
|
+
UnionFind,
|
|
181
|
+
buildClusters,
|
|
182
|
+
buildMst,
|
|
183
|
+
splitOversizedCluster,
|
|
184
|
+
computeClusterConfidence,
|
|
185
|
+
addToCluster,
|
|
186
|
+
unmergeRecord,
|
|
187
|
+
unmergeCluster,
|
|
188
|
+
pairKey,
|
|
189
|
+
parsePairKey,
|
|
190
|
+
getClusterPairScores,
|
|
191
|
+
} from "./cluster.js";
|
|
192
|
+
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
// Golden records
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
export {
|
|
198
|
+
mergeField,
|
|
199
|
+
buildGoldenRecord,
|
|
200
|
+
buildGoldenRecordWithProvenance,
|
|
201
|
+
} from "./golden.js";
|
|
202
|
+
|
|
203
|
+
// ---------------------------------------------------------------------------
|
|
204
|
+
// Pipeline
|
|
205
|
+
// ---------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
export { runDedupePipeline, runMatchPipeline } from "./pipeline.js";
|
|
208
|
+
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
// API
|
|
211
|
+
// ---------------------------------------------------------------------------
|
|
212
|
+
|
|
213
|
+
export { dedupe, match, scoreStrings, scorePairRecord } from "./api.js";
|
|
214
|
+
|
|
215
|
+
// ---------------------------------------------------------------------------
|
|
216
|
+
// Config
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
export { parseConfig, parseConfigYaml, configToYaml } from "./config/loader.js";
|
|
220
|
+
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// LLM
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
|
|
225
|
+
export { BudgetTracker, countTokensApprox } from "./llm/budget.js";
|
|
226
|
+
export type { BudgetSnapshot } from "./llm/budget.js";
|
|
227
|
+
export { llmScorePairs, scoreStringsWithLlm } from "./llm/scorer.js";
|
|
228
|
+
export type { LLMScoreResult } from "./llm/scorer.js";
|
|
229
|
+
export { llmClusterPairs } from "./llm/cluster.js";
|
|
230
|
+
|
|
231
|
+
// ---------------------------------------------------------------------------
|
|
232
|
+
// Explain
|
|
233
|
+
// ---------------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
export { explainPair, explainCluster } from "./explain.js";
|
|
236
|
+
export type { PairExplanation, ClusterExplanation } from "./explain.js";
|
|
237
|
+
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// Probabilistic (Fellegi-Sunter)
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
export { buildComparisonVector, trainEM, scoreProbabilistic } from "./probabilistic.js";
|
|
243
|
+
export type { EMResult } from "./probabilistic.js";
|
|
244
|
+
|
|
245
|
+
// ---------------------------------------------------------------------------
|
|
246
|
+
// Evaluation
|
|
247
|
+
// ---------------------------------------------------------------------------
|
|
248
|
+
|
|
249
|
+
export { evaluatePairs, evaluateClusters, loadGroundTruthPairs } from "./evaluate.js";
|
|
250
|
+
export type { EvalResult } from "./evaluate.js";
|
|
251
|
+
|
|
252
|
+
// ---------------------------------------------------------------------------
|
|
253
|
+
// Streaming / match-one
|
|
254
|
+
// ---------------------------------------------------------------------------
|
|
255
|
+
|
|
256
|
+
export { StreamProcessor } from "./streaming.js";
|
|
257
|
+
export { matchOne, findExactMatchesOne } from "./match-one.js";
|
|
258
|
+
|
|
259
|
+
// ---------------------------------------------------------------------------
|
|
260
|
+
// Cluster comparison + sensitivity
|
|
261
|
+
// ---------------------------------------------------------------------------
|
|
262
|
+
|
|
263
|
+
export { compareClusters } from "./compare-clusters.js";
|
|
264
|
+
export type { CCMSResult } from "./compare-clusters.js";
|
|
265
|
+
export { runSensitivity, stabilityReport } from "./sensitivity.js";
|
|
266
|
+
export type { SweepParam, SweepPoint, SensitivityResult } from "./sensitivity.js";
|
|
267
|
+
|
|
268
|
+
// ---------------------------------------------------------------------------
|
|
269
|
+
// Quality, autofix, validation, profiling, ingest
|
|
270
|
+
// ---------------------------------------------------------------------------
|
|
271
|
+
|
|
272
|
+
export { scanQuality, runQualityCheck } from "./quality.js";
|
|
273
|
+
export type { QualityFinding } from "./quality.js";
|
|
274
|
+
export { autoFixRows } from "./autofix.js";
|
|
275
|
+
export type { AutoFixLog } from "./autofix.js";
|
|
276
|
+
export { validateRows } from "./validate.js";
|
|
277
|
+
export type { ValidationRule, ValidationReport } from "./validate.js";
|
|
278
|
+
export { profileRows } from "./profiler.js";
|
|
279
|
+
export type { ColumnProfile, DatasetProfile } from "./profiler.js";
|
|
280
|
+
export { applyColumnMap, validateColumns, concatRows } from "./ingest.js";
|
|
281
|
+
|
|
282
|
+
// ---------------------------------------------------------------------------
|
|
283
|
+
// Review queue, autoconfig, domain, lineage, learned blocking, graph ER
|
|
284
|
+
// ---------------------------------------------------------------------------
|
|
285
|
+
|
|
286
|
+
export { ReviewQueue, gatePairs } from "./review-queue.js";
|
|
287
|
+
export type { ReviewItem, GatedResult } from "./review-queue.js";
|
|
288
|
+
export { autoConfigureRows } from "./autoconfig.js";
|
|
289
|
+
export type { AutoconfigOptions } from "./autoconfig.js";
|
|
290
|
+
export { detectDomain, extractFeatures } from "./domain.js";
|
|
291
|
+
export type { DomainProfile } from "./domain.js";
|
|
292
|
+
export { buildLineage, lineageToJson, lineageFromJson } from "./lineage.js";
|
|
293
|
+
export type { LineageEdge, LineageBundle } from "./lineage.js";
|
|
294
|
+
export { learnBlockingRules, applyLearnedBlocks } from "./learned-blocking.js";
|
|
295
|
+
export type { LearnedPredicate, LearnedRules } from "./learned-blocking.js";
|
|
296
|
+
export { runGraphER } from "./graph-er.js";
|
|
297
|
+
export type { TableSchema, Relationship, GraphERResult } from "./graph-er.js";
|
|
298
|
+
|
|
299
|
+
// ---------------------------------------------------------------------------
|
|
300
|
+
// Memory (learning corrections)
|
|
301
|
+
// ---------------------------------------------------------------------------
|
|
302
|
+
|
|
303
|
+
export { MemoryStore } from "./memory/store.js";
|
|
304
|
+
export type { Correction, MemoryStoreConfig } from "./memory/store.js";
|
|
305
|
+
export { applyCorrections, hashRow } from "./memory/corrections.js";
|
|
306
|
+
export { MemoryLearner } from "./memory/learner.js";
|
|
307
|
+
export type { LearnedParams } from "./memory/learner.js";
|
|
308
|
+
|
|
309
|
+
// ---------------------------------------------------------------------------
|
|
310
|
+
// PPRL (Privacy-Preserving Record Linkage)
|
|
311
|
+
// ---------------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
export { runPPRL, autoConfigurePPRL } from "./pprl/protocol.js";
|
|
314
|
+
export type { PPRLConfig, PPRLResult } from "./pprl/protocol.js";
|