goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ingest.ts — Edge-safe data-shape transforms for row arrays.
|
|
3
|
+
* Edge-safe: no `node:` imports, no file I/O.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/ingest.py minus file loading — callers are
|
|
6
|
+
* expected to bring already-parsed rows (JSON, fetched CSV, etc.).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Row } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Column renaming
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Rename columns according to a {oldName: newName} map.
|
|
17
|
+
*
|
|
18
|
+
* Keys missing from the map are passed through untouched. If a rename
|
|
19
|
+
* would collide with an existing column, the mapped column wins
|
|
20
|
+
* (mirroring Polars behavior).
|
|
21
|
+
*/
|
|
22
|
+
export function applyColumnMap(
|
|
23
|
+
rows: readonly Row[],
|
|
24
|
+
columnMap: Readonly<Record<string, string>>,
|
|
25
|
+
): Row[] {
|
|
26
|
+
if (Object.keys(columnMap).length === 0) {
|
|
27
|
+
return rows.map((r) => ({ ...r }));
|
|
28
|
+
}
|
|
29
|
+
return rows.map((row) => {
|
|
30
|
+
const newRow: Record<string, unknown> = {};
|
|
31
|
+
for (const [key, val] of Object.entries(row)) {
|
|
32
|
+
const newKey = columnMap[key] ?? key;
|
|
33
|
+
newRow[newKey] = val;
|
|
34
|
+
}
|
|
35
|
+
return newRow as Row;
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// Column validation
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Ensure every required column exists on the first row of `rows`.
|
|
45
|
+
* No-ops on empty input.
|
|
46
|
+
*/
|
|
47
|
+
export function validateColumns(
|
|
48
|
+
rows: readonly Row[],
|
|
49
|
+
required: readonly string[],
|
|
50
|
+
): void {
|
|
51
|
+
if (rows.length === 0) return;
|
|
52
|
+
const first = rows[0]!;
|
|
53
|
+
const missing = required.filter((c) => !(c in first));
|
|
54
|
+
if (missing.length > 0) {
|
|
55
|
+
throw new Error(`Required columns missing: ${missing.join(", ")}`);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Row concat (union of schemas)
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Concatenate several row arrays. Unioned schema: any column present in
|
|
65
|
+
* any input appears in the output; missing values become null.
|
|
66
|
+
*/
|
|
67
|
+
export function concatRows(rowsArrays: readonly (readonly Row[])[]): Row[] {
|
|
68
|
+
const allKeys = new Set<string>();
|
|
69
|
+
let totalLen = 0;
|
|
70
|
+
for (const arr of rowsArrays) {
|
|
71
|
+
totalLen += arr.length;
|
|
72
|
+
for (const row of arr) {
|
|
73
|
+
for (const k of Object.keys(row)) allKeys.add(k);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const out: Row[] = new Array(totalLen);
|
|
78
|
+
let idx = 0;
|
|
79
|
+
for (const arr of rowsArrays) {
|
|
80
|
+
for (const row of arr) {
|
|
81
|
+
const merged: Record<string, unknown> = {};
|
|
82
|
+
for (const k of allKeys) {
|
|
83
|
+
merged[k] = k in row ? row[k] : null;
|
|
84
|
+
}
|
|
85
|
+
out[idx++] = merged as Row;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return out;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// Source tagging
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Add a `__source__` column to each row. Useful when concatenating rows
|
|
97
|
+
* from multiple datasets and downstream logic needs to know the origin.
|
|
98
|
+
*/
|
|
99
|
+
export function tagSource(rows: readonly Row[], source: string): Row[] {
|
|
100
|
+
return rows.map((row) => ({ ...row, __source__: source }));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Add a `__row_id__` column if missing. IDs are assigned sequentially
|
|
105
|
+
* starting at `startAt` (default 0).
|
|
106
|
+
*/
|
|
107
|
+
export function assignRowIds(rows: readonly Row[], startAt: number = 0): Row[] {
|
|
108
|
+
return rows.map((row, i) => ({
|
|
109
|
+
...row,
|
|
110
|
+
__row_id__: row["__row_id__"] ?? startAt + i,
|
|
111
|
+
}));
|
|
112
|
+
}
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* learned-blocking.ts — Data-driven predicate selection for blocking.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/learned_blocking.py. Given a labelled set of
|
|
6
|
+
* matching pairs, greedy-select predicates (equal, soundex, prefix,
|
|
7
|
+
* qgram) that maximize recall while minimizing candidate pair count.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { Row, BlockResult, ScoredPair } from "./types.js";
|
|
11
|
+
import { applyTransform, soundex } from "./transforms.js";
|
|
12
|
+
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Types
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
export interface LearnedPredicate {
|
|
18
|
+
readonly type: "equal" | "soundex" | "first_n_chars" | "qgram";
|
|
19
|
+
readonly field: string;
|
|
20
|
+
readonly n?: number;
|
|
21
|
+
readonly recall: number;
|
|
22
|
+
readonly reduction: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface LearnedRules {
|
|
26
|
+
readonly predicates: readonly LearnedPredicate[];
|
|
27
|
+
readonly minRecall: number;
|
|
28
|
+
readonly minReduction: number;
|
|
29
|
+
readonly learnedAt: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface LearnBlockingOptions {
|
|
33
|
+
readonly minRecall?: number;
|
|
34
|
+
readonly minReduction?: number;
|
|
35
|
+
readonly predicateDepth?: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// Predicate evaluation
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
function fieldValue(row: Row, field: string): string | null {
|
|
43
|
+
const v = row[field];
|
|
44
|
+
if (v === null || v === undefined) return null;
|
|
45
|
+
const s = typeof v === "string" ? v : String(v);
|
|
46
|
+
const trimmed = s.trim();
|
|
47
|
+
return trimmed.length === 0 ? null : trimmed;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
type PredicateFn = (row: Row) => string | null;
|
|
51
|
+
|
|
52
|
+
function buildPredicateFn(
|
|
53
|
+
pred: Omit<LearnedPredicate, "recall" | "reduction">,
|
|
54
|
+
): PredicateFn {
|
|
55
|
+
const { type, field, n } = pred;
|
|
56
|
+
switch (type) {
|
|
57
|
+
case "equal":
|
|
58
|
+
return (row) => {
|
|
59
|
+
const v = fieldValue(row, field);
|
|
60
|
+
return v === null ? null : applyTransform(v, "lowercase");
|
|
61
|
+
};
|
|
62
|
+
case "soundex":
|
|
63
|
+
return (row) => {
|
|
64
|
+
const v = fieldValue(row, field);
|
|
65
|
+
if (v === null) return null;
|
|
66
|
+
const code = soundex(v);
|
|
67
|
+
return code === "0000" ? null : code;
|
|
68
|
+
};
|
|
69
|
+
case "first_n_chars": {
|
|
70
|
+
const len = n ?? 3;
|
|
71
|
+
return (row) => {
|
|
72
|
+
const v = fieldValue(row, field);
|
|
73
|
+
if (v === null) return null;
|
|
74
|
+
const lc = v.toLowerCase();
|
|
75
|
+
return lc.length >= len ? lc.slice(0, len) : null;
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
case "qgram": {
|
|
79
|
+
const size = n ?? 3;
|
|
80
|
+
return (row) => {
|
|
81
|
+
const v = fieldValue(row, field);
|
|
82
|
+
if (v === null) return null;
|
|
83
|
+
const lc = v.toLowerCase();
|
|
84
|
+
if (lc.length < size) return null;
|
|
85
|
+
// Use the first q-gram as the block key (simple, deterministic).
|
|
86
|
+
return lc.slice(0, size);
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
interface PredicateStats {
|
|
93
|
+
readonly capturedPairKeys: Set<string>;
|
|
94
|
+
readonly candidatePairs: number;
|
|
95
|
+
readonly recall: number;
|
|
96
|
+
readonly reduction: number;
|
|
97
|
+
readonly blocks: Map<string, number[]>;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function pairKey(a: number, b: number): string {
|
|
101
|
+
return a < b ? `${a}|${b}` : `${b}|${a}`;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function evaluatePredicate(
|
|
105
|
+
rows: readonly Row[],
|
|
106
|
+
knownPairs: readonly ScoredPair[],
|
|
107
|
+
fn: PredicateFn,
|
|
108
|
+
totalPossiblePairs: number,
|
|
109
|
+
): PredicateStats {
|
|
110
|
+
// Group rows by their block key.
|
|
111
|
+
const blocks = new Map<string, number[]>();
|
|
112
|
+
for (let i = 0; i < rows.length; i++) {
|
|
113
|
+
const row = rows[i]!;
|
|
114
|
+
const key = fn(row);
|
|
115
|
+
if (key === null) continue;
|
|
116
|
+
const list = blocks.get(key);
|
|
117
|
+
if (list) list.push(i);
|
|
118
|
+
else blocks.set(key, [i]);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Count candidate pairs generated by this predicate.
|
|
122
|
+
let candidatePairs = 0;
|
|
123
|
+
for (const ids of blocks.values()) {
|
|
124
|
+
const n = ids.length;
|
|
125
|
+
candidatePairs += (n * (n - 1)) / 2;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Compute which known pairs are captured: both row indices share a block.
|
|
129
|
+
// Build row->block-key-set index.
|
|
130
|
+
const idToKeys = new Map<number, Set<string>>();
|
|
131
|
+
for (const [key, ids] of blocks) {
|
|
132
|
+
for (const id of ids) {
|
|
133
|
+
let set = idToKeys.get(id);
|
|
134
|
+
if (!set) {
|
|
135
|
+
set = new Set<string>();
|
|
136
|
+
idToKeys.set(id, set);
|
|
137
|
+
}
|
|
138
|
+
set.add(key);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const capturedPairKeys = new Set<string>();
|
|
143
|
+
for (const pair of knownPairs) {
|
|
144
|
+
const keysA = idToKeys.get(pair.idA);
|
|
145
|
+
const keysB = idToKeys.get(pair.idB);
|
|
146
|
+
if (!keysA || !keysB) continue;
|
|
147
|
+
for (const k of keysA) {
|
|
148
|
+
if (keysB.has(k)) {
|
|
149
|
+
capturedPairKeys.add(pairKey(pair.idA, pair.idB));
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const recall =
|
|
156
|
+
knownPairs.length === 0 ? 0 : capturedPairKeys.size / knownPairs.length;
|
|
157
|
+
const reduction =
|
|
158
|
+
totalPossiblePairs === 0
|
|
159
|
+
? 1
|
|
160
|
+
: Math.max(0, 1 - candidatePairs / totalPossiblePairs);
|
|
161
|
+
|
|
162
|
+
return { capturedPairKeys, candidatePairs, recall, reduction, blocks };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Greedy predicate selection
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Learn blocking rules from known-matching pairs.
|
|
171
|
+
*
|
|
172
|
+
* Evaluates candidate predicates for each column, then greedily selects
|
|
173
|
+
* predicates that add the most new recall per candidate pair introduced,
|
|
174
|
+
* stopping when minRecall is reached (or when options.predicateDepth is
|
|
175
|
+
* exceeded).
|
|
176
|
+
*/
|
|
177
|
+
export function learnBlockingRules(
|
|
178
|
+
rows: readonly Row[],
|
|
179
|
+
knownPairs: readonly ScoredPair[],
|
|
180
|
+
columns: readonly string[],
|
|
181
|
+
options?: LearnBlockingOptions,
|
|
182
|
+
): LearnedRules {
|
|
183
|
+
const minRecall = options?.minRecall ?? 0.95;
|
|
184
|
+
const minReduction = options?.minReduction ?? 0.9;
|
|
185
|
+
const predicateDepth = options?.predicateDepth ?? 3;
|
|
186
|
+
|
|
187
|
+
const totalPossible = (rows.length * (rows.length - 1)) / 2;
|
|
188
|
+
|
|
189
|
+
const candidates: { pred: LearnedPredicate; stats: PredicateStats }[] = [];
|
|
190
|
+
for (const field of columns) {
|
|
191
|
+
const predVariants: {
|
|
192
|
+
type: LearnedPredicate["type"];
|
|
193
|
+
n?: number;
|
|
194
|
+
}[] = [
|
|
195
|
+
{ type: "equal" },
|
|
196
|
+
{ type: "soundex" },
|
|
197
|
+
{ type: "first_n_chars", n: 3 },
|
|
198
|
+
{ type: "first_n_chars", n: 4 },
|
|
199
|
+
{ type: "qgram", n: 3 },
|
|
200
|
+
];
|
|
201
|
+
for (const variant of predVariants) {
|
|
202
|
+
const predPartial: Omit<LearnedPredicate, "recall" | "reduction"> =
|
|
203
|
+
variant.n !== undefined
|
|
204
|
+
? { type: variant.type, field, n: variant.n }
|
|
205
|
+
: { type: variant.type, field };
|
|
206
|
+
const fn = buildPredicateFn(predPartial);
|
|
207
|
+
const stats = evaluatePredicate(rows, knownPairs, fn, totalPossible);
|
|
208
|
+
if (stats.recall === 0) continue;
|
|
209
|
+
// Reject predicates whose reduction falls below threshold unless
|
|
210
|
+
// they contribute unique recall (caught in the greedy loop below).
|
|
211
|
+
const pred: LearnedPredicate = {
|
|
212
|
+
...predPartial,
|
|
213
|
+
recall: stats.recall,
|
|
214
|
+
reduction: stats.reduction,
|
|
215
|
+
};
|
|
216
|
+
candidates.push({ pred, stats });
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Greedy selection: at each step pick the predicate that contributes the
|
|
221
|
+
// most new captured pairs per candidate pair introduced.
|
|
222
|
+
const selected: LearnedPredicate[] = [];
|
|
223
|
+
const covered = new Set<string>();
|
|
224
|
+
|
|
225
|
+
while (selected.length < predicateDepth) {
|
|
226
|
+
let best: { pred: LearnedPredicate; gain: number; stats: PredicateStats } | null = null;
|
|
227
|
+
|
|
228
|
+
for (const cand of candidates) {
|
|
229
|
+
if (selected.includes(cand.pred)) continue;
|
|
230
|
+
|
|
231
|
+
let newCaptures = 0;
|
|
232
|
+
for (const pk of cand.stats.capturedPairKeys) {
|
|
233
|
+
if (!covered.has(pk)) newCaptures++;
|
|
234
|
+
}
|
|
235
|
+
if (newCaptures === 0) continue;
|
|
236
|
+
if (cand.stats.reduction < minReduction && newCaptures === 0) continue;
|
|
237
|
+
|
|
238
|
+
// Marginal efficiency: new captures per candidate pair (lower cost
|
|
239
|
+
// predicates favored). Avoid divide-by-zero.
|
|
240
|
+
const cost = Math.max(1, cand.stats.candidatePairs);
|
|
241
|
+
const gain = newCaptures / cost;
|
|
242
|
+
|
|
243
|
+
if (best === null || gain > best.gain) {
|
|
244
|
+
best = { pred: cand.pred, gain, stats: cand.stats };
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (best === null) break;
|
|
249
|
+
selected.push(best.pred);
|
|
250
|
+
for (const pk of best.stats.capturedPairKeys) covered.add(pk);
|
|
251
|
+
|
|
252
|
+
const achievedRecall =
|
|
253
|
+
knownPairs.length === 0 ? 0 : covered.size / knownPairs.length;
|
|
254
|
+
if (achievedRecall >= minRecall) break;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
predicates: selected,
|
|
259
|
+
minRecall,
|
|
260
|
+
minReduction,
|
|
261
|
+
learnedAt: new Date().toISOString(),
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// ---------------------------------------------------------------------------
|
|
266
|
+
// Apply learned rules to produce blocks
|
|
267
|
+
// ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Apply learned rules to rows, returning one BlockResult per non-empty,
|
|
271
|
+
* non-oversized block. Each predicate is its own pass (OR'd together).
|
|
272
|
+
*/
|
|
273
|
+
export function applyLearnedBlocks(
|
|
274
|
+
rows: readonly Row[],
|
|
275
|
+
rules: LearnedRules,
|
|
276
|
+
maxBlockSize: number,
|
|
277
|
+
): BlockResult[] {
|
|
278
|
+
const out: BlockResult[] = [];
|
|
279
|
+
|
|
280
|
+
for (let p = 0; p < rules.predicates.length; p++) {
|
|
281
|
+
const pred = rules.predicates[p]!;
|
|
282
|
+
const fn = buildPredicateFn(pred);
|
|
283
|
+
const blocks = new Map<string, Row[]>();
|
|
284
|
+
for (const row of rows) {
|
|
285
|
+
const key = fn(row);
|
|
286
|
+
if (key === null) continue;
|
|
287
|
+
const list = blocks.get(key);
|
|
288
|
+
if (list) list.push(row);
|
|
289
|
+
else blocks.set(key, [row]);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
for (const [key, blockRows] of blocks) {
|
|
293
|
+
if (blockRows.length < 2) continue;
|
|
294
|
+
if (blockRows.length > maxBlockSize) continue;
|
|
295
|
+
out.push({
|
|
296
|
+
blockKey: `learned:${pred.type}:${pred.field}=${key}`,
|
|
297
|
+
rows: blockRows,
|
|
298
|
+
strategy: "learned",
|
|
299
|
+
depth: p,
|
|
300
|
+
});
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return out;
|
|
305
|
+
}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lineage.ts — Provenance tracking for golden records.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/lineage.py. Records which source rows contributed
|
|
6
|
+
* each golden-record field, with the survivorship strategy and confidence.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { ClusterInfo, DedupeResult, Row } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Types
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
export interface FieldProvenanceEntry {
|
|
16
|
+
readonly sourceRowId: number;
|
|
17
|
+
readonly strategy: string;
|
|
18
|
+
readonly confidence: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface LineageEdge {
|
|
22
|
+
readonly clusterId: number;
|
|
23
|
+
readonly sourceRowIds: readonly number[];
|
|
24
|
+
readonly goldenRowId: number;
|
|
25
|
+
readonly fieldProvenance: Readonly<Record<string, FieldProvenanceEntry>>;
|
|
26
|
+
readonly timestamp: string;
|
|
27
|
+
/**
|
|
28
|
+
* Human-readable summary of this lineage edge. Only populated when
|
|
29
|
+
* `buildLineage` is called with `{ naturalLanguage: true }`.
|
|
30
|
+
*/
|
|
31
|
+
readonly naturalLanguage?: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface LineageBundle {
|
|
35
|
+
readonly edges: readonly LineageEdge[];
|
|
36
|
+
readonly timestamp: string;
|
|
37
|
+
readonly recordCount: number;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// Helpers
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
function isoTimestamp(): string {
|
|
45
|
+
// Edge-safe: Date works in browser/workers/edge runtimes.
|
|
46
|
+
return new Date().toISOString();
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function getRowId(row: Row): number | null {
|
|
50
|
+
const raw = row["__row_id__"];
|
|
51
|
+
if (typeof raw === "number") return raw;
|
|
52
|
+
if (typeof raw === "string") {
|
|
53
|
+
const n = Number(raw);
|
|
54
|
+
return Number.isFinite(n) ? n : null;
|
|
55
|
+
}
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function getClusterId(row: Row): number | null {
|
|
60
|
+
const raw = row["__cluster_id__"];
|
|
61
|
+
if (typeof raw === "number") return raw;
|
|
62
|
+
if (typeof raw === "string") {
|
|
63
|
+
const n = Number(raw);
|
|
64
|
+
return Number.isFinite(n) ? n : null;
|
|
65
|
+
}
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function renderNaturalLanguage(edge: Omit<LineageEdge, "naturalLanguage">): string {
|
|
70
|
+
const numSources = edge.sourceRowIds.length;
|
|
71
|
+
const entries = Object.entries(edge.fieldProvenance);
|
|
72
|
+
const numFields = entries.length;
|
|
73
|
+
let strongest: [string, FieldProvenanceEntry] | null = null;
|
|
74
|
+
for (const entry of entries) {
|
|
75
|
+
if (strongest === null || entry[1].confidence > strongest[1].confidence) {
|
|
76
|
+
strongest = entry;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const strongestDesc = strongest
|
|
80
|
+
? `${strongest[0]} from row ${strongest[1].sourceRowId} via ${strongest[1].strategy} (conf ${strongest[1].confidence.toFixed(2)})`
|
|
81
|
+
: "no field provenance";
|
|
82
|
+
return `Cluster ${edge.clusterId}: merged ${numSources} source records into golden row ${edge.goldenRowId} across ${numFields} fields. Strongest contribution: ${strongestDesc}.`;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function computeConfidence(cluster: ClusterInfo, strategy: string): number {
|
|
86
|
+
// Base confidence is the cluster confidence; minor bonus for
|
|
87
|
+
// most_complete / first_non_null which are safer picks.
|
|
88
|
+
const base = cluster.confidence;
|
|
89
|
+
if (strategy === "source_priority" || strategy === "most_complete") {
|
|
90
|
+
return Math.min(1, base + 0.05);
|
|
91
|
+
}
|
|
92
|
+
return base;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ---------------------------------------------------------------------------
|
|
96
|
+
// Build lineage
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
export interface BuildLineageOptions {
|
|
100
|
+
/**
|
|
101
|
+
* When `true`, every emitted `LineageEdge` gets a human-readable
|
|
102
|
+
* `naturalLanguage` summary describing which rows were merged, how many
|
|
103
|
+
* fields carry provenance, and the strongest contributing field. Zero LLM
|
|
104
|
+
* cost — purely template-based.
|
|
105
|
+
*/
|
|
106
|
+
readonly naturalLanguage?: boolean;
|
|
107
|
+
readonly defaultStrategy?: string;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Build a lineage bundle from a DedupeResult.
|
|
112
|
+
*
|
|
113
|
+
* The resulting bundle has one edge per golden record, with field-level
|
|
114
|
+
* provenance keyed by column name. Source row IDs include every member of
|
|
115
|
+
* the cluster the golden record came from.
|
|
116
|
+
*
|
|
117
|
+
* Pass `{ naturalLanguage: true }` to populate a human-readable summary on
|
|
118
|
+
* each edge (see {@link LineageEdge.naturalLanguage}).
|
|
119
|
+
*/
|
|
120
|
+
export function buildLineage(
|
|
121
|
+
result: DedupeResult,
|
|
122
|
+
options?: BuildLineageOptions,
|
|
123
|
+
): LineageBundle {
|
|
124
|
+
const defaultStrategy =
|
|
125
|
+
options?.defaultStrategy ??
|
|
126
|
+
result.config.goldenRules?.defaultStrategy ??
|
|
127
|
+
"most_complete";
|
|
128
|
+
|
|
129
|
+
const timestamp = isoTimestamp();
|
|
130
|
+
const edges: LineageEdge[] = [];
|
|
131
|
+
|
|
132
|
+
// Pre-index dupes by row_id → row for quick field lookup.
|
|
133
|
+
const rowById = new Map<number, Row>();
|
|
134
|
+
for (const r of result.dupes) {
|
|
135
|
+
const id = getRowId(r);
|
|
136
|
+
if (id !== null) rowById.set(id, r);
|
|
137
|
+
}
|
|
138
|
+
for (const r of result.unique) {
|
|
139
|
+
const id = getRowId(r);
|
|
140
|
+
if (id !== null) rowById.set(id, r);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
for (const golden of result.goldenRecords) {
|
|
144
|
+
const clusterId = getClusterId(golden);
|
|
145
|
+
const goldenRowId = getRowId(golden) ?? -1;
|
|
146
|
+
if (clusterId === null) continue;
|
|
147
|
+
|
|
148
|
+
const cluster = result.clusters.get(clusterId);
|
|
149
|
+
if (!cluster) continue;
|
|
150
|
+
|
|
151
|
+
const fieldProvenance: Record<string, FieldProvenanceEntry> = {};
|
|
152
|
+
const confidence = computeConfidence(cluster, defaultStrategy);
|
|
153
|
+
|
|
154
|
+
for (const [key, value] of Object.entries(golden)) {
|
|
155
|
+
if (key.startsWith("__")) continue;
|
|
156
|
+
if (value === null || value === undefined) continue;
|
|
157
|
+
|
|
158
|
+
// Locate which member row contributed this value.
|
|
159
|
+
let sourceRowId = goldenRowId;
|
|
160
|
+
for (const memberId of cluster.members) {
|
|
161
|
+
const memberRow = rowById.get(memberId);
|
|
162
|
+
if (memberRow && memberRow[key] === value) {
|
|
163
|
+
sourceRowId = memberId;
|
|
164
|
+
break;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const fieldStrategy =
|
|
169
|
+
result.config.goldenRules?.fieldRules?.[key]?.strategy ??
|
|
170
|
+
defaultStrategy;
|
|
171
|
+
|
|
172
|
+
fieldProvenance[key] = {
|
|
173
|
+
sourceRowId,
|
|
174
|
+
strategy: fieldStrategy,
|
|
175
|
+
confidence,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const base: Omit<LineageEdge, "naturalLanguage"> = {
|
|
180
|
+
clusterId,
|
|
181
|
+
sourceRowIds: [...cluster.members],
|
|
182
|
+
goldenRowId,
|
|
183
|
+
fieldProvenance,
|
|
184
|
+
timestamp,
|
|
185
|
+
};
|
|
186
|
+
const edge: LineageEdge = options?.naturalLanguage
|
|
187
|
+
? { ...base, naturalLanguage: renderNaturalLanguage(base) }
|
|
188
|
+
: base;
|
|
189
|
+
edges.push(edge);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
edges,
|
|
194
|
+
timestamp,
|
|
195
|
+
recordCount: edges.length,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
// (De)serialization
|
|
201
|
+
// ---------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
/** Serialize a lineage bundle to stable, human-readable JSON. */
|
|
204
|
+
export function lineageToJson(bundle: LineageBundle): string {
|
|
205
|
+
return JSON.stringify(bundle, null, 2);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/** Parse a lineage bundle from JSON. Does not validate schema. */
|
|
209
|
+
export function lineageFromJson(json: string): LineageBundle {
|
|
210
|
+
const parsed = JSON.parse(json) as unknown;
|
|
211
|
+
if (
|
|
212
|
+
typeof parsed !== "object" ||
|
|
213
|
+
parsed === null ||
|
|
214
|
+
!("edges" in parsed) ||
|
|
215
|
+
!("timestamp" in parsed) ||
|
|
216
|
+
!("recordCount" in parsed)
|
|
217
|
+
) {
|
|
218
|
+
throw new Error("Invalid lineage bundle: missing required fields");
|
|
219
|
+
}
|
|
220
|
+
return parsed as LineageBundle;
|
|
221
|
+
}
|