goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* golden.ts — Golden record builder with per-field merge strategies.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type {
|
|
7
|
+
ClusterInfo,
|
|
8
|
+
ClusterProvenance,
|
|
9
|
+
FieldProvenance,
|
|
10
|
+
GoldenFieldRule,
|
|
11
|
+
GoldenRulesConfig,
|
|
12
|
+
Row,
|
|
13
|
+
} from "./types.js";
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Constants
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
const INTERNAL_PREFIXES = [
|
|
20
|
+
"__row_id__",
|
|
21
|
+
"__source__",
|
|
22
|
+
"__block_key__",
|
|
23
|
+
"__mk_",
|
|
24
|
+
"__cluster_id__",
|
|
25
|
+
"__golden_confidence__",
|
|
26
|
+
];
|
|
27
|
+
|
|
28
|
+
function isInternal(col: string): boolean {
|
|
29
|
+
return (
|
|
30
|
+
col === "__mk_" ||
|
|
31
|
+
INTERNAL_PREFIXES.some((prefix) => col.startsWith(prefix))
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// MergeField result
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
export interface MergeFieldResult {
|
|
40
|
+
readonly value: unknown;
|
|
41
|
+
readonly confidence: number;
|
|
42
|
+
readonly sourceIndex: number | null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
// MergeField options
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
export interface MergeFieldOptions {
|
|
50
|
+
readonly sources?: readonly string[];
|
|
51
|
+
readonly dates?: readonly unknown[];
|
|
52
|
+
readonly qualityWeights?: readonly number[];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// mergeField
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Merge a list of values using the given strategy.
|
|
61
|
+
*
|
|
62
|
+
* Strategies:
|
|
63
|
+
* - most_complete: pick longest string value; tie-break by quality weight
|
|
64
|
+
* - majority_vote: pick most frequent value; weighted by quality if available
|
|
65
|
+
* - source_priority: pick first non-null from priority list
|
|
66
|
+
* - most_recent: pick value with most recent date
|
|
67
|
+
* - first_non_null: pick first non-null; prefer highest quality weight
|
|
68
|
+
*/
|
|
69
|
+
export function mergeField(
|
|
70
|
+
values: readonly unknown[],
|
|
71
|
+
rule: GoldenFieldRule,
|
|
72
|
+
options?: MergeFieldOptions,
|
|
73
|
+
): MergeFieldResult {
|
|
74
|
+
const nonNull: [number, unknown][] = [];
|
|
75
|
+
for (let i = 0; i < values.length; i++) {
|
|
76
|
+
if (values[i] != null) {
|
|
77
|
+
nonNull.push([i, values[i]]);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (nonNull.length === 0) {
|
|
82
|
+
return { value: null, confidence: 0.0, sourceIndex: null };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// All non-null values identical -> confidence 1.0 shortcut
|
|
86
|
+
const uniqueVals = new Set(nonNull.map(([, v]) => v));
|
|
87
|
+
if (uniqueVals.size === 1) {
|
|
88
|
+
return { value: nonNull[0]![1], confidence: 1.0, sourceIndex: nonNull[0]![0] };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const strategy = rule.strategy;
|
|
92
|
+
|
|
93
|
+
switch (strategy) {
|
|
94
|
+
case "most_complete":
|
|
95
|
+
return _mostComplete(nonNull, options?.qualityWeights);
|
|
96
|
+
case "majority_vote":
|
|
97
|
+
return _majorityVote(nonNull, options?.qualityWeights);
|
|
98
|
+
case "source_priority":
|
|
99
|
+
return _sourcePriority(values, rule, options?.sources);
|
|
100
|
+
case "most_recent":
|
|
101
|
+
return _mostRecent(values, options?.dates);
|
|
102
|
+
case "first_non_null":
|
|
103
|
+
return _firstNonNull(nonNull, options?.qualityWeights);
|
|
104
|
+
default:
|
|
105
|
+
throw new Error(`Unknown strategy: ${strategy}`);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Strategy implementations
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
function _mostComplete(
|
|
114
|
+
nonNull: [number, unknown][],
|
|
115
|
+
qualityWeights?: readonly number[],
|
|
116
|
+
): MergeFieldResult {
|
|
117
|
+
const strVals = nonNull.map(
|
|
118
|
+
([i, v]) => [i, String(v), v] as [number, string, unknown],
|
|
119
|
+
);
|
|
120
|
+
// For-loop max — Math.max(...arr) crashes on arrays with >65K elements.
|
|
121
|
+
let maxLen = 0;
|
|
122
|
+
for (const [, s] of strVals) if (s.length > maxLen) maxLen = s.length;
|
|
123
|
+
const longest = strVals.filter(([, s]) => s.length === maxLen);
|
|
124
|
+
|
|
125
|
+
if (longest.length === 1) {
|
|
126
|
+
return { value: longest[0]![2], confidence: 1.0, sourceIndex: longest[0]![0] };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Tie-break by quality weight
|
|
130
|
+
if (qualityWeights) {
|
|
131
|
+
const best = longest.reduce((a, b) => {
|
|
132
|
+
const wa =
|
|
133
|
+
a[0] < qualityWeights.length ? qualityWeights[a[0]]! : 1.0;
|
|
134
|
+
const wb =
|
|
135
|
+
b[0] < qualityWeights.length ? qualityWeights[b[0]]! : 1.0;
|
|
136
|
+
return wa >= wb ? a : b;
|
|
137
|
+
});
|
|
138
|
+
const w =
|
|
139
|
+
best[0] < qualityWeights.length ? qualityWeights[best[0]]! : 1.0;
|
|
140
|
+
const conf = Math.min(1.0, 0.7 * w);
|
|
141
|
+
return { value: best[2], confidence: conf, sourceIndex: best[0] };
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return { value: longest[0]![2], confidence: 0.7, sourceIndex: longest[0]![0] };
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function _majorityVote(
|
|
148
|
+
nonNull: [number, unknown][],
|
|
149
|
+
qualityWeights?: readonly number[],
|
|
150
|
+
): MergeFieldResult {
|
|
151
|
+
if (qualityWeights) {
|
|
152
|
+
// Weighted vote: sum quality weights per value
|
|
153
|
+
const valueWeights = new Map<unknown, number>();
|
|
154
|
+
const valueIdx = new Map<unknown, number>();
|
|
155
|
+
|
|
156
|
+
for (const [i, v] of nonNull) {
|
|
157
|
+
const w = i < qualityWeights.length ? qualityWeights[i]! : 1.0;
|
|
158
|
+
valueWeights.set(v, (valueWeights.get(v) ?? 0) + w);
|
|
159
|
+
if (!valueIdx.has(v)) valueIdx.set(v, i);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
let winner: unknown = null;
|
|
163
|
+
let bestWeight = -Infinity;
|
|
164
|
+
for (const [v, w] of valueWeights) {
|
|
165
|
+
if (w > bestWeight) {
|
|
166
|
+
bestWeight = w;
|
|
167
|
+
winner = v;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
let totalWeight = 0;
|
|
172
|
+
for (const w of valueWeights.values()) totalWeight += w;
|
|
173
|
+
const conf = totalWeight > 0 ? bestWeight / totalWeight : 0.0;
|
|
174
|
+
return { value: winner, confidence: conf, sourceIndex: valueIdx.get(winner)! };
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Unweighted: count occurrences
|
|
178
|
+
const counts = new Map<unknown, number>();
|
|
179
|
+
for (const [, v] of nonNull) {
|
|
180
|
+
counts.set(v, (counts.get(v) ?? 0) + 1);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
let winner: unknown = null;
|
|
184
|
+
let bestCount = -1;
|
|
185
|
+
for (const [v, c] of counts) {
|
|
186
|
+
if (c > bestCount) {
|
|
187
|
+
bestCount = c;
|
|
188
|
+
winner = v;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const winnerIdx = nonNull.find(([, v]) => v === winner)![0];
|
|
193
|
+
return {
|
|
194
|
+
value: winner,
|
|
195
|
+
confidence: bestCount / nonNull.length,
|
|
196
|
+
sourceIndex: winnerIdx,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function _sourcePriority(
|
|
201
|
+
values: readonly unknown[],
|
|
202
|
+
rule: GoldenFieldRule,
|
|
203
|
+
sources?: readonly string[],
|
|
204
|
+
): MergeFieldResult {
|
|
205
|
+
if (!sources) {
|
|
206
|
+
throw new Error("source_priority strategy requires sources list");
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const sourceVal = new Map<string, unknown>();
|
|
210
|
+
const sourceIdx = new Map<string, number>();
|
|
211
|
+
|
|
212
|
+
for (let i = 0; i < sources.length; i++) {
|
|
213
|
+
const src = sources[i]!;
|
|
214
|
+
if (!sourceVal.has(src)) {
|
|
215
|
+
sourceVal.set(src, values[i]);
|
|
216
|
+
sourceIdx.set(src, i);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const priority = rule.sourcePriority ?? [];
|
|
221
|
+
for (let idx = 0; idx < priority.length; idx++) {
|
|
222
|
+
const src = priority[idx]!;
|
|
223
|
+
const val = sourceVal.get(src);
|
|
224
|
+
if (val != null) {
|
|
225
|
+
const conf = Math.max(0.1, 1.0 - idx * 0.1);
|
|
226
|
+
return { value: val, confidence: conf, sourceIndex: sourceIdx.get(src)! };
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Fallback: no match in priority list
|
|
231
|
+
return { value: null, confidence: 0.0, sourceIndex: null };
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function _mostRecent(
|
|
235
|
+
values: readonly unknown[],
|
|
236
|
+
dates?: readonly unknown[],
|
|
237
|
+
): MergeFieldResult {
|
|
238
|
+
if (!dates) {
|
|
239
|
+
throw new Error("most_recent strategy requires dates list");
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const indexed: [number, unknown, unknown][] = [];
|
|
243
|
+
for (let i = 0; i < values.length; i++) {
|
|
244
|
+
if (values[i] != null && dates[i] != null) {
|
|
245
|
+
indexed.push([i, dates[i], values[i]]);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (indexed.length === 0) {
|
|
250
|
+
return { value: null, confidence: 0.0, sourceIndex: null };
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Sort by date descending (works for ISO strings and numbers)
|
|
254
|
+
indexed.sort((a, b) => {
|
|
255
|
+
if (a[1]! > b[1]!) return -1;
|
|
256
|
+
if (a[1]! < b[1]!) return 1;
|
|
257
|
+
return 0;
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
const topDate = indexed[0]![1];
|
|
261
|
+
const tied = indexed.filter(([, d]) => d === topDate);
|
|
262
|
+
const conf = tied.length === 1 ? 1.0 : 0.5;
|
|
263
|
+
|
|
264
|
+
return { value: indexed[0]![2], confidence: conf, sourceIndex: indexed[0]![0] };
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function _firstNonNull(
|
|
268
|
+
nonNull: [number, unknown][],
|
|
269
|
+
qualityWeights?: readonly number[],
|
|
270
|
+
): MergeFieldResult {
|
|
271
|
+
if (qualityWeights) {
|
|
272
|
+
// Pick the non-null value with the highest quality weight
|
|
273
|
+
let bestIdx = nonNull[0]![0];
|
|
274
|
+
let bestVal = nonNull[0]![1];
|
|
275
|
+
let bestWeight =
|
|
276
|
+
nonNull[0]![0] < qualityWeights.length
|
|
277
|
+
? qualityWeights[nonNull[0]![0]]!
|
|
278
|
+
: 1.0;
|
|
279
|
+
|
|
280
|
+
for (let i = 1; i < nonNull.length; i++) {
|
|
281
|
+
const [idx, val] = nonNull[i]!;
|
|
282
|
+
const w = idx < qualityWeights.length ? qualityWeights[idx]! : 1.0;
|
|
283
|
+
if (w > bestWeight) {
|
|
284
|
+
bestWeight = w;
|
|
285
|
+
bestIdx = idx;
|
|
286
|
+
bestVal = val;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return { value: bestVal, confidence: 0.6, sourceIndex: bestIdx };
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return { value: nonNull[0]![1], confidence: 0.6, sourceIndex: nonNull[0]![0] };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// ---------------------------------------------------------------------------
|
|
297
|
+
// GoldenRecord
|
|
298
|
+
// ---------------------------------------------------------------------------
|
|
299
|
+
|
|
300
|
+
export interface GoldenRecord {
|
|
301
|
+
readonly fields: Readonly<Record<string, { value: unknown; confidence: number }>>;
|
|
302
|
+
readonly goldenConfidence: number;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// ---------------------------------------------------------------------------
|
|
306
|
+
// buildGoldenRecord
|
|
307
|
+
// ---------------------------------------------------------------------------
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Build a golden record from cluster rows.
|
|
311
|
+
*
|
|
312
|
+
* @param clusterRows - Array of row objects belonging to one cluster.
|
|
313
|
+
* @param rules - Golden rules config with default strategy and field rules.
|
|
314
|
+
* @param qualityScores - Optional map of `"rowId:column"` -> quality score.
|
|
315
|
+
*/
|
|
316
|
+
export function buildGoldenRecord(
|
|
317
|
+
clusterRows: readonly Row[],
|
|
318
|
+
rules: GoldenRulesConfig,
|
|
319
|
+
qualityScores?: ReadonlyMap<string, number>,
|
|
320
|
+
): GoldenRecord {
|
|
321
|
+
if (clusterRows.length === 0) {
|
|
322
|
+
return { fields: {}, goldenConfidence: 0.0 };
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Collect all column names
|
|
326
|
+
const columns = new Set<string>();
|
|
327
|
+
for (const row of clusterRows) {
|
|
328
|
+
for (const col of Object.keys(row)) {
|
|
329
|
+
columns.add(col);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
const rowIds: number[] = clusterRows.map(
|
|
334
|
+
(r) => (r.__row_id__ as number) ?? 0,
|
|
335
|
+
);
|
|
336
|
+
|
|
337
|
+
const fields: Record<string, { value: unknown; confidence: number }> = {};
|
|
338
|
+
const confidences: number[] = [];
|
|
339
|
+
|
|
340
|
+
for (const col of columns) {
|
|
341
|
+
if (isInternal(col)) continue;
|
|
342
|
+
|
|
343
|
+
const values = clusterRows.map((r) => r[col] ?? null);
|
|
344
|
+
|
|
345
|
+
// Look up field rule or use default
|
|
346
|
+
const fieldRule: GoldenFieldRule =
|
|
347
|
+
rules.fieldRules[col] ?? { strategy: rules.defaultStrategy as GoldenFieldRule["strategy"] };
|
|
348
|
+
|
|
349
|
+
// Gather optional lists
|
|
350
|
+
let sources: string[] | undefined;
|
|
351
|
+
let dates: unknown[] | undefined;
|
|
352
|
+
let weights: number[] | undefined;
|
|
353
|
+
|
|
354
|
+
if (fieldRule.strategy === "source_priority") {
|
|
355
|
+
sources = clusterRows.map((r) => String(r.__source__ ?? ""));
|
|
356
|
+
}
|
|
357
|
+
if (fieldRule.strategy === "most_recent" && fieldRule.dateColumn) {
|
|
358
|
+
dates = clusterRows.map((r) => r[fieldRule.dateColumn!] ?? null);
|
|
359
|
+
}
|
|
360
|
+
if (qualityScores) {
|
|
361
|
+
weights = rowIds.map((rid) => qualityScores.get(`${rid}:${col}`) ?? 1.0);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
const mergeOpts: MergeFieldOptions = {
|
|
365
|
+
...(sources !== undefined && { sources }),
|
|
366
|
+
...(dates !== undefined && { dates }),
|
|
367
|
+
...(weights !== undefined && { qualityWeights: weights }),
|
|
368
|
+
};
|
|
369
|
+
const result = mergeField(values, fieldRule, mergeOpts);
|
|
370
|
+
fields[col] = { value: result.value, confidence: result.confidence };
|
|
371
|
+
confidences.push(result.confidence);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
const goldenConfidence =
|
|
375
|
+
confidences.length > 0
|
|
376
|
+
? confidences.reduce((a, b) => a + b, 0) / confidences.length
|
|
377
|
+
: 0.0;
|
|
378
|
+
|
|
379
|
+
return { fields, goldenConfidence };
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// ---------------------------------------------------------------------------
|
|
383
|
+
// buildGoldenRecordWithProvenance
|
|
384
|
+
// ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
export interface GoldenRecordWithProvenanceResult {
|
|
387
|
+
readonly goldenRecords: readonly (Row & {
|
|
388
|
+
__cluster_id__: number;
|
|
389
|
+
__golden_confidence__: number;
|
|
390
|
+
})[];
|
|
391
|
+
readonly provenance: readonly ClusterProvenance[];
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* Build golden records with full field-level provenance tracking.
|
|
396
|
+
*
|
|
397
|
+
* @param allRows - All rows with `__cluster_id__` and `__row_id__` columns.
|
|
398
|
+
* @param rules - Golden rules config.
|
|
399
|
+
* @param clusters - Cluster map from buildClusters.
|
|
400
|
+
* @param qualityScores - Optional `"rowId:column"` -> quality score map.
|
|
401
|
+
*/
|
|
402
|
+
export function buildGoldenRecordWithProvenance(
|
|
403
|
+
allRows: readonly Row[],
|
|
404
|
+
rules: GoldenRulesConfig,
|
|
405
|
+
clusters: ReadonlyMap<number, ClusterInfo>,
|
|
406
|
+
qualityScores?: ReadonlyMap<string, number>,
|
|
407
|
+
): GoldenRecordWithProvenanceResult {
|
|
408
|
+
// Group rows by cluster ID
|
|
409
|
+
const clusterDfs = new Map<number, Row[]>();
|
|
410
|
+
for (const row of allRows) {
|
|
411
|
+
const cid = (row.__cluster_id__ as number) ?? 1;
|
|
412
|
+
let arr = clusterDfs.get(cid);
|
|
413
|
+
if (!arr) {
|
|
414
|
+
arr = [];
|
|
415
|
+
clusterDfs.set(cid, arr);
|
|
416
|
+
}
|
|
417
|
+
arr.push(row);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const clusterIds = [...clusterDfs.keys()].sort((a, b) => a - b);
|
|
421
|
+
const goldenRecords: (Row & {
|
|
422
|
+
__cluster_id__: number;
|
|
423
|
+
__golden_confidence__: number;
|
|
424
|
+
})[] = [];
|
|
425
|
+
const provenanceList: ClusterProvenance[] = [];
|
|
426
|
+
|
|
427
|
+
for (const cid of clusterIds) {
|
|
428
|
+
const clusterRows = clusterDfs.get(cid)!;
|
|
429
|
+
const cinfo = clusters.get(cid);
|
|
430
|
+
const rowIds = clusterRows.map(
|
|
431
|
+
(r) => (r.__row_id__ as number) ?? 0,
|
|
432
|
+
);
|
|
433
|
+
|
|
434
|
+
// Collect columns
|
|
435
|
+
const columns = new Set<string>();
|
|
436
|
+
for (const row of clusterRows) {
|
|
437
|
+
for (const col of Object.keys(row)) {
|
|
438
|
+
columns.add(col);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const fieldProvenance: Record<string, FieldProvenance> = {};
|
|
443
|
+
const goldenRow: Record<string, unknown> = { __cluster_id__: cid };
|
|
444
|
+
const confidences: number[] = [];
|
|
445
|
+
|
|
446
|
+
for (const col of columns) {
|
|
447
|
+
if (isInternal(col)) continue;
|
|
448
|
+
|
|
449
|
+
const values = clusterRows.map((r) => r[col] ?? null);
|
|
450
|
+
|
|
451
|
+
const fieldRule: GoldenFieldRule =
|
|
452
|
+
rules.fieldRules[col] ?? { strategy: rules.defaultStrategy as GoldenFieldRule["strategy"] };
|
|
453
|
+
|
|
454
|
+
let sources: string[] | undefined;
|
|
455
|
+
let dates: unknown[] | undefined;
|
|
456
|
+
let weights: number[] | undefined;
|
|
457
|
+
|
|
458
|
+
if (fieldRule.strategy === "source_priority") {
|
|
459
|
+
sources = clusterRows.map((r) => String(r.__source__ ?? ""));
|
|
460
|
+
}
|
|
461
|
+
if (fieldRule.strategy === "most_recent" && fieldRule.dateColumn) {
|
|
462
|
+
dates = clusterRows.map((r) => r[fieldRule.dateColumn!] ?? null);
|
|
463
|
+
}
|
|
464
|
+
if (qualityScores) {
|
|
465
|
+
weights = rowIds.map(
|
|
466
|
+
(rid) => qualityScores.get(`${rid}:${col}`) ?? 1.0,
|
|
467
|
+
);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
const mergeOpts: MergeFieldOptions = {
|
|
471
|
+
...(sources !== undefined && { sources }),
|
|
472
|
+
...(dates !== undefined && { dates }),
|
|
473
|
+
...(weights !== undefined && { qualityWeights: weights }),
|
|
474
|
+
};
|
|
475
|
+
const result = mergeField(values, fieldRule, mergeOpts);
|
|
476
|
+
confidences.push(result.confidence);
|
|
477
|
+
|
|
478
|
+
const sourceRowId =
|
|
479
|
+
result.sourceIndex != null && result.sourceIndex < rowIds.length
|
|
480
|
+
? rowIds[result.sourceIndex]!
|
|
481
|
+
: rowIds[0]!;
|
|
482
|
+
|
|
483
|
+
const candidates = rowIds.map((rid, idx) => {
|
|
484
|
+
const q = qualityScores
|
|
485
|
+
? (qualityScores.get(`${rid}:${col}`) ?? 1.0)
|
|
486
|
+
: 1.0;
|
|
487
|
+
return { row_id: rid, value: values[idx], quality: q };
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
fieldProvenance[col] = {
|
|
491
|
+
value: result.value,
|
|
492
|
+
sourceRowId,
|
|
493
|
+
strategy: fieldRule.strategy,
|
|
494
|
+
confidence: result.confidence,
|
|
495
|
+
candidates,
|
|
496
|
+
};
|
|
497
|
+
|
|
498
|
+
goldenRow[col] = result.value;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
const goldenConfidence =
|
|
502
|
+
confidences.length > 0
|
|
503
|
+
? confidences.reduce((a, b) => a + b, 0) / confidences.length
|
|
504
|
+
: 0.0;
|
|
505
|
+
|
|
506
|
+
goldenRow.__golden_confidence__ = goldenConfidence;
|
|
507
|
+
|
|
508
|
+
goldenRecords.push(
|
|
509
|
+
goldenRow as Row & {
|
|
510
|
+
__cluster_id__: number;
|
|
511
|
+
__golden_confidence__: number;
|
|
512
|
+
},
|
|
513
|
+
);
|
|
514
|
+
|
|
515
|
+
provenanceList.push({
|
|
516
|
+
clusterId: cid,
|
|
517
|
+
clusterQuality: cinfo?.clusterQuality ?? "strong",
|
|
518
|
+
clusterConfidence: cinfo?.confidence ?? 0.0,
|
|
519
|
+
fields: fieldProvenance,
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
return { goldenRecords, provenance: provenanceList };
|
|
524
|
+
}
|