goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,524 @@
1
+ /**
2
+ * golden.ts — Golden record builder with per-field merge strategies.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ */
5
+
6
+ import type {
7
+ ClusterInfo,
8
+ ClusterProvenance,
9
+ FieldProvenance,
10
+ GoldenFieldRule,
11
+ GoldenRulesConfig,
12
+ Row,
13
+ } from "./types.js";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Constants
17
+ // ---------------------------------------------------------------------------
18
+
19
+ const INTERNAL_PREFIXES = [
20
+ "__row_id__",
21
+ "__source__",
22
+ "__block_key__",
23
+ "__mk_",
24
+ "__cluster_id__",
25
+ "__golden_confidence__",
26
+ ];
27
+
28
+ function isInternal(col: string): boolean {
29
+ return (
30
+ col === "__mk_" ||
31
+ INTERNAL_PREFIXES.some((prefix) => col.startsWith(prefix))
32
+ );
33
+ }
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // MergeField result
37
+ // ---------------------------------------------------------------------------
38
+
39
+ export interface MergeFieldResult {
40
+ readonly value: unknown;
41
+ readonly confidence: number;
42
+ readonly sourceIndex: number | null;
43
+ }
44
+
45
+ // ---------------------------------------------------------------------------
46
+ // MergeField options
47
+ // ---------------------------------------------------------------------------
48
+
49
+ export interface MergeFieldOptions {
50
+ readonly sources?: readonly string[];
51
+ readonly dates?: readonly unknown[];
52
+ readonly qualityWeights?: readonly number[];
53
+ }
54
+
55
+ // ---------------------------------------------------------------------------
56
+ // mergeField
57
+ // ---------------------------------------------------------------------------
58
+
59
+ /**
60
+ * Merge a list of values using the given strategy.
61
+ *
62
+ * Strategies:
63
+ * - most_complete: pick longest string value; tie-break by quality weight
64
+ * - majority_vote: pick most frequent value; weighted by quality if available
65
+ * - source_priority: pick first non-null from priority list
66
+ * - most_recent: pick value with most recent date
67
+ * - first_non_null: pick first non-null; prefer highest quality weight
68
+ */
69
+ export function mergeField(
70
+ values: readonly unknown[],
71
+ rule: GoldenFieldRule,
72
+ options?: MergeFieldOptions,
73
+ ): MergeFieldResult {
74
+ const nonNull: [number, unknown][] = [];
75
+ for (let i = 0; i < values.length; i++) {
76
+ if (values[i] != null) {
77
+ nonNull.push([i, values[i]]);
78
+ }
79
+ }
80
+
81
+ if (nonNull.length === 0) {
82
+ return { value: null, confidence: 0.0, sourceIndex: null };
83
+ }
84
+
85
+ // All non-null values identical -> confidence 1.0 shortcut
86
+ const uniqueVals = new Set(nonNull.map(([, v]) => v));
87
+ if (uniqueVals.size === 1) {
88
+ return { value: nonNull[0]![1], confidence: 1.0, sourceIndex: nonNull[0]![0] };
89
+ }
90
+
91
+ const strategy = rule.strategy;
92
+
93
+ switch (strategy) {
94
+ case "most_complete":
95
+ return _mostComplete(nonNull, options?.qualityWeights);
96
+ case "majority_vote":
97
+ return _majorityVote(nonNull, options?.qualityWeights);
98
+ case "source_priority":
99
+ return _sourcePriority(values, rule, options?.sources);
100
+ case "most_recent":
101
+ return _mostRecent(values, options?.dates);
102
+ case "first_non_null":
103
+ return _firstNonNull(nonNull, options?.qualityWeights);
104
+ default:
105
+ throw new Error(`Unknown strategy: ${strategy}`);
106
+ }
107
+ }
108
+
109
+ // ---------------------------------------------------------------------------
110
+ // Strategy implementations
111
+ // ---------------------------------------------------------------------------
112
+
113
+ function _mostComplete(
114
+ nonNull: [number, unknown][],
115
+ qualityWeights?: readonly number[],
116
+ ): MergeFieldResult {
117
+ const strVals = nonNull.map(
118
+ ([i, v]) => [i, String(v), v] as [number, string, unknown],
119
+ );
120
+ // For-loop max — Math.max(...arr) crashes on arrays with >65K elements.
121
+ let maxLen = 0;
122
+ for (const [, s] of strVals) if (s.length > maxLen) maxLen = s.length;
123
+ const longest = strVals.filter(([, s]) => s.length === maxLen);
124
+
125
+ if (longest.length === 1) {
126
+ return { value: longest[0]![2], confidence: 1.0, sourceIndex: longest[0]![0] };
127
+ }
128
+
129
+ // Tie-break by quality weight
130
+ if (qualityWeights) {
131
+ const best = longest.reduce((a, b) => {
132
+ const wa =
133
+ a[0] < qualityWeights.length ? qualityWeights[a[0]]! : 1.0;
134
+ const wb =
135
+ b[0] < qualityWeights.length ? qualityWeights[b[0]]! : 1.0;
136
+ return wa >= wb ? a : b;
137
+ });
138
+ const w =
139
+ best[0] < qualityWeights.length ? qualityWeights[best[0]]! : 1.0;
140
+ const conf = Math.min(1.0, 0.7 * w);
141
+ return { value: best[2], confidence: conf, sourceIndex: best[0] };
142
+ }
143
+
144
+ return { value: longest[0]![2], confidence: 0.7, sourceIndex: longest[0]![0] };
145
+ }
146
+
147
+ function _majorityVote(
148
+ nonNull: [number, unknown][],
149
+ qualityWeights?: readonly number[],
150
+ ): MergeFieldResult {
151
+ if (qualityWeights) {
152
+ // Weighted vote: sum quality weights per value
153
+ const valueWeights = new Map<unknown, number>();
154
+ const valueIdx = new Map<unknown, number>();
155
+
156
+ for (const [i, v] of nonNull) {
157
+ const w = i < qualityWeights.length ? qualityWeights[i]! : 1.0;
158
+ valueWeights.set(v, (valueWeights.get(v) ?? 0) + w);
159
+ if (!valueIdx.has(v)) valueIdx.set(v, i);
160
+ }
161
+
162
+ let winner: unknown = null;
163
+ let bestWeight = -Infinity;
164
+ for (const [v, w] of valueWeights) {
165
+ if (w > bestWeight) {
166
+ bestWeight = w;
167
+ winner = v;
168
+ }
169
+ }
170
+
171
+ let totalWeight = 0;
172
+ for (const w of valueWeights.values()) totalWeight += w;
173
+ const conf = totalWeight > 0 ? bestWeight / totalWeight : 0.0;
174
+ return { value: winner, confidence: conf, sourceIndex: valueIdx.get(winner)! };
175
+ }
176
+
177
+ // Unweighted: count occurrences
178
+ const counts = new Map<unknown, number>();
179
+ for (const [, v] of nonNull) {
180
+ counts.set(v, (counts.get(v) ?? 0) + 1);
181
+ }
182
+
183
+ let winner: unknown = null;
184
+ let bestCount = -1;
185
+ for (const [v, c] of counts) {
186
+ if (c > bestCount) {
187
+ bestCount = c;
188
+ winner = v;
189
+ }
190
+ }
191
+
192
+ const winnerIdx = nonNull.find(([, v]) => v === winner)![0];
193
+ return {
194
+ value: winner,
195
+ confidence: bestCount / nonNull.length,
196
+ sourceIndex: winnerIdx,
197
+ };
198
+ }
199
+
200
+ function _sourcePriority(
201
+ values: readonly unknown[],
202
+ rule: GoldenFieldRule,
203
+ sources?: readonly string[],
204
+ ): MergeFieldResult {
205
+ if (!sources) {
206
+ throw new Error("source_priority strategy requires sources list");
207
+ }
208
+
209
+ const sourceVal = new Map<string, unknown>();
210
+ const sourceIdx = new Map<string, number>();
211
+
212
+ for (let i = 0; i < sources.length; i++) {
213
+ const src = sources[i]!;
214
+ if (!sourceVal.has(src)) {
215
+ sourceVal.set(src, values[i]);
216
+ sourceIdx.set(src, i);
217
+ }
218
+ }
219
+
220
+ const priority = rule.sourcePriority ?? [];
221
+ for (let idx = 0; idx < priority.length; idx++) {
222
+ const src = priority[idx]!;
223
+ const val = sourceVal.get(src);
224
+ if (val != null) {
225
+ const conf = Math.max(0.1, 1.0 - idx * 0.1);
226
+ return { value: val, confidence: conf, sourceIndex: sourceIdx.get(src)! };
227
+ }
228
+ }
229
+
230
+ // Fallback: no match in priority list
231
+ return { value: null, confidence: 0.0, sourceIndex: null };
232
+ }
233
+
234
+ function _mostRecent(
235
+ values: readonly unknown[],
236
+ dates?: readonly unknown[],
237
+ ): MergeFieldResult {
238
+ if (!dates) {
239
+ throw new Error("most_recent strategy requires dates list");
240
+ }
241
+
242
+ const indexed: [number, unknown, unknown][] = [];
243
+ for (let i = 0; i < values.length; i++) {
244
+ if (values[i] != null && dates[i] != null) {
245
+ indexed.push([i, dates[i], values[i]]);
246
+ }
247
+ }
248
+
249
+ if (indexed.length === 0) {
250
+ return { value: null, confidence: 0.0, sourceIndex: null };
251
+ }
252
+
253
+ // Sort by date descending (works for ISO strings and numbers)
254
+ indexed.sort((a, b) => {
255
+ if (a[1]! > b[1]!) return -1;
256
+ if (a[1]! < b[1]!) return 1;
257
+ return 0;
258
+ });
259
+
260
+ const topDate = indexed[0]![1];
261
+ const tied = indexed.filter(([, d]) => d === topDate);
262
+ const conf = tied.length === 1 ? 1.0 : 0.5;
263
+
264
+ return { value: indexed[0]![2], confidence: conf, sourceIndex: indexed[0]![0] };
265
+ }
266
+
267
+ function _firstNonNull(
268
+ nonNull: [number, unknown][],
269
+ qualityWeights?: readonly number[],
270
+ ): MergeFieldResult {
271
+ if (qualityWeights) {
272
+ // Pick the non-null value with the highest quality weight
273
+ let bestIdx = nonNull[0]![0];
274
+ let bestVal = nonNull[0]![1];
275
+ let bestWeight =
276
+ nonNull[0]![0] < qualityWeights.length
277
+ ? qualityWeights[nonNull[0]![0]]!
278
+ : 1.0;
279
+
280
+ for (let i = 1; i < nonNull.length; i++) {
281
+ const [idx, val] = nonNull[i]!;
282
+ const w = idx < qualityWeights.length ? qualityWeights[idx]! : 1.0;
283
+ if (w > bestWeight) {
284
+ bestWeight = w;
285
+ bestIdx = idx;
286
+ bestVal = val;
287
+ }
288
+ }
289
+
290
+ return { value: bestVal, confidence: 0.6, sourceIndex: bestIdx };
291
+ }
292
+
293
+ return { value: nonNull[0]![1], confidence: 0.6, sourceIndex: nonNull[0]![0] };
294
+ }
295
+
296
+ // ---------------------------------------------------------------------------
297
+ // GoldenRecord
298
+ // ---------------------------------------------------------------------------
299
+
300
+ export interface GoldenRecord {
301
+ readonly fields: Readonly<Record<string, { value: unknown; confidence: number }>>;
302
+ readonly goldenConfidence: number;
303
+ }
304
+
305
+ // ---------------------------------------------------------------------------
306
+ // buildGoldenRecord
307
+ // ---------------------------------------------------------------------------
308
+
309
+ /**
310
+ * Build a golden record from cluster rows.
311
+ *
312
+ * @param clusterRows - Array of row objects belonging to one cluster.
313
+ * @param rules - Golden rules config with default strategy and field rules.
314
+ * @param qualityScores - Optional map of `"rowId:column"` -> quality score.
315
+ */
316
+ export function buildGoldenRecord(
317
+ clusterRows: readonly Row[],
318
+ rules: GoldenRulesConfig,
319
+ qualityScores?: ReadonlyMap<string, number>,
320
+ ): GoldenRecord {
321
+ if (clusterRows.length === 0) {
322
+ return { fields: {}, goldenConfidence: 0.0 };
323
+ }
324
+
325
+ // Collect all column names
326
+ const columns = new Set<string>();
327
+ for (const row of clusterRows) {
328
+ for (const col of Object.keys(row)) {
329
+ columns.add(col);
330
+ }
331
+ }
332
+
333
+ const rowIds: number[] = clusterRows.map(
334
+ (r) => (r.__row_id__ as number) ?? 0,
335
+ );
336
+
337
+ const fields: Record<string, { value: unknown; confidence: number }> = {};
338
+ const confidences: number[] = [];
339
+
340
+ for (const col of columns) {
341
+ if (isInternal(col)) continue;
342
+
343
+ const values = clusterRows.map((r) => r[col] ?? null);
344
+
345
+ // Look up field rule or use default
346
+ const fieldRule: GoldenFieldRule =
347
+ rules.fieldRules[col] ?? { strategy: rules.defaultStrategy as GoldenFieldRule["strategy"] };
348
+
349
+ // Gather optional lists
350
+ let sources: string[] | undefined;
351
+ let dates: unknown[] | undefined;
352
+ let weights: number[] | undefined;
353
+
354
+ if (fieldRule.strategy === "source_priority") {
355
+ sources = clusterRows.map((r) => String(r.__source__ ?? ""));
356
+ }
357
+ if (fieldRule.strategy === "most_recent" && fieldRule.dateColumn) {
358
+ dates = clusterRows.map((r) => r[fieldRule.dateColumn!] ?? null);
359
+ }
360
+ if (qualityScores) {
361
+ weights = rowIds.map((rid) => qualityScores.get(`${rid}:${col}`) ?? 1.0);
362
+ }
363
+
364
+ const mergeOpts: MergeFieldOptions = {
365
+ ...(sources !== undefined && { sources }),
366
+ ...(dates !== undefined && { dates }),
367
+ ...(weights !== undefined && { qualityWeights: weights }),
368
+ };
369
+ const result = mergeField(values, fieldRule, mergeOpts);
370
+ fields[col] = { value: result.value, confidence: result.confidence };
371
+ confidences.push(result.confidence);
372
+ }
373
+
374
+ const goldenConfidence =
375
+ confidences.length > 0
376
+ ? confidences.reduce((a, b) => a + b, 0) / confidences.length
377
+ : 0.0;
378
+
379
+ return { fields, goldenConfidence };
380
+ }
381
+
382
+ // ---------------------------------------------------------------------------
383
+ // buildGoldenRecordWithProvenance
384
+ // ---------------------------------------------------------------------------
385
+
386
+ export interface GoldenRecordWithProvenanceResult {
387
+ readonly goldenRecords: readonly (Row & {
388
+ __cluster_id__: number;
389
+ __golden_confidence__: number;
390
+ })[];
391
+ readonly provenance: readonly ClusterProvenance[];
392
+ }
393
+
394
+ /**
395
+ * Build golden records with full field-level provenance tracking.
396
+ *
397
+ * @param allRows - All rows with `__cluster_id__` and `__row_id__` columns.
398
+ * @param rules - Golden rules config.
399
+ * @param clusters - Cluster map from buildClusters.
400
+ * @param qualityScores - Optional `"rowId:column"` -> quality score map.
401
+ */
402
+ export function buildGoldenRecordWithProvenance(
403
+ allRows: readonly Row[],
404
+ rules: GoldenRulesConfig,
405
+ clusters: ReadonlyMap<number, ClusterInfo>,
406
+ qualityScores?: ReadonlyMap<string, number>,
407
+ ): GoldenRecordWithProvenanceResult {
408
+ // Group rows by cluster ID
409
+ const clusterDfs = new Map<number, Row[]>();
410
+ for (const row of allRows) {
411
+ const cid = (row.__cluster_id__ as number) ?? 1;
412
+ let arr = clusterDfs.get(cid);
413
+ if (!arr) {
414
+ arr = [];
415
+ clusterDfs.set(cid, arr);
416
+ }
417
+ arr.push(row);
418
+ }
419
+
420
+ const clusterIds = [...clusterDfs.keys()].sort((a, b) => a - b);
421
+ const goldenRecords: (Row & {
422
+ __cluster_id__: number;
423
+ __golden_confidence__: number;
424
+ })[] = [];
425
+ const provenanceList: ClusterProvenance[] = [];
426
+
427
+ for (const cid of clusterIds) {
428
+ const clusterRows = clusterDfs.get(cid)!;
429
+ const cinfo = clusters.get(cid);
430
+ const rowIds = clusterRows.map(
431
+ (r) => (r.__row_id__ as number) ?? 0,
432
+ );
433
+
434
+ // Collect columns
435
+ const columns = new Set<string>();
436
+ for (const row of clusterRows) {
437
+ for (const col of Object.keys(row)) {
438
+ columns.add(col);
439
+ }
440
+ }
441
+
442
+ const fieldProvenance: Record<string, FieldProvenance> = {};
443
+ const goldenRow: Record<string, unknown> = { __cluster_id__: cid };
444
+ const confidences: number[] = [];
445
+
446
+ for (const col of columns) {
447
+ if (isInternal(col)) continue;
448
+
449
+ const values = clusterRows.map((r) => r[col] ?? null);
450
+
451
+ const fieldRule: GoldenFieldRule =
452
+ rules.fieldRules[col] ?? { strategy: rules.defaultStrategy as GoldenFieldRule["strategy"] };
453
+
454
+ let sources: string[] | undefined;
455
+ let dates: unknown[] | undefined;
456
+ let weights: number[] | undefined;
457
+
458
+ if (fieldRule.strategy === "source_priority") {
459
+ sources = clusterRows.map((r) => String(r.__source__ ?? ""));
460
+ }
461
+ if (fieldRule.strategy === "most_recent" && fieldRule.dateColumn) {
462
+ dates = clusterRows.map((r) => r[fieldRule.dateColumn!] ?? null);
463
+ }
464
+ if (qualityScores) {
465
+ weights = rowIds.map(
466
+ (rid) => qualityScores.get(`${rid}:${col}`) ?? 1.0,
467
+ );
468
+ }
469
+
470
+ const mergeOpts: MergeFieldOptions = {
471
+ ...(sources !== undefined && { sources }),
472
+ ...(dates !== undefined && { dates }),
473
+ ...(weights !== undefined && { qualityWeights: weights }),
474
+ };
475
+ const result = mergeField(values, fieldRule, mergeOpts);
476
+ confidences.push(result.confidence);
477
+
478
+ const sourceRowId =
479
+ result.sourceIndex != null && result.sourceIndex < rowIds.length
480
+ ? rowIds[result.sourceIndex]!
481
+ : rowIds[0]!;
482
+
483
+ const candidates = rowIds.map((rid, idx) => {
484
+ const q = qualityScores
485
+ ? (qualityScores.get(`${rid}:${col}`) ?? 1.0)
486
+ : 1.0;
487
+ return { row_id: rid, value: values[idx], quality: q };
488
+ });
489
+
490
+ fieldProvenance[col] = {
491
+ value: result.value,
492
+ sourceRowId,
493
+ strategy: fieldRule.strategy,
494
+ confidence: result.confidence,
495
+ candidates,
496
+ };
497
+
498
+ goldenRow[col] = result.value;
499
+ }
500
+
501
+ const goldenConfidence =
502
+ confidences.length > 0
503
+ ? confidences.reduce((a, b) => a + b, 0) / confidences.length
504
+ : 0.0;
505
+
506
+ goldenRow.__golden_confidence__ = goldenConfidence;
507
+
508
+ goldenRecords.push(
509
+ goldenRow as Row & {
510
+ __cluster_id__: number;
511
+ __golden_confidence__: number;
512
+ },
513
+ );
514
+
515
+ provenanceList.push({
516
+ clusterId: cid,
517
+ clusterQuality: cinfo?.clusterQuality ?? "strong",
518
+ clusterConfidence: cinfo?.confidence ?? 0.0,
519
+ fields: fieldProvenance,
520
+ });
521
+ }
522
+
523
+ return { goldenRecords, provenance: provenanceList };
524
+ }