goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,570 @@
1
+ /**
2
+ * types.ts — GoldenMatch config interfaces and result types.
3
+ * Edge-safe: no Node.js imports, no `process`.
4
+ */
5
+
6
+ // ---------------------------------------------------------------------------
7
+ // Primitive types
8
+ // ---------------------------------------------------------------------------
9
+
10
+ export type ColumnValue = string | number | boolean | null;
11
+ export type Row = Readonly<Record<string, unknown>>;
12
+
13
+ /** A canonical pair key in the form "minId:maxId". Only produced by pairKey(). */
14
+ export type PairKey = string & { readonly __brand: "PairKey" };
15
+
16
+ // ---------------------------------------------------------------------------
17
+ // Matchkey field config
18
+ // ---------------------------------------------------------------------------
19
+
20
+ export interface MatchkeyField {
21
+ readonly field: string;
22
+ readonly transforms: readonly string[];
23
+ readonly scorer: string;
24
+ readonly weight: number;
25
+ readonly model?: string;
26
+ readonly columns?: readonly string[];
27
+ readonly columnWeights?: Readonly<Record<string, number>>;
28
+ readonly levels?: number;
29
+ readonly partialThreshold?: number;
30
+ }
31
+
32
+ export interface ExactMatchkey {
33
+ readonly name: string;
34
+ readonly type: "exact";
35
+ readonly fields: readonly MatchkeyField[];
36
+ }
37
+
38
+ export interface WeightedMatchkey {
39
+ readonly name: string;
40
+ readonly type: "weighted";
41
+ readonly fields: readonly MatchkeyField[];
42
+ readonly threshold: number;
43
+ readonly autoThreshold?: boolean;
44
+ readonly rerank?: boolean;
45
+ readonly rerankModel?: string;
46
+ readonly rerankBand?: number;
47
+ }
48
+
49
+ export interface ProbabilisticMatchkey {
50
+ readonly name: string;
51
+ readonly type: "probabilistic";
52
+ readonly fields: readonly MatchkeyField[];
53
+ readonly threshold?: number;
54
+ readonly emIterations?: number;
55
+ readonly convergenceThreshold?: number;
56
+ readonly linkThreshold?: number;
57
+ readonly reviewThreshold?: number;
58
+ }
59
+
60
+ export type MatchkeyConfig =
61
+ | ExactMatchkey
62
+ | WeightedMatchkey
63
+ | ProbabilisticMatchkey;
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Blocking config
67
+ // ---------------------------------------------------------------------------
68
+
69
+ export interface BlockingKeyConfig {
70
+ readonly fields: readonly string[];
71
+ readonly transforms: readonly string[];
72
+ }
73
+
74
+ export interface SortKeyField {
75
+ readonly column: string;
76
+ readonly transforms: readonly string[];
77
+ }
78
+
79
+ export interface CanopyConfig {
80
+ readonly fields: readonly string[];
81
+ readonly looseThreshold: number;
82
+ readonly tightThreshold: number;
83
+ readonly maxCanopySize: number;
84
+ }
85
+
86
+ export interface BlockingConfig {
87
+ readonly strategy:
88
+ | "static"
89
+ | "adaptive"
90
+ | "sorted_neighborhood"
91
+ | "multi_pass"
92
+ | "ann"
93
+ | "canopy"
94
+ | "ann_pairs"
95
+ | "learned";
96
+ readonly keys: readonly BlockingKeyConfig[];
97
+ readonly maxBlockSize: number;
98
+ readonly skipOversized: boolean;
99
+ readonly autoSuggest?: boolean;
100
+ readonly autoSelect?: boolean;
101
+ readonly subBlockKeys?: readonly BlockingKeyConfig[];
102
+ readonly windowSize?: number;
103
+ readonly sortKey?: readonly SortKeyField[];
104
+ readonly passes?: readonly BlockingKeyConfig[];
105
+ readonly unionMode?: boolean;
106
+ readonly maxTotalComparisons?: number;
107
+ readonly annColumn?: string;
108
+ readonly annModel?: string;
109
+ readonly annTopK?: number;
110
+ readonly canopy?: CanopyConfig;
111
+ readonly learnedSampleSize?: number;
112
+ readonly learnedMinRecall?: number;
113
+ readonly learnedMinReduction?: number;
114
+ readonly learnedPredicateDepth?: number;
115
+ readonly learnedCachePath?: string;
116
+ }
117
+
118
+ // ---------------------------------------------------------------------------
119
+ // Golden rules config
120
+ // ---------------------------------------------------------------------------
121
+
122
+ export interface GoldenFieldRule {
123
+ readonly strategy:
124
+ | "most_complete"
125
+ | "majority_vote"
126
+ | "source_priority"
127
+ | "most_recent"
128
+ | "first_non_null";
129
+ readonly dateColumn?: string;
130
+ readonly sourcePriority?: readonly string[];
131
+ }
132
+
133
+ export interface GoldenRulesConfig {
134
+ readonly defaultStrategy: string;
135
+ readonly fieldRules: Readonly<Record<string, GoldenFieldRule>>;
136
+ readonly maxClusterSize: number;
137
+ readonly autoSplit: boolean;
138
+ readonly qualityWeighting: boolean;
139
+ readonly weakClusterThreshold: number;
140
+ }
141
+
142
+ // ---------------------------------------------------------------------------
143
+ // Standardization, validation, quality, transform
144
+ // ---------------------------------------------------------------------------
145
+
146
+ export interface StandardizationConfig {
147
+ readonly rules: Readonly<Record<string, readonly string[]>>;
148
+ }
149
+
150
+ export interface ValidationRuleConfig {
151
+ readonly column: string;
152
+ readonly ruleType:
153
+ | "regex"
154
+ | "min_length"
155
+ | "max_length"
156
+ | "not_null"
157
+ | "in_set"
158
+ | "format";
159
+ readonly params: Readonly<Record<string, unknown>>;
160
+ readonly action: "null" | "quarantine" | "flag";
161
+ }
162
+
163
+ export interface ValidationConfig {
164
+ readonly rules: readonly ValidationRuleConfig[];
165
+ readonly autoFix: boolean;
166
+ }
167
+
168
+ export interface QualityConfig {
169
+ readonly enabled: boolean;
170
+ readonly mode: "silent" | "announced" | "disabled";
171
+ readonly fixMode: "safe" | "moderate" | "none";
172
+ readonly domain?: string;
173
+ }
174
+
175
+ export interface TransformConfig {
176
+ readonly enabled: boolean;
177
+ readonly mode: "silent" | "announced" | "disabled";
178
+ }
179
+
180
+ // ---------------------------------------------------------------------------
181
+ // LLM scorer & budget
182
+ // ---------------------------------------------------------------------------
183
+
184
+ export interface BudgetConfig {
185
+ readonly maxCostUsd?: number;
186
+ readonly maxCalls?: number;
187
+ readonly escalationModel?: string;
188
+ readonly escalationBand?: readonly number[];
189
+ readonly escalationBudgetPct?: number;
190
+ readonly warnAtPct?: number;
191
+ }
192
+
193
+ export interface LLMScorerConfig {
194
+ readonly enabled: boolean;
195
+ readonly provider?: string;
196
+ readonly model?: string;
197
+ readonly autoThreshold: number;
198
+ readonly candidateLo: number;
199
+ readonly candidateHi: number;
200
+ readonly batchSize: number;
201
+ readonly maxWorkers: number;
202
+ readonly budget?: BudgetConfig;
203
+ readonly mode: "pairwise" | "cluster";
204
+ readonly clusterMaxSize?: number;
205
+ readonly clusterMinSize?: number;
206
+ }
207
+
208
+ // ---------------------------------------------------------------------------
209
+ // Domain config
210
+ // ---------------------------------------------------------------------------
211
+
212
+ export interface DomainConfig {
213
+ readonly enabled: boolean;
214
+ readonly mode?: string;
215
+ readonly confidenceThreshold: number;
216
+ readonly llmValidation: boolean;
217
+ readonly budget?: BudgetConfig;
218
+ }
219
+
220
+ // ---------------------------------------------------------------------------
221
+ // Memory & learning
222
+ // ---------------------------------------------------------------------------
223
+
224
+ export interface LearningConfig {
225
+ readonly thresholdMinCorrections: number;
226
+ readonly weightsMinCorrections: number;
227
+ }
228
+
229
+ export interface MemoryConfig {
230
+ readonly enabled: boolean;
231
+ readonly backend: "sqlite" | "postgres";
232
+ readonly path?: string;
233
+ readonly trust: number;
234
+ readonly learning: LearningConfig;
235
+ }
236
+
237
+ // ---------------------------------------------------------------------------
238
+ // Input & output config
239
+ // ---------------------------------------------------------------------------
240
+
241
+ export interface InputFileConfig {
242
+ readonly path: string;
243
+ readonly idColumn?: string;
244
+ readonly sourceLabel?: string;
245
+ readonly sourceName?: string;
246
+ readonly columnMap?: Readonly<Record<string, string>>;
247
+ readonly delimiter?: string;
248
+ readonly encoding?: string;
249
+ readonly sheet?: string;
250
+ readonly parseMode?: string;
251
+ readonly headerRow?: number;
252
+ readonly hasHeader?: boolean;
253
+ readonly skipRows?: readonly number[];
254
+ }
255
+
256
+ export interface InputConfig {
257
+ readonly files: readonly InputFileConfig[];
258
+ readonly fileA?: InputFileConfig;
259
+ readonly fileB?: InputFileConfig;
260
+ }
261
+
262
+ export interface OutputConfig {
263
+ readonly path?: string;
264
+ readonly format?: string;
265
+ readonly directory?: string;
266
+ readonly runName?: string;
267
+ }
268
+
269
+ // ---------------------------------------------------------------------------
270
+ // Top-level config
271
+ // ---------------------------------------------------------------------------
272
+
273
+ export interface GoldenMatchConfig {
274
+ readonly matchkeys?: readonly MatchkeyConfig[];
275
+ readonly matchSettings?: readonly MatchkeyConfig[];
276
+ readonly blocking?: BlockingConfig;
277
+ readonly threshold?: number;
278
+ readonly goldenRules?: GoldenRulesConfig;
279
+ readonly standardization?: StandardizationConfig;
280
+ readonly validation?: ValidationConfig;
281
+ readonly quality?: QualityConfig;
282
+ readonly transform?: TransformConfig;
283
+ readonly llmScorer?: LLMScorerConfig;
284
+ readonly domain?: DomainConfig;
285
+ readonly memory?: MemoryConfig;
286
+ readonly input?: InputConfig;
287
+ readonly output?: OutputConfig;
288
+ readonly backend?: string;
289
+ readonly llmAuto?: boolean;
290
+ readonly llmBoost?: boolean;
291
+ }
292
+
293
+ // ---------------------------------------------------------------------------
294
+ // Result types
295
+ // ---------------------------------------------------------------------------
296
+
297
+ export interface ScoredPair {
298
+ readonly idA: number;
299
+ readonly idB: number;
300
+ readonly score: number;
301
+ }
302
+
303
+ export interface ClusterInfo {
304
+ readonly members: readonly number[];
305
+ readonly size: number;
306
+ readonly oversized: boolean;
307
+ readonly pairScores: ReadonlyMap<PairKey, number>;
308
+ readonly confidence: number;
309
+ readonly bottleneckPair: readonly [number, number] | null;
310
+ readonly clusterQuality: "strong" | "weak" | "split";
311
+ }
312
+
313
+ export interface DedupeStats {
314
+ readonly totalRecords: number;
315
+ readonly totalClusters: number;
316
+ readonly matchRate: number;
317
+ readonly matchedRecords: number;
318
+ readonly uniqueRecords: number;
319
+ }
320
+
321
+ export interface DedupeResult {
322
+ readonly goldenRecords: readonly Row[];
323
+ readonly clusters: ReadonlyMap<number, ClusterInfo>;
324
+ readonly dupes: readonly Row[];
325
+ readonly unique: readonly Row[];
326
+ readonly stats: DedupeStats;
327
+ readonly scoredPairs: readonly ScoredPair[];
328
+ readonly config: GoldenMatchConfig;
329
+ }
330
+
331
+ export interface MatchResult {
332
+ readonly matched: readonly Row[];
333
+ readonly unmatched: readonly Row[];
334
+ readonly stats: Readonly<Record<string, unknown>>;
335
+ }
336
+
337
+ export interface FieldProvenance {
338
+ readonly value: unknown;
339
+ readonly sourceRowId: number;
340
+ readonly strategy: string;
341
+ readonly confidence: number;
342
+ readonly candidates: readonly Readonly<Record<string, unknown>>[];
343
+ }
344
+
345
+ export interface ClusterProvenance {
346
+ readonly clusterId: number;
347
+ readonly clusterQuality: string;
348
+ readonly clusterConfidence: number;
349
+ readonly fields: Readonly<Record<string, FieldProvenance>>;
350
+ }
351
+
352
+ export interface BlockResult {
353
+ readonly blockKey: string;
354
+ readonly rows: readonly Row[];
355
+ readonly strategy: string;
356
+ readonly depth: number;
357
+ readonly parentKey?: string;
358
+ readonly preScoredPairs?: readonly ScoredPair[];
359
+ }
360
+
361
+ // ---------------------------------------------------------------------------
362
+ // Valid enum sets
363
+ // ---------------------------------------------------------------------------
364
+
365
+ export const VALID_SCORERS = new Set([
366
+ "exact",
367
+ "jaro_winkler",
368
+ "levenshtein",
369
+ "token_sort",
370
+ "soundex_match",
371
+ "embedding",
372
+ "record_embedding",
373
+ "ensemble",
374
+ "dice",
375
+ "jaccard",
376
+ ] as const);
377
+
378
+ export const VALID_TRANSFORMS = new Set([
379
+ "lowercase",
380
+ "uppercase",
381
+ "strip",
382
+ "strip_all",
383
+ "soundex",
384
+ "metaphone",
385
+ "digits_only",
386
+ "alpha_only",
387
+ "normalize_whitespace",
388
+ "token_sort",
389
+ "first_token",
390
+ "last_token",
391
+ ] as const);
392
+
393
+ export const VALID_STRATEGIES = new Set([
394
+ "most_recent",
395
+ "source_priority",
396
+ "most_complete",
397
+ "majority_vote",
398
+ "first_non_null",
399
+ ] as const);
400
+
401
+ export const VALID_STANDARDIZERS = new Set([
402
+ "email",
403
+ "name_proper",
404
+ "name_upper",
405
+ "name_lower",
406
+ "phone",
407
+ "zip5",
408
+ "address",
409
+ "state",
410
+ "strip",
411
+ "trim_whitespace",
412
+ ] as const);
413
+
414
+ // ---------------------------------------------------------------------------
415
+ // Factory functions
416
+ // ---------------------------------------------------------------------------
417
+
418
+ /**
419
+ * Create a ScoredPair guaranteeing idA <= idB (canonical order).
420
+ * Always use this instead of constructing `{ idA, idB, score }` directly.
421
+ */
422
+ export function makeScoredPair(
423
+ a: number,
424
+ b: number,
425
+ score: number,
426
+ ): ScoredPair {
427
+ const lo = a < b ? a : b;
428
+ const hi = a < b ? b : a;
429
+ return { idA: lo, idB: hi, score };
430
+ }
431
+
432
+ /** Create a MatchkeyField with sensible defaults. */
433
+ export function makeMatchkeyField(
434
+ partial: Partial<MatchkeyField> & Pick<MatchkeyField, "field">,
435
+ ): MatchkeyField {
436
+ return {
437
+ transforms: [],
438
+ scorer: "jaro_winkler",
439
+ weight: 1.0,
440
+ ...partial,
441
+ };
442
+ }
443
+
444
+ /**
445
+ * Shape accepted by `makeMatchkeyConfig`. All variant-specific fields are
446
+ * optional; the factory picks the right variant based on `type`.
447
+ */
448
+ export interface MakeMatchkeyConfigInput {
449
+ readonly name: string;
450
+ readonly type?: "exact" | "weighted" | "probabilistic";
451
+ readonly fields?: readonly MatchkeyField[];
452
+ readonly threshold?: number;
453
+ readonly autoThreshold?: boolean;
454
+ readonly rerank?: boolean;
455
+ readonly rerankModel?: string;
456
+ readonly rerankBand?: number;
457
+ readonly emIterations?: number;
458
+ readonly convergenceThreshold?: number;
459
+ readonly linkThreshold?: number;
460
+ readonly reviewThreshold?: number;
461
+ }
462
+
463
+ /** Create a MatchkeyConfig with sensible defaults. Produces the correct variant. */
464
+ export function makeMatchkeyConfig(
465
+ partial: MakeMatchkeyConfigInput,
466
+ ): MatchkeyConfig {
467
+ const type = partial.type ?? "weighted";
468
+ const fields = partial.fields ?? [];
469
+ if (type === "exact") {
470
+ return { name: partial.name, type: "exact", fields };
471
+ }
472
+ if (type === "probabilistic") {
473
+ const out: ProbabilisticMatchkey = {
474
+ name: partial.name,
475
+ type: "probabilistic",
476
+ fields,
477
+ ...(partial.threshold !== undefined
478
+ ? { threshold: partial.threshold }
479
+ : {}),
480
+ ...(partial.emIterations !== undefined
481
+ ? { emIterations: partial.emIterations }
482
+ : {}),
483
+ ...(partial.convergenceThreshold !== undefined
484
+ ? { convergenceThreshold: partial.convergenceThreshold }
485
+ : {}),
486
+ ...(partial.linkThreshold !== undefined
487
+ ? { linkThreshold: partial.linkThreshold }
488
+ : {}),
489
+ ...(partial.reviewThreshold !== undefined
490
+ ? { reviewThreshold: partial.reviewThreshold }
491
+ : {}),
492
+ };
493
+ return out;
494
+ }
495
+ // weighted (default)
496
+ const out: WeightedMatchkey = {
497
+ name: partial.name,
498
+ type: "weighted",
499
+ fields,
500
+ threshold: partial.threshold ?? 0.85,
501
+ ...(partial.autoThreshold !== undefined
502
+ ? { autoThreshold: partial.autoThreshold }
503
+ : {}),
504
+ ...(partial.rerank !== undefined ? { rerank: partial.rerank } : {}),
505
+ ...(partial.rerankModel !== undefined
506
+ ? { rerankModel: partial.rerankModel }
507
+ : {}),
508
+ ...(partial.rerankBand !== undefined
509
+ ? { rerankBand: partial.rerankBand }
510
+ : {}),
511
+ };
512
+ return out;
513
+ }
514
+
515
+ /** Create a BlockingConfig with sensible defaults. */
516
+ export function makeBlockingConfig(
517
+ partial?: Partial<BlockingConfig>,
518
+ ): BlockingConfig {
519
+ return {
520
+ strategy: "static",
521
+ keys: [],
522
+ maxBlockSize: 5000,
523
+ skipOversized: false,
524
+ ...partial,
525
+ };
526
+ }
527
+
528
+ /** Create a GoldenRulesConfig with sensible defaults. */
529
+ export function makeGoldenRulesConfig(
530
+ partial?: Partial<GoldenRulesConfig>,
531
+ ): GoldenRulesConfig {
532
+ return {
533
+ defaultStrategy: "most_complete",
534
+ fieldRules: {},
535
+ maxClusterSize: 10,
536
+ autoSplit: true,
537
+ qualityWeighting: true,
538
+ weakClusterThreshold: 0.3,
539
+ ...partial,
540
+ };
541
+ }
542
+
543
+ /** Create a full GoldenMatchConfig with sensible defaults. */
544
+ export function makeConfig(
545
+ partial?: Partial<GoldenMatchConfig>,
546
+ ): GoldenMatchConfig {
547
+ return {
548
+ threshold: 0.85,
549
+ blocking: makeBlockingConfig(partial?.blocking),
550
+ goldenRules: makeGoldenRulesConfig(partial?.goldenRules),
551
+ ...partial,
552
+ // Re-apply blocking/goldenRules after spread so partial overrides win
553
+ ...(partial?.blocking !== undefined
554
+ ? { blocking: makeBlockingConfig(partial.blocking) }
555
+ : {}),
556
+ ...(partial?.goldenRules !== undefined
557
+ ? { goldenRules: makeGoldenRulesConfig(partial.goldenRules) }
558
+ : {}),
559
+ };
560
+ }
561
+
562
+ /**
563
+ * Return matchkeys from config, checking both `matchkeys` and `matchSettings`.
564
+ * Mirrors Python's `GoldenMatchConfig.get_matchkeys()`.
565
+ */
566
+ export function getMatchkeys(
567
+ config: GoldenMatchConfig,
568
+ ): readonly MatchkeyConfig[] {
569
+ return config.matchkeys ?? config.matchSettings ?? [];
570
+ }