goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,869 @@
1
+ /**
2
+ * config/loader.ts — Config loader that parses raw objects (from YAML/JSON)
3
+ * into typed GoldenMatchConfig.
4
+ *
5
+ * Edge-safe: no `node:` imports, no `require()`.
6
+ */
7
+
8
+ import type {
9
+ GoldenMatchConfig,
10
+ MatchkeyConfig,
11
+ MatchkeyField,
12
+ BlockingConfig,
13
+ BlockingKeyConfig,
14
+ GoldenRulesConfig,
15
+ GoldenFieldRule,
16
+ StandardizationConfig,
17
+ LLMScorerConfig,
18
+ BudgetConfig,
19
+ ValidationConfig,
20
+ ValidationRuleConfig,
21
+ DomainConfig,
22
+ QualityConfig,
23
+ TransformConfig,
24
+ MemoryConfig,
25
+ LearningConfig,
26
+ InputConfig,
27
+ InputFileConfig,
28
+ OutputConfig,
29
+ SortKeyField,
30
+ CanopyConfig,
31
+ } from "../types.js";
32
+ import {
33
+ VALID_SCORERS,
34
+ VALID_TRANSFORMS,
35
+ VALID_STRATEGIES,
36
+ VALID_STANDARDIZERS,
37
+ } from "../types.js";
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // String-union validation
41
+ // ---------------------------------------------------------------------------
42
+
43
+ const VALID_MATCHKEY_TYPES = new Set([
44
+ "exact",
45
+ "weighted",
46
+ "probabilistic",
47
+ ] as const);
48
+
49
+ const VALID_BLOCKING_STRATEGIES = new Set([
50
+ "static",
51
+ "adaptive",
52
+ "sorted_neighborhood",
53
+ "multi_pass",
54
+ "ann",
55
+ "canopy",
56
+ "ann_pairs",
57
+ "learned",
58
+ ] as const);
59
+
60
+ const VALID_MEMORY_BACKENDS = new Set(["sqlite", "postgres"] as const);
61
+
62
+ const VALID_QUALITY_MODES = new Set([
63
+ "silent",
64
+ "announced",
65
+ "disabled",
66
+ ] as const);
67
+
68
+ const VALID_QUALITY_FIX_MODES = new Set(["safe", "moderate", "none"] as const);
69
+
70
+ const VALID_LLM_MODES = new Set(["pairwise", "cluster"] as const);
71
+
72
+ const VALID_VALIDATION_RULE_TYPES = new Set([
73
+ "regex",
74
+ "min_length",
75
+ "max_length",
76
+ "not_null",
77
+ "in_set",
78
+ "format",
79
+ ] as const);
80
+
81
+ const VALID_VALIDATION_ACTIONS = new Set([
82
+ "null",
83
+ "quarantine",
84
+ "flag",
85
+ ] as const);
86
+
87
+ /**
88
+ * Validate that `value` is one of `allowed`. If `defaultValue` is provided,
89
+ * return it when `value` is null/undefined. Throws a clear error otherwise.
90
+ */
91
+ function requireIn<T extends string>(
92
+ value: unknown,
93
+ allowed: ReadonlySet<T>,
94
+ fieldName: string,
95
+ defaultValue?: T,
96
+ ): T {
97
+ if (value === undefined || value === null) {
98
+ if (defaultValue !== undefined) return defaultValue;
99
+ throw new Error(`Required field '${fieldName}' is missing`);
100
+ }
101
+ if (typeof value !== "string" || !(allowed as ReadonlySet<string>).has(value)) {
102
+ const valid = [...allowed].sort().join(", ");
103
+ throw new Error(
104
+ `Invalid value '${String(value)}' for '${fieldName}'. Valid options: ${valid}`,
105
+ );
106
+ }
107
+ return value as T;
108
+ }
109
+
110
+ /**
111
+ * Accept known transforms plus parametric forms:
112
+ * - substring:<n>:<n>
113
+ * - qgram:<n>
114
+ * - bloom_filter, bloom_filter:<...>
115
+ */
116
+ function isValidTransform(t: string): boolean {
117
+ if ((VALID_TRANSFORMS as ReadonlySet<string>).has(t)) return true;
118
+ if (/^substring:\d+:\d+$/.test(t)) return true;
119
+ if (/^qgram:\d+$/.test(t)) return true;
120
+ if (t === "bloom_filter" || /^bloom_filter:/.test(t)) return true;
121
+ return false;
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Snake_case to camelCase conversion
126
+ // ---------------------------------------------------------------------------
127
+
128
+ /** Convert a snake_case key to camelCase. */
129
+ function snakeToCamel(s: string): string {
130
+ return s.replace(/_([a-z])/g, (_, c: string) => c.toUpperCase());
131
+ }
132
+
133
+ /** Recursively convert all keys of a plain object from snake_case to camelCase. */
134
+ function camelizeKeys(obj: unknown): unknown {
135
+ if (obj === null || obj === undefined) return obj;
136
+ if (Array.isArray(obj)) return obj.map(camelizeKeys);
137
+ if (typeof obj === "object") {
138
+ const result: Record<string, unknown> = {};
139
+ for (const [key, val] of Object.entries(obj as Record<string, unknown>)) {
140
+ result[snakeToCamel(key)] = camelizeKeys(val);
141
+ }
142
+ return result;
143
+ }
144
+ return obj;
145
+ }
146
+
147
+ /** Recursively convert all keys from camelCase to snake_case. */
148
+ function camelToSnake(s: string): string {
149
+ return s.replace(/[A-Z]/g, (c) => `_${c.toLowerCase()}`);
150
+ }
151
+
152
+ function snakeifyKeys(obj: unknown): unknown {
153
+ if (obj === null || obj === undefined) return obj;
154
+ if (Array.isArray(obj)) return obj.map(snakeifyKeys);
155
+ if (typeof obj === "object") {
156
+ const result: Record<string, unknown> = {};
157
+ for (const [key, val] of Object.entries(obj as Record<string, unknown>)) {
158
+ result[camelToSnake(key)] = snakeifyKeys(val);
159
+ }
160
+ return result;
161
+ }
162
+ return obj;
163
+ }
164
+
165
+ // ---------------------------------------------------------------------------
166
+ // Helpers: strip undefined values for exactOptionalPropertyTypes
167
+ // ---------------------------------------------------------------------------
168
+
169
+ /**
170
+ * Remove keys whose value is `undefined` from a plain object.
171
+ * Required because TypeScript's `exactOptionalPropertyTypes` disallows
172
+ * assigning `undefined` to optional properties.
173
+ */
174
+ function stripUndefined<T extends Record<string, unknown>>(obj: T): T {
175
+ const result = {} as Record<string, unknown>;
176
+ for (const [k, v] of Object.entries(obj)) {
177
+ if (v !== undefined) result[k] = v;
178
+ }
179
+ return result as T;
180
+ }
181
+
182
+ // ---------------------------------------------------------------------------
183
+ // Helpers: safe getters
184
+ // ---------------------------------------------------------------------------
185
+
186
+ type RawObj = Record<string, unknown>;
187
+
188
+ function asObj(v: unknown, ctx: string): RawObj {
189
+ if (typeof v !== "object" || v === null || Array.isArray(v)) {
190
+ throw new Error(`${ctx}: expected object, got ${typeof v}`);
191
+ }
192
+ return v as RawObj;
193
+ }
194
+
195
+ function asArr(v: unknown, ctx: string): unknown[] {
196
+ if (!Array.isArray(v)) {
197
+ throw new Error(`${ctx}: expected array, got ${typeof v}`);
198
+ }
199
+ return v;
200
+ }
201
+
202
+ function asStr(v: unknown, ctx: string): string {
203
+ if (typeof v !== "string") {
204
+ throw new Error(`${ctx}: expected string, got ${typeof v}`);
205
+ }
206
+ return v;
207
+ }
208
+
209
+ function asNum(v: unknown, ctx: string): number {
210
+ if (typeof v !== "number") {
211
+ throw new Error(`${ctx}: expected number, got ${typeof v}`);
212
+ }
213
+ return v;
214
+ }
215
+
216
+ function asBool(v: unknown, ctx: string): boolean {
217
+ if (typeof v !== "boolean") {
218
+ throw new Error(`${ctx}: expected boolean, got ${typeof v}`);
219
+ }
220
+ return v;
221
+ }
222
+
223
+ function optStr(v: unknown): string | undefined {
224
+ return typeof v === "string" ? v : undefined;
225
+ }
226
+
227
+ function optNum(v: unknown): number | undefined {
228
+ return typeof v === "number" ? v : undefined;
229
+ }
230
+
231
+ function optBool(v: unknown): boolean | undefined {
232
+ return typeof v === "boolean" ? v : undefined;
233
+ }
234
+
235
+ // ---------------------------------------------------------------------------
236
+ // Parsers for nested config objects
237
+ // ---------------------------------------------------------------------------
238
+
239
+ function parseMatchkeyField(raw: unknown, ctx: string): MatchkeyField {
240
+ const obj = asObj(raw, ctx);
241
+ const fieldName = typeof obj.field === "string" ? obj.field : "<unknown>";
242
+
243
+ // Validate transforms. Allow parametric forms like "substring:0:3", "qgram:3",
244
+ // "bloom_filter:high".
245
+ const transforms: string[] = Array.isArray(obj.transforms)
246
+ ? (obj.transforms as unknown[]).map((t, i) => {
247
+ if (typeof t !== "string") {
248
+ throw new Error(
249
+ `${ctx}.transforms[${i}]: expected string, got ${typeof t}`,
250
+ );
251
+ }
252
+ return t;
253
+ })
254
+ : [];
255
+ for (const t of transforms) {
256
+ if (!isValidTransform(t)) {
257
+ const valid = [...VALID_TRANSFORMS].sort().join(", ");
258
+ throw new Error(
259
+ `Invalid transform '${t}' on field '${fieldName}'. ` +
260
+ `Valid: ${valid}, or 'substring:<n>:<n>', 'qgram:<n>', 'bloom_filter[:...]'.`,
261
+ );
262
+ }
263
+ }
264
+
265
+ // Scorer is optional for exact matchkeys. Allow plugin scorers — warn only
266
+ // if the name is unknown (plugin registration may fill it in later).
267
+ if (obj.scorer !== undefined && obj.scorer !== null) {
268
+ if (
269
+ typeof obj.scorer !== "string" ||
270
+ !(VALID_SCORERS as ReadonlySet<string>).has(obj.scorer)
271
+ ) {
272
+ // eslint-disable-next-line no-console
273
+ console.warn(
274
+ `Unknown scorer '${String(obj.scorer)}' on field '${fieldName}' ` +
275
+ `(will be rejected at score-time if no plugin is registered).`,
276
+ );
277
+ }
278
+ }
279
+
280
+ return stripUndefined({
281
+ field: asStr(obj.field, `${ctx}.field`),
282
+ transforms,
283
+ scorer: typeof obj.scorer === "string" ? obj.scorer : "jaro_winkler",
284
+ weight: typeof obj.weight === "number" ? obj.weight : 1.0,
285
+ model: optStr(obj.model),
286
+ columns: Array.isArray(obj.columns)
287
+ ? (obj.columns as string[])
288
+ : undefined,
289
+ columnWeights:
290
+ typeof obj.columnWeights === "object" && obj.columnWeights !== null
291
+ ? (obj.columnWeights as Record<string, number>)
292
+ : undefined,
293
+ levels: optNum(obj.levels),
294
+ partialThreshold: optNum(obj.partialThreshold),
295
+ }) as MatchkeyField;
296
+ }
297
+
298
+ function parseMatchkeyConfig(raw: unknown, ctx: string): MatchkeyConfig {
299
+ const obj = asObj(raw, ctx);
300
+ const fields = Array.isArray(obj.fields)
301
+ ? obj.fields.map((f: unknown, i: number) =>
302
+ parseMatchkeyField(f, `${ctx}.fields[${i}]`),
303
+ )
304
+ : [];
305
+
306
+ const name = asStr(obj.name, `${ctx}.name`);
307
+ const type = requireIn(
308
+ obj.type,
309
+ VALID_MATCHKEY_TYPES,
310
+ `${ctx}.type`,
311
+ "weighted",
312
+ ) as "exact" | "weighted" | "probabilistic";
313
+
314
+ if (type === "exact") {
315
+ return { name, type: "exact", fields };
316
+ }
317
+ if (type === "probabilistic") {
318
+ return stripUndefined({
319
+ name,
320
+ type: "probabilistic" as const,
321
+ fields,
322
+ threshold: optNum(obj.threshold),
323
+ emIterations: optNum(obj.emIterations),
324
+ convergenceThreshold: optNum(obj.convergenceThreshold),
325
+ linkThreshold: optNum(obj.linkThreshold),
326
+ reviewThreshold: optNum(obj.reviewThreshold),
327
+ }) as MatchkeyConfig;
328
+ }
329
+ // weighted
330
+ return stripUndefined({
331
+ name,
332
+ type: "weighted" as const,
333
+ fields,
334
+ threshold: optNum(obj.threshold) ?? 0.85,
335
+ autoThreshold: optBool(obj.autoThreshold),
336
+ rerank: optBool(obj.rerank),
337
+ rerankModel: optStr(obj.rerankModel),
338
+ rerankBand: optNum(obj.rerankBand),
339
+ }) as MatchkeyConfig;
340
+ }
341
+
342
+ function parseBlockingKeyConfig(
343
+ raw: unknown,
344
+ ctx: string,
345
+ ): BlockingKeyConfig {
346
+ const obj = asObj(raw, ctx);
347
+ return {
348
+ fields: Array.isArray(obj.fields) ? (obj.fields as string[]) : [],
349
+ transforms: Array.isArray(obj.transforms)
350
+ ? (obj.transforms as string[])
351
+ : [],
352
+ };
353
+ }
354
+
355
+ function parseSortKeyField(raw: unknown, ctx: string): SortKeyField {
356
+ const obj = asObj(raw, ctx);
357
+ return {
358
+ column: asStr(obj.column, `${ctx}.column`),
359
+ transforms: Array.isArray(obj.transforms)
360
+ ? (obj.transforms as string[])
361
+ : [],
362
+ };
363
+ }
364
+
365
+ function parseCanopyConfig(raw: unknown, ctx: string): CanopyConfig {
366
+ const obj = asObj(raw, ctx);
367
+ return {
368
+ fields: Array.isArray(obj.fields) ? (obj.fields as string[]) : [],
369
+ looseThreshold: typeof obj.looseThreshold === "number" ? obj.looseThreshold : 0.7,
370
+ tightThreshold: typeof obj.tightThreshold === "number" ? obj.tightThreshold : 0.9,
371
+ maxCanopySize: typeof obj.maxCanopySize === "number" ? obj.maxCanopySize : 1000,
372
+ };
373
+ }
374
+
375
+ function parseBlockingConfig(raw: unknown, ctx: string): BlockingConfig {
376
+ const obj = asObj(raw, ctx);
377
+ const keys = Array.isArray(obj.keys)
378
+ ? obj.keys.map((k: unknown, i: number) =>
379
+ parseBlockingKeyConfig(k, `${ctx}.keys[${i}]`),
380
+ )
381
+ : [];
382
+ const passes = Array.isArray(obj.passes)
383
+ ? obj.passes.map((p: unknown, i: number) =>
384
+ parseBlockingKeyConfig(p, `${ctx}.passes[${i}]`),
385
+ )
386
+ : undefined;
387
+ const subBlockKeys = Array.isArray(obj.subBlockKeys)
388
+ ? obj.subBlockKeys.map((k: unknown, i: number) =>
389
+ parseBlockingKeyConfig(k, `${ctx}.subBlockKeys[${i}]`),
390
+ )
391
+ : undefined;
392
+ const sortKey = Array.isArray(obj.sortKey)
393
+ ? obj.sortKey.map((s: unknown, i: number) =>
394
+ parseSortKeyField(s, `${ctx}.sortKey[${i}]`),
395
+ )
396
+ : undefined;
397
+ const canopy =
398
+ typeof obj.canopy === "object" && obj.canopy !== null
399
+ ? parseCanopyConfig(obj.canopy, `${ctx}.canopy`)
400
+ : undefined;
401
+
402
+ return stripUndefined({
403
+ strategy: requireIn(
404
+ obj.strategy,
405
+ VALID_BLOCKING_STRATEGIES,
406
+ `${ctx}.strategy`,
407
+ "static",
408
+ ),
409
+ keys,
410
+ maxBlockSize:
411
+ typeof obj.maxBlockSize === "number" ? obj.maxBlockSize : 5000,
412
+ skipOversized:
413
+ typeof obj.skipOversized === "boolean" ? obj.skipOversized : false,
414
+ autoSuggest: optBool(obj.autoSuggest),
415
+ autoSelect: optBool(obj.autoSelect),
416
+ subBlockKeys,
417
+ windowSize: optNum(obj.windowSize),
418
+ sortKey,
419
+ passes,
420
+ unionMode: optBool(obj.unionMode),
421
+ maxTotalComparisons: optNum(obj.maxTotalComparisons),
422
+ annColumn: optStr(obj.annColumn),
423
+ annModel: optStr(obj.annModel),
424
+ annTopK: optNum(obj.annTopK),
425
+ canopy,
426
+ learnedSampleSize: optNum(obj.learnedSampleSize),
427
+ learnedMinRecall: optNum(obj.learnedMinRecall),
428
+ learnedMinReduction: optNum(obj.learnedMinReduction),
429
+ learnedPredicateDepth: optNum(obj.learnedPredicateDepth),
430
+ learnedCachePath: optStr(obj.learnedCachePath),
431
+ }) as BlockingConfig;
432
+ }
433
+
434
+ function parseGoldenFieldRule(raw: unknown, ctx: string): GoldenFieldRule {
435
+ const obj = asObj(raw, ctx);
436
+ return stripUndefined({
437
+ strategy: requireIn(
438
+ obj.strategy,
439
+ VALID_STRATEGIES,
440
+ `${ctx}.strategy`,
441
+ ) as GoldenFieldRule["strategy"],
442
+ dateColumn: optStr(obj.dateColumn),
443
+ sourcePriority: Array.isArray(obj.sourcePriority)
444
+ ? (obj.sourcePriority as string[])
445
+ : undefined,
446
+ }) as GoldenFieldRule;
447
+ }
448
+
449
+ function parseGoldenRulesConfig(
450
+ raw: unknown,
451
+ ctx: string,
452
+ ): GoldenRulesConfig {
453
+ const obj = asObj(raw, ctx);
454
+
455
+ // Normalize: YAML uses `default`, TS interface uses `defaultStrategy`
456
+ const defaultStrategy =
457
+ typeof obj.defaultStrategy === "string"
458
+ ? obj.defaultStrategy
459
+ : typeof obj.default === "string"
460
+ ? obj.default
461
+ : "most_complete";
462
+
463
+ const fieldRules: Record<string, GoldenFieldRule> = {};
464
+ if (
465
+ typeof obj.fieldRules === "object" &&
466
+ obj.fieldRules !== null &&
467
+ !Array.isArray(obj.fieldRules)
468
+ ) {
469
+ for (const [key, val] of Object.entries(
470
+ obj.fieldRules as Record<string, unknown>,
471
+ )) {
472
+ fieldRules[key] = parseGoldenFieldRule(val, `${ctx}.fieldRules.${key}`);
473
+ }
474
+ }
475
+
476
+ return {
477
+ defaultStrategy,
478
+ fieldRules,
479
+ maxClusterSize:
480
+ typeof obj.maxClusterSize === "number" ? obj.maxClusterSize : 10,
481
+ autoSplit:
482
+ typeof obj.autoSplit === "boolean" ? obj.autoSplit : true,
483
+ qualityWeighting:
484
+ typeof obj.qualityWeighting === "boolean"
485
+ ? obj.qualityWeighting
486
+ : true,
487
+ weakClusterThreshold:
488
+ typeof obj.weakClusterThreshold === "number"
489
+ ? obj.weakClusterThreshold
490
+ : 0.3,
491
+ };
492
+ }
493
+
494
+ function parseStandardizationConfig(
495
+ raw: unknown,
496
+ ctx: string,
497
+ ): StandardizationConfig {
498
+ const obj = asObj(raw, ctx);
499
+
500
+ // Normalize: in YAML the rules may be at top level or nested under `rules`
501
+ let rulesObj: Record<string, unknown>;
502
+ if (
503
+ typeof obj.rules === "object" &&
504
+ obj.rules !== null &&
505
+ !Array.isArray(obj.rules)
506
+ ) {
507
+ rulesObj = obj.rules as Record<string, unknown>;
508
+ } else {
509
+ // Flat form: each key is a column name mapping to standardizers
510
+ rulesObj = obj;
511
+ }
512
+
513
+ const rules: Record<string, readonly string[]> = {};
514
+ for (const [key, val] of Object.entries(rulesObj)) {
515
+ if (Array.isArray(val)) {
516
+ const arr = val as unknown[];
517
+ for (const rule of arr) {
518
+ if (typeof rule !== "string") {
519
+ throw new Error(
520
+ `${ctx}.${key}: expected array of strings, got ${typeof rule}`,
521
+ );
522
+ }
523
+ if (!(VALID_STANDARDIZERS as ReadonlySet<string>).has(rule)) {
524
+ const valid = [...VALID_STANDARDIZERS].sort().join(", ");
525
+ throw new Error(
526
+ `Invalid standardizer '${rule}' on column '${key}'. Valid: ${valid}`,
527
+ );
528
+ }
529
+ }
530
+ rules[key] = arr as string[];
531
+ }
532
+ }
533
+
534
+ return { rules };
535
+ }
536
+
537
+ function parseBudgetConfig(raw: unknown, ctx: string): BudgetConfig {
538
+ const obj = asObj(raw, ctx);
539
+ return stripUndefined({
540
+ maxCostUsd: optNum(obj.maxCostUsd),
541
+ maxCalls: optNum(obj.maxCalls),
542
+ escalationModel: optStr(obj.escalationModel),
543
+ escalationBand: Array.isArray(obj.escalationBand)
544
+ ? (obj.escalationBand as number[])
545
+ : undefined,
546
+ escalationBudgetPct: optNum(obj.escalationBudgetPct),
547
+ warnAtPct: optNum(obj.warnAtPct),
548
+ }) as BudgetConfig;
549
+ }
550
+
551
+ function parseLLMScorerConfig(
552
+ raw: unknown,
553
+ ctx: string,
554
+ ): LLMScorerConfig {
555
+ const obj = asObj(raw, ctx);
556
+ return stripUndefined({
557
+ enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
558
+ provider: optStr(obj.provider),
559
+ model: optStr(obj.model),
560
+ autoThreshold:
561
+ typeof obj.autoThreshold === "number" ? obj.autoThreshold : 0.9,
562
+ candidateLo:
563
+ typeof obj.candidateLo === "number" ? obj.candidateLo : 0.6,
564
+ candidateHi:
565
+ typeof obj.candidateHi === "number" ? obj.candidateHi : 0.9,
566
+ batchSize:
567
+ typeof obj.batchSize === "number" ? obj.batchSize : 10,
568
+ maxWorkers:
569
+ typeof obj.maxWorkers === "number" ? obj.maxWorkers : 4,
570
+ budget:
571
+ typeof obj.budget === "object" && obj.budget !== null
572
+ ? parseBudgetConfig(obj.budget, `${ctx}.budget`)
573
+ : undefined,
574
+ mode: requireIn(obj.mode, VALID_LLM_MODES, `${ctx}.mode`, "pairwise"),
575
+ clusterMaxSize: optNum(obj.clusterMaxSize),
576
+ clusterMinSize: optNum(obj.clusterMinSize),
577
+ }) as LLMScorerConfig;
578
+ }
579
+
580
+ function parseValidationRuleConfig(
581
+ raw: unknown,
582
+ ctx: string,
583
+ ): ValidationRuleConfig {
584
+ const obj = asObj(raw, ctx);
585
+ return {
586
+ column: asStr(obj.column, `${ctx}.column`),
587
+ ruleType: requireIn(
588
+ obj.ruleType,
589
+ VALID_VALIDATION_RULE_TYPES,
590
+ `${ctx}.ruleType`,
591
+ ),
592
+ params:
593
+ typeof obj.params === "object" && obj.params !== null
594
+ ? (obj.params as Record<string, unknown>)
595
+ : {},
596
+ action: requireIn(
597
+ obj.action,
598
+ VALID_VALIDATION_ACTIONS,
599
+ `${ctx}.action`,
600
+ "flag",
601
+ ),
602
+ };
603
+ }
604
+
605
+ function parseValidationConfig(
606
+ raw: unknown,
607
+ ctx: string,
608
+ ): ValidationConfig {
609
+ const obj = asObj(raw, ctx);
610
+ return {
611
+ rules: Array.isArray(obj.rules)
612
+ ? obj.rules.map((r: unknown, i: number) =>
613
+ parseValidationRuleConfig(r, `${ctx}.rules[${i}]`),
614
+ )
615
+ : [],
616
+ autoFix: typeof obj.autoFix === "boolean" ? obj.autoFix : false,
617
+ };
618
+ }
619
+
620
+ function parseDomainConfig(raw: unknown, ctx: string): DomainConfig {
621
+ const obj = asObj(raw, ctx);
622
+ return stripUndefined({
623
+ enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
624
+ mode: optStr(obj.mode),
625
+ confidenceThreshold:
626
+ typeof obj.confidenceThreshold === "number"
627
+ ? obj.confidenceThreshold
628
+ : 0.8,
629
+ llmValidation:
630
+ typeof obj.llmValidation === "boolean" ? obj.llmValidation : false,
631
+ budget:
632
+ typeof obj.budget === "object" && obj.budget !== null
633
+ ? parseBudgetConfig(obj.budget, `${ctx}.budget`)
634
+ : undefined,
635
+ }) as DomainConfig;
636
+ }
637
+
638
+ function parseQualityConfig(raw: unknown, ctx: string): QualityConfig {
639
+ const obj = asObj(raw, ctx);
640
+ return stripUndefined({
641
+ enabled: typeof obj.enabled === "boolean" ? obj.enabled : true,
642
+ mode: requireIn(obj.mode, VALID_QUALITY_MODES, `${ctx}.mode`, "silent"),
643
+ fixMode: requireIn(
644
+ obj.fixMode,
645
+ VALID_QUALITY_FIX_MODES,
646
+ `${ctx}.fixMode`,
647
+ "safe",
648
+ ),
649
+ domain: optStr(obj.domain),
650
+ }) as QualityConfig;
651
+ }
652
+
653
+ function parseTransformConfig(raw: unknown, ctx: string): TransformConfig {
654
+ const obj = asObj(raw, ctx);
655
+ return {
656
+ enabled: typeof obj.enabled === "boolean" ? obj.enabled : true,
657
+ mode: requireIn(obj.mode, VALID_QUALITY_MODES, `${ctx}.mode`, "silent"),
658
+ };
659
+ }
660
+
661
+ function parseLearningConfig(raw: unknown, ctx: string): LearningConfig {
662
+ const obj = asObj(raw, ctx);
663
+ return {
664
+ thresholdMinCorrections:
665
+ typeof obj.thresholdMinCorrections === "number"
666
+ ? obj.thresholdMinCorrections
667
+ : 10,
668
+ weightsMinCorrections:
669
+ typeof obj.weightsMinCorrections === "number"
670
+ ? obj.weightsMinCorrections
671
+ : 50,
672
+ };
673
+ }
674
+
675
+ function parseMemoryConfig(raw: unknown, ctx: string): MemoryConfig {
676
+ const obj = asObj(raw, ctx);
677
+ return stripUndefined({
678
+ enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
679
+ backend: requireIn(
680
+ obj.backend,
681
+ VALID_MEMORY_BACKENDS,
682
+ `${ctx}.backend`,
683
+ "sqlite",
684
+ ),
685
+ path: optStr(obj.path),
686
+ trust: typeof obj.trust === "number" ? obj.trust : 0.9,
687
+ learning:
688
+ typeof obj.learning === "object" && obj.learning !== null
689
+ ? parseLearningConfig(obj.learning, `${ctx}.learning`)
690
+ : { thresholdMinCorrections: 10, weightsMinCorrections: 50 },
691
+ }) as MemoryConfig;
692
+ }
693
+
694
+ function parseInputFileConfig(
695
+ raw: unknown,
696
+ ctx: string,
697
+ ): InputFileConfig {
698
+ const obj = asObj(raw, ctx);
699
+ return stripUndefined({
700
+ path: asStr(obj.path, `${ctx}.path`),
701
+ idColumn: optStr(obj.idColumn),
702
+ sourceLabel: optStr(obj.sourceLabel),
703
+ sourceName: optStr(obj.sourceName),
704
+ columnMap:
705
+ typeof obj.columnMap === "object" && obj.columnMap !== null
706
+ ? (obj.columnMap as Record<string, string>)
707
+ : undefined,
708
+ delimiter: optStr(obj.delimiter),
709
+ encoding: optStr(obj.encoding),
710
+ sheet: optStr(obj.sheet),
711
+ parseMode: optStr(obj.parseMode),
712
+ headerRow: optNum(obj.headerRow),
713
+ hasHeader: optBool(obj.hasHeader),
714
+ skipRows: Array.isArray(obj.skipRows)
715
+ ? (obj.skipRows as number[])
716
+ : undefined,
717
+ }) as InputFileConfig;
718
+ }
719
+
720
+ function parseInputConfig(raw: unknown, ctx: string): InputConfig {
721
+ const obj = asObj(raw, ctx);
722
+ return stripUndefined({
723
+ files: Array.isArray(obj.files)
724
+ ? obj.files.map((f: unknown, i: number) =>
725
+ parseInputFileConfig(f, `${ctx}.files[${i}]`),
726
+ )
727
+ : [],
728
+ fileA:
729
+ typeof obj.fileA === "object" && obj.fileA !== null
730
+ ? parseInputFileConfig(obj.fileA, `${ctx}.fileA`)
731
+ : undefined,
732
+ fileB:
733
+ typeof obj.fileB === "object" && obj.fileB !== null
734
+ ? parseInputFileConfig(obj.fileB, `${ctx}.fileB`)
735
+ : undefined,
736
+ }) as InputConfig;
737
+ }
738
+
739
+ function parseOutputConfig(raw: unknown, ctx: string): OutputConfig {
740
+ const obj = asObj(raw, ctx);
741
+ return stripUndefined({
742
+ path: optStr(obj.path),
743
+ format: optStr(obj.format),
744
+ directory: optStr(obj.directory),
745
+ runName: optStr(obj.runName),
746
+ }) as OutputConfig;
747
+ }
748
+
749
+ // ---------------------------------------------------------------------------
750
+ // Public API
751
+ // ---------------------------------------------------------------------------
752
+
753
+ /**
754
+ * Parse a raw JS object (already deserialized from YAML or JSON) into a
755
+ * validated GoldenMatchConfig.
756
+ *
757
+ * Handles:
758
+ * - Snake_case to camelCase key conversion
759
+ * - Normalization of `matchkeys` / `match_settings`
760
+ * - Parsing of all nested config objects
761
+ * - `default` -> `defaultStrategy` normalization in golden_rules
762
+ */
763
+ export function parseConfig(raw: unknown): GoldenMatchConfig {
764
+ if (typeof raw !== "object" || raw === null) {
765
+ throw new Error("Invalid config: expected a non-null object");
766
+ }
767
+
768
+ // Camelize all keys recursively
769
+ const obj = camelizeKeys(raw) as RawObj;
770
+
771
+ // Normalize matchkeys: accept either `matchkeys` or `matchSettings`
772
+ const rawMatchkeys = obj.matchkeys ?? obj.matchSettings;
773
+ const matchkeys = Array.isArray(rawMatchkeys)
774
+ ? rawMatchkeys.map((mk: unknown, i: number) =>
775
+ parseMatchkeyConfig(mk, `matchkeys[${i}]`),
776
+ )
777
+ : undefined;
778
+
779
+ const config = stripUndefined({
780
+ matchkeys,
781
+ blocking:
782
+ typeof obj.blocking === "object" && obj.blocking !== null
783
+ ? parseBlockingConfig(obj.blocking, "blocking")
784
+ : undefined,
785
+ threshold: optNum(obj.threshold),
786
+ goldenRules:
787
+ typeof obj.goldenRules === "object" && obj.goldenRules !== null
788
+ ? parseGoldenRulesConfig(obj.goldenRules, "goldenRules")
789
+ : undefined,
790
+ standardization:
791
+ typeof obj.standardization === "object" && obj.standardization !== null
792
+ ? parseStandardizationConfig(obj.standardization, "standardization")
793
+ : undefined,
794
+ validation:
795
+ typeof obj.validation === "object" && obj.validation !== null
796
+ ? parseValidationConfig(obj.validation, "validation")
797
+ : undefined,
798
+ quality:
799
+ typeof obj.quality === "object" && obj.quality !== null
800
+ ? parseQualityConfig(obj.quality, "quality")
801
+ : undefined,
802
+ transform:
803
+ typeof obj.transform === "object" && obj.transform !== null
804
+ ? parseTransformConfig(obj.transform, "transform")
805
+ : undefined,
806
+ llmScorer:
807
+ typeof obj.llmScorer === "object" && obj.llmScorer !== null
808
+ ? parseLLMScorerConfig(obj.llmScorer, "llmScorer")
809
+ : undefined,
810
+ domain:
811
+ typeof obj.domain === "object" && obj.domain !== null
812
+ ? parseDomainConfig(obj.domain, "domain")
813
+ : undefined,
814
+ memory:
815
+ typeof obj.memory === "object" && obj.memory !== null
816
+ ? parseMemoryConfig(obj.memory, "memory")
817
+ : undefined,
818
+ input:
819
+ typeof obj.input === "object" && obj.input !== null
820
+ ? parseInputConfig(obj.input, "input")
821
+ : undefined,
822
+ output:
823
+ typeof obj.output === "object" && obj.output !== null
824
+ ? parseOutputConfig(obj.output, "output")
825
+ : undefined,
826
+ backend: optStr(obj.backend),
827
+ llmAuto: optBool(obj.llmAuto),
828
+ llmBoost: optBool(obj.llmBoost),
829
+ }) as GoldenMatchConfig;
830
+
831
+ return config;
832
+ }
833
+
834
+ /**
835
+ * Parse a YAML string into a GoldenMatchConfig.
836
+ *
837
+ * Requires the caller to provide a YAML parse function (e.g. from the `yaml`
838
+ * npm package) to keep this module edge-safe with no dynamic imports.
839
+ *
840
+ * @param yamlStr - The YAML configuration string.
841
+ * @param yamlParseFn - A function that parses a YAML string into a JS object.
842
+ */
843
+ export function parseConfigYaml(
844
+ yamlStr: string,
845
+ yamlParseFn: (s: string) => unknown,
846
+ ): GoldenMatchConfig {
847
+ const raw = yamlParseFn(yamlStr);
848
+ if (typeof raw !== "object" || raw === null) {
849
+ throw new Error("Invalid YAML config: expected a non-null object at root");
850
+ }
851
+ return parseConfig(raw);
852
+ }
853
+
854
+ /**
855
+ * Convert a GoldenMatchConfig back to a plain JS object suitable for
856
+ * YAML or JSON serialization (snake_case keys).
857
+ *
858
+ * @param config - The typed config object.
859
+ * @param yamlStringifyFn - A function that serializes a JS object to YAML.
860
+ */
861
+ export function configToYaml(
862
+ config: GoldenMatchConfig,
863
+ yamlStringifyFn: (obj: unknown) => string,
864
+ ): string {
865
+ // Strip undefined values then convert keys to snake_case
866
+ const plain = JSON.parse(JSON.stringify(config));
867
+ const snaked = snakeifyKeys(plain);
868
+ return yamlStringifyFn(snaked);
869
+ }