goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* config/loader.ts — Config loader that parses raw objects (from YAML/JSON)
|
|
3
|
+
* into typed GoldenMatchConfig.
|
|
4
|
+
*
|
|
5
|
+
* Edge-safe: no `node:` imports, no `require()`.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
GoldenMatchConfig,
|
|
10
|
+
MatchkeyConfig,
|
|
11
|
+
MatchkeyField,
|
|
12
|
+
BlockingConfig,
|
|
13
|
+
BlockingKeyConfig,
|
|
14
|
+
GoldenRulesConfig,
|
|
15
|
+
GoldenFieldRule,
|
|
16
|
+
StandardizationConfig,
|
|
17
|
+
LLMScorerConfig,
|
|
18
|
+
BudgetConfig,
|
|
19
|
+
ValidationConfig,
|
|
20
|
+
ValidationRuleConfig,
|
|
21
|
+
DomainConfig,
|
|
22
|
+
QualityConfig,
|
|
23
|
+
TransformConfig,
|
|
24
|
+
MemoryConfig,
|
|
25
|
+
LearningConfig,
|
|
26
|
+
InputConfig,
|
|
27
|
+
InputFileConfig,
|
|
28
|
+
OutputConfig,
|
|
29
|
+
SortKeyField,
|
|
30
|
+
CanopyConfig,
|
|
31
|
+
} from "../types.js";
|
|
32
|
+
import {
|
|
33
|
+
VALID_SCORERS,
|
|
34
|
+
VALID_TRANSFORMS,
|
|
35
|
+
VALID_STRATEGIES,
|
|
36
|
+
VALID_STANDARDIZERS,
|
|
37
|
+
} from "../types.js";
|
|
38
|
+
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// String-union validation
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
const VALID_MATCHKEY_TYPES = new Set([
|
|
44
|
+
"exact",
|
|
45
|
+
"weighted",
|
|
46
|
+
"probabilistic",
|
|
47
|
+
] as const);
|
|
48
|
+
|
|
49
|
+
const VALID_BLOCKING_STRATEGIES = new Set([
|
|
50
|
+
"static",
|
|
51
|
+
"adaptive",
|
|
52
|
+
"sorted_neighborhood",
|
|
53
|
+
"multi_pass",
|
|
54
|
+
"ann",
|
|
55
|
+
"canopy",
|
|
56
|
+
"ann_pairs",
|
|
57
|
+
"learned",
|
|
58
|
+
] as const);
|
|
59
|
+
|
|
60
|
+
const VALID_MEMORY_BACKENDS = new Set(["sqlite", "postgres"] as const);
|
|
61
|
+
|
|
62
|
+
const VALID_QUALITY_MODES = new Set([
|
|
63
|
+
"silent",
|
|
64
|
+
"announced",
|
|
65
|
+
"disabled",
|
|
66
|
+
] as const);
|
|
67
|
+
|
|
68
|
+
const VALID_QUALITY_FIX_MODES = new Set(["safe", "moderate", "none"] as const);
|
|
69
|
+
|
|
70
|
+
const VALID_LLM_MODES = new Set(["pairwise", "cluster"] as const);
|
|
71
|
+
|
|
72
|
+
const VALID_VALIDATION_RULE_TYPES = new Set([
|
|
73
|
+
"regex",
|
|
74
|
+
"min_length",
|
|
75
|
+
"max_length",
|
|
76
|
+
"not_null",
|
|
77
|
+
"in_set",
|
|
78
|
+
"format",
|
|
79
|
+
] as const);
|
|
80
|
+
|
|
81
|
+
const VALID_VALIDATION_ACTIONS = new Set([
|
|
82
|
+
"null",
|
|
83
|
+
"quarantine",
|
|
84
|
+
"flag",
|
|
85
|
+
] as const);
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Validate that `value` is one of `allowed`. If `defaultValue` is provided,
|
|
89
|
+
* return it when `value` is null/undefined. Throws a clear error otherwise.
|
|
90
|
+
*/
|
|
91
|
+
function requireIn<T extends string>(
|
|
92
|
+
value: unknown,
|
|
93
|
+
allowed: ReadonlySet<T>,
|
|
94
|
+
fieldName: string,
|
|
95
|
+
defaultValue?: T,
|
|
96
|
+
): T {
|
|
97
|
+
if (value === undefined || value === null) {
|
|
98
|
+
if (defaultValue !== undefined) return defaultValue;
|
|
99
|
+
throw new Error(`Required field '${fieldName}' is missing`);
|
|
100
|
+
}
|
|
101
|
+
if (typeof value !== "string" || !(allowed as ReadonlySet<string>).has(value)) {
|
|
102
|
+
const valid = [...allowed].sort().join(", ");
|
|
103
|
+
throw new Error(
|
|
104
|
+
`Invalid value '${String(value)}' for '${fieldName}'. Valid options: ${valid}`,
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
return value as T;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Accept known transforms plus parametric forms:
|
|
112
|
+
* - substring:<n>:<n>
|
|
113
|
+
* - qgram:<n>
|
|
114
|
+
* - bloom_filter, bloom_filter:<...>
|
|
115
|
+
*/
|
|
116
|
+
function isValidTransform(t: string): boolean {
|
|
117
|
+
if ((VALID_TRANSFORMS as ReadonlySet<string>).has(t)) return true;
|
|
118
|
+
if (/^substring:\d+:\d+$/.test(t)) return true;
|
|
119
|
+
if (/^qgram:\d+$/.test(t)) return true;
|
|
120
|
+
if (t === "bloom_filter" || /^bloom_filter:/.test(t)) return true;
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Snake_case to camelCase conversion
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
/** Convert a snake_case key to camelCase. */
|
|
129
|
+
function snakeToCamel(s: string): string {
|
|
130
|
+
return s.replace(/_([a-z])/g, (_, c: string) => c.toUpperCase());
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/** Recursively convert all keys of a plain object from snake_case to camelCase. */
|
|
134
|
+
function camelizeKeys(obj: unknown): unknown {
|
|
135
|
+
if (obj === null || obj === undefined) return obj;
|
|
136
|
+
if (Array.isArray(obj)) return obj.map(camelizeKeys);
|
|
137
|
+
if (typeof obj === "object") {
|
|
138
|
+
const result: Record<string, unknown> = {};
|
|
139
|
+
for (const [key, val] of Object.entries(obj as Record<string, unknown>)) {
|
|
140
|
+
result[snakeToCamel(key)] = camelizeKeys(val);
|
|
141
|
+
}
|
|
142
|
+
return result;
|
|
143
|
+
}
|
|
144
|
+
return obj;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/** Recursively convert all keys from camelCase to snake_case. */
|
|
148
|
+
function camelToSnake(s: string): string {
|
|
149
|
+
return s.replace(/[A-Z]/g, (c) => `_${c.toLowerCase()}`);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function snakeifyKeys(obj: unknown): unknown {
|
|
153
|
+
if (obj === null || obj === undefined) return obj;
|
|
154
|
+
if (Array.isArray(obj)) return obj.map(snakeifyKeys);
|
|
155
|
+
if (typeof obj === "object") {
|
|
156
|
+
const result: Record<string, unknown> = {};
|
|
157
|
+
for (const [key, val] of Object.entries(obj as Record<string, unknown>)) {
|
|
158
|
+
result[camelToSnake(key)] = snakeifyKeys(val);
|
|
159
|
+
}
|
|
160
|
+
return result;
|
|
161
|
+
}
|
|
162
|
+
return obj;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Helpers: strip undefined values for exactOptionalPropertyTypes
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Remove keys whose value is `undefined` from a plain object.
|
|
171
|
+
* Required because TypeScript's `exactOptionalPropertyTypes` disallows
|
|
172
|
+
* assigning `undefined` to optional properties.
|
|
173
|
+
*/
|
|
174
|
+
function stripUndefined<T extends Record<string, unknown>>(obj: T): T {
|
|
175
|
+
const result = {} as Record<string, unknown>;
|
|
176
|
+
for (const [k, v] of Object.entries(obj)) {
|
|
177
|
+
if (v !== undefined) result[k] = v;
|
|
178
|
+
}
|
|
179
|
+
return result as T;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// ---------------------------------------------------------------------------
|
|
183
|
+
// Helpers: safe getters
|
|
184
|
+
// ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
type RawObj = Record<string, unknown>;
|
|
187
|
+
|
|
188
|
+
function asObj(v: unknown, ctx: string): RawObj {
|
|
189
|
+
if (typeof v !== "object" || v === null || Array.isArray(v)) {
|
|
190
|
+
throw new Error(`${ctx}: expected object, got ${typeof v}`);
|
|
191
|
+
}
|
|
192
|
+
return v as RawObj;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function asArr(v: unknown, ctx: string): unknown[] {
|
|
196
|
+
if (!Array.isArray(v)) {
|
|
197
|
+
throw new Error(`${ctx}: expected array, got ${typeof v}`);
|
|
198
|
+
}
|
|
199
|
+
return v;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function asStr(v: unknown, ctx: string): string {
|
|
203
|
+
if (typeof v !== "string") {
|
|
204
|
+
throw new Error(`${ctx}: expected string, got ${typeof v}`);
|
|
205
|
+
}
|
|
206
|
+
return v;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function asNum(v: unknown, ctx: string): number {
|
|
210
|
+
if (typeof v !== "number") {
|
|
211
|
+
throw new Error(`${ctx}: expected number, got ${typeof v}`);
|
|
212
|
+
}
|
|
213
|
+
return v;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function asBool(v: unknown, ctx: string): boolean {
|
|
217
|
+
if (typeof v !== "boolean") {
|
|
218
|
+
throw new Error(`${ctx}: expected boolean, got ${typeof v}`);
|
|
219
|
+
}
|
|
220
|
+
return v;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function optStr(v: unknown): string | undefined {
|
|
224
|
+
return typeof v === "string" ? v : undefined;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function optNum(v: unknown): number | undefined {
|
|
228
|
+
return typeof v === "number" ? v : undefined;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function optBool(v: unknown): boolean | undefined {
|
|
232
|
+
return typeof v === "boolean" ? v : undefined;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ---------------------------------------------------------------------------
|
|
236
|
+
// Parsers for nested config objects
|
|
237
|
+
// ---------------------------------------------------------------------------
|
|
238
|
+
|
|
239
|
+
function parseMatchkeyField(raw: unknown, ctx: string): MatchkeyField {
|
|
240
|
+
const obj = asObj(raw, ctx);
|
|
241
|
+
const fieldName = typeof obj.field === "string" ? obj.field : "<unknown>";
|
|
242
|
+
|
|
243
|
+
// Validate transforms. Allow parametric forms like "substring:0:3", "qgram:3",
|
|
244
|
+
// "bloom_filter:high".
|
|
245
|
+
const transforms: string[] = Array.isArray(obj.transforms)
|
|
246
|
+
? (obj.transforms as unknown[]).map((t, i) => {
|
|
247
|
+
if (typeof t !== "string") {
|
|
248
|
+
throw new Error(
|
|
249
|
+
`${ctx}.transforms[${i}]: expected string, got ${typeof t}`,
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
return t;
|
|
253
|
+
})
|
|
254
|
+
: [];
|
|
255
|
+
for (const t of transforms) {
|
|
256
|
+
if (!isValidTransform(t)) {
|
|
257
|
+
const valid = [...VALID_TRANSFORMS].sort().join(", ");
|
|
258
|
+
throw new Error(
|
|
259
|
+
`Invalid transform '${t}' on field '${fieldName}'. ` +
|
|
260
|
+
`Valid: ${valid}, or 'substring:<n>:<n>', 'qgram:<n>', 'bloom_filter[:...]'.`,
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Scorer is optional for exact matchkeys. Allow plugin scorers — warn only
|
|
266
|
+
// if the name is unknown (plugin registration may fill it in later).
|
|
267
|
+
if (obj.scorer !== undefined && obj.scorer !== null) {
|
|
268
|
+
if (
|
|
269
|
+
typeof obj.scorer !== "string" ||
|
|
270
|
+
!(VALID_SCORERS as ReadonlySet<string>).has(obj.scorer)
|
|
271
|
+
) {
|
|
272
|
+
// eslint-disable-next-line no-console
|
|
273
|
+
console.warn(
|
|
274
|
+
`Unknown scorer '${String(obj.scorer)}' on field '${fieldName}' ` +
|
|
275
|
+
`(will be rejected at score-time if no plugin is registered).`,
|
|
276
|
+
);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return stripUndefined({
|
|
281
|
+
field: asStr(obj.field, `${ctx}.field`),
|
|
282
|
+
transforms,
|
|
283
|
+
scorer: typeof obj.scorer === "string" ? obj.scorer : "jaro_winkler",
|
|
284
|
+
weight: typeof obj.weight === "number" ? obj.weight : 1.0,
|
|
285
|
+
model: optStr(obj.model),
|
|
286
|
+
columns: Array.isArray(obj.columns)
|
|
287
|
+
? (obj.columns as string[])
|
|
288
|
+
: undefined,
|
|
289
|
+
columnWeights:
|
|
290
|
+
typeof obj.columnWeights === "object" && obj.columnWeights !== null
|
|
291
|
+
? (obj.columnWeights as Record<string, number>)
|
|
292
|
+
: undefined,
|
|
293
|
+
levels: optNum(obj.levels),
|
|
294
|
+
partialThreshold: optNum(obj.partialThreshold),
|
|
295
|
+
}) as MatchkeyField;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function parseMatchkeyConfig(raw: unknown, ctx: string): MatchkeyConfig {
|
|
299
|
+
const obj = asObj(raw, ctx);
|
|
300
|
+
const fields = Array.isArray(obj.fields)
|
|
301
|
+
? obj.fields.map((f: unknown, i: number) =>
|
|
302
|
+
parseMatchkeyField(f, `${ctx}.fields[${i}]`),
|
|
303
|
+
)
|
|
304
|
+
: [];
|
|
305
|
+
|
|
306
|
+
const name = asStr(obj.name, `${ctx}.name`);
|
|
307
|
+
const type = requireIn(
|
|
308
|
+
obj.type,
|
|
309
|
+
VALID_MATCHKEY_TYPES,
|
|
310
|
+
`${ctx}.type`,
|
|
311
|
+
"weighted",
|
|
312
|
+
) as "exact" | "weighted" | "probabilistic";
|
|
313
|
+
|
|
314
|
+
if (type === "exact") {
|
|
315
|
+
return { name, type: "exact", fields };
|
|
316
|
+
}
|
|
317
|
+
if (type === "probabilistic") {
|
|
318
|
+
return stripUndefined({
|
|
319
|
+
name,
|
|
320
|
+
type: "probabilistic" as const,
|
|
321
|
+
fields,
|
|
322
|
+
threshold: optNum(obj.threshold),
|
|
323
|
+
emIterations: optNum(obj.emIterations),
|
|
324
|
+
convergenceThreshold: optNum(obj.convergenceThreshold),
|
|
325
|
+
linkThreshold: optNum(obj.linkThreshold),
|
|
326
|
+
reviewThreshold: optNum(obj.reviewThreshold),
|
|
327
|
+
}) as MatchkeyConfig;
|
|
328
|
+
}
|
|
329
|
+
// weighted
|
|
330
|
+
return stripUndefined({
|
|
331
|
+
name,
|
|
332
|
+
type: "weighted" as const,
|
|
333
|
+
fields,
|
|
334
|
+
threshold: optNum(obj.threshold) ?? 0.85,
|
|
335
|
+
autoThreshold: optBool(obj.autoThreshold),
|
|
336
|
+
rerank: optBool(obj.rerank),
|
|
337
|
+
rerankModel: optStr(obj.rerankModel),
|
|
338
|
+
rerankBand: optNum(obj.rerankBand),
|
|
339
|
+
}) as MatchkeyConfig;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
function parseBlockingKeyConfig(
|
|
343
|
+
raw: unknown,
|
|
344
|
+
ctx: string,
|
|
345
|
+
): BlockingKeyConfig {
|
|
346
|
+
const obj = asObj(raw, ctx);
|
|
347
|
+
return {
|
|
348
|
+
fields: Array.isArray(obj.fields) ? (obj.fields as string[]) : [],
|
|
349
|
+
transforms: Array.isArray(obj.transforms)
|
|
350
|
+
? (obj.transforms as string[])
|
|
351
|
+
: [],
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
function parseSortKeyField(raw: unknown, ctx: string): SortKeyField {
|
|
356
|
+
const obj = asObj(raw, ctx);
|
|
357
|
+
return {
|
|
358
|
+
column: asStr(obj.column, `${ctx}.column`),
|
|
359
|
+
transforms: Array.isArray(obj.transforms)
|
|
360
|
+
? (obj.transforms as string[])
|
|
361
|
+
: [],
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
function parseCanopyConfig(raw: unknown, ctx: string): CanopyConfig {
|
|
366
|
+
const obj = asObj(raw, ctx);
|
|
367
|
+
return {
|
|
368
|
+
fields: Array.isArray(obj.fields) ? (obj.fields as string[]) : [],
|
|
369
|
+
looseThreshold: typeof obj.looseThreshold === "number" ? obj.looseThreshold : 0.7,
|
|
370
|
+
tightThreshold: typeof obj.tightThreshold === "number" ? obj.tightThreshold : 0.9,
|
|
371
|
+
maxCanopySize: typeof obj.maxCanopySize === "number" ? obj.maxCanopySize : 1000,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function parseBlockingConfig(raw: unknown, ctx: string): BlockingConfig {
|
|
376
|
+
const obj = asObj(raw, ctx);
|
|
377
|
+
const keys = Array.isArray(obj.keys)
|
|
378
|
+
? obj.keys.map((k: unknown, i: number) =>
|
|
379
|
+
parseBlockingKeyConfig(k, `${ctx}.keys[${i}]`),
|
|
380
|
+
)
|
|
381
|
+
: [];
|
|
382
|
+
const passes = Array.isArray(obj.passes)
|
|
383
|
+
? obj.passes.map((p: unknown, i: number) =>
|
|
384
|
+
parseBlockingKeyConfig(p, `${ctx}.passes[${i}]`),
|
|
385
|
+
)
|
|
386
|
+
: undefined;
|
|
387
|
+
const subBlockKeys = Array.isArray(obj.subBlockKeys)
|
|
388
|
+
? obj.subBlockKeys.map((k: unknown, i: number) =>
|
|
389
|
+
parseBlockingKeyConfig(k, `${ctx}.subBlockKeys[${i}]`),
|
|
390
|
+
)
|
|
391
|
+
: undefined;
|
|
392
|
+
const sortKey = Array.isArray(obj.sortKey)
|
|
393
|
+
? obj.sortKey.map((s: unknown, i: number) =>
|
|
394
|
+
parseSortKeyField(s, `${ctx}.sortKey[${i}]`),
|
|
395
|
+
)
|
|
396
|
+
: undefined;
|
|
397
|
+
const canopy =
|
|
398
|
+
typeof obj.canopy === "object" && obj.canopy !== null
|
|
399
|
+
? parseCanopyConfig(obj.canopy, `${ctx}.canopy`)
|
|
400
|
+
: undefined;
|
|
401
|
+
|
|
402
|
+
return stripUndefined({
|
|
403
|
+
strategy: requireIn(
|
|
404
|
+
obj.strategy,
|
|
405
|
+
VALID_BLOCKING_STRATEGIES,
|
|
406
|
+
`${ctx}.strategy`,
|
|
407
|
+
"static",
|
|
408
|
+
),
|
|
409
|
+
keys,
|
|
410
|
+
maxBlockSize:
|
|
411
|
+
typeof obj.maxBlockSize === "number" ? obj.maxBlockSize : 5000,
|
|
412
|
+
skipOversized:
|
|
413
|
+
typeof obj.skipOversized === "boolean" ? obj.skipOversized : false,
|
|
414
|
+
autoSuggest: optBool(obj.autoSuggest),
|
|
415
|
+
autoSelect: optBool(obj.autoSelect),
|
|
416
|
+
subBlockKeys,
|
|
417
|
+
windowSize: optNum(obj.windowSize),
|
|
418
|
+
sortKey,
|
|
419
|
+
passes,
|
|
420
|
+
unionMode: optBool(obj.unionMode),
|
|
421
|
+
maxTotalComparisons: optNum(obj.maxTotalComparisons),
|
|
422
|
+
annColumn: optStr(obj.annColumn),
|
|
423
|
+
annModel: optStr(obj.annModel),
|
|
424
|
+
annTopK: optNum(obj.annTopK),
|
|
425
|
+
canopy,
|
|
426
|
+
learnedSampleSize: optNum(obj.learnedSampleSize),
|
|
427
|
+
learnedMinRecall: optNum(obj.learnedMinRecall),
|
|
428
|
+
learnedMinReduction: optNum(obj.learnedMinReduction),
|
|
429
|
+
learnedPredicateDepth: optNum(obj.learnedPredicateDepth),
|
|
430
|
+
learnedCachePath: optStr(obj.learnedCachePath),
|
|
431
|
+
}) as BlockingConfig;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function parseGoldenFieldRule(raw: unknown, ctx: string): GoldenFieldRule {
|
|
435
|
+
const obj = asObj(raw, ctx);
|
|
436
|
+
return stripUndefined({
|
|
437
|
+
strategy: requireIn(
|
|
438
|
+
obj.strategy,
|
|
439
|
+
VALID_STRATEGIES,
|
|
440
|
+
`${ctx}.strategy`,
|
|
441
|
+
) as GoldenFieldRule["strategy"],
|
|
442
|
+
dateColumn: optStr(obj.dateColumn),
|
|
443
|
+
sourcePriority: Array.isArray(obj.sourcePriority)
|
|
444
|
+
? (obj.sourcePriority as string[])
|
|
445
|
+
: undefined,
|
|
446
|
+
}) as GoldenFieldRule;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function parseGoldenRulesConfig(
|
|
450
|
+
raw: unknown,
|
|
451
|
+
ctx: string,
|
|
452
|
+
): GoldenRulesConfig {
|
|
453
|
+
const obj = asObj(raw, ctx);
|
|
454
|
+
|
|
455
|
+
// Normalize: YAML uses `default`, TS interface uses `defaultStrategy`
|
|
456
|
+
const defaultStrategy =
|
|
457
|
+
typeof obj.defaultStrategy === "string"
|
|
458
|
+
? obj.defaultStrategy
|
|
459
|
+
: typeof obj.default === "string"
|
|
460
|
+
? obj.default
|
|
461
|
+
: "most_complete";
|
|
462
|
+
|
|
463
|
+
const fieldRules: Record<string, GoldenFieldRule> = {};
|
|
464
|
+
if (
|
|
465
|
+
typeof obj.fieldRules === "object" &&
|
|
466
|
+
obj.fieldRules !== null &&
|
|
467
|
+
!Array.isArray(obj.fieldRules)
|
|
468
|
+
) {
|
|
469
|
+
for (const [key, val] of Object.entries(
|
|
470
|
+
obj.fieldRules as Record<string, unknown>,
|
|
471
|
+
)) {
|
|
472
|
+
fieldRules[key] = parseGoldenFieldRule(val, `${ctx}.fieldRules.${key}`);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
return {
|
|
477
|
+
defaultStrategy,
|
|
478
|
+
fieldRules,
|
|
479
|
+
maxClusterSize:
|
|
480
|
+
typeof obj.maxClusterSize === "number" ? obj.maxClusterSize : 10,
|
|
481
|
+
autoSplit:
|
|
482
|
+
typeof obj.autoSplit === "boolean" ? obj.autoSplit : true,
|
|
483
|
+
qualityWeighting:
|
|
484
|
+
typeof obj.qualityWeighting === "boolean"
|
|
485
|
+
? obj.qualityWeighting
|
|
486
|
+
: true,
|
|
487
|
+
weakClusterThreshold:
|
|
488
|
+
typeof obj.weakClusterThreshold === "number"
|
|
489
|
+
? obj.weakClusterThreshold
|
|
490
|
+
: 0.3,
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
function parseStandardizationConfig(
|
|
495
|
+
raw: unknown,
|
|
496
|
+
ctx: string,
|
|
497
|
+
): StandardizationConfig {
|
|
498
|
+
const obj = asObj(raw, ctx);
|
|
499
|
+
|
|
500
|
+
// Normalize: in YAML the rules may be at top level or nested under `rules`
|
|
501
|
+
let rulesObj: Record<string, unknown>;
|
|
502
|
+
if (
|
|
503
|
+
typeof obj.rules === "object" &&
|
|
504
|
+
obj.rules !== null &&
|
|
505
|
+
!Array.isArray(obj.rules)
|
|
506
|
+
) {
|
|
507
|
+
rulesObj = obj.rules as Record<string, unknown>;
|
|
508
|
+
} else {
|
|
509
|
+
// Flat form: each key is a column name mapping to standardizers
|
|
510
|
+
rulesObj = obj;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
const rules: Record<string, readonly string[]> = {};
|
|
514
|
+
for (const [key, val] of Object.entries(rulesObj)) {
|
|
515
|
+
if (Array.isArray(val)) {
|
|
516
|
+
const arr = val as unknown[];
|
|
517
|
+
for (const rule of arr) {
|
|
518
|
+
if (typeof rule !== "string") {
|
|
519
|
+
throw new Error(
|
|
520
|
+
`${ctx}.${key}: expected array of strings, got ${typeof rule}`,
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
if (!(VALID_STANDARDIZERS as ReadonlySet<string>).has(rule)) {
|
|
524
|
+
const valid = [...VALID_STANDARDIZERS].sort().join(", ");
|
|
525
|
+
throw new Error(
|
|
526
|
+
`Invalid standardizer '${rule}' on column '${key}'. Valid: ${valid}`,
|
|
527
|
+
);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
rules[key] = arr as string[];
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
return { rules };
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
function parseBudgetConfig(raw: unknown, ctx: string): BudgetConfig {
|
|
538
|
+
const obj = asObj(raw, ctx);
|
|
539
|
+
return stripUndefined({
|
|
540
|
+
maxCostUsd: optNum(obj.maxCostUsd),
|
|
541
|
+
maxCalls: optNum(obj.maxCalls),
|
|
542
|
+
escalationModel: optStr(obj.escalationModel),
|
|
543
|
+
escalationBand: Array.isArray(obj.escalationBand)
|
|
544
|
+
? (obj.escalationBand as number[])
|
|
545
|
+
: undefined,
|
|
546
|
+
escalationBudgetPct: optNum(obj.escalationBudgetPct),
|
|
547
|
+
warnAtPct: optNum(obj.warnAtPct),
|
|
548
|
+
}) as BudgetConfig;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
function parseLLMScorerConfig(
|
|
552
|
+
raw: unknown,
|
|
553
|
+
ctx: string,
|
|
554
|
+
): LLMScorerConfig {
|
|
555
|
+
const obj = asObj(raw, ctx);
|
|
556
|
+
return stripUndefined({
|
|
557
|
+
enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
|
|
558
|
+
provider: optStr(obj.provider),
|
|
559
|
+
model: optStr(obj.model),
|
|
560
|
+
autoThreshold:
|
|
561
|
+
typeof obj.autoThreshold === "number" ? obj.autoThreshold : 0.9,
|
|
562
|
+
candidateLo:
|
|
563
|
+
typeof obj.candidateLo === "number" ? obj.candidateLo : 0.6,
|
|
564
|
+
candidateHi:
|
|
565
|
+
typeof obj.candidateHi === "number" ? obj.candidateHi : 0.9,
|
|
566
|
+
batchSize:
|
|
567
|
+
typeof obj.batchSize === "number" ? obj.batchSize : 10,
|
|
568
|
+
maxWorkers:
|
|
569
|
+
typeof obj.maxWorkers === "number" ? obj.maxWorkers : 4,
|
|
570
|
+
budget:
|
|
571
|
+
typeof obj.budget === "object" && obj.budget !== null
|
|
572
|
+
? parseBudgetConfig(obj.budget, `${ctx}.budget`)
|
|
573
|
+
: undefined,
|
|
574
|
+
mode: requireIn(obj.mode, VALID_LLM_MODES, `${ctx}.mode`, "pairwise"),
|
|
575
|
+
clusterMaxSize: optNum(obj.clusterMaxSize),
|
|
576
|
+
clusterMinSize: optNum(obj.clusterMinSize),
|
|
577
|
+
}) as LLMScorerConfig;
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
function parseValidationRuleConfig(
|
|
581
|
+
raw: unknown,
|
|
582
|
+
ctx: string,
|
|
583
|
+
): ValidationRuleConfig {
|
|
584
|
+
const obj = asObj(raw, ctx);
|
|
585
|
+
return {
|
|
586
|
+
column: asStr(obj.column, `${ctx}.column`),
|
|
587
|
+
ruleType: requireIn(
|
|
588
|
+
obj.ruleType,
|
|
589
|
+
VALID_VALIDATION_RULE_TYPES,
|
|
590
|
+
`${ctx}.ruleType`,
|
|
591
|
+
),
|
|
592
|
+
params:
|
|
593
|
+
typeof obj.params === "object" && obj.params !== null
|
|
594
|
+
? (obj.params as Record<string, unknown>)
|
|
595
|
+
: {},
|
|
596
|
+
action: requireIn(
|
|
597
|
+
obj.action,
|
|
598
|
+
VALID_VALIDATION_ACTIONS,
|
|
599
|
+
`${ctx}.action`,
|
|
600
|
+
"flag",
|
|
601
|
+
),
|
|
602
|
+
};
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
function parseValidationConfig(
|
|
606
|
+
raw: unknown,
|
|
607
|
+
ctx: string,
|
|
608
|
+
): ValidationConfig {
|
|
609
|
+
const obj = asObj(raw, ctx);
|
|
610
|
+
return {
|
|
611
|
+
rules: Array.isArray(obj.rules)
|
|
612
|
+
? obj.rules.map((r: unknown, i: number) =>
|
|
613
|
+
parseValidationRuleConfig(r, `${ctx}.rules[${i}]`),
|
|
614
|
+
)
|
|
615
|
+
: [],
|
|
616
|
+
autoFix: typeof obj.autoFix === "boolean" ? obj.autoFix : false,
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
function parseDomainConfig(raw: unknown, ctx: string): DomainConfig {
|
|
621
|
+
const obj = asObj(raw, ctx);
|
|
622
|
+
return stripUndefined({
|
|
623
|
+
enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
|
|
624
|
+
mode: optStr(obj.mode),
|
|
625
|
+
confidenceThreshold:
|
|
626
|
+
typeof obj.confidenceThreshold === "number"
|
|
627
|
+
? obj.confidenceThreshold
|
|
628
|
+
: 0.8,
|
|
629
|
+
llmValidation:
|
|
630
|
+
typeof obj.llmValidation === "boolean" ? obj.llmValidation : false,
|
|
631
|
+
budget:
|
|
632
|
+
typeof obj.budget === "object" && obj.budget !== null
|
|
633
|
+
? parseBudgetConfig(obj.budget, `${ctx}.budget`)
|
|
634
|
+
: undefined,
|
|
635
|
+
}) as DomainConfig;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
function parseQualityConfig(raw: unknown, ctx: string): QualityConfig {
|
|
639
|
+
const obj = asObj(raw, ctx);
|
|
640
|
+
return stripUndefined({
|
|
641
|
+
enabled: typeof obj.enabled === "boolean" ? obj.enabled : true,
|
|
642
|
+
mode: requireIn(obj.mode, VALID_QUALITY_MODES, `${ctx}.mode`, "silent"),
|
|
643
|
+
fixMode: requireIn(
|
|
644
|
+
obj.fixMode,
|
|
645
|
+
VALID_QUALITY_FIX_MODES,
|
|
646
|
+
`${ctx}.fixMode`,
|
|
647
|
+
"safe",
|
|
648
|
+
),
|
|
649
|
+
domain: optStr(obj.domain),
|
|
650
|
+
}) as QualityConfig;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
function parseTransformConfig(raw: unknown, ctx: string): TransformConfig {
|
|
654
|
+
const obj = asObj(raw, ctx);
|
|
655
|
+
return {
|
|
656
|
+
enabled: typeof obj.enabled === "boolean" ? obj.enabled : true,
|
|
657
|
+
mode: requireIn(obj.mode, VALID_QUALITY_MODES, `${ctx}.mode`, "silent"),
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
function parseLearningConfig(raw: unknown, ctx: string): LearningConfig {
|
|
662
|
+
const obj = asObj(raw, ctx);
|
|
663
|
+
return {
|
|
664
|
+
thresholdMinCorrections:
|
|
665
|
+
typeof obj.thresholdMinCorrections === "number"
|
|
666
|
+
? obj.thresholdMinCorrections
|
|
667
|
+
: 10,
|
|
668
|
+
weightsMinCorrections:
|
|
669
|
+
typeof obj.weightsMinCorrections === "number"
|
|
670
|
+
? obj.weightsMinCorrections
|
|
671
|
+
: 50,
|
|
672
|
+
};
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
function parseMemoryConfig(raw: unknown, ctx: string): MemoryConfig {
|
|
676
|
+
const obj = asObj(raw, ctx);
|
|
677
|
+
return stripUndefined({
|
|
678
|
+
enabled: typeof obj.enabled === "boolean" ? obj.enabled : false,
|
|
679
|
+
backend: requireIn(
|
|
680
|
+
obj.backend,
|
|
681
|
+
VALID_MEMORY_BACKENDS,
|
|
682
|
+
`${ctx}.backend`,
|
|
683
|
+
"sqlite",
|
|
684
|
+
),
|
|
685
|
+
path: optStr(obj.path),
|
|
686
|
+
trust: typeof obj.trust === "number" ? obj.trust : 0.9,
|
|
687
|
+
learning:
|
|
688
|
+
typeof obj.learning === "object" && obj.learning !== null
|
|
689
|
+
? parseLearningConfig(obj.learning, `${ctx}.learning`)
|
|
690
|
+
: { thresholdMinCorrections: 10, weightsMinCorrections: 50 },
|
|
691
|
+
}) as MemoryConfig;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
function parseInputFileConfig(
|
|
695
|
+
raw: unknown,
|
|
696
|
+
ctx: string,
|
|
697
|
+
): InputFileConfig {
|
|
698
|
+
const obj = asObj(raw, ctx);
|
|
699
|
+
return stripUndefined({
|
|
700
|
+
path: asStr(obj.path, `${ctx}.path`),
|
|
701
|
+
idColumn: optStr(obj.idColumn),
|
|
702
|
+
sourceLabel: optStr(obj.sourceLabel),
|
|
703
|
+
sourceName: optStr(obj.sourceName),
|
|
704
|
+
columnMap:
|
|
705
|
+
typeof obj.columnMap === "object" && obj.columnMap !== null
|
|
706
|
+
? (obj.columnMap as Record<string, string>)
|
|
707
|
+
: undefined,
|
|
708
|
+
delimiter: optStr(obj.delimiter),
|
|
709
|
+
encoding: optStr(obj.encoding),
|
|
710
|
+
sheet: optStr(obj.sheet),
|
|
711
|
+
parseMode: optStr(obj.parseMode),
|
|
712
|
+
headerRow: optNum(obj.headerRow),
|
|
713
|
+
hasHeader: optBool(obj.hasHeader),
|
|
714
|
+
skipRows: Array.isArray(obj.skipRows)
|
|
715
|
+
? (obj.skipRows as number[])
|
|
716
|
+
: undefined,
|
|
717
|
+
}) as InputFileConfig;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
function parseInputConfig(raw: unknown, ctx: string): InputConfig {
|
|
721
|
+
const obj = asObj(raw, ctx);
|
|
722
|
+
return stripUndefined({
|
|
723
|
+
files: Array.isArray(obj.files)
|
|
724
|
+
? obj.files.map((f: unknown, i: number) =>
|
|
725
|
+
parseInputFileConfig(f, `${ctx}.files[${i}]`),
|
|
726
|
+
)
|
|
727
|
+
: [],
|
|
728
|
+
fileA:
|
|
729
|
+
typeof obj.fileA === "object" && obj.fileA !== null
|
|
730
|
+
? parseInputFileConfig(obj.fileA, `${ctx}.fileA`)
|
|
731
|
+
: undefined,
|
|
732
|
+
fileB:
|
|
733
|
+
typeof obj.fileB === "object" && obj.fileB !== null
|
|
734
|
+
? parseInputFileConfig(obj.fileB, `${ctx}.fileB`)
|
|
735
|
+
: undefined,
|
|
736
|
+
}) as InputConfig;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
function parseOutputConfig(raw: unknown, ctx: string): OutputConfig {
|
|
740
|
+
const obj = asObj(raw, ctx);
|
|
741
|
+
return stripUndefined({
|
|
742
|
+
path: optStr(obj.path),
|
|
743
|
+
format: optStr(obj.format),
|
|
744
|
+
directory: optStr(obj.directory),
|
|
745
|
+
runName: optStr(obj.runName),
|
|
746
|
+
}) as OutputConfig;
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
// ---------------------------------------------------------------------------
|
|
750
|
+
// Public API
|
|
751
|
+
// ---------------------------------------------------------------------------
|
|
752
|
+
|
|
753
|
+
/**
|
|
754
|
+
* Parse a raw JS object (already deserialized from YAML or JSON) into a
|
|
755
|
+
* validated GoldenMatchConfig.
|
|
756
|
+
*
|
|
757
|
+
* Handles:
|
|
758
|
+
* - Snake_case to camelCase key conversion
|
|
759
|
+
* - Normalization of `matchkeys` / `match_settings`
|
|
760
|
+
* - Parsing of all nested config objects
|
|
761
|
+
* - `default` -> `defaultStrategy` normalization in golden_rules
|
|
762
|
+
*/
|
|
763
|
+
export function parseConfig(raw: unknown): GoldenMatchConfig {
|
|
764
|
+
if (typeof raw !== "object" || raw === null) {
|
|
765
|
+
throw new Error("Invalid config: expected a non-null object");
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
// Camelize all keys recursively
|
|
769
|
+
const obj = camelizeKeys(raw) as RawObj;
|
|
770
|
+
|
|
771
|
+
// Normalize matchkeys: accept either `matchkeys` or `matchSettings`
|
|
772
|
+
const rawMatchkeys = obj.matchkeys ?? obj.matchSettings;
|
|
773
|
+
const matchkeys = Array.isArray(rawMatchkeys)
|
|
774
|
+
? rawMatchkeys.map((mk: unknown, i: number) =>
|
|
775
|
+
parseMatchkeyConfig(mk, `matchkeys[${i}]`),
|
|
776
|
+
)
|
|
777
|
+
: undefined;
|
|
778
|
+
|
|
779
|
+
const config = stripUndefined({
|
|
780
|
+
matchkeys,
|
|
781
|
+
blocking:
|
|
782
|
+
typeof obj.blocking === "object" && obj.blocking !== null
|
|
783
|
+
? parseBlockingConfig(obj.blocking, "blocking")
|
|
784
|
+
: undefined,
|
|
785
|
+
threshold: optNum(obj.threshold),
|
|
786
|
+
goldenRules:
|
|
787
|
+
typeof obj.goldenRules === "object" && obj.goldenRules !== null
|
|
788
|
+
? parseGoldenRulesConfig(obj.goldenRules, "goldenRules")
|
|
789
|
+
: undefined,
|
|
790
|
+
standardization:
|
|
791
|
+
typeof obj.standardization === "object" && obj.standardization !== null
|
|
792
|
+
? parseStandardizationConfig(obj.standardization, "standardization")
|
|
793
|
+
: undefined,
|
|
794
|
+
validation:
|
|
795
|
+
typeof obj.validation === "object" && obj.validation !== null
|
|
796
|
+
? parseValidationConfig(obj.validation, "validation")
|
|
797
|
+
: undefined,
|
|
798
|
+
quality:
|
|
799
|
+
typeof obj.quality === "object" && obj.quality !== null
|
|
800
|
+
? parseQualityConfig(obj.quality, "quality")
|
|
801
|
+
: undefined,
|
|
802
|
+
transform:
|
|
803
|
+
typeof obj.transform === "object" && obj.transform !== null
|
|
804
|
+
? parseTransformConfig(obj.transform, "transform")
|
|
805
|
+
: undefined,
|
|
806
|
+
llmScorer:
|
|
807
|
+
typeof obj.llmScorer === "object" && obj.llmScorer !== null
|
|
808
|
+
? parseLLMScorerConfig(obj.llmScorer, "llmScorer")
|
|
809
|
+
: undefined,
|
|
810
|
+
domain:
|
|
811
|
+
typeof obj.domain === "object" && obj.domain !== null
|
|
812
|
+
? parseDomainConfig(obj.domain, "domain")
|
|
813
|
+
: undefined,
|
|
814
|
+
memory:
|
|
815
|
+
typeof obj.memory === "object" && obj.memory !== null
|
|
816
|
+
? parseMemoryConfig(obj.memory, "memory")
|
|
817
|
+
: undefined,
|
|
818
|
+
input:
|
|
819
|
+
typeof obj.input === "object" && obj.input !== null
|
|
820
|
+
? parseInputConfig(obj.input, "input")
|
|
821
|
+
: undefined,
|
|
822
|
+
output:
|
|
823
|
+
typeof obj.output === "object" && obj.output !== null
|
|
824
|
+
? parseOutputConfig(obj.output, "output")
|
|
825
|
+
: undefined,
|
|
826
|
+
backend: optStr(obj.backend),
|
|
827
|
+
llmAuto: optBool(obj.llmAuto),
|
|
828
|
+
llmBoost: optBool(obj.llmBoost),
|
|
829
|
+
}) as GoldenMatchConfig;
|
|
830
|
+
|
|
831
|
+
return config;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
/**
|
|
835
|
+
* Parse a YAML string into a GoldenMatchConfig.
|
|
836
|
+
*
|
|
837
|
+
* Requires the caller to provide a YAML parse function (e.g. from the `yaml`
|
|
838
|
+
* npm package) to keep this module edge-safe with no dynamic imports.
|
|
839
|
+
*
|
|
840
|
+
* @param yamlStr - The YAML configuration string.
|
|
841
|
+
* @param yamlParseFn - A function that parses a YAML string into a JS object.
|
|
842
|
+
*/
|
|
843
|
+
export function parseConfigYaml(
|
|
844
|
+
yamlStr: string,
|
|
845
|
+
yamlParseFn: (s: string) => unknown,
|
|
846
|
+
): GoldenMatchConfig {
|
|
847
|
+
const raw = yamlParseFn(yamlStr);
|
|
848
|
+
if (typeof raw !== "object" || raw === null) {
|
|
849
|
+
throw new Error("Invalid YAML config: expected a non-null object at root");
|
|
850
|
+
}
|
|
851
|
+
return parseConfig(raw);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
/**
|
|
855
|
+
* Convert a GoldenMatchConfig back to a plain JS object suitable for
|
|
856
|
+
* YAML or JSON serialization (snake_case keys).
|
|
857
|
+
*
|
|
858
|
+
* @param config - The typed config object.
|
|
859
|
+
* @param yamlStringifyFn - A function that serializes a JS object to YAML.
|
|
860
|
+
*/
|
|
861
|
+
export function configToYaml(
|
|
862
|
+
config: GoldenMatchConfig,
|
|
863
|
+
yamlStringifyFn: (obj: unknown) => string,
|
|
864
|
+
): string {
|
|
865
|
+
// Strip undefined values then convert keys to snake_case
|
|
866
|
+
const plain = JSON.parse(JSON.stringify(config));
|
|
867
|
+
const snaked = snakeifyKeys(plain);
|
|
868
|
+
return yamlStringifyFn(snaked);
|
|
869
|
+
}
|