goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* validate.ts — Column validation rules with quarantine/flag actions.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/validate.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row } from "./types.js";
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Types
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
export type ValidationAction = "null" | "quarantine" | "flag";
|
|
15
|
+
|
|
16
|
+
export type ValidationRuleType =
|
|
17
|
+
| "regex"
|
|
18
|
+
| "min_length"
|
|
19
|
+
| "max_length"
|
|
20
|
+
| "not_null"
|
|
21
|
+
| "in_set"
|
|
22
|
+
| "format";
|
|
23
|
+
|
|
24
|
+
export interface ValidationRule {
|
|
25
|
+
readonly column: string;
|
|
26
|
+
readonly ruleType: ValidationRuleType;
|
|
27
|
+
readonly params: Readonly<Record<string, unknown>>;
|
|
28
|
+
readonly action: ValidationAction;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface ValidationReport {
|
|
32
|
+
readonly totalRows: number;
|
|
33
|
+
readonly quarantined: number;
|
|
34
|
+
readonly flagged: number;
|
|
35
|
+
readonly ruleViolations: Readonly<Record<string, number>>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface ValidationResult {
|
|
39
|
+
readonly valid: Row[];
|
|
40
|
+
readonly quarantine: Row[];
|
|
41
|
+
readonly report: ValidationReport;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Built-in format matchers
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
const FORMAT_MATCHERS: Readonly<Record<string, RegExp>> = {
|
|
49
|
+
email: /^[^@\s]+@[^@\s]+\.[^@\s]+$/,
|
|
50
|
+
phone: /^\+?[\d\s().-]{7,}$/,
|
|
51
|
+
zip: /^\d{5}(-\d{4})?$/,
|
|
52
|
+
date: /^\d{4}-\d{2}-\d{2}$/,
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// Rule checker
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
function valueToStr(v: unknown): string | null {
|
|
60
|
+
if (v === null || v === undefined) return null;
|
|
61
|
+
if (typeof v === "string") return v;
|
|
62
|
+
if (typeof v === "number" || typeof v === "boolean") return String(v);
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Compile a rule into a checker function. Expensive work (regex compilation)
|
|
68
|
+
* happens here, not on every row. If a regex is invalid, log once and return
|
|
69
|
+
* a checker that matches no rows.
|
|
70
|
+
*/
|
|
71
|
+
function compileRule(rule: ValidationRule): (value: unknown) => boolean {
|
|
72
|
+
if (rule.ruleType === "not_null") {
|
|
73
|
+
return (v) => v !== null && v !== undefined && v !== "";
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (rule.ruleType === "regex") {
|
|
77
|
+
const pat = rule.params["pattern"];
|
|
78
|
+
if (typeof pat !== "string") {
|
|
79
|
+
return (v) => v === null || v === undefined;
|
|
80
|
+
}
|
|
81
|
+
let re: RegExp;
|
|
82
|
+
try {
|
|
83
|
+
re = new RegExp(pat);
|
|
84
|
+
} catch (err) {
|
|
85
|
+
// eslint-disable-next-line no-console
|
|
86
|
+
console.warn(
|
|
87
|
+
`Invalid regex pattern for rule on column '${rule.column}': ${pat}. ` +
|
|
88
|
+
`Error: ${err instanceof Error ? err.message : String(err)}. ` +
|
|
89
|
+
`Rule will match no rows.`,
|
|
90
|
+
);
|
|
91
|
+
return (v) => {
|
|
92
|
+
if (v === null || v === undefined) return true;
|
|
93
|
+
return false;
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
return (v) => {
|
|
97
|
+
if (v === null || v === undefined) return true;
|
|
98
|
+
const str = valueToStr(v);
|
|
99
|
+
if (str === null) return false;
|
|
100
|
+
return re.test(str);
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (rule.ruleType === "min_length") {
|
|
105
|
+
const min =
|
|
106
|
+
typeof rule.params["value"] === "number" ? rule.params["value"] : 0;
|
|
107
|
+
return (v) => {
|
|
108
|
+
if (v === null || v === undefined) return true;
|
|
109
|
+
const str = valueToStr(v) ?? "";
|
|
110
|
+
return str.length >= min;
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (rule.ruleType === "max_length") {
|
|
115
|
+
const max =
|
|
116
|
+
typeof rule.params["value"] === "number"
|
|
117
|
+
? rule.params["value"]
|
|
118
|
+
: Infinity;
|
|
119
|
+
return (v) => {
|
|
120
|
+
if (v === null || v === undefined) return true;
|
|
121
|
+
const str = valueToStr(v) ?? "";
|
|
122
|
+
return str.length <= max;
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (rule.ruleType === "in_set") {
|
|
127
|
+
const allowed = rule.params["values"];
|
|
128
|
+
if (!Array.isArray(allowed)) return () => true;
|
|
129
|
+
return (v) => {
|
|
130
|
+
if (v === null || v === undefined) return true;
|
|
131
|
+
return allowed.includes(v);
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (rule.ruleType === "format") {
|
|
136
|
+
const name = rule.params["name"];
|
|
137
|
+
if (typeof name !== "string") return () => true;
|
|
138
|
+
const matcher = FORMAT_MATCHERS[name];
|
|
139
|
+
if (matcher === undefined) return () => true;
|
|
140
|
+
return (v) => {
|
|
141
|
+
if (v === null || v === undefined) return true;
|
|
142
|
+
const str = valueToStr(v);
|
|
143
|
+
if (str === null) return false;
|
|
144
|
+
return matcher.test(str);
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return () => true;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/** Returns true if the rule passes for this value, false otherwise. */
|
|
152
|
+
export function checkRule(value: unknown, rule: ValidationRule): boolean {
|
|
153
|
+
return compileRule(rule)(value);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function ruleKey(rule: ValidationRule): string {
|
|
157
|
+
return `${rule.column}:${rule.ruleType}`;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
// validateRows
|
|
162
|
+
// ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Validate rows against a list of rules.
|
|
166
|
+
*
|
|
167
|
+
* Actions:
|
|
168
|
+
* - "null": replace the failing value with null, row stays valid
|
|
169
|
+
* - "quarantine": move row to quarantine bucket
|
|
170
|
+
* - "flag": add __flags__ entry, row stays valid
|
|
171
|
+
*/
|
|
172
|
+
export function validateRows(
|
|
173
|
+
rows: readonly Row[],
|
|
174
|
+
rules: readonly ValidationRule[],
|
|
175
|
+
): ValidationResult {
|
|
176
|
+
const valid: Row[] = [];
|
|
177
|
+
const quarantine: Row[] = [];
|
|
178
|
+
const violations = new Map<string, number>();
|
|
179
|
+
let flagged = 0;
|
|
180
|
+
|
|
181
|
+
// Pre-compile all rule checkers once. Logs any regex errors exactly once.
|
|
182
|
+
const compiled = rules.map((rule) => ({
|
|
183
|
+
rule,
|
|
184
|
+
check: compileRule(rule),
|
|
185
|
+
}));
|
|
186
|
+
|
|
187
|
+
for (const row of rows) {
|
|
188
|
+
let current: Record<string, unknown> = { ...row };
|
|
189
|
+
let shouldQuarantine = false;
|
|
190
|
+
let wasFlagged = false;
|
|
191
|
+
const flags: string[] = Array.isArray(current["__flags__"])
|
|
192
|
+
? [...(current["__flags__"] as unknown[])].filter(
|
|
193
|
+
(f): f is string => typeof f === "string",
|
|
194
|
+
)
|
|
195
|
+
: [];
|
|
196
|
+
|
|
197
|
+
for (const { rule, check } of compiled) {
|
|
198
|
+
const value = current[rule.column];
|
|
199
|
+
if (check(value)) continue;
|
|
200
|
+
|
|
201
|
+
const key = ruleKey(rule);
|
|
202
|
+
violations.set(key, (violations.get(key) ?? 0) + 1);
|
|
203
|
+
|
|
204
|
+
switch (rule.action) {
|
|
205
|
+
case "null":
|
|
206
|
+
current[rule.column] = null;
|
|
207
|
+
break;
|
|
208
|
+
case "quarantine":
|
|
209
|
+
shouldQuarantine = true;
|
|
210
|
+
break;
|
|
211
|
+
case "flag":
|
|
212
|
+
flags.push(key);
|
|
213
|
+
wasFlagged = true;
|
|
214
|
+
break;
|
|
215
|
+
}
|
|
216
|
+
if (shouldQuarantine) break;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (shouldQuarantine) {
|
|
220
|
+
quarantine.push(current as Row);
|
|
221
|
+
} else {
|
|
222
|
+
if (wasFlagged) {
|
|
223
|
+
current["__flags__"] = flags;
|
|
224
|
+
flagged++;
|
|
225
|
+
}
|
|
226
|
+
valid.push(current as Row);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const ruleViolations: Record<string, number> = {};
|
|
231
|
+
for (const [k, v] of violations) ruleViolations[k] = v;
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
valid,
|
|
235
|
+
quarantine,
|
|
236
|
+
report: {
|
|
237
|
+
totalRows: rows.length,
|
|
238
|
+
quarantined: quarantine.length,
|
|
239
|
+
flagged,
|
|
240
|
+
ruleViolations,
|
|
241
|
+
},
|
|
242
|
+
};
|
|
243
|
+
}
|