goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* profiler.ts — Lightweight per-column data profiler.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports parts of goldenmatch/core/profiler.py that autoconfig relies on.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row } from "./types.js";
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Types
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
export type ColumnType =
|
|
15
|
+
| "email"
|
|
16
|
+
| "phone"
|
|
17
|
+
| "zip"
|
|
18
|
+
| "date"
|
|
19
|
+
| "name"
|
|
20
|
+
| "geo"
|
|
21
|
+
| "id"
|
|
22
|
+
| "numeric"
|
|
23
|
+
| "text";
|
|
24
|
+
|
|
25
|
+
export interface ColumnProfile {
|
|
26
|
+
readonly name: string;
|
|
27
|
+
readonly nullRate: number;
|
|
28
|
+
readonly nullCount: number;
|
|
29
|
+
readonly totalCount: number;
|
|
30
|
+
readonly distinctCount: number;
|
|
31
|
+
readonly cardinalityRatio: number;
|
|
32
|
+
readonly inferredType: ColumnType;
|
|
33
|
+
readonly avgLength: number;
|
|
34
|
+
readonly maxLength: number;
|
|
35
|
+
readonly sampleValues: readonly string[];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface DatasetProfile {
|
|
39
|
+
readonly rowCount: number;
|
|
40
|
+
readonly columns: readonly ColumnProfile[];
|
|
41
|
+
readonly byName: Readonly<Record<string, ColumnProfile>>;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// Regex heuristics
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
const EMAIL_VALUE_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
49
|
+
const PHONE_STRIP_RE = /[()\-+.\s]/g;
|
|
50
|
+
const DATE_VALUE_RES: readonly RegExp[] = [
|
|
51
|
+
/^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}$/,
|
|
52
|
+
/^\d{4}[/\-]\d{1,2}[/\-]\d{1,2}$/,
|
|
53
|
+
/^\d{1,2}\s[A-Za-z]+\s\d{2,4}$/,
|
|
54
|
+
];
|
|
55
|
+
const ZIP_VALUE_RE = /^\d{5}(-?\d{4})?$/;
|
|
56
|
+
const NAME_VALUE_RE = /^[A-Za-z][A-Za-z \-']{0,28}[A-Za-z]$|^[A-Za-z]{2,3}$/;
|
|
57
|
+
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
// Per-column profiling
|
|
60
|
+
// ---------------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
function toStringOrNull(value: unknown): string | null {
|
|
63
|
+
if (value === null || value === undefined) return null;
|
|
64
|
+
if (typeof value === "string") {
|
|
65
|
+
const t = value.trim();
|
|
66
|
+
return t.length === 0 ? null : t;
|
|
67
|
+
}
|
|
68
|
+
return String(value);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function guessType(values: readonly string[], columnName: string): ColumnType {
|
|
72
|
+
if (values.length === 0) return "text";
|
|
73
|
+
const n = values.length;
|
|
74
|
+
const lname = columnName.toLowerCase();
|
|
75
|
+
|
|
76
|
+
// Email: >60% look like addresses
|
|
77
|
+
const emailCount = values.reduce(
|
|
78
|
+
(acc, v) => acc + (EMAIL_VALUE_RE.test(v) ? 1 : 0),
|
|
79
|
+
0,
|
|
80
|
+
);
|
|
81
|
+
if (emailCount / n > 0.6) return "email";
|
|
82
|
+
|
|
83
|
+
// Phone
|
|
84
|
+
let phoneCount = 0;
|
|
85
|
+
for (const v of values) {
|
|
86
|
+
const stripped = v.replace(PHONE_STRIP_RE, "");
|
|
87
|
+
if (/^\d+$/.test(stripped) && stripped.length >= 7 && stripped.length <= 15) {
|
|
88
|
+
phoneCount++;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (phoneCount / n > 0.6) return "phone";
|
|
92
|
+
|
|
93
|
+
// Zip: 5 or 9 digits (with optional dash)
|
|
94
|
+
const zipCount = values.reduce(
|
|
95
|
+
(acc, v) => acc + (ZIP_VALUE_RE.test(v) ? 1 : 0),
|
|
96
|
+
0,
|
|
97
|
+
);
|
|
98
|
+
if (zipCount / n > 0.6) return "zip";
|
|
99
|
+
|
|
100
|
+
// Date
|
|
101
|
+
let dateCount = 0;
|
|
102
|
+
for (const v of values) {
|
|
103
|
+
if (DATE_VALUE_RES.some((re) => re.test(v))) dateCount++;
|
|
104
|
+
}
|
|
105
|
+
if (dateCount / n > 0.6) return "date";
|
|
106
|
+
|
|
107
|
+
// Geographic columns by name + short text values
|
|
108
|
+
if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
|
|
109
|
+
if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
|
|
110
|
+
|
|
111
|
+
// Identifier columns by name
|
|
112
|
+
if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
|
|
113
|
+
|
|
114
|
+
// Name: >60% match alpha-name pattern
|
|
115
|
+
const nameCount = values.reduce(
|
|
116
|
+
(acc, v) => acc + (NAME_VALUE_RE.test(v) ? 1 : 0),
|
|
117
|
+
0,
|
|
118
|
+
);
|
|
119
|
+
if (nameCount / n > 0.6) return "name";
|
|
120
|
+
|
|
121
|
+
// Numeric
|
|
122
|
+
let numericCount = 0;
|
|
123
|
+
for (const v of values) {
|
|
124
|
+
if (/^-?\d+(\.\d+)?$/.test(v)) numericCount++;
|
|
125
|
+
}
|
|
126
|
+
if (numericCount / n > 0.8) return "numeric";
|
|
127
|
+
|
|
128
|
+
return "text";
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function profileColumn(name: string, rawValues: readonly unknown[]): ColumnProfile {
|
|
132
|
+
const totalCount = rawValues.length;
|
|
133
|
+
let nullCount = 0;
|
|
134
|
+
const nonNull: string[] = [];
|
|
135
|
+
for (const v of rawValues) {
|
|
136
|
+
const s = toStringOrNull(v);
|
|
137
|
+
if (s === null) nullCount++;
|
|
138
|
+
else nonNull.push(s);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
const distinct = new Set(nonNull);
|
|
142
|
+
const distinctCount = distinct.size;
|
|
143
|
+
const cardinalityRatio = totalCount > 0 ? distinctCount / totalCount : 0;
|
|
144
|
+
|
|
145
|
+
let totalLen = 0;
|
|
146
|
+
let maxLen = 0;
|
|
147
|
+
for (const v of nonNull) {
|
|
148
|
+
totalLen += v.length;
|
|
149
|
+
if (v.length > maxLen) maxLen = v.length;
|
|
150
|
+
}
|
|
151
|
+
const avgLength = nonNull.length > 0 ? totalLen / nonNull.length : 0;
|
|
152
|
+
const nullRate = totalCount > 0 ? nullCount / totalCount : 0;
|
|
153
|
+
|
|
154
|
+
// Sample values (first 5 unique)
|
|
155
|
+
const sampleValues: string[] = [];
|
|
156
|
+
for (const v of distinct) {
|
|
157
|
+
sampleValues.push(v);
|
|
158
|
+
if (sampleValues.length >= 5) break;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Subsample for type guessing for performance
|
|
162
|
+
const sampleForType = nonNull.length > 500 ? nonNull.slice(0, 500) : nonNull;
|
|
163
|
+
const inferredType = guessType(sampleForType, name);
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
name,
|
|
167
|
+
nullRate,
|
|
168
|
+
nullCount,
|
|
169
|
+
totalCount,
|
|
170
|
+
distinctCount,
|
|
171
|
+
cardinalityRatio,
|
|
172
|
+
inferredType,
|
|
173
|
+
avgLength,
|
|
174
|
+
maxLength: maxLen,
|
|
175
|
+
sampleValues,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Public API
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
/** Profile all columns of a row array. */
|
|
184
|
+
export function profileRows(rows: readonly Row[]): DatasetProfile {
|
|
185
|
+
if (rows.length === 0) {
|
|
186
|
+
return { rowCount: 0, columns: [], byName: {} };
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Collect column names from all rows (not just first)
|
|
190
|
+
const colSet = new Set<string>();
|
|
191
|
+
for (const r of rows) {
|
|
192
|
+
for (const k of Object.keys(r)) {
|
|
193
|
+
if (!k.startsWith("__")) colSet.add(k);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
const columns = [...colSet];
|
|
197
|
+
|
|
198
|
+
const profiles: ColumnProfile[] = [];
|
|
199
|
+
const byName: Record<string, ColumnProfile> = {};
|
|
200
|
+
for (const col of columns) {
|
|
201
|
+
const values = rows.map((r) => r[col]);
|
|
202
|
+
const profile = profileColumn(col, values);
|
|
203
|
+
profiles.push(profile);
|
|
204
|
+
byName[col] = profile;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
rowCount: rows.length,
|
|
209
|
+
columns: profiles,
|
|
210
|
+
byName,
|
|
211
|
+
};
|
|
212
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* quality.ts — Lightweight quality scan stub.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports a subset of goldenmatch/core/quality.py. The Python version
|
|
6
|
+
* integrates with GoldenCheck; this port only provides the interface and a
|
|
7
|
+
* handful of basic heuristics that are safe to run client-side.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { Row, QualityConfig } from "./types.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Types
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
export type QualitySeverity = "info" | "warn" | "error";
|
|
17
|
+
|
|
18
|
+
export interface QualityFinding {
|
|
19
|
+
readonly column: string;
|
|
20
|
+
readonly issue: string;
|
|
21
|
+
readonly severity: QualitySeverity;
|
|
22
|
+
readonly affectedRows: number;
|
|
23
|
+
readonly sampleValues: readonly unknown[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface QualityRunResult {
|
|
27
|
+
readonly rows: readonly Row[];
|
|
28
|
+
readonly findings: readonly QualityFinding[];
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Pattern detectors
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
|
|
36
|
+
const DIGITS_RE = /^\d+$/;
|
|
37
|
+
const DATE_PATTERNS: readonly RegExp[] = [
|
|
38
|
+
/^\d{4}-\d{2}-\d{2}$/, // ISO
|
|
39
|
+
/^\d{1,2}\/\d{1,2}\/\d{2,4}$/, // US
|
|
40
|
+
/^\d{1,2}-\d{1,2}-\d{2,4}$/,
|
|
41
|
+
/^\d{8}$/, // yyyymmdd
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
function collectColumns(rows: readonly Row[]): string[] {
|
|
45
|
+
const cols = new Set<string>();
|
|
46
|
+
for (const row of rows) {
|
|
47
|
+
for (const key of Object.keys(row)) {
|
|
48
|
+
if (!key.startsWith("__")) cols.add(key);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return [...cols];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function asStr(v: unknown): string | null {
|
|
55
|
+
if (v === null || v === undefined) return null;
|
|
56
|
+
if (typeof v === "string") return v;
|
|
57
|
+
if (typeof v === "number" || typeof v === "boolean") return String(v);
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
// scanQuality
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Run a few cheap heuristics across the dataset: high null rate, low
|
|
67
|
+
* cardinality, inconsistent date format, obviously malformed emails.
|
|
68
|
+
*/
|
|
69
|
+
export function scanQuality(
|
|
70
|
+
rows: readonly Row[],
|
|
71
|
+
_config?: QualityConfig,
|
|
72
|
+
): QualityFinding[] {
|
|
73
|
+
const findings: QualityFinding[] = [];
|
|
74
|
+
if (rows.length === 0) return findings;
|
|
75
|
+
|
|
76
|
+
const total = rows.length;
|
|
77
|
+
const columns = collectColumns(rows);
|
|
78
|
+
|
|
79
|
+
for (const col of columns) {
|
|
80
|
+
let nullCount = 0;
|
|
81
|
+
let emailLike = 0;
|
|
82
|
+
let malformedEmail = 0;
|
|
83
|
+
let dateLike = 0;
|
|
84
|
+
const dateFormatsSeen = new Set<number>();
|
|
85
|
+
const nonNullSamples: unknown[] = [];
|
|
86
|
+
const distinct = new Set<string>();
|
|
87
|
+
|
|
88
|
+
for (const row of rows) {
|
|
89
|
+
const raw = row[col];
|
|
90
|
+
if (raw === null || raw === undefined || raw === "") {
|
|
91
|
+
nullCount++;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
if (nonNullSamples.length < 5) nonNullSamples.push(raw);
|
|
95
|
+
const s = asStr(raw);
|
|
96
|
+
if (s !== null) {
|
|
97
|
+
distinct.add(s);
|
|
98
|
+
// Email heuristics
|
|
99
|
+
if (s.includes("@")) {
|
|
100
|
+
emailLike++;
|
|
101
|
+
if (!EMAIL_RE.test(s)) malformedEmail++;
|
|
102
|
+
}
|
|
103
|
+
// Date format tracking
|
|
104
|
+
for (let i = 0; i < DATE_PATTERNS.length; i++) {
|
|
105
|
+
if (DATE_PATTERNS[i]!.test(s)) {
|
|
106
|
+
dateFormatsSeen.add(i);
|
|
107
|
+
dateLike++;
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const nullRate = nullCount / total;
|
|
115
|
+
if (nullRate > 0.5) {
|
|
116
|
+
findings.push({
|
|
117
|
+
column: col,
|
|
118
|
+
issue: `High null rate: ${(nullRate * 100).toFixed(1)}%`,
|
|
119
|
+
severity: nullRate > 0.9 ? "error" : "warn",
|
|
120
|
+
affectedRows: nullCount,
|
|
121
|
+
sampleValues: [],
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const nonNull = total - nullCount;
|
|
126
|
+
if (nonNull > 0) {
|
|
127
|
+
const cardinalityRatio = distinct.size / nonNull;
|
|
128
|
+
if (cardinalityRatio < 0.001 && distinct.size <= 1) {
|
|
129
|
+
findings.push({
|
|
130
|
+
column: col,
|
|
131
|
+
issue: "Constant column (single distinct non-null value)",
|
|
132
|
+
severity: "info",
|
|
133
|
+
affectedRows: nonNull,
|
|
134
|
+
sampleValues: nonNullSamples,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (emailLike > 0 && malformedEmail > 0) {
|
|
140
|
+
findings.push({
|
|
141
|
+
column: col,
|
|
142
|
+
issue: `Malformed email values (${malformedEmail} of ${emailLike})`,
|
|
143
|
+
severity: "warn",
|
|
144
|
+
affectedRows: malformedEmail,
|
|
145
|
+
sampleValues: nonNullSamples,
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (dateLike > 0 && dateFormatsSeen.size > 1) {
|
|
150
|
+
findings.push({
|
|
151
|
+
column: col,
|
|
152
|
+
issue: `Inconsistent date formats (${dateFormatsSeen.size} distinct patterns)`,
|
|
153
|
+
severity: "warn",
|
|
154
|
+
affectedRows: dateLike,
|
|
155
|
+
sampleValues: nonNullSamples,
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Numeric-looking string column
|
|
160
|
+
if (nonNull > 0 && distinct.size > 0) {
|
|
161
|
+
let digitCount = 0;
|
|
162
|
+
for (const v of distinct) {
|
|
163
|
+
if (DIGITS_RE.test(v)) digitCount++;
|
|
164
|
+
}
|
|
165
|
+
if (digitCount === distinct.size && distinct.size > 1) {
|
|
166
|
+
// Entire column is numeric strings — informational.
|
|
167
|
+
findings.push({
|
|
168
|
+
column: col,
|
|
169
|
+
issue: "Column contains only numeric strings (consider typing)",
|
|
170
|
+
severity: "info",
|
|
171
|
+
affectedRows: nonNull,
|
|
172
|
+
sampleValues: nonNullSamples,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return findings;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
// runQualityCheck
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Pass-through runner: produce findings, echo rows unchanged.
|
|
187
|
+
*
|
|
188
|
+
* Mirrors `_scan_only` / `run_quality_check` from the Python module: no
|
|
189
|
+
* GoldenCheck, no row rewrites, just reportable findings.
|
|
190
|
+
*/
|
|
191
|
+
export function runQualityCheck(
|
|
192
|
+
rows: readonly Row[],
|
|
193
|
+
config?: QualityConfig,
|
|
194
|
+
): QualityRunResult {
|
|
195
|
+
const findings = scanQuality(rows, config);
|
|
196
|
+
return { rows, findings };
|
|
197
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* review-queue.ts — Human-in-the-loop pair gating.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/review_queue.py. Default gates: >=0.95 auto-approve,
|
|
6
|
+
* <0.75 auto-reject, everything in between needs review.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { ScoredPair } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Types
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
export type ReviewStatus = "pending" | "approved" | "rejected";
|
|
16
|
+
|
|
17
|
+
export interface ReviewItem {
|
|
18
|
+
readonly pairId: string;
|
|
19
|
+
readonly idA: number;
|
|
20
|
+
readonly idB: number;
|
|
21
|
+
readonly score: number;
|
|
22
|
+
readonly status: ReviewStatus;
|
|
23
|
+
readonly createdAt: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface GatedResult {
|
|
27
|
+
readonly autoApproved: readonly ScoredPair[];
|
|
28
|
+
readonly needsReview: readonly ReviewItem[];
|
|
29
|
+
readonly rejected: readonly ScoredPair[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface GateOptions {
|
|
33
|
+
readonly approveAbove?: number;
|
|
34
|
+
readonly rejectBelow?: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Helpers
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
function canonicalIds(a: number, b: number): [number, number] {
|
|
42
|
+
return a < b ? [a, b] : [b, a];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function pairIdFor(a: number, b: number): string {
|
|
46
|
+
const [lo, hi] = canonicalIds(a, b);
|
|
47
|
+
return `${lo}:${hi}`;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function now(): number {
|
|
51
|
+
// Date.now is edge-safe (no node imports).
|
|
52
|
+
return Date.now();
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// gatePairs
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Split pairs into auto-approved, needs-review, and rejected buckets.
|
|
61
|
+
*
|
|
62
|
+
* Defaults: approveAbove=0.95, rejectBelow=0.75.
|
|
63
|
+
*/
|
|
64
|
+
export function gatePairs(
|
|
65
|
+
pairs: readonly ScoredPair[],
|
|
66
|
+
options?: GateOptions,
|
|
67
|
+
): GatedResult {
|
|
68
|
+
const approveAbove = options?.approveAbove ?? 0.95;
|
|
69
|
+
const rejectBelow = options?.rejectBelow ?? 0.75;
|
|
70
|
+
|
|
71
|
+
const autoApproved: ScoredPair[] = [];
|
|
72
|
+
const needsReview: ReviewItem[] = [];
|
|
73
|
+
const rejected: ScoredPair[] = [];
|
|
74
|
+
const t = now();
|
|
75
|
+
|
|
76
|
+
for (const p of pairs) {
|
|
77
|
+
if (p.score >= approveAbove) {
|
|
78
|
+
autoApproved.push(p);
|
|
79
|
+
} else if (p.score < rejectBelow) {
|
|
80
|
+
rejected.push(p);
|
|
81
|
+
} else {
|
|
82
|
+
const [lo, hi] = canonicalIds(p.idA, p.idB);
|
|
83
|
+
needsReview.push({
|
|
84
|
+
pairId: `${lo}:${hi}`,
|
|
85
|
+
idA: lo,
|
|
86
|
+
idB: hi,
|
|
87
|
+
score: p.score,
|
|
88
|
+
status: "pending",
|
|
89
|
+
createdAt: t,
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return { autoApproved, needsReview, rejected };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// ReviewQueue
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* In-memory review queue for human adjudication of borderline pairs.
|
|
103
|
+
*/
|
|
104
|
+
export class ReviewQueue {
|
|
105
|
+
private readonly items = new Map<string, ReviewItem>();
|
|
106
|
+
|
|
107
|
+
/** Add a pair as a pending review item (idempotent by canonical pair id). */
|
|
108
|
+
add(pair: ScoredPair): void {
|
|
109
|
+
const [lo, hi] = canonicalIds(pair.idA, pair.idB);
|
|
110
|
+
const pairId = `${lo}:${hi}`;
|
|
111
|
+
if (this.items.has(pairId)) return;
|
|
112
|
+
this.items.set(pairId, {
|
|
113
|
+
pairId,
|
|
114
|
+
idA: lo,
|
|
115
|
+
idB: hi,
|
|
116
|
+
score: pair.score,
|
|
117
|
+
status: "pending",
|
|
118
|
+
createdAt: now(),
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/** Get an item by canonical pair id ("minId:maxId"). */
|
|
123
|
+
get(pairId: string): ReviewItem | undefined {
|
|
124
|
+
return this.items.get(pairId);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/** Mark a pair approved. No-op if unknown. */
|
|
128
|
+
approve(pairId: string): void {
|
|
129
|
+
const item = this.items.get(pairId);
|
|
130
|
+
if (item === undefined) return;
|
|
131
|
+
this.items.set(pairId, { ...item, status: "approved" });
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/** Mark a pair rejected. No-op if unknown. */
|
|
135
|
+
reject(pairId: string): void {
|
|
136
|
+
const item = this.items.get(pairId);
|
|
137
|
+
if (item === undefined) return;
|
|
138
|
+
this.items.set(pairId, { ...item, status: "rejected" });
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/** All pending items. */
|
|
142
|
+
pending(): ReviewItem[] {
|
|
143
|
+
const out: ReviewItem[] = [];
|
|
144
|
+
for (const item of this.items.values()) {
|
|
145
|
+
if (item.status === "pending") out.push(item);
|
|
146
|
+
}
|
|
147
|
+
return out;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** All approved items. */
|
|
151
|
+
approved(): ReviewItem[] {
|
|
152
|
+
const out: ReviewItem[] = [];
|
|
153
|
+
for (const item of this.items.values()) {
|
|
154
|
+
if (item.status === "approved") out.push(item);
|
|
155
|
+
}
|
|
156
|
+
return out;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** All rejected items. */
|
|
160
|
+
rejected(): ReviewItem[] {
|
|
161
|
+
const out: ReviewItem[] = [];
|
|
162
|
+
for (const item of this.items.values()) {
|
|
163
|
+
if (item.status === "rejected") out.push(item);
|
|
164
|
+
}
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/** Current queue size. */
|
|
169
|
+
size(): number {
|
|
170
|
+
return this.items.size;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/** Canonical pair id helper ("minId:maxId"). */
|
|
174
|
+
static pairIdFor(a: number, b: number): string {
|
|
175
|
+
return pairIdFor(a, b);
|
|
176
|
+
}
|
|
177
|
+
}
|