goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
package/src/core/api.ts
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* api.ts — High-level API functions wrapping the pipeline.
|
|
3
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/_api.py convenience functions.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type {
|
|
9
|
+
Row,
|
|
10
|
+
GoldenMatchConfig,
|
|
11
|
+
DedupeResult,
|
|
12
|
+
MatchResult,
|
|
13
|
+
MatchkeyConfig,
|
|
14
|
+
MatchkeyField,
|
|
15
|
+
BlockingKeyConfig,
|
|
16
|
+
} from "./types.js";
|
|
17
|
+
import {
|
|
18
|
+
makeConfig,
|
|
19
|
+
makeMatchkeyConfig,
|
|
20
|
+
makeMatchkeyField,
|
|
21
|
+
makeBlockingConfig,
|
|
22
|
+
} from "./types.js";
|
|
23
|
+
import { runDedupePipeline, runMatchPipeline } from "./pipeline.js";
|
|
24
|
+
import { scoreField, scorePair, asString } from "./scorer.js";
|
|
25
|
+
import { applyTransforms } from "./transforms.js";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Options
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
export interface DedupeOptions {
|
|
32
|
+
/** Full config object -- takes precedence over shorthand options. */
|
|
33
|
+
readonly config?: GoldenMatchConfig;
|
|
34
|
+
/** Columns for exact matching (creates one exact matchkey per column). */
|
|
35
|
+
readonly exact?: readonly string[];
|
|
36
|
+
/** Columns for fuzzy matching with per-field thresholds. */
|
|
37
|
+
readonly fuzzy?: Readonly<Record<string, number>>;
|
|
38
|
+
/** Blocking key columns (lowercase transform applied). */
|
|
39
|
+
readonly blocking?: readonly string[];
|
|
40
|
+
/** Overall fuzzy threshold (default 0.85). */
|
|
41
|
+
readonly threshold?: number;
|
|
42
|
+
/** Enable LLM scorer for borderline pairs. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in env. */
|
|
43
|
+
readonly llmScorer?: boolean;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
// Build config from shorthand options
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
function buildConfigFromOptions(options?: DedupeOptions): GoldenMatchConfig {
|
|
51
|
+
if (options?.config) return options.config;
|
|
52
|
+
|
|
53
|
+
const matchkeys: MatchkeyConfig[] = [];
|
|
54
|
+
const threshold = options?.threshold ?? 0.85;
|
|
55
|
+
|
|
56
|
+
// Exact matchkeys: one per column
|
|
57
|
+
if (options?.exact) {
|
|
58
|
+
for (const col of options.exact) {
|
|
59
|
+
matchkeys.push(
|
|
60
|
+
makeMatchkeyConfig({
|
|
61
|
+
name: `exact_${col}`,
|
|
62
|
+
type: "exact",
|
|
63
|
+
fields: [
|
|
64
|
+
makeMatchkeyField({
|
|
65
|
+
field: col,
|
|
66
|
+
transforms: ["lowercase", "strip"],
|
|
67
|
+
scorer: "exact",
|
|
68
|
+
}),
|
|
69
|
+
],
|
|
70
|
+
}),
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Fuzzy matchkey: all fuzzy columns combined into one weighted matchkey
|
|
76
|
+
if (options?.fuzzy) {
|
|
77
|
+
const fuzzyEntries = Object.entries(options.fuzzy);
|
|
78
|
+
if (fuzzyEntries.length > 0) {
|
|
79
|
+
const fields: MatchkeyField[] = fuzzyEntries.map(([col, weight]) =>
|
|
80
|
+
makeMatchkeyField({
|
|
81
|
+
field: col,
|
|
82
|
+
transforms: ["lowercase", "strip"],
|
|
83
|
+
scorer: "jaro_winkler",
|
|
84
|
+
weight,
|
|
85
|
+
}),
|
|
86
|
+
);
|
|
87
|
+
matchkeys.push(
|
|
88
|
+
makeMatchkeyConfig({
|
|
89
|
+
name: "fuzzy_combined",
|
|
90
|
+
type: "weighted",
|
|
91
|
+
fields,
|
|
92
|
+
threshold,
|
|
93
|
+
}),
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Blocking config
|
|
99
|
+
let blocking = makeBlockingConfig();
|
|
100
|
+
if (options?.blocking && options.blocking.length > 0) {
|
|
101
|
+
const keys: BlockingKeyConfig[] = options.blocking.map((col) => ({
|
|
102
|
+
fields: [col],
|
|
103
|
+
transforms: ["lowercase", "strip"],
|
|
104
|
+
}));
|
|
105
|
+
blocking = makeBlockingConfig({ keys });
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const partial: Partial<GoldenMatchConfig> = {
|
|
109
|
+
blocking,
|
|
110
|
+
threshold,
|
|
111
|
+
};
|
|
112
|
+
if (matchkeys.length > 0) {
|
|
113
|
+
(partial as Record<string, unknown>).matchkeys = matchkeys;
|
|
114
|
+
}
|
|
115
|
+
if (options?.llmScorer) {
|
|
116
|
+
(partial as Record<string, unknown>).llmScorer = {
|
|
117
|
+
enabled: true,
|
|
118
|
+
autoThreshold: 0.9,
|
|
119
|
+
candidateLo: 0.6,
|
|
120
|
+
candidateHi: 0.9,
|
|
121
|
+
batchSize: 10,
|
|
122
|
+
maxWorkers: 4,
|
|
123
|
+
mode: "pairwise",
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
return makeConfig(partial);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
// Public API: dedupe
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Deduplicate an array of row objects.
|
|
135
|
+
*
|
|
136
|
+
* Shorthand usage:
|
|
137
|
+
* ```ts
|
|
138
|
+
* const result = dedupe(rows, {
|
|
139
|
+
* exact: ["email"],
|
|
140
|
+
* fuzzy: { name: 0.85, address: 0.7 },
|
|
141
|
+
* blocking: ["zip"],
|
|
142
|
+
* threshold: 0.85,
|
|
143
|
+
* });
|
|
144
|
+
* ```
|
|
145
|
+
*
|
|
146
|
+
* Or provide a full config:
|
|
147
|
+
* ```ts
|
|
148
|
+
* const result = dedupe(rows, { config: myConfig });
|
|
149
|
+
* ```
|
|
150
|
+
*/
|
|
151
|
+
export function dedupe(
|
|
152
|
+
rows: readonly Row[],
|
|
153
|
+
options?: DedupeOptions,
|
|
154
|
+
): DedupeResult {
|
|
155
|
+
const config = buildConfigFromOptions(options);
|
|
156
|
+
return runDedupePipeline(rows, config);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ---------------------------------------------------------------------------
|
|
160
|
+
// Public API: match
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Match target rows against reference rows.
|
|
165
|
+
*
|
|
166
|
+
* Same options as `dedupe()`. Returns matched/unmatched target rows.
|
|
167
|
+
*/
|
|
168
|
+
export function match(
|
|
169
|
+
target: readonly Row[],
|
|
170
|
+
reference: readonly Row[],
|
|
171
|
+
options?: DedupeOptions,
|
|
172
|
+
): MatchResult {
|
|
173
|
+
const config = buildConfigFromOptions(options);
|
|
174
|
+
return runMatchPipeline(target, reference, config);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// Public API: scoreStrings
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Score two strings using the specified scorer algorithm.
|
|
183
|
+
*
|
|
184
|
+
* @param a - First string.
|
|
185
|
+
* @param b - Second string.
|
|
186
|
+
* @param scorer - Scorer name (default: "jaro_winkler").
|
|
187
|
+
* Valid scorers: exact, jaro_winkler, levenshtein, token_sort,
|
|
188
|
+
* soundex_match, dice, jaccard, ensemble.
|
|
189
|
+
* @returns Similarity score between 0.0 and 1.0.
|
|
190
|
+
*/
|
|
191
|
+
export function scoreStrings(
|
|
192
|
+
a: string,
|
|
193
|
+
b: string,
|
|
194
|
+
scorer: string = "jaro_winkler",
|
|
195
|
+
): number {
|
|
196
|
+
const result = scoreField(a, b, scorer);
|
|
197
|
+
return result ?? 0.0;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// ---------------------------------------------------------------------------
|
|
201
|
+
// Public API: scorePairRecord
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Score a pair of row objects across specified fields using weighted
|
|
206
|
+
* aggregation.
|
|
207
|
+
*
|
|
208
|
+
* @param rowA - First row.
|
|
209
|
+
* @param rowB - Second row.
|
|
210
|
+
* @param fields - Field configs specifying which fields to compare,
|
|
211
|
+
* transforms to apply, scorer to use, and weight.
|
|
212
|
+
* @returns Weighted similarity score between 0.0 and 1.0.
|
|
213
|
+
*/
|
|
214
|
+
export function scorePairRecord(
|
|
215
|
+
rowA: Row,
|
|
216
|
+
rowB: Row,
|
|
217
|
+
fields: readonly MatchkeyField[],
|
|
218
|
+
): number {
|
|
219
|
+
return scorePair(rowA, rowB, fields);
|
|
220
|
+
}
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* autoconfig.ts — Auto-generate a GoldenMatch config from sample data.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/autoconfig.py. Profiles the rows, classifies
|
|
6
|
+
* columns, and builds exact/weighted matchkeys + blocking config.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type {
|
|
10
|
+
Row,
|
|
11
|
+
GoldenMatchConfig,
|
|
12
|
+
MatchkeyConfig,
|
|
13
|
+
MatchkeyField,
|
|
14
|
+
BlockingKeyConfig,
|
|
15
|
+
BlockingConfig,
|
|
16
|
+
} from "./types.js";
|
|
17
|
+
import {
|
|
18
|
+
makeConfig,
|
|
19
|
+
makeMatchkeyConfig,
|
|
20
|
+
makeMatchkeyField,
|
|
21
|
+
makeBlockingConfig,
|
|
22
|
+
makeGoldenRulesConfig,
|
|
23
|
+
} from "./types.js";
|
|
24
|
+
import { profileRows, type ColumnProfile, type DatasetProfile } from "./profiler.js";
|
|
25
|
+
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Options
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
export interface AutoconfigOptions {
|
|
31
|
+
readonly llmProvider?: string;
|
|
32
|
+
readonly llmAuto?: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Name-based classification patterns (authoritative over data profiling for
|
|
37
|
+
// some signals — matches Python's _DATE_PATTERNS / _GEO_PATTERNS behavior).
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
const EMAIL_NAME_PATTERNS = [/email/i, /e_mail/i, /e-mail/i];
|
|
41
|
+
const PHONE_NAME_PATTERNS = [/phone/i, /tel(?!e)/i, /mobile/i, /cell/i];
|
|
42
|
+
const NAME_NAME_PATTERNS = [/name/i, /first/i, /last/i, /full_name/i, /surname/i];
|
|
43
|
+
const ZIP_NAME_PATTERNS = [/zip/i, /postal/i, /postcode/i];
|
|
44
|
+
const GEO_NAME_PATTERNS = [
|
|
45
|
+
/^city/i,
|
|
46
|
+
/city_desc/i,
|
|
47
|
+
/^state/i,
|
|
48
|
+
/state_cd/i,
|
|
49
|
+
/county/i,
|
|
50
|
+
/country/i,
|
|
51
|
+
/^region/i,
|
|
52
|
+
/province/i,
|
|
53
|
+
];
|
|
54
|
+
const DATE_NAME_PATTERNS = [
|
|
55
|
+
/date/i,
|
|
56
|
+
/created/i,
|
|
57
|
+
/modified/i,
|
|
58
|
+
/updated/i,
|
|
59
|
+
/_at$/i,
|
|
60
|
+
/birth/i,
|
|
61
|
+
/dob/i,
|
|
62
|
+
];
|
|
63
|
+
const ID_NAME_PATTERNS = [/^id$/i, /_id$/i, /uuid/i, /guid/i];
|
|
64
|
+
|
|
65
|
+
// Re-exported for consumers that wanted the spec-level constants.
|
|
66
|
+
export const EMAIL_PATTERNS = EMAIL_NAME_PATTERNS;
|
|
67
|
+
export const PHONE_PATTERNS = PHONE_NAME_PATTERNS;
|
|
68
|
+
export const NAME_PATTERNS = NAME_NAME_PATTERNS;
|
|
69
|
+
export const ZIP_PATTERNS = ZIP_NAME_PATTERNS;
|
|
70
|
+
export const GEO_PATTERNS = GEO_NAME_PATTERNS;
|
|
71
|
+
export const DATE_PATTERNS = DATE_NAME_PATTERNS;
|
|
72
|
+
export const ID_PATTERNS = ID_NAME_PATTERNS;
|
|
73
|
+
|
|
74
|
+
function nameMatches(name: string, patterns: readonly RegExp[]): boolean {
|
|
75
|
+
return patterns.some((re) => re.test(name));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// Column classification (authoritative: date > geo > name heuristics)
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
type ClassifiedKind =
|
|
83
|
+
| "email"
|
|
84
|
+
| "phone"
|
|
85
|
+
| "zip"
|
|
86
|
+
| "geo"
|
|
87
|
+
| "date"
|
|
88
|
+
| "name"
|
|
89
|
+
| "id"
|
|
90
|
+
| "numeric"
|
|
91
|
+
| "text";
|
|
92
|
+
|
|
93
|
+
function classifyColumn(profile: ColumnProfile): ClassifiedKind {
|
|
94
|
+
const name = profile.name;
|
|
95
|
+
|
|
96
|
+
// Date is checked first so that date-like columns never get misclassified
|
|
97
|
+
// as phones by the profiler's value heuristic.
|
|
98
|
+
if (nameMatches(name, DATE_NAME_PATTERNS)) return "date";
|
|
99
|
+
if (profile.inferredType === "date") return "date";
|
|
100
|
+
|
|
101
|
+
if (nameMatches(name, GEO_NAME_PATTERNS)) return "geo";
|
|
102
|
+
if (profile.inferredType === "geo") return "geo";
|
|
103
|
+
|
|
104
|
+
if (nameMatches(name, EMAIL_NAME_PATTERNS) || profile.inferredType === "email") {
|
|
105
|
+
return "email";
|
|
106
|
+
}
|
|
107
|
+
if (nameMatches(name, PHONE_NAME_PATTERNS) || profile.inferredType === "phone") {
|
|
108
|
+
return "phone";
|
|
109
|
+
}
|
|
110
|
+
if (nameMatches(name, ZIP_NAME_PATTERNS) || profile.inferredType === "zip") {
|
|
111
|
+
return "zip";
|
|
112
|
+
}
|
|
113
|
+
if (nameMatches(name, NAME_NAME_PATTERNS) || profile.inferredType === "name") {
|
|
114
|
+
return "name";
|
|
115
|
+
}
|
|
116
|
+
if (nameMatches(name, ID_NAME_PATTERNS) || profile.inferredType === "id") {
|
|
117
|
+
return "id";
|
|
118
|
+
}
|
|
119
|
+
if (profile.inferredType === "numeric") return "numeric";
|
|
120
|
+
return "text";
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
// Heuristic builders
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
function buildExactMatchkeys(
|
|
128
|
+
profiles: readonly ColumnProfile[],
|
|
129
|
+
): MatchkeyConfig[] {
|
|
130
|
+
const out: MatchkeyConfig[] = [];
|
|
131
|
+
for (const p of profiles) {
|
|
132
|
+
const kind = classifyColumn(p);
|
|
133
|
+
// zip/geo are blocking signals, NOT identity claims.
|
|
134
|
+
if (kind === "zip" || kind === "geo" || kind === "date" || kind === "text") {
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Skip sparse & near-constant columns
|
|
139
|
+
if (p.nullRate > 0.4) continue;
|
|
140
|
+
if (p.cardinalityRatio < 0.01) continue;
|
|
141
|
+
|
|
142
|
+
// Only identifier-like columns get exact matchkeys with >=0.5 cardinality.
|
|
143
|
+
const isIdentifier =
|
|
144
|
+
kind === "email" || kind === "phone" || kind === "id";
|
|
145
|
+
if (!isIdentifier) continue;
|
|
146
|
+
if (p.cardinalityRatio < 0.5) continue;
|
|
147
|
+
|
|
148
|
+
const transforms: string[] =
|
|
149
|
+
kind === "email"
|
|
150
|
+
? ["lowercase", "strip"]
|
|
151
|
+
: kind === "phone"
|
|
152
|
+
? ["digits_only"]
|
|
153
|
+
: ["strip"];
|
|
154
|
+
|
|
155
|
+
out.push(
|
|
156
|
+
makeMatchkeyConfig({
|
|
157
|
+
name: `exact_${p.name}`,
|
|
158
|
+
type: "exact",
|
|
159
|
+
fields: [
|
|
160
|
+
makeMatchkeyField({
|
|
161
|
+
field: p.name,
|
|
162
|
+
transforms,
|
|
163
|
+
scorer: "exact",
|
|
164
|
+
weight: 1.0,
|
|
165
|
+
}),
|
|
166
|
+
],
|
|
167
|
+
threshold: 1.0,
|
|
168
|
+
}),
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
return out;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function buildWeightedMatchkey(
|
|
175
|
+
profiles: readonly ColumnProfile[],
|
|
176
|
+
): MatchkeyConfig | null {
|
|
177
|
+
const fields: MatchkeyField[] = [];
|
|
178
|
+
|
|
179
|
+
for (const p of profiles) {
|
|
180
|
+
const kind = classifyColumn(p);
|
|
181
|
+
if (p.nullRate > 0.5) continue;
|
|
182
|
+
|
|
183
|
+
if (kind === "name") {
|
|
184
|
+
fields.push(
|
|
185
|
+
makeMatchkeyField({
|
|
186
|
+
field: p.name,
|
|
187
|
+
transforms: ["lowercase", "strip", "normalize_whitespace"],
|
|
188
|
+
scorer: "jaro_winkler",
|
|
189
|
+
weight: 0.6,
|
|
190
|
+
}),
|
|
191
|
+
);
|
|
192
|
+
} else if (kind === "email") {
|
|
193
|
+
fields.push(
|
|
194
|
+
makeMatchkeyField({
|
|
195
|
+
field: p.name,
|
|
196
|
+
transforms: ["lowercase", "strip"],
|
|
197
|
+
scorer: "jaro_winkler",
|
|
198
|
+
weight: 0.3,
|
|
199
|
+
}),
|
|
200
|
+
);
|
|
201
|
+
} else if (kind === "phone") {
|
|
202
|
+
fields.push(
|
|
203
|
+
makeMatchkeyField({
|
|
204
|
+
field: p.name,
|
|
205
|
+
transforms: ["digits_only"],
|
|
206
|
+
scorer: "exact",
|
|
207
|
+
weight: 0.25,
|
|
208
|
+
}),
|
|
209
|
+
);
|
|
210
|
+
} else if (kind === "zip") {
|
|
211
|
+
fields.push(
|
|
212
|
+
makeMatchkeyField({
|
|
213
|
+
field: p.name,
|
|
214
|
+
transforms: ["digits_only"],
|
|
215
|
+
scorer: "exact",
|
|
216
|
+
weight: 0.15,
|
|
217
|
+
}),
|
|
218
|
+
);
|
|
219
|
+
} else if (kind === "geo") {
|
|
220
|
+
fields.push(
|
|
221
|
+
makeMatchkeyField({
|
|
222
|
+
field: p.name,
|
|
223
|
+
transforms: ["lowercase", "strip"],
|
|
224
|
+
scorer: "exact",
|
|
225
|
+
weight: 0.1,
|
|
226
|
+
}),
|
|
227
|
+
);
|
|
228
|
+
} else if (kind === "text" && p.avgLength >= 10) {
|
|
229
|
+
// Long free-text columns: token_sort to catch reordering
|
|
230
|
+
fields.push(
|
|
231
|
+
makeMatchkeyField({
|
|
232
|
+
field: p.name,
|
|
233
|
+
transforms: ["lowercase", "strip", "token_sort"],
|
|
234
|
+
scorer: "token_sort",
|
|
235
|
+
weight: 0.2,
|
|
236
|
+
}),
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if (fields.length === 0) return null;
|
|
242
|
+
|
|
243
|
+
return makeMatchkeyConfig({
|
|
244
|
+
name: "weighted_identity",
|
|
245
|
+
type: "weighted",
|
|
246
|
+
fields,
|
|
247
|
+
threshold: 0.85,
|
|
248
|
+
rerank: false,
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function buildBlocking(profiles: readonly ColumnProfile[]): BlockingConfig {
|
|
253
|
+
const keys: BlockingKeyConfig[] = [];
|
|
254
|
+
|
|
255
|
+
// Prefer zip > geo > first-letter of name
|
|
256
|
+
for (const p of profiles) {
|
|
257
|
+
const kind = classifyColumn(p);
|
|
258
|
+
if (kind !== "zip") continue;
|
|
259
|
+
if (p.nullRate > 0.2) continue;
|
|
260
|
+
if (p.cardinalityRatio >= 0.95) continue;
|
|
261
|
+
keys.push({
|
|
262
|
+
fields: [p.name],
|
|
263
|
+
transforms: ["digits_only", "substring:0:5"],
|
|
264
|
+
});
|
|
265
|
+
break;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if (keys.length === 0) {
|
|
269
|
+
for (const p of profiles) {
|
|
270
|
+
const kind = classifyColumn(p);
|
|
271
|
+
if (kind !== "geo") continue;
|
|
272
|
+
if (p.nullRate > 0.2) continue;
|
|
273
|
+
if (p.cardinalityRatio >= 0.95) continue;
|
|
274
|
+
keys.push({
|
|
275
|
+
fields: [p.name],
|
|
276
|
+
transforms: ["lowercase", "strip"],
|
|
277
|
+
});
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (keys.length === 0) {
|
|
283
|
+
for (const p of profiles) {
|
|
284
|
+
const kind = classifyColumn(p);
|
|
285
|
+
if (kind !== "name") continue;
|
|
286
|
+
if (p.nullRate > 0.2) continue;
|
|
287
|
+
if (p.cardinalityRatio >= 0.95) continue;
|
|
288
|
+
keys.push({
|
|
289
|
+
fields: [p.name],
|
|
290
|
+
transforms: ["lowercase", "strip", "substring:0:1"],
|
|
291
|
+
});
|
|
292
|
+
break;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Last resort: first non-null column that isn't near-unique or sparse
|
|
297
|
+
if (keys.length === 0) {
|
|
298
|
+
for (const p of profiles) {
|
|
299
|
+
if (p.nullRate > 0.2) continue;
|
|
300
|
+
if (p.cardinalityRatio >= 0.95) continue;
|
|
301
|
+
if (p.cardinalityRatio < 0.01) continue;
|
|
302
|
+
keys.push({
|
|
303
|
+
fields: [p.name],
|
|
304
|
+
transforms: ["lowercase", "strip"],
|
|
305
|
+
});
|
|
306
|
+
break;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return makeBlockingConfig({
|
|
311
|
+
strategy: "static",
|
|
312
|
+
keys,
|
|
313
|
+
maxBlockSize: 1000,
|
|
314
|
+
skipOversized: true,
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// ---------------------------------------------------------------------------
|
|
319
|
+
// Public entry points
|
|
320
|
+
// ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Build a GoldenMatchConfig by profiling the provided rows.
|
|
324
|
+
*
|
|
325
|
+
* Mirrors goldenmatch.core.autoconfig.auto_configure_df. Does not apply
|
|
326
|
+
* standardization rules directly — callers can merge them onto the result.
|
|
327
|
+
*/
|
|
328
|
+
export function autoConfigureRows(
|
|
329
|
+
rows: readonly Row[],
|
|
330
|
+
options?: AutoconfigOptions,
|
|
331
|
+
): GoldenMatchConfig {
|
|
332
|
+
const profile: DatasetProfile = profileRows(rows);
|
|
333
|
+
const profiles = profile.columns;
|
|
334
|
+
|
|
335
|
+
const exactKeys = buildExactMatchkeys(profiles);
|
|
336
|
+
const weighted = buildWeightedMatchkey(profiles);
|
|
337
|
+
const matchkeys: MatchkeyConfig[] = [...exactKeys];
|
|
338
|
+
if (weighted) matchkeys.push(weighted);
|
|
339
|
+
|
|
340
|
+
const blocking = buildBlocking(profiles);
|
|
341
|
+
const goldenRules = makeGoldenRulesConfig({ defaultStrategy: "most_complete" });
|
|
342
|
+
|
|
343
|
+
const config = makeConfig({
|
|
344
|
+
matchkeys,
|
|
345
|
+
blocking,
|
|
346
|
+
goldenRules,
|
|
347
|
+
threshold: 0.85,
|
|
348
|
+
...(options?.llmAuto !== undefined ? { llmAuto: options.llmAuto } : {}),
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
return config;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* Convenience alias for API parity with the Python function that starts
|
|
356
|
+
* from "files" (which, in edge-safe land, means pre-loaded row arrays).
|
|
357
|
+
*/
|
|
358
|
+
export function autoConfigure(
|
|
359
|
+
rows: readonly Row[],
|
|
360
|
+
options?: AutoconfigOptions,
|
|
361
|
+
): GoldenMatchConfig {
|
|
362
|
+
return autoConfigureRows(rows, options);
|
|
363
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* autofix.ts — Lightweight row auto-fix utilities.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/autofix.py. Trims whitespace, nulls empty strings,
|
|
6
|
+
* and converts common "no value" tokens to null.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Row } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Types
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
export interface AutoFixLog {
|
|
16
|
+
readonly fixType: string;
|
|
17
|
+
readonly column: string;
|
|
18
|
+
readonly affectedRows: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface AutoFixResult {
|
|
22
|
+
readonly rows: Row[];
|
|
23
|
+
readonly log: AutoFixLog[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Tokens treated as null
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
const NULL_TOKENS: ReadonlySet<string> = new Set([
|
|
31
|
+
"n/a",
|
|
32
|
+
"na",
|
|
33
|
+
"none",
|
|
34
|
+
"null",
|
|
35
|
+
"nil",
|
|
36
|
+
"unknown",
|
|
37
|
+
"unk",
|
|
38
|
+
"-",
|
|
39
|
+
"--",
|
|
40
|
+
"?",
|
|
41
|
+
]);
|
|
42
|
+
|
|
43
|
+
function isNullToken(s: string): boolean {
|
|
44
|
+
const lower = s.trim().toLowerCase();
|
|
45
|
+
if (lower.length === 0) return true;
|
|
46
|
+
return NULL_TOKENS.has(lower);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// autoFixRows
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Apply conservative fixes row-by-row:
|
|
55
|
+
* - trim string values
|
|
56
|
+
* - convert empty strings and common "no value" tokens to null
|
|
57
|
+
*
|
|
58
|
+
* Internal columns (prefix `__`) are preserved unchanged.
|
|
59
|
+
*/
|
|
60
|
+
export function autoFixRows(rows: readonly Row[]): AutoFixResult {
|
|
61
|
+
const out: Row[] = [];
|
|
62
|
+
const trimCounts = new Map<string, number>();
|
|
63
|
+
const nullCounts = new Map<string, number>();
|
|
64
|
+
|
|
65
|
+
for (const row of rows) {
|
|
66
|
+
const fixed: Record<string, unknown> = {};
|
|
67
|
+
let changed = false;
|
|
68
|
+
for (const [key, value] of Object.entries(row)) {
|
|
69
|
+
if (key.startsWith("__")) {
|
|
70
|
+
fixed[key] = value;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
if (typeof value === "string") {
|
|
74
|
+
const trimmed = value.trim();
|
|
75
|
+
if (trimmed !== value) {
|
|
76
|
+
trimCounts.set(key, (trimCounts.get(key) ?? 0) + 1);
|
|
77
|
+
changed = true;
|
|
78
|
+
}
|
|
79
|
+
if (isNullToken(trimmed)) {
|
|
80
|
+
fixed[key] = null;
|
|
81
|
+
nullCounts.set(key, (nullCounts.get(key) ?? 0) + 1);
|
|
82
|
+
changed = true;
|
|
83
|
+
} else {
|
|
84
|
+
fixed[key] = trimmed;
|
|
85
|
+
}
|
|
86
|
+
} else {
|
|
87
|
+
fixed[key] = value;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
out.push(changed ? (fixed as Row) : row);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const log: AutoFixLog[] = [];
|
|
94
|
+
for (const [col, n] of trimCounts) {
|
|
95
|
+
log.push({ fixType: "trim_whitespace", column: col, affectedRows: n });
|
|
96
|
+
}
|
|
97
|
+
for (const [col, n] of nullCounts) {
|
|
98
|
+
log.push({ fixType: "null_empty_or_token", column: col, affectedRows: n });
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return { rows: out, log };
|
|
102
|
+
}
|