goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sensitivity.ts — Parameter sweep engine for GoldenMatch.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/sensitivity.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row, GoldenMatchConfig } from "./types.js";
|
|
9
|
+
import { runDedupePipeline } from "./pipeline.js";
|
|
10
|
+
import { compareClusters } from "./compare-clusters.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Types
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
export interface SweepParam {
|
|
17
|
+
/** Dot-path into the config, e.g. "threshold", "blocking.maxBlockSize". */
|
|
18
|
+
readonly path: string;
|
|
19
|
+
readonly values: readonly unknown[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface SweepPoint {
|
|
23
|
+
readonly params: Readonly<Record<string, unknown>>;
|
|
24
|
+
readonly stats: Readonly<Record<string, number>>;
|
|
25
|
+
readonly twi?: number;
|
|
26
|
+
readonly error?: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface SensitivityResult {
|
|
30
|
+
readonly baseline: SweepPoint;
|
|
31
|
+
readonly points: readonly SweepPoint[];
|
|
32
|
+
readonly stable: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Dot-path config override
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
/** Set a nested property by dot-path, returning a new object (shallow-cloned chain). */
|
|
40
|
+
function setPath(
|
|
41
|
+
root: Record<string, unknown>,
|
|
42
|
+
path: string,
|
|
43
|
+
value: unknown,
|
|
44
|
+
): Record<string, unknown> {
|
|
45
|
+
// Simple dot path; array indices via [n] not supported in this edge-safe port
|
|
46
|
+
const parts = path.split(".").filter((p) => p.length > 0);
|
|
47
|
+
if (parts.length === 0) return root;
|
|
48
|
+
const clone: Record<string, unknown> = { ...root };
|
|
49
|
+
let cursor: Record<string, unknown> = clone;
|
|
50
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
51
|
+
const key = parts[i]!;
|
|
52
|
+
const child = cursor[key];
|
|
53
|
+
const childObj =
|
|
54
|
+
child !== null && typeof child === "object" && !Array.isArray(child)
|
|
55
|
+
? { ...(child as Record<string, unknown>) }
|
|
56
|
+
: {};
|
|
57
|
+
cursor[key] = childObj;
|
|
58
|
+
cursor = childObj;
|
|
59
|
+
}
|
|
60
|
+
cursor[parts[parts.length - 1]!] = value;
|
|
61
|
+
return clone;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
// Stats extraction
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
function statsFrom(result: ReturnType<typeof runDedupePipeline>): Record<string, number> {
|
|
69
|
+
return {
|
|
70
|
+
totalRecords: result.stats.totalRecords,
|
|
71
|
+
totalClusters: result.stats.totalClusters,
|
|
72
|
+
matchedRecords: result.stats.matchedRecords,
|
|
73
|
+
uniqueRecords: result.stats.uniqueRecords,
|
|
74
|
+
matchRate: result.stats.matchRate,
|
|
75
|
+
scoredPairs: result.scoredPairs.length,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
// Cartesian product of sweep values
|
|
81
|
+
// ---------------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
function cartesianPoints(
|
|
84
|
+
params: readonly SweepParam[],
|
|
85
|
+
): Readonly<Record<string, unknown>>[] {
|
|
86
|
+
if (params.length === 0) return [];
|
|
87
|
+
let acc: Record<string, unknown>[] = [{}];
|
|
88
|
+
for (const p of params) {
|
|
89
|
+
const next: Record<string, unknown>[] = [];
|
|
90
|
+
for (const base of acc) {
|
|
91
|
+
for (const v of p.values) {
|
|
92
|
+
next.push({ ...base, [p.path]: v });
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
acc = next;
|
|
96
|
+
}
|
|
97
|
+
return acc;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// runSensitivity
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Run a parameter sweep.
|
|
106
|
+
*
|
|
107
|
+
* Each point in the Cartesian product of `params` is applied to
|
|
108
|
+
* `baselineConfig`, the dedupe pipeline runs, and the resulting clusters are
|
|
109
|
+
* compared against the baseline via CCMS. A `stable` flag is set when every
|
|
110
|
+
* point's TWI is within 0.05 of 1.0.
|
|
111
|
+
*
|
|
112
|
+
* Per-point errors are caught and stored on the point so that partial
|
|
113
|
+
* results are preserved.
|
|
114
|
+
*/
|
|
115
|
+
export function runSensitivity(
|
|
116
|
+
rows: readonly Row[],
|
|
117
|
+
baselineConfig: GoldenMatchConfig,
|
|
118
|
+
params: readonly SweepParam[],
|
|
119
|
+
): SensitivityResult {
|
|
120
|
+
// Baseline run
|
|
121
|
+
const baselineRun = runDedupePipeline(rows, baselineConfig);
|
|
122
|
+
const baseline: SweepPoint = {
|
|
123
|
+
params: {},
|
|
124
|
+
stats: statsFrom(baselineRun),
|
|
125
|
+
twi: 1.0,
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
const points: SweepPoint[] = [];
|
|
129
|
+
const combos = cartesianPoints(params);
|
|
130
|
+
|
|
131
|
+
let stable = true;
|
|
132
|
+
for (const combo of combos) {
|
|
133
|
+
let cfg: GoldenMatchConfig = baselineConfig;
|
|
134
|
+
for (const [path, value] of Object.entries(combo)) {
|
|
135
|
+
cfg = setPath(
|
|
136
|
+
cfg as Record<string, unknown>,
|
|
137
|
+
path,
|
|
138
|
+
value,
|
|
139
|
+
) as GoldenMatchConfig;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
try {
|
|
143
|
+
const runResult = runDedupePipeline(rows, cfg);
|
|
144
|
+
let twi: number | undefined;
|
|
145
|
+
try {
|
|
146
|
+
twi = compareClusters(baselineRun.clusters, runResult.clusters).twi;
|
|
147
|
+
} catch (err) {
|
|
148
|
+
// eslint-disable-next-line no-console
|
|
149
|
+
console.warn(
|
|
150
|
+
`TWI comparison failed for sweep point ${JSON.stringify(combo)}: ${
|
|
151
|
+
err instanceof Error ? err.message : String(err)
|
|
152
|
+
}`,
|
|
153
|
+
);
|
|
154
|
+
twi = undefined;
|
|
155
|
+
}
|
|
156
|
+
if (twi === undefined || Math.abs(1 - twi) > 0.05) stable = false;
|
|
157
|
+
points.push({
|
|
158
|
+
params: combo,
|
|
159
|
+
stats: statsFrom(runResult),
|
|
160
|
+
...(twi !== undefined ? { twi } : {}),
|
|
161
|
+
});
|
|
162
|
+
} catch (err) {
|
|
163
|
+
stable = false;
|
|
164
|
+
points.push({
|
|
165
|
+
params: combo,
|
|
166
|
+
stats: {},
|
|
167
|
+
error: err instanceof Error ? err.message : String(err),
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return { baseline, points, stable };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
// stabilityReport
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
/** Render a human-readable stability report for a sensitivity result. */
|
|
180
|
+
export function stabilityReport(result: SensitivityResult): string {
|
|
181
|
+
const lines: string[] = [];
|
|
182
|
+
lines.push("Sensitivity sweep:");
|
|
183
|
+
lines.push(` Baseline: ${JSON.stringify(result.baseline.stats)}`);
|
|
184
|
+
lines.push(` Points: ${result.points.length}`);
|
|
185
|
+
lines.push(` Stable: ${result.stable ? "yes" : "no"}`);
|
|
186
|
+
for (const p of result.points) {
|
|
187
|
+
const twiStr = p.twi !== undefined ? p.twi.toFixed(4) : "n/a";
|
|
188
|
+
const errStr = p.error !== undefined ? ` error=${p.error}` : "";
|
|
189
|
+
lines.push(
|
|
190
|
+
` - params=${JSON.stringify(p.params)} twi=${twiStr} clusters=${
|
|
191
|
+
p.stats["totalClusters"] ?? "?"
|
|
192
|
+
}${errStr}`,
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
return lines.join("\n");
|
|
196
|
+
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* standardize.ts — Data standardization for GoldenMatch-JS.
|
|
3
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports standardization from goldenmatch/core/standardize.py.
|
|
6
|
+
* These are data cleaning transforms applied to columns before matching.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Row } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Address abbreviations (USPS standard)
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
/** Map of full word (lowercase) to USPS abbreviation. */
|
|
16
|
+
const ADDRESS_ABBREVIATIONS: Readonly<Record<string, string>> = {
|
|
17
|
+
street: "St",
|
|
18
|
+
avenue: "Ave",
|
|
19
|
+
boulevard: "Blvd",
|
|
20
|
+
drive: "Dr",
|
|
21
|
+
lane: "Ln",
|
|
22
|
+
road: "Rd",
|
|
23
|
+
court: "Ct",
|
|
24
|
+
place: "Pl",
|
|
25
|
+
circle: "Cir",
|
|
26
|
+
terrace: "Ter",
|
|
27
|
+
highway: "Hwy",
|
|
28
|
+
parkway: "Pkwy",
|
|
29
|
+
expressway: "Expy",
|
|
30
|
+
freeway: "Fwy",
|
|
31
|
+
trail: "Trl",
|
|
32
|
+
way: "Way",
|
|
33
|
+
north: "N",
|
|
34
|
+
south: "S",
|
|
35
|
+
east: "E",
|
|
36
|
+
west: "W",
|
|
37
|
+
northeast: "NE",
|
|
38
|
+
northwest: "NW",
|
|
39
|
+
southeast: "SE",
|
|
40
|
+
southwest: "SW",
|
|
41
|
+
apartment: "Apt",
|
|
42
|
+
suite: "Ste",
|
|
43
|
+
building: "Bldg",
|
|
44
|
+
floor: "Fl",
|
|
45
|
+
room: "Rm",
|
|
46
|
+
unit: "Unit",
|
|
47
|
+
department: "Dept",
|
|
48
|
+
"post office box": "PO Box",
|
|
49
|
+
"p.o. box": "PO Box",
|
|
50
|
+
"po box": "PO Box",
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// ---------------------------------------------------------------------------
|
|
54
|
+
// Individual standardizer functions
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Standardize email: lowercase, strip, validate basic structure.
|
|
59
|
+
* Returns null for invalid emails.
|
|
60
|
+
*/
|
|
61
|
+
function stdEmail(value: string): string | null {
|
|
62
|
+
const v = value.trim().toLowerCase();
|
|
63
|
+
if (!v || !v.includes("@")) return null;
|
|
64
|
+
const domain = v.split("@").pop();
|
|
65
|
+
if (!domain || !domain.includes(".")) return null;
|
|
66
|
+
return v;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Standardize name to proper case (Title Case).
|
|
71
|
+
* Handles hyphenated names: mary-jane -> Mary-Jane.
|
|
72
|
+
*/
|
|
73
|
+
function stdNameProper(value: string): string | null {
|
|
74
|
+
const v = value.trim();
|
|
75
|
+
if (!v) return null;
|
|
76
|
+
// Collapse whitespace
|
|
77
|
+
const collapsed = v.replace(/\s+/g, " ");
|
|
78
|
+
// Title-case each whitespace-separated word; within a word handle hyphens
|
|
79
|
+
const titleWord = (word: string): string => {
|
|
80
|
+
if (!word) return "";
|
|
81
|
+
const hyphenParts = word.split("-");
|
|
82
|
+
return hyphenParts
|
|
83
|
+
.map((p) => {
|
|
84
|
+
if (!p) return "";
|
|
85
|
+
return p.charAt(0).toUpperCase() + p.slice(1).toLowerCase();
|
|
86
|
+
})
|
|
87
|
+
.join("-");
|
|
88
|
+
};
|
|
89
|
+
return collapsed.split(" ").map(titleWord).join(" ");
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Standardize name to UPPER CASE.
|
|
94
|
+
*/
|
|
95
|
+
function stdNameUpper(value: string): string | null {
|
|
96
|
+
const v = value.trim().replace(/\s+/g, " ").toUpperCase();
|
|
97
|
+
return v || null;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Standardize name to lower case.
|
|
102
|
+
*/
|
|
103
|
+
function stdNameLower(value: string): string | null {
|
|
104
|
+
const v = value.trim().replace(/\s+/g, " ");
|
|
105
|
+
return v ? v.toLowerCase() : null;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Standardize phone: digits only, strip US country code if 11 digits starting with 1.
|
|
110
|
+
* Returns null if fewer than 7 digits.
|
|
111
|
+
*/
|
|
112
|
+
function stdPhone(value: string): string | null {
|
|
113
|
+
let digits = value.replace(/\D/g, "");
|
|
114
|
+
if (!digits) return null;
|
|
115
|
+
// Strip US country code
|
|
116
|
+
if (digits.length === 11 && digits.startsWith("1")) {
|
|
117
|
+
digits = digits.slice(1);
|
|
118
|
+
}
|
|
119
|
+
// Must be at least 7 digits
|
|
120
|
+
if (digits.length < 7) return null;
|
|
121
|
+
return digits;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Standardize ZIP code to first 5 digits, zero-padded.
|
|
126
|
+
*/
|
|
127
|
+
function stdZip5(value: string): string | null {
|
|
128
|
+
// Take part before hyphen or space
|
|
129
|
+
const first = value.split("-")[0]!.split(" ")[0]!;
|
|
130
|
+
const digits = first.replace(/\D/g, "");
|
|
131
|
+
if (!digits) return null;
|
|
132
|
+
return digits.slice(0, 5).padStart(5, "0");
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Title-case a single word.
|
|
137
|
+
*/
|
|
138
|
+
function titleCase(word: string): string {
|
|
139
|
+
if (!word) return word;
|
|
140
|
+
return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Standardize address: title case, USPS abbreviations, normalize whitespace.
|
|
145
|
+
*/
|
|
146
|
+
function stdAddress(value: string): string | null {
|
|
147
|
+
let v = value.trim();
|
|
148
|
+
if (!v) return null;
|
|
149
|
+
// Normalize whitespace
|
|
150
|
+
v = v.replace(/\s+/g, " ");
|
|
151
|
+
const words = v.split(" ");
|
|
152
|
+
const result: string[] = [];
|
|
153
|
+
let i = 0;
|
|
154
|
+
while (i < words.length) {
|
|
155
|
+
// Check two-word phrases first (e.g. "post office")
|
|
156
|
+
if (i + 1 < words.length) {
|
|
157
|
+
const twoWord = `${words[i]} ${words[i + 1]}`.toLowerCase();
|
|
158
|
+
if (twoWord in ADDRESS_ABBREVIATIONS) {
|
|
159
|
+
result.push(ADDRESS_ABBREVIATIONS[twoWord]!);
|
|
160
|
+
i += 2;
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
// Strip trailing punctuation for lookup
|
|
165
|
+
const wordLower = words[i]!.toLowerCase().replace(/[.,]+$/, "");
|
|
166
|
+
if (wordLower in ADDRESS_ABBREVIATIONS) {
|
|
167
|
+
result.push(ADDRESS_ABBREVIATIONS[wordLower]!);
|
|
168
|
+
} else {
|
|
169
|
+
result.push(titleCase(words[i]!));
|
|
170
|
+
}
|
|
171
|
+
i += 1;
|
|
172
|
+
}
|
|
173
|
+
return result.join(" ");
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Standardize state to uppercase, strip.
|
|
178
|
+
*/
|
|
179
|
+
function stdState(value: string): string | null {
|
|
180
|
+
const v = value.trim().toUpperCase();
|
|
181
|
+
return v || null;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Strip whitespace, normalize to null if empty.
|
|
186
|
+
*/
|
|
187
|
+
function stdStrip(value: string): string | null {
|
|
188
|
+
const v = value.trim();
|
|
189
|
+
return v || null;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Collapse multiple spaces to one, strip.
|
|
194
|
+
*/
|
|
195
|
+
function stdTrimWhitespace(value: string): string | null {
|
|
196
|
+
const v = value.replace(/\s+/g, " ").trim();
|
|
197
|
+
return v || null;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// ---------------------------------------------------------------------------
|
|
201
|
+
// Standardizer registry
|
|
202
|
+
// ---------------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
/** Map of standardizer name to function. */
|
|
205
|
+
const STANDARDIZERS: Readonly<Record<string, (value: string) => string | null>> = {
|
|
206
|
+
email: stdEmail,
|
|
207
|
+
name_proper: stdNameProper,
|
|
208
|
+
name_upper: stdNameUpper,
|
|
209
|
+
name_lower: stdNameLower,
|
|
210
|
+
phone: stdPhone,
|
|
211
|
+
zip5: stdZip5,
|
|
212
|
+
address: stdAddress,
|
|
213
|
+
state: stdState,
|
|
214
|
+
strip: stdStrip,
|
|
215
|
+
trim_whitespace: stdTrimWhitespace,
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
// ---------------------------------------------------------------------------
|
|
219
|
+
// applyStandardizer — dispatch to the correct standardizer
|
|
220
|
+
// ---------------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Apply a named standardizer to a string value.
|
|
224
|
+
*
|
|
225
|
+
* @throws Error if the standardizer name is not recognized.
|
|
226
|
+
*/
|
|
227
|
+
export function applyStandardizer(value: string, name: string): string {
|
|
228
|
+
const fn = STANDARDIZERS[name];
|
|
229
|
+
if (!fn) {
|
|
230
|
+
const available = Object.keys(STANDARDIZERS).sort().join(", ");
|
|
231
|
+
throw new Error(
|
|
232
|
+
`Unknown standardizer: "${name}". Available: ${available}`,
|
|
233
|
+
);
|
|
234
|
+
}
|
|
235
|
+
const result = fn(value);
|
|
236
|
+
// Standardizers may return null for invalid data; treat as empty string
|
|
237
|
+
// so downstream pipeline can decide how to handle it.
|
|
238
|
+
return result ?? "";
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// ---------------------------------------------------------------------------
|
|
242
|
+
// applyStandardization — apply rules to all rows
|
|
243
|
+
// ---------------------------------------------------------------------------
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* Apply standardization rules to rows.
|
|
247
|
+
*
|
|
248
|
+
* `rules` maps column names to arrays of standardizer names that are
|
|
249
|
+
* applied in sequence. For example:
|
|
250
|
+
*
|
|
251
|
+
* ```ts
|
|
252
|
+
* applyStandardization(rows, {
|
|
253
|
+
* email: ["email"],
|
|
254
|
+
* first_name: ["strip", "name_proper"],
|
|
255
|
+
* phone: ["phone"],
|
|
256
|
+
* });
|
|
257
|
+
* ```
|
|
258
|
+
*
|
|
259
|
+
* Returns new row objects (does not mutate originals).
|
|
260
|
+
* Null/undefined column values are skipped (left as-is).
|
|
261
|
+
*/
|
|
262
|
+
export function applyStandardization(
|
|
263
|
+
rows: readonly Row[],
|
|
264
|
+
rules: Readonly<Record<string, readonly string[]>>,
|
|
265
|
+
): Row[] {
|
|
266
|
+
return rows.map((row) => {
|
|
267
|
+
const newRow: Record<string, unknown> = { ...row };
|
|
268
|
+
for (const [column, standardizers] of Object.entries(rules)) {
|
|
269
|
+
const val = row[column];
|
|
270
|
+
if (val === null || val === undefined) continue;
|
|
271
|
+
let str = String(val);
|
|
272
|
+
for (const stdName of standardizers) {
|
|
273
|
+
str = applyStandardizer(str, stdName);
|
|
274
|
+
}
|
|
275
|
+
newRow[column] = str;
|
|
276
|
+
}
|
|
277
|
+
return newRow as Row;
|
|
278
|
+
});
|
|
279
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* streaming.ts — Incremental single-record match + cluster updates.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/streaming.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row, MatchkeyConfig, ClusterInfo } from "./types.js";
|
|
9
|
+
import { addToCluster } from "./cluster.js";
|
|
10
|
+
import { matchOne } from "./match-one.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Types
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
export interface StreamAddResult {
|
|
17
|
+
readonly rowId: number;
|
|
18
|
+
readonly matchedIds: readonly number[];
|
|
19
|
+
readonly clusterId: number;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface StreamProcessorConfig {
|
|
23
|
+
readonly matchkey: MatchkeyConfig;
|
|
24
|
+
readonly threshold: number;
|
|
25
|
+
readonly maxClusterSize?: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface StreamSnapshot {
|
|
29
|
+
readonly clusters: ReadonlyMap<number, ClusterInfo>;
|
|
30
|
+
readonly rows: readonly Row[];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// StreamProcessor
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Incremental record processor.
|
|
39
|
+
*
|
|
40
|
+
* On each `add()` the new row is matched against all previously seen rows
|
|
41
|
+
* using `matchOne`, then folded into the cluster map via `addToCluster`.
|
|
42
|
+
*/
|
|
43
|
+
export class StreamProcessor {
|
|
44
|
+
private readonly clusters = new Map<number, ClusterInfo>();
|
|
45
|
+
private readonly rowsById = new Map<number, Row>();
|
|
46
|
+
private readonly order: number[] = [];
|
|
47
|
+
private nextId = 0;
|
|
48
|
+
|
|
49
|
+
constructor(private readonly config: StreamProcessorConfig) {}
|
|
50
|
+
|
|
51
|
+
/** Add a new record and return match + cluster info. */
|
|
52
|
+
add(row: Row): StreamAddResult {
|
|
53
|
+
const rowId = (row["__row_id__"] as number | undefined) ?? this.nextId;
|
|
54
|
+
if (typeof row["__row_id__"] !== "number") {
|
|
55
|
+
// Attach row_id if missing
|
|
56
|
+
row = { ...row, __row_id__: rowId };
|
|
57
|
+
}
|
|
58
|
+
if (rowId >= this.nextId) {
|
|
59
|
+
this.nextId = rowId + 1;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Matchkey with threshold override (exact variant has no threshold).
|
|
63
|
+
const base = this.config.matchkey;
|
|
64
|
+
const mk: MatchkeyConfig =
|
|
65
|
+
base.type === "exact"
|
|
66
|
+
? base
|
|
67
|
+
: { ...base, threshold: this.config.threshold };
|
|
68
|
+
|
|
69
|
+
// Build snapshot of existing rows (exclude self if duplicate id)
|
|
70
|
+
const existing: Row[] = [];
|
|
71
|
+
for (const id of this.order) {
|
|
72
|
+
if (id === rowId) continue;
|
|
73
|
+
const r = this.rowsById.get(id);
|
|
74
|
+
if (r !== undefined) existing.push(r);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const hits = matchOne(row, existing, mk);
|
|
78
|
+
const matchPairs: [number, number][] = hits.map((h) => [h.rowId, h.score]);
|
|
79
|
+
|
|
80
|
+
addToCluster(
|
|
81
|
+
rowId,
|
|
82
|
+
matchPairs,
|
|
83
|
+
this.clusters,
|
|
84
|
+
this.config.maxClusterSize ?? 100,
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
// Register the row
|
|
88
|
+
if (!this.rowsById.has(rowId)) {
|
|
89
|
+
this.order.push(rowId);
|
|
90
|
+
}
|
|
91
|
+
this.rowsById.set(rowId, row);
|
|
92
|
+
|
|
93
|
+
// Find the cluster id the record landed in
|
|
94
|
+
let landedCid = -1;
|
|
95
|
+
for (const [cid, info] of this.clusters) {
|
|
96
|
+
if (info.members.includes(rowId)) {
|
|
97
|
+
landedCid = cid;
|
|
98
|
+
break;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
rowId,
|
|
104
|
+
matchedIds: hits.map((h) => h.rowId),
|
|
105
|
+
clusterId: landedCid,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Number of records ingested. */
|
|
110
|
+
get size(): number {
|
|
111
|
+
return this.rowsById.size;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Snapshot of current cluster state + rows. */
|
|
115
|
+
snapshot(): StreamSnapshot {
|
|
116
|
+
const rows: Row[] = [];
|
|
117
|
+
for (const id of this.order) {
|
|
118
|
+
const r = this.rowsById.get(id);
|
|
119
|
+
if (r !== undefined) rows.push(r);
|
|
120
|
+
}
|
|
121
|
+
// Clone clusters to decouple callers from internal map
|
|
122
|
+
const frozen = new Map<number, ClusterInfo>();
|
|
123
|
+
for (const [cid, info] of this.clusters) {
|
|
124
|
+
frozen.set(cid, info);
|
|
125
|
+
}
|
|
126
|
+
return { clusters: frozen, rows };
|
|
127
|
+
}
|
|
128
|
+
}
|