goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,196 @@
1
+ /**
2
+ * sensitivity.ts — Parameter sweep engine for GoldenMatch.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/sensitivity.py.
6
+ */
7
+
8
+ import type { Row, GoldenMatchConfig } from "./types.js";
9
+ import { runDedupePipeline } from "./pipeline.js";
10
+ import { compareClusters } from "./compare-clusters.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface SweepParam {
17
+ /** Dot-path into the config, e.g. "threshold", "blocking.maxBlockSize". */
18
+ readonly path: string;
19
+ readonly values: readonly unknown[];
20
+ }
21
+
22
+ export interface SweepPoint {
23
+ readonly params: Readonly<Record<string, unknown>>;
24
+ readonly stats: Readonly<Record<string, number>>;
25
+ readonly twi?: number;
26
+ readonly error?: string;
27
+ }
28
+
29
+ export interface SensitivityResult {
30
+ readonly baseline: SweepPoint;
31
+ readonly points: readonly SweepPoint[];
32
+ readonly stable: boolean;
33
+ }
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Dot-path config override
37
+ // ---------------------------------------------------------------------------
38
+
39
+ /** Set a nested property by dot-path, returning a new object (shallow-cloned chain). */
40
+ function setPath(
41
+ root: Record<string, unknown>,
42
+ path: string,
43
+ value: unknown,
44
+ ): Record<string, unknown> {
45
+ // Simple dot path; array indices via [n] not supported in this edge-safe port
46
+ const parts = path.split(".").filter((p) => p.length > 0);
47
+ if (parts.length === 0) return root;
48
+ const clone: Record<string, unknown> = { ...root };
49
+ let cursor: Record<string, unknown> = clone;
50
+ for (let i = 0; i < parts.length - 1; i++) {
51
+ const key = parts[i]!;
52
+ const child = cursor[key];
53
+ const childObj =
54
+ child !== null && typeof child === "object" && !Array.isArray(child)
55
+ ? { ...(child as Record<string, unknown>) }
56
+ : {};
57
+ cursor[key] = childObj;
58
+ cursor = childObj;
59
+ }
60
+ cursor[parts[parts.length - 1]!] = value;
61
+ return clone;
62
+ }
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Stats extraction
66
+ // ---------------------------------------------------------------------------
67
+
68
+ function statsFrom(result: ReturnType<typeof runDedupePipeline>): Record<string, number> {
69
+ return {
70
+ totalRecords: result.stats.totalRecords,
71
+ totalClusters: result.stats.totalClusters,
72
+ matchedRecords: result.stats.matchedRecords,
73
+ uniqueRecords: result.stats.uniqueRecords,
74
+ matchRate: result.stats.matchRate,
75
+ scoredPairs: result.scoredPairs.length,
76
+ };
77
+ }
78
+
79
+ // ---------------------------------------------------------------------------
80
+ // Cartesian product of sweep values
81
+ // ---------------------------------------------------------------------------
82
+
83
+ function cartesianPoints(
84
+ params: readonly SweepParam[],
85
+ ): Readonly<Record<string, unknown>>[] {
86
+ if (params.length === 0) return [];
87
+ let acc: Record<string, unknown>[] = [{}];
88
+ for (const p of params) {
89
+ const next: Record<string, unknown>[] = [];
90
+ for (const base of acc) {
91
+ for (const v of p.values) {
92
+ next.push({ ...base, [p.path]: v });
93
+ }
94
+ }
95
+ acc = next;
96
+ }
97
+ return acc;
98
+ }
99
+
100
+ // ---------------------------------------------------------------------------
101
+ // runSensitivity
102
+ // ---------------------------------------------------------------------------
103
+
104
+ /**
105
+ * Run a parameter sweep.
106
+ *
107
+ * Each point in the Cartesian product of `params` is applied to
108
+ * `baselineConfig`, the dedupe pipeline runs, and the resulting clusters are
109
+ * compared against the baseline via CCMS. A `stable` flag is set when every
110
+ * point's TWI is within 0.05 of 1.0.
111
+ *
112
+ * Per-point errors are caught and stored on the point so that partial
113
+ * results are preserved.
114
+ */
115
+ export function runSensitivity(
116
+ rows: readonly Row[],
117
+ baselineConfig: GoldenMatchConfig,
118
+ params: readonly SweepParam[],
119
+ ): SensitivityResult {
120
+ // Baseline run
121
+ const baselineRun = runDedupePipeline(rows, baselineConfig);
122
+ const baseline: SweepPoint = {
123
+ params: {},
124
+ stats: statsFrom(baselineRun),
125
+ twi: 1.0,
126
+ };
127
+
128
+ const points: SweepPoint[] = [];
129
+ const combos = cartesianPoints(params);
130
+
131
+ let stable = true;
132
+ for (const combo of combos) {
133
+ let cfg: GoldenMatchConfig = baselineConfig;
134
+ for (const [path, value] of Object.entries(combo)) {
135
+ cfg = setPath(
136
+ cfg as Record<string, unknown>,
137
+ path,
138
+ value,
139
+ ) as GoldenMatchConfig;
140
+ }
141
+
142
+ try {
143
+ const runResult = runDedupePipeline(rows, cfg);
144
+ let twi: number | undefined;
145
+ try {
146
+ twi = compareClusters(baselineRun.clusters, runResult.clusters).twi;
147
+ } catch (err) {
148
+ // eslint-disable-next-line no-console
149
+ console.warn(
150
+ `TWI comparison failed for sweep point ${JSON.stringify(combo)}: ${
151
+ err instanceof Error ? err.message : String(err)
152
+ }`,
153
+ );
154
+ twi = undefined;
155
+ }
156
+ if (twi === undefined || Math.abs(1 - twi) > 0.05) stable = false;
157
+ points.push({
158
+ params: combo,
159
+ stats: statsFrom(runResult),
160
+ ...(twi !== undefined ? { twi } : {}),
161
+ });
162
+ } catch (err) {
163
+ stable = false;
164
+ points.push({
165
+ params: combo,
166
+ stats: {},
167
+ error: err instanceof Error ? err.message : String(err),
168
+ });
169
+ }
170
+ }
171
+
172
+ return { baseline, points, stable };
173
+ }
174
+
175
+ // ---------------------------------------------------------------------------
176
+ // stabilityReport
177
+ // ---------------------------------------------------------------------------
178
+
179
+ /** Render a human-readable stability report for a sensitivity result. */
180
+ export function stabilityReport(result: SensitivityResult): string {
181
+ const lines: string[] = [];
182
+ lines.push("Sensitivity sweep:");
183
+ lines.push(` Baseline: ${JSON.stringify(result.baseline.stats)}`);
184
+ lines.push(` Points: ${result.points.length}`);
185
+ lines.push(` Stable: ${result.stable ? "yes" : "no"}`);
186
+ for (const p of result.points) {
187
+ const twiStr = p.twi !== undefined ? p.twi.toFixed(4) : "n/a";
188
+ const errStr = p.error !== undefined ? ` error=${p.error}` : "";
189
+ lines.push(
190
+ ` - params=${JSON.stringify(p.params)} twi=${twiStr} clusters=${
191
+ p.stats["totalClusters"] ?? "?"
192
+ }${errStr}`,
193
+ );
194
+ }
195
+ return lines.join("\n");
196
+ }
@@ -0,0 +1,279 @@
1
+ /**
2
+ * standardize.ts — Data standardization for GoldenMatch-JS.
3
+ * Edge-safe: no `node:` imports, pure TypeScript only.
4
+ *
5
+ * Ports standardization from goldenmatch/core/standardize.py.
6
+ * These are data cleaning transforms applied to columns before matching.
7
+ */
8
+
9
+ import type { Row } from "./types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Address abbreviations (USPS standard)
13
+ // ---------------------------------------------------------------------------
14
+
15
+ /** Map of full word (lowercase) to USPS abbreviation. */
16
+ const ADDRESS_ABBREVIATIONS: Readonly<Record<string, string>> = {
17
+ street: "St",
18
+ avenue: "Ave",
19
+ boulevard: "Blvd",
20
+ drive: "Dr",
21
+ lane: "Ln",
22
+ road: "Rd",
23
+ court: "Ct",
24
+ place: "Pl",
25
+ circle: "Cir",
26
+ terrace: "Ter",
27
+ highway: "Hwy",
28
+ parkway: "Pkwy",
29
+ expressway: "Expy",
30
+ freeway: "Fwy",
31
+ trail: "Trl",
32
+ way: "Way",
33
+ north: "N",
34
+ south: "S",
35
+ east: "E",
36
+ west: "W",
37
+ northeast: "NE",
38
+ northwest: "NW",
39
+ southeast: "SE",
40
+ southwest: "SW",
41
+ apartment: "Apt",
42
+ suite: "Ste",
43
+ building: "Bldg",
44
+ floor: "Fl",
45
+ room: "Rm",
46
+ unit: "Unit",
47
+ department: "Dept",
48
+ "post office box": "PO Box",
49
+ "p.o. box": "PO Box",
50
+ "po box": "PO Box",
51
+ };
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // Individual standardizer functions
55
+ // ---------------------------------------------------------------------------
56
+
57
+ /**
58
+ * Standardize email: lowercase, strip, validate basic structure.
59
+ * Returns null for invalid emails.
60
+ */
61
+ function stdEmail(value: string): string | null {
62
+ const v = value.trim().toLowerCase();
63
+ if (!v || !v.includes("@")) return null;
64
+ const domain = v.split("@").pop();
65
+ if (!domain || !domain.includes(".")) return null;
66
+ return v;
67
+ }
68
+
69
+ /**
70
+ * Standardize name to proper case (Title Case).
71
+ * Handles hyphenated names: mary-jane -> Mary-Jane.
72
+ */
73
+ function stdNameProper(value: string): string | null {
74
+ const v = value.trim();
75
+ if (!v) return null;
76
+ // Collapse whitespace
77
+ const collapsed = v.replace(/\s+/g, " ");
78
+ // Title-case each whitespace-separated word; within a word handle hyphens
79
+ const titleWord = (word: string): string => {
80
+ if (!word) return "";
81
+ const hyphenParts = word.split("-");
82
+ return hyphenParts
83
+ .map((p) => {
84
+ if (!p) return "";
85
+ return p.charAt(0).toUpperCase() + p.slice(1).toLowerCase();
86
+ })
87
+ .join("-");
88
+ };
89
+ return collapsed.split(" ").map(titleWord).join(" ");
90
+ }
91
+
92
+ /**
93
+ * Standardize name to UPPER CASE.
94
+ */
95
+ function stdNameUpper(value: string): string | null {
96
+ const v = value.trim().replace(/\s+/g, " ").toUpperCase();
97
+ return v || null;
98
+ }
99
+
100
+ /**
101
+ * Standardize name to lower case.
102
+ */
103
+ function stdNameLower(value: string): string | null {
104
+ const v = value.trim().replace(/\s+/g, " ");
105
+ return v ? v.toLowerCase() : null;
106
+ }
107
+
108
+ /**
109
+ * Standardize phone: digits only, strip US country code if 11 digits starting with 1.
110
+ * Returns null if fewer than 7 digits.
111
+ */
112
+ function stdPhone(value: string): string | null {
113
+ let digits = value.replace(/\D/g, "");
114
+ if (!digits) return null;
115
+ // Strip US country code
116
+ if (digits.length === 11 && digits.startsWith("1")) {
117
+ digits = digits.slice(1);
118
+ }
119
+ // Must be at least 7 digits
120
+ if (digits.length < 7) return null;
121
+ return digits;
122
+ }
123
+
124
+ /**
125
+ * Standardize ZIP code to first 5 digits, zero-padded.
126
+ */
127
+ function stdZip5(value: string): string | null {
128
+ // Take part before hyphen or space
129
+ const first = value.split("-")[0]!.split(" ")[0]!;
130
+ const digits = first.replace(/\D/g, "");
131
+ if (!digits) return null;
132
+ return digits.slice(0, 5).padStart(5, "0");
133
+ }
134
+
135
+ /**
136
+ * Title-case a single word.
137
+ */
138
+ function titleCase(word: string): string {
139
+ if (!word) return word;
140
+ return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
141
+ }
142
+
143
+ /**
144
+ * Standardize address: title case, USPS abbreviations, normalize whitespace.
145
+ */
146
+ function stdAddress(value: string): string | null {
147
+ let v = value.trim();
148
+ if (!v) return null;
149
+ // Normalize whitespace
150
+ v = v.replace(/\s+/g, " ");
151
+ const words = v.split(" ");
152
+ const result: string[] = [];
153
+ let i = 0;
154
+ while (i < words.length) {
155
+ // Check two-word phrases first (e.g. "post office")
156
+ if (i + 1 < words.length) {
157
+ const twoWord = `${words[i]} ${words[i + 1]}`.toLowerCase();
158
+ if (twoWord in ADDRESS_ABBREVIATIONS) {
159
+ result.push(ADDRESS_ABBREVIATIONS[twoWord]!);
160
+ i += 2;
161
+ continue;
162
+ }
163
+ }
164
+ // Strip trailing punctuation for lookup
165
+ const wordLower = words[i]!.toLowerCase().replace(/[.,]+$/, "");
166
+ if (wordLower in ADDRESS_ABBREVIATIONS) {
167
+ result.push(ADDRESS_ABBREVIATIONS[wordLower]!);
168
+ } else {
169
+ result.push(titleCase(words[i]!));
170
+ }
171
+ i += 1;
172
+ }
173
+ return result.join(" ");
174
+ }
175
+
176
+ /**
177
+ * Standardize state to uppercase, strip.
178
+ */
179
+ function stdState(value: string): string | null {
180
+ const v = value.trim().toUpperCase();
181
+ return v || null;
182
+ }
183
+
184
+ /**
185
+ * Strip whitespace, normalize to null if empty.
186
+ */
187
+ function stdStrip(value: string): string | null {
188
+ const v = value.trim();
189
+ return v || null;
190
+ }
191
+
192
+ /**
193
+ * Collapse multiple spaces to one, strip.
194
+ */
195
+ function stdTrimWhitespace(value: string): string | null {
196
+ const v = value.replace(/\s+/g, " ").trim();
197
+ return v || null;
198
+ }
199
+
200
+ // ---------------------------------------------------------------------------
201
+ // Standardizer registry
202
+ // ---------------------------------------------------------------------------
203
+
204
+ /** Map of standardizer name to function. */
205
+ const STANDARDIZERS: Readonly<Record<string, (value: string) => string | null>> = {
206
+ email: stdEmail,
207
+ name_proper: stdNameProper,
208
+ name_upper: stdNameUpper,
209
+ name_lower: stdNameLower,
210
+ phone: stdPhone,
211
+ zip5: stdZip5,
212
+ address: stdAddress,
213
+ state: stdState,
214
+ strip: stdStrip,
215
+ trim_whitespace: stdTrimWhitespace,
216
+ };
217
+
218
+ // ---------------------------------------------------------------------------
219
+ // applyStandardizer — dispatch to the correct standardizer
220
+ // ---------------------------------------------------------------------------
221
+
222
+ /**
223
+ * Apply a named standardizer to a string value.
224
+ *
225
+ * @throws Error if the standardizer name is not recognized.
226
+ */
227
+ export function applyStandardizer(value: string, name: string): string {
228
+ const fn = STANDARDIZERS[name];
229
+ if (!fn) {
230
+ const available = Object.keys(STANDARDIZERS).sort().join(", ");
231
+ throw new Error(
232
+ `Unknown standardizer: "${name}". Available: ${available}`,
233
+ );
234
+ }
235
+ const result = fn(value);
236
+ // Standardizers may return null for invalid data; treat as empty string
237
+ // so downstream pipeline can decide how to handle it.
238
+ return result ?? "";
239
+ }
240
+
241
+ // ---------------------------------------------------------------------------
242
+ // applyStandardization — apply rules to all rows
243
+ // ---------------------------------------------------------------------------
244
+
245
+ /**
246
+ * Apply standardization rules to rows.
247
+ *
248
+ * `rules` maps column names to arrays of standardizer names that are
249
+ * applied in sequence. For example:
250
+ *
251
+ * ```ts
252
+ * applyStandardization(rows, {
253
+ * email: ["email"],
254
+ * first_name: ["strip", "name_proper"],
255
+ * phone: ["phone"],
256
+ * });
257
+ * ```
258
+ *
259
+ * Returns new row objects (does not mutate originals).
260
+ * Null/undefined column values are skipped (left as-is).
261
+ */
262
+ export function applyStandardization(
263
+ rows: readonly Row[],
264
+ rules: Readonly<Record<string, readonly string[]>>,
265
+ ): Row[] {
266
+ return rows.map((row) => {
267
+ const newRow: Record<string, unknown> = { ...row };
268
+ for (const [column, standardizers] of Object.entries(rules)) {
269
+ const val = row[column];
270
+ if (val === null || val === undefined) continue;
271
+ let str = String(val);
272
+ for (const stdName of standardizers) {
273
+ str = applyStandardizer(str, stdName);
274
+ }
275
+ newRow[column] = str;
276
+ }
277
+ return newRow as Row;
278
+ });
279
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * streaming.ts — Incremental single-record match + cluster updates.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/streaming.py.
6
+ */
7
+
8
+ import type { Row, MatchkeyConfig, ClusterInfo } from "./types.js";
9
+ import { addToCluster } from "./cluster.js";
10
+ import { matchOne } from "./match-one.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface StreamAddResult {
17
+ readonly rowId: number;
18
+ readonly matchedIds: readonly number[];
19
+ readonly clusterId: number;
20
+ }
21
+
22
+ export interface StreamProcessorConfig {
23
+ readonly matchkey: MatchkeyConfig;
24
+ readonly threshold: number;
25
+ readonly maxClusterSize?: number;
26
+ }
27
+
28
+ export interface StreamSnapshot {
29
+ readonly clusters: ReadonlyMap<number, ClusterInfo>;
30
+ readonly rows: readonly Row[];
31
+ }
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // StreamProcessor
35
+ // ---------------------------------------------------------------------------
36
+
37
+ /**
38
+ * Incremental record processor.
39
+ *
40
+ * On each `add()` the new row is matched against all previously seen rows
41
+ * using `matchOne`, then folded into the cluster map via `addToCluster`.
42
+ */
43
+ export class StreamProcessor {
44
+ private readonly clusters = new Map<number, ClusterInfo>();
45
+ private readonly rowsById = new Map<number, Row>();
46
+ private readonly order: number[] = [];
47
+ private nextId = 0;
48
+
49
+ constructor(private readonly config: StreamProcessorConfig) {}
50
+
51
+ /** Add a new record and return match + cluster info. */
52
+ add(row: Row): StreamAddResult {
53
+ const rowId = (row["__row_id__"] as number | undefined) ?? this.nextId;
54
+ if (typeof row["__row_id__"] !== "number") {
55
+ // Attach row_id if missing
56
+ row = { ...row, __row_id__: rowId };
57
+ }
58
+ if (rowId >= this.nextId) {
59
+ this.nextId = rowId + 1;
60
+ }
61
+
62
+ // Matchkey with threshold override (exact variant has no threshold).
63
+ const base = this.config.matchkey;
64
+ const mk: MatchkeyConfig =
65
+ base.type === "exact"
66
+ ? base
67
+ : { ...base, threshold: this.config.threshold };
68
+
69
+ // Build snapshot of existing rows (exclude self if duplicate id)
70
+ const existing: Row[] = [];
71
+ for (const id of this.order) {
72
+ if (id === rowId) continue;
73
+ const r = this.rowsById.get(id);
74
+ if (r !== undefined) existing.push(r);
75
+ }
76
+
77
+ const hits = matchOne(row, existing, mk);
78
+ const matchPairs: [number, number][] = hits.map((h) => [h.rowId, h.score]);
79
+
80
+ addToCluster(
81
+ rowId,
82
+ matchPairs,
83
+ this.clusters,
84
+ this.config.maxClusterSize ?? 100,
85
+ );
86
+
87
+ // Register the row
88
+ if (!this.rowsById.has(rowId)) {
89
+ this.order.push(rowId);
90
+ }
91
+ this.rowsById.set(rowId, row);
92
+
93
+ // Find the cluster id the record landed in
94
+ let landedCid = -1;
95
+ for (const [cid, info] of this.clusters) {
96
+ if (info.members.includes(rowId)) {
97
+ landedCid = cid;
98
+ break;
99
+ }
100
+ }
101
+
102
+ return {
103
+ rowId,
104
+ matchedIds: hits.map((h) => h.rowId),
105
+ clusterId: landedCid,
106
+ };
107
+ }
108
+
109
+ /** Number of records ingested. */
110
+ get size(): number {
111
+ return this.rowsById.size;
112
+ }
113
+
114
+ /** Snapshot of current cluster state + rows. */
115
+ snapshot(): StreamSnapshot {
116
+ const rows: Row[] = [];
117
+ for (const id of this.order) {
118
+ const r = this.rowsById.get(id);
119
+ if (r !== undefined) rows.push(r);
120
+ }
121
+ // Clone clusters to decouple callers from internal map
122
+ const frozen = new Map<number, ClusterInfo>();
123
+ for (const [cid, info] of this.clusters) {
124
+ frozen.set(cid, info);
125
+ }
126
+ return { clusters: frozen, rows };
127
+ }
128
+ }