goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,220 @@
1
+ /**
2
+ * api.ts — High-level API functions wrapping the pipeline.
3
+ * Edge-safe: no `node:` imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/_api.py convenience functions.
6
+ */
7
+
8
+ import type {
9
+ Row,
10
+ GoldenMatchConfig,
11
+ DedupeResult,
12
+ MatchResult,
13
+ MatchkeyConfig,
14
+ MatchkeyField,
15
+ BlockingKeyConfig,
16
+ } from "./types.js";
17
+ import {
18
+ makeConfig,
19
+ makeMatchkeyConfig,
20
+ makeMatchkeyField,
21
+ makeBlockingConfig,
22
+ } from "./types.js";
23
+ import { runDedupePipeline, runMatchPipeline } from "./pipeline.js";
24
+ import { scoreField, scorePair, asString } from "./scorer.js";
25
+ import { applyTransforms } from "./transforms.js";
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // Options
29
+ // ---------------------------------------------------------------------------
30
+
31
+ export interface DedupeOptions {
32
+ /** Full config object -- takes precedence over shorthand options. */
33
+ readonly config?: GoldenMatchConfig;
34
+ /** Columns for exact matching (creates one exact matchkey per column). */
35
+ readonly exact?: readonly string[];
36
+ /** Columns for fuzzy matching with per-field thresholds. */
37
+ readonly fuzzy?: Readonly<Record<string, number>>;
38
+ /** Blocking key columns (lowercase transform applied). */
39
+ readonly blocking?: readonly string[];
40
+ /** Overall fuzzy threshold (default 0.85). */
41
+ readonly threshold?: number;
42
+ /** Enable LLM scorer for borderline pairs. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in env. */
43
+ readonly llmScorer?: boolean;
44
+ }
45
+
46
+ // ---------------------------------------------------------------------------
47
+ // Build config from shorthand options
48
+ // ---------------------------------------------------------------------------
49
+
50
+ function buildConfigFromOptions(options?: DedupeOptions): GoldenMatchConfig {
51
+ if (options?.config) return options.config;
52
+
53
+ const matchkeys: MatchkeyConfig[] = [];
54
+ const threshold = options?.threshold ?? 0.85;
55
+
56
+ // Exact matchkeys: one per column
57
+ if (options?.exact) {
58
+ for (const col of options.exact) {
59
+ matchkeys.push(
60
+ makeMatchkeyConfig({
61
+ name: `exact_${col}`,
62
+ type: "exact",
63
+ fields: [
64
+ makeMatchkeyField({
65
+ field: col,
66
+ transforms: ["lowercase", "strip"],
67
+ scorer: "exact",
68
+ }),
69
+ ],
70
+ }),
71
+ );
72
+ }
73
+ }
74
+
75
+ // Fuzzy matchkey: all fuzzy columns combined into one weighted matchkey
76
+ if (options?.fuzzy) {
77
+ const fuzzyEntries = Object.entries(options.fuzzy);
78
+ if (fuzzyEntries.length > 0) {
79
+ const fields: MatchkeyField[] = fuzzyEntries.map(([col, weight]) =>
80
+ makeMatchkeyField({
81
+ field: col,
82
+ transforms: ["lowercase", "strip"],
83
+ scorer: "jaro_winkler",
84
+ weight,
85
+ }),
86
+ );
87
+ matchkeys.push(
88
+ makeMatchkeyConfig({
89
+ name: "fuzzy_combined",
90
+ type: "weighted",
91
+ fields,
92
+ threshold,
93
+ }),
94
+ );
95
+ }
96
+ }
97
+
98
+ // Blocking config
99
+ let blocking = makeBlockingConfig();
100
+ if (options?.blocking && options.blocking.length > 0) {
101
+ const keys: BlockingKeyConfig[] = options.blocking.map((col) => ({
102
+ fields: [col],
103
+ transforms: ["lowercase", "strip"],
104
+ }));
105
+ blocking = makeBlockingConfig({ keys });
106
+ }
107
+
108
+ const partial: Partial<GoldenMatchConfig> = {
109
+ blocking,
110
+ threshold,
111
+ };
112
+ if (matchkeys.length > 0) {
113
+ (partial as Record<string, unknown>).matchkeys = matchkeys;
114
+ }
115
+ if (options?.llmScorer) {
116
+ (partial as Record<string, unknown>).llmScorer = {
117
+ enabled: true,
118
+ autoThreshold: 0.9,
119
+ candidateLo: 0.6,
120
+ candidateHi: 0.9,
121
+ batchSize: 10,
122
+ maxWorkers: 4,
123
+ mode: "pairwise",
124
+ };
125
+ }
126
+ return makeConfig(partial);
127
+ }
128
+
129
+ // ---------------------------------------------------------------------------
130
+ // Public API: dedupe
131
+ // ---------------------------------------------------------------------------
132
+
133
+ /**
134
+ * Deduplicate an array of row objects.
135
+ *
136
+ * Shorthand usage:
137
+ * ```ts
138
+ * const result = dedupe(rows, {
139
+ * exact: ["email"],
140
+ * fuzzy: { name: 0.85, address: 0.7 },
141
+ * blocking: ["zip"],
142
+ * threshold: 0.85,
143
+ * });
144
+ * ```
145
+ *
146
+ * Or provide a full config:
147
+ * ```ts
148
+ * const result = dedupe(rows, { config: myConfig });
149
+ * ```
150
+ */
151
+ export function dedupe(
152
+ rows: readonly Row[],
153
+ options?: DedupeOptions,
154
+ ): DedupeResult {
155
+ const config = buildConfigFromOptions(options);
156
+ return runDedupePipeline(rows, config);
157
+ }
158
+
159
+ // ---------------------------------------------------------------------------
160
+ // Public API: match
161
+ // ---------------------------------------------------------------------------
162
+
163
+ /**
164
+ * Match target rows against reference rows.
165
+ *
166
+ * Same options as `dedupe()`. Returns matched/unmatched target rows.
167
+ */
168
+ export function match(
169
+ target: readonly Row[],
170
+ reference: readonly Row[],
171
+ options?: DedupeOptions,
172
+ ): MatchResult {
173
+ const config = buildConfigFromOptions(options);
174
+ return runMatchPipeline(target, reference, config);
175
+ }
176
+
177
+ // ---------------------------------------------------------------------------
178
+ // Public API: scoreStrings
179
+ // ---------------------------------------------------------------------------
180
+
181
+ /**
182
+ * Score two strings using the specified scorer algorithm.
183
+ *
184
+ * @param a - First string.
185
+ * @param b - Second string.
186
+ * @param scorer - Scorer name (default: "jaro_winkler").
187
+ * Valid scorers: exact, jaro_winkler, levenshtein, token_sort,
188
+ * soundex_match, dice, jaccard, ensemble.
189
+ * @returns Similarity score between 0.0 and 1.0.
190
+ */
191
+ export function scoreStrings(
192
+ a: string,
193
+ b: string,
194
+ scorer: string = "jaro_winkler",
195
+ ): number {
196
+ const result = scoreField(a, b, scorer);
197
+ return result ?? 0.0;
198
+ }
199
+
200
+ // ---------------------------------------------------------------------------
201
+ // Public API: scorePairRecord
202
+ // ---------------------------------------------------------------------------
203
+
204
+ /**
205
+ * Score a pair of row objects across specified fields using weighted
206
+ * aggregation.
207
+ *
208
+ * @param rowA - First row.
209
+ * @param rowB - Second row.
210
+ * @param fields - Field configs specifying which fields to compare,
211
+ * transforms to apply, scorer to use, and weight.
212
+ * @returns Weighted similarity score between 0.0 and 1.0.
213
+ */
214
+ export function scorePairRecord(
215
+ rowA: Row,
216
+ rowB: Row,
217
+ fields: readonly MatchkeyField[],
218
+ ): number {
219
+ return scorePair(rowA, rowB, fields);
220
+ }
@@ -0,0 +1,363 @@
1
+ /**
2
+ * autoconfig.ts — Auto-generate a GoldenMatch config from sample data.
3
+ * Edge-safe: no `node:` imports.
4
+ *
5
+ * Ports goldenmatch/core/autoconfig.py. Profiles the rows, classifies
6
+ * columns, and builds exact/weighted matchkeys + blocking config.
7
+ */
8
+
9
+ import type {
10
+ Row,
11
+ GoldenMatchConfig,
12
+ MatchkeyConfig,
13
+ MatchkeyField,
14
+ BlockingKeyConfig,
15
+ BlockingConfig,
16
+ } from "./types.js";
17
+ import {
18
+ makeConfig,
19
+ makeMatchkeyConfig,
20
+ makeMatchkeyField,
21
+ makeBlockingConfig,
22
+ makeGoldenRulesConfig,
23
+ } from "./types.js";
24
+ import { profileRows, type ColumnProfile, type DatasetProfile } from "./profiler.js";
25
+
26
+ // ---------------------------------------------------------------------------
27
+ // Options
28
+ // ---------------------------------------------------------------------------
29
+
30
+ export interface AutoconfigOptions {
31
+ readonly llmProvider?: string;
32
+ readonly llmAuto?: boolean;
33
+ }
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Name-based classification patterns (authoritative over data profiling for
37
+ // some signals — matches Python's _DATE_PATTERNS / _GEO_PATTERNS behavior).
38
+ // ---------------------------------------------------------------------------
39
+
40
+ const EMAIL_NAME_PATTERNS = [/email/i, /e_mail/i, /e-mail/i];
41
+ const PHONE_NAME_PATTERNS = [/phone/i, /tel(?!e)/i, /mobile/i, /cell/i];
42
+ const NAME_NAME_PATTERNS = [/name/i, /first/i, /last/i, /full_name/i, /surname/i];
43
+ const ZIP_NAME_PATTERNS = [/zip/i, /postal/i, /postcode/i];
44
+ const GEO_NAME_PATTERNS = [
45
+ /^city/i,
46
+ /city_desc/i,
47
+ /^state/i,
48
+ /state_cd/i,
49
+ /county/i,
50
+ /country/i,
51
+ /^region/i,
52
+ /province/i,
53
+ ];
54
+ const DATE_NAME_PATTERNS = [
55
+ /date/i,
56
+ /created/i,
57
+ /modified/i,
58
+ /updated/i,
59
+ /_at$/i,
60
+ /birth/i,
61
+ /dob/i,
62
+ ];
63
+ const ID_NAME_PATTERNS = [/^id$/i, /_id$/i, /uuid/i, /guid/i];
64
+
65
+ // Re-exported for consumers that wanted the spec-level constants.
66
+ export const EMAIL_PATTERNS = EMAIL_NAME_PATTERNS;
67
+ export const PHONE_PATTERNS = PHONE_NAME_PATTERNS;
68
+ export const NAME_PATTERNS = NAME_NAME_PATTERNS;
69
+ export const ZIP_PATTERNS = ZIP_NAME_PATTERNS;
70
+ export const GEO_PATTERNS = GEO_NAME_PATTERNS;
71
+ export const DATE_PATTERNS = DATE_NAME_PATTERNS;
72
+ export const ID_PATTERNS = ID_NAME_PATTERNS;
73
+
74
+ function nameMatches(name: string, patterns: readonly RegExp[]): boolean {
75
+ return patterns.some((re) => re.test(name));
76
+ }
77
+
78
+ // ---------------------------------------------------------------------------
79
+ // Column classification (authoritative: date > geo > name heuristics)
80
+ // ---------------------------------------------------------------------------
81
+
82
+ type ClassifiedKind =
83
+ | "email"
84
+ | "phone"
85
+ | "zip"
86
+ | "geo"
87
+ | "date"
88
+ | "name"
89
+ | "id"
90
+ | "numeric"
91
+ | "text";
92
+
93
+ function classifyColumn(profile: ColumnProfile): ClassifiedKind {
94
+ const name = profile.name;
95
+
96
+ // Date is checked first so that date-like columns never get misclassified
97
+ // as phones by the profiler's value heuristic.
98
+ if (nameMatches(name, DATE_NAME_PATTERNS)) return "date";
99
+ if (profile.inferredType === "date") return "date";
100
+
101
+ if (nameMatches(name, GEO_NAME_PATTERNS)) return "geo";
102
+ if (profile.inferredType === "geo") return "geo";
103
+
104
+ if (nameMatches(name, EMAIL_NAME_PATTERNS) || profile.inferredType === "email") {
105
+ return "email";
106
+ }
107
+ if (nameMatches(name, PHONE_NAME_PATTERNS) || profile.inferredType === "phone") {
108
+ return "phone";
109
+ }
110
+ if (nameMatches(name, ZIP_NAME_PATTERNS) || profile.inferredType === "zip") {
111
+ return "zip";
112
+ }
113
+ if (nameMatches(name, NAME_NAME_PATTERNS) || profile.inferredType === "name") {
114
+ return "name";
115
+ }
116
+ if (nameMatches(name, ID_NAME_PATTERNS) || profile.inferredType === "id") {
117
+ return "id";
118
+ }
119
+ if (profile.inferredType === "numeric") return "numeric";
120
+ return "text";
121
+ }
122
+
123
+ // ---------------------------------------------------------------------------
124
+ // Heuristic builders
125
+ // ---------------------------------------------------------------------------
126
+
127
+ function buildExactMatchkeys(
128
+ profiles: readonly ColumnProfile[],
129
+ ): MatchkeyConfig[] {
130
+ const out: MatchkeyConfig[] = [];
131
+ for (const p of profiles) {
132
+ const kind = classifyColumn(p);
133
+ // zip/geo are blocking signals, NOT identity claims.
134
+ if (kind === "zip" || kind === "geo" || kind === "date" || kind === "text") {
135
+ continue;
136
+ }
137
+
138
+ // Skip sparse & near-constant columns
139
+ if (p.nullRate > 0.4) continue;
140
+ if (p.cardinalityRatio < 0.01) continue;
141
+
142
+ // Only identifier-like columns get exact matchkeys with >=0.5 cardinality.
143
+ const isIdentifier =
144
+ kind === "email" || kind === "phone" || kind === "id";
145
+ if (!isIdentifier) continue;
146
+ if (p.cardinalityRatio < 0.5) continue;
147
+
148
+ const transforms: string[] =
149
+ kind === "email"
150
+ ? ["lowercase", "strip"]
151
+ : kind === "phone"
152
+ ? ["digits_only"]
153
+ : ["strip"];
154
+
155
+ out.push(
156
+ makeMatchkeyConfig({
157
+ name: `exact_${p.name}`,
158
+ type: "exact",
159
+ fields: [
160
+ makeMatchkeyField({
161
+ field: p.name,
162
+ transforms,
163
+ scorer: "exact",
164
+ weight: 1.0,
165
+ }),
166
+ ],
167
+ threshold: 1.0,
168
+ }),
169
+ );
170
+ }
171
+ return out;
172
+ }
173
+
174
+ function buildWeightedMatchkey(
175
+ profiles: readonly ColumnProfile[],
176
+ ): MatchkeyConfig | null {
177
+ const fields: MatchkeyField[] = [];
178
+
179
+ for (const p of profiles) {
180
+ const kind = classifyColumn(p);
181
+ if (p.nullRate > 0.5) continue;
182
+
183
+ if (kind === "name") {
184
+ fields.push(
185
+ makeMatchkeyField({
186
+ field: p.name,
187
+ transforms: ["lowercase", "strip", "normalize_whitespace"],
188
+ scorer: "jaro_winkler",
189
+ weight: 0.6,
190
+ }),
191
+ );
192
+ } else if (kind === "email") {
193
+ fields.push(
194
+ makeMatchkeyField({
195
+ field: p.name,
196
+ transforms: ["lowercase", "strip"],
197
+ scorer: "jaro_winkler",
198
+ weight: 0.3,
199
+ }),
200
+ );
201
+ } else if (kind === "phone") {
202
+ fields.push(
203
+ makeMatchkeyField({
204
+ field: p.name,
205
+ transforms: ["digits_only"],
206
+ scorer: "exact",
207
+ weight: 0.25,
208
+ }),
209
+ );
210
+ } else if (kind === "zip") {
211
+ fields.push(
212
+ makeMatchkeyField({
213
+ field: p.name,
214
+ transforms: ["digits_only"],
215
+ scorer: "exact",
216
+ weight: 0.15,
217
+ }),
218
+ );
219
+ } else if (kind === "geo") {
220
+ fields.push(
221
+ makeMatchkeyField({
222
+ field: p.name,
223
+ transforms: ["lowercase", "strip"],
224
+ scorer: "exact",
225
+ weight: 0.1,
226
+ }),
227
+ );
228
+ } else if (kind === "text" && p.avgLength >= 10) {
229
+ // Long free-text columns: token_sort to catch reordering
230
+ fields.push(
231
+ makeMatchkeyField({
232
+ field: p.name,
233
+ transforms: ["lowercase", "strip", "token_sort"],
234
+ scorer: "token_sort",
235
+ weight: 0.2,
236
+ }),
237
+ );
238
+ }
239
+ }
240
+
241
+ if (fields.length === 0) return null;
242
+
243
+ return makeMatchkeyConfig({
244
+ name: "weighted_identity",
245
+ type: "weighted",
246
+ fields,
247
+ threshold: 0.85,
248
+ rerank: false,
249
+ });
250
+ }
251
+
252
+ function buildBlocking(profiles: readonly ColumnProfile[]): BlockingConfig {
253
+ const keys: BlockingKeyConfig[] = [];
254
+
255
+ // Prefer zip > geo > first-letter of name
256
+ for (const p of profiles) {
257
+ const kind = classifyColumn(p);
258
+ if (kind !== "zip") continue;
259
+ if (p.nullRate > 0.2) continue;
260
+ if (p.cardinalityRatio >= 0.95) continue;
261
+ keys.push({
262
+ fields: [p.name],
263
+ transforms: ["digits_only", "substring:0:5"],
264
+ });
265
+ break;
266
+ }
267
+
268
+ if (keys.length === 0) {
269
+ for (const p of profiles) {
270
+ const kind = classifyColumn(p);
271
+ if (kind !== "geo") continue;
272
+ if (p.nullRate > 0.2) continue;
273
+ if (p.cardinalityRatio >= 0.95) continue;
274
+ keys.push({
275
+ fields: [p.name],
276
+ transforms: ["lowercase", "strip"],
277
+ });
278
+ break;
279
+ }
280
+ }
281
+
282
+ if (keys.length === 0) {
283
+ for (const p of profiles) {
284
+ const kind = classifyColumn(p);
285
+ if (kind !== "name") continue;
286
+ if (p.nullRate > 0.2) continue;
287
+ if (p.cardinalityRatio >= 0.95) continue;
288
+ keys.push({
289
+ fields: [p.name],
290
+ transforms: ["lowercase", "strip", "substring:0:1"],
291
+ });
292
+ break;
293
+ }
294
+ }
295
+
296
+ // Last resort: first non-null column that isn't near-unique or sparse
297
+ if (keys.length === 0) {
298
+ for (const p of profiles) {
299
+ if (p.nullRate > 0.2) continue;
300
+ if (p.cardinalityRatio >= 0.95) continue;
301
+ if (p.cardinalityRatio < 0.01) continue;
302
+ keys.push({
303
+ fields: [p.name],
304
+ transforms: ["lowercase", "strip"],
305
+ });
306
+ break;
307
+ }
308
+ }
309
+
310
+ return makeBlockingConfig({
311
+ strategy: "static",
312
+ keys,
313
+ maxBlockSize: 1000,
314
+ skipOversized: true,
315
+ });
316
+ }
317
+
318
+ // ---------------------------------------------------------------------------
319
+ // Public entry points
320
+ // ---------------------------------------------------------------------------
321
+
322
+ /**
323
+ * Build a GoldenMatchConfig by profiling the provided rows.
324
+ *
325
+ * Mirrors goldenmatch.core.autoconfig.auto_configure_df. Does not apply
326
+ * standardization rules directly — callers can merge them onto the result.
327
+ */
328
+ export function autoConfigureRows(
329
+ rows: readonly Row[],
330
+ options?: AutoconfigOptions,
331
+ ): GoldenMatchConfig {
332
+ const profile: DatasetProfile = profileRows(rows);
333
+ const profiles = profile.columns;
334
+
335
+ const exactKeys = buildExactMatchkeys(profiles);
336
+ const weighted = buildWeightedMatchkey(profiles);
337
+ const matchkeys: MatchkeyConfig[] = [...exactKeys];
338
+ if (weighted) matchkeys.push(weighted);
339
+
340
+ const blocking = buildBlocking(profiles);
341
+ const goldenRules = makeGoldenRulesConfig({ defaultStrategy: "most_complete" });
342
+
343
+ const config = makeConfig({
344
+ matchkeys,
345
+ blocking,
346
+ goldenRules,
347
+ threshold: 0.85,
348
+ ...(options?.llmAuto !== undefined ? { llmAuto: options.llmAuto } : {}),
349
+ });
350
+
351
+ return config;
352
+ }
353
+
354
+ /**
355
+ * Convenience alias for API parity with the Python function that starts
356
+ * from "files" (which, in edge-safe land, means pre-loaded row arrays).
357
+ */
358
+ export function autoConfigure(
359
+ rows: readonly Row[],
360
+ options?: AutoconfigOptions,
361
+ ): GoldenMatchConfig {
362
+ return autoConfigureRows(rows, options);
363
+ }
@@ -0,0 +1,102 @@
1
+ /**
2
+ * autofix.ts — Lightweight row auto-fix utilities.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/autofix.py. Trims whitespace, nulls empty strings,
6
+ * and converts common "no value" tokens to null.
7
+ */
8
+
9
+ import type { Row } from "./types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Types
13
+ // ---------------------------------------------------------------------------
14
+
15
+ export interface AutoFixLog {
16
+ readonly fixType: string;
17
+ readonly column: string;
18
+ readonly affectedRows: number;
19
+ }
20
+
21
+ export interface AutoFixResult {
22
+ readonly rows: Row[];
23
+ readonly log: AutoFixLog[];
24
+ }
25
+
26
+ // ---------------------------------------------------------------------------
27
+ // Tokens treated as null
28
+ // ---------------------------------------------------------------------------
29
+
30
+ const NULL_TOKENS: ReadonlySet<string> = new Set([
31
+ "n/a",
32
+ "na",
33
+ "none",
34
+ "null",
35
+ "nil",
36
+ "unknown",
37
+ "unk",
38
+ "-",
39
+ "--",
40
+ "?",
41
+ ]);
42
+
43
+ function isNullToken(s: string): boolean {
44
+ const lower = s.trim().toLowerCase();
45
+ if (lower.length === 0) return true;
46
+ return NULL_TOKENS.has(lower);
47
+ }
48
+
49
+ // ---------------------------------------------------------------------------
50
+ // autoFixRows
51
+ // ---------------------------------------------------------------------------
52
+
53
+ /**
54
+ * Apply conservative fixes row-by-row:
55
+ * - trim string values
56
+ * - convert empty strings and common "no value" tokens to null
57
+ *
58
+ * Internal columns (prefix `__`) are preserved unchanged.
59
+ */
60
+ export function autoFixRows(rows: readonly Row[]): AutoFixResult {
61
+ const out: Row[] = [];
62
+ const trimCounts = new Map<string, number>();
63
+ const nullCounts = new Map<string, number>();
64
+
65
+ for (const row of rows) {
66
+ const fixed: Record<string, unknown> = {};
67
+ let changed = false;
68
+ for (const [key, value] of Object.entries(row)) {
69
+ if (key.startsWith("__")) {
70
+ fixed[key] = value;
71
+ continue;
72
+ }
73
+ if (typeof value === "string") {
74
+ const trimmed = value.trim();
75
+ if (trimmed !== value) {
76
+ trimCounts.set(key, (trimCounts.get(key) ?? 0) + 1);
77
+ changed = true;
78
+ }
79
+ if (isNullToken(trimmed)) {
80
+ fixed[key] = null;
81
+ nullCounts.set(key, (nullCounts.get(key) ?? 0) + 1);
82
+ changed = true;
83
+ } else {
84
+ fixed[key] = trimmed;
85
+ }
86
+ } else {
87
+ fixed[key] = value;
88
+ }
89
+ }
90
+ out.push(changed ? (fixed as Row) : row);
91
+ }
92
+
93
+ const log: AutoFixLog[] = [];
94
+ for (const [col, n] of trimCounts) {
95
+ log.push({ fixType: "trim_whitespace", column: col, affectedRows: n });
96
+ }
97
+ for (const [col, n] of nullCounts) {
98
+ log.push({ fixType: "null_empty_or_token", column: col, affectedRows: n });
99
+ }
100
+
101
+ return { rows: out, log };
102
+ }