goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,430 @@
1
+ /**
2
+ * data.ts — TabularData, edge-safe Polars replacement.
3
+ * Wraps readonly Row[] with column operations, joins, groupBy, sampling.
4
+ * No Node.js imports, no `process`.
5
+ */
6
+
7
+ import type { ColumnValue, Row } from "./types.js";
8
+
9
+ // ---------------------------------------------------------------------------
10
+ // Null handling
11
+ // ---------------------------------------------------------------------------
12
+
13
+ /** Strings treated as null (case-insensitive, trimmed). */
14
+ const NULL_STRINGS = new Set([
15
+ "",
16
+ "null",
17
+ "none",
18
+ "nan",
19
+ "n/a",
20
+ "na",
21
+ "nil",
22
+ "#n/a",
23
+ "missing",
24
+ "undefined",
25
+ ]);
26
+
27
+ /** Returns true for null, undefined, NaN, and null-ish string sentinels. */
28
+ export function isNullish(v: unknown): v is null | undefined {
29
+ if (v === null || v === undefined) return true;
30
+ if (typeof v === "string") return NULL_STRINGS.has(v.toLowerCase().trim());
31
+ if (typeof v === "number") return Number.isNaN(v);
32
+ return false;
33
+ }
34
+
35
+ /** Normalize an unknown value to ColumnValue (string | number | boolean | null). */
36
+ export function toColumnValue(v: unknown): ColumnValue {
37
+ if (isNullish(v)) return null;
38
+ if (typeof v === "string") return v;
39
+ if (typeof v === "number") return v;
40
+ if (typeof v === "boolean") return v;
41
+ return String(v);
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Mulberry32 seedable PRNG (NOT Mersenne Twister)
46
+ // ---------------------------------------------------------------------------
47
+
48
+ function mulberry32(seed: number): () => number {
49
+ let s = seed | 0;
50
+ return () => {
51
+ s = (s + 0x6d2b79f5) | 0;
52
+ let t = Math.imul(s ^ (s >>> 15), 1 | s);
53
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
54
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
55
+ };
56
+ }
57
+
58
+ // ---------------------------------------------------------------------------
59
+ // TabularData
60
+ // ---------------------------------------------------------------------------
61
+
62
+ export class TabularData {
63
+ private readonly _rows: readonly Row[];
64
+ private _columnCache = new Map<string, readonly ColumnValue[]>();
65
+
66
+ constructor(rows: readonly Row[]) {
67
+ this._rows = rows;
68
+ }
69
+
70
+ // ---- Getters ----
71
+
72
+ get rows(): readonly Row[] {
73
+ return this._rows;
74
+ }
75
+
76
+ get columns(): readonly string[] {
77
+ if (this._rows.length === 0) return [];
78
+ return Object.keys(this._rows[0]!);
79
+ }
80
+
81
+ get rowCount(): number {
82
+ return this._rows.length;
83
+ }
84
+
85
+ // ---- Column access ----
86
+
87
+ /** Get column values with null coercion (N/A, NaN, etc. become null). */
88
+ column(name: string): readonly ColumnValue[] {
89
+ const cached = this._columnCache.get(name);
90
+ if (cached) return cached;
91
+ const values = this._rows.map((r) => toColumnValue(r[name]));
92
+ this._columnCache.set(name, values);
93
+ return values;
94
+ }
95
+
96
+ /** Raw column access -- preserves original values without null coercion.
97
+ * Use for profiling where "N/A" should remain a string, not become null. */
98
+ rawColumn(name: string): readonly ColumnValue[] {
99
+ return this._rows.map((r) => {
100
+ const v = r[name];
101
+ if (v === null || v === undefined) return null;
102
+ if (
103
+ typeof v === "string" ||
104
+ typeof v === "number" ||
105
+ typeof v === "boolean"
106
+ )
107
+ return v;
108
+ return String(v);
109
+ });
110
+ }
111
+
112
+ // ---- Null helpers ----
113
+
114
+ nullCount(col: string): number {
115
+ let count = 0;
116
+ for (const v of this.column(col)) {
117
+ if (v === null) count++;
118
+ }
119
+ return count;
120
+ }
121
+
122
+ dropNulls(col: string): ColumnValue[] {
123
+ return this.column(col).filter(
124
+ (v): v is Exclude<ColumnValue, null> => v !== null,
125
+ );
126
+ }
127
+
128
+ // ---- Aggregation ----
129
+
130
+ nUnique(col: string): number {
131
+ const set = new Set<ColumnValue>();
132
+ for (const v of this.dropNulls(col)) set.add(v);
133
+ return set.size;
134
+ }
135
+
136
+ valueCounts(col: string): Map<ColumnValue, number> {
137
+ const map = new Map<ColumnValue, number>();
138
+ for (const v of this.dropNulls(col)) {
139
+ map.set(v, (map.get(v) ?? 0) + 1);
140
+ }
141
+ return map;
142
+ }
143
+
144
+ /** MUST use loop -- Math.min(...array) crashes on >65K elements. */
145
+ min(col: string): number | null {
146
+ const nums = this.numericValues(col);
147
+ if (nums.length === 0) return null;
148
+ let m = nums[0]!;
149
+ for (let i = 1; i < nums.length; i++) {
150
+ if (nums[i]! < m) m = nums[i]!;
151
+ }
152
+ return m;
153
+ }
154
+
155
+ /** MUST use loop -- Math.max(...array) crashes on >65K elements. */
156
+ max(col: string): number | null {
157
+ const nums = this.numericValues(col);
158
+ if (nums.length === 0) return null;
159
+ let m = nums[0]!;
160
+ for (let i = 1; i < nums.length; i++) {
161
+ if (nums[i]! > m) m = nums[i]!;
162
+ }
163
+ return m;
164
+ }
165
+
166
+ mean(col: string): number | null {
167
+ const nums = this.numericValues(col);
168
+ if (nums.length === 0) return null;
169
+ let sum = 0;
170
+ for (const n of nums) sum += n;
171
+ return sum / nums.length;
172
+ }
173
+
174
+ std(col: string): number | null {
175
+ const nums = this.numericValues(col);
176
+ if (nums.length < 2) return null;
177
+ const avg = this.mean(col)!;
178
+ let sumSq = 0;
179
+ for (const n of nums) sumSq += (n - avg) ** 2;
180
+ return Math.sqrt(sumSq / (nums.length - 1));
181
+ }
182
+
183
+ // ---- Filtering, mapping, slicing ----
184
+
185
+ filter(predicate: (row: Row) => boolean): TabularData {
186
+ return new TabularData(this._rows.filter(predicate));
187
+ }
188
+
189
+ map(fn: (row: Row, index: number) => Row): TabularData {
190
+ return new TabularData(this._rows.map(fn));
191
+ }
192
+
193
+ slice(start: number, end?: number): TabularData {
194
+ return new TabularData(this._rows.slice(start, end));
195
+ }
196
+
197
+ // ---- Column projection ----
198
+
199
+ /** Keep only the named columns. */
200
+ select(cols: readonly string[]): TabularData {
201
+ const colSet = new Set(cols);
202
+ const rows = this._rows.map((r) => {
203
+ const out: Record<string, unknown> = {};
204
+ for (const c of colSet) {
205
+ if (c in r) out[c] = r[c];
206
+ }
207
+ return out as Row;
208
+ });
209
+ return new TabularData(rows);
210
+ }
211
+
212
+ /** Drop the named columns. */
213
+ drop(cols: readonly string[]): TabularData {
214
+ const dropSet = new Set(cols);
215
+ const rows = this._rows.map((r) => {
216
+ const out: Record<string, unknown> = {};
217
+ for (const k of Object.keys(r)) {
218
+ if (!dropSet.has(k)) out[k] = r[k];
219
+ }
220
+ return out as Row;
221
+ });
222
+ return new TabularData(rows);
223
+ }
224
+
225
+ // ---- Column mutation ----
226
+
227
+ /** Return a new TabularData with an added (or replaced) column. */
228
+ addColumn(name: string, values: readonly ColumnValue[]): TabularData {
229
+ if (values.length !== this._rows.length) {
230
+ throw new Error(
231
+ `addColumn: values length (${values.length}) != row count (${this._rows.length})`,
232
+ );
233
+ }
234
+ const rows = this._rows.map((r, i) => ({
235
+ ...r,
236
+ [name]: values[i],
237
+ })) as Row[];
238
+ return new TabularData(rows);
239
+ }
240
+
241
+ /** Add a sequential row index column (like Polars with_row_index). */
242
+ withRowIndex(name = "__row_id__", offset = 0): TabularData {
243
+ const rows = this._rows.map(
244
+ (r, i) =>
245
+ ({
246
+ [name]: i + offset,
247
+ ...r,
248
+ }) as Row,
249
+ );
250
+ return new TabularData(rows);
251
+ }
252
+
253
+ // ---- Group by ----
254
+
255
+ /** Group rows by a column, returning Map<stringKey, TabularData>. */
256
+ groupBy(key: string): Map<string, TabularData> {
257
+ const groups = new Map<string, Row[]>();
258
+ for (const row of this._rows) {
259
+ const v = toColumnValue(row[key]);
260
+ const k = v === null ? "__null__" : String(v);
261
+ let arr = groups.get(k);
262
+ if (!arr) {
263
+ arr = [];
264
+ groups.set(k, arr);
265
+ }
266
+ arr.push(row);
267
+ }
268
+ const result = new Map<string, TabularData>();
269
+ for (const [k, rows] of groups) {
270
+ result.set(k, new TabularData(rows));
271
+ }
272
+ return result;
273
+ }
274
+
275
+ // ---- Join ----
276
+
277
+ /**
278
+ * Inner join with another TabularData on a shared column.
279
+ * Columns from `other` get a suffix to avoid collisions.
280
+ */
281
+ join(
282
+ other: TabularData,
283
+ on: string,
284
+ suffix = "_right",
285
+ ): TabularData {
286
+ // Build index on other
287
+ const otherIndex = new Map<string, Row[]>();
288
+ for (const row of other._rows) {
289
+ const v = toColumnValue(row[on]);
290
+ const k = v === null ? "__null__" : String(v);
291
+ let arr = otherIndex.get(k);
292
+ if (!arr) {
293
+ arr = [];
294
+ otherIndex.set(k, arr);
295
+ }
296
+ arr.push(row);
297
+ }
298
+
299
+ const otherCols = other.columns.filter((c) => c !== on);
300
+ const result: Row[] = [];
301
+
302
+ for (const leftRow of this._rows) {
303
+ const v = toColumnValue(leftRow[on]);
304
+ const k = v === null ? "__null__" : String(v);
305
+ const matches = otherIndex.get(k);
306
+ if (!matches) continue;
307
+
308
+ for (const rightRow of matches) {
309
+ const merged: Record<string, unknown> = { ...leftRow };
310
+ for (const c of otherCols) {
311
+ const key = c in leftRow ? `${c}${suffix}` : c;
312
+ merged[key] = rightRow[c];
313
+ }
314
+ result.push(merged as Row);
315
+ }
316
+ }
317
+
318
+ return new TabularData(result);
319
+ }
320
+
321
+ // ---- Sampling ----
322
+
323
+ /** Fisher-Yates partial shuffle with seedable PRNG. */
324
+ sample(n: number, seed = 42): TabularData {
325
+ if (n >= this._rows.length) return this;
326
+ const rng = mulberry32(seed);
327
+ const indices = Array.from({ length: this._rows.length }, (_, i) => i);
328
+ // Partial Fisher-Yates: shuffle last n elements
329
+ for (let i = indices.length - 1; i > 0 && indices.length - 1 - i < n; i--) {
330
+ const j = Math.floor(rng() * (i + 1));
331
+ [indices[i], indices[j]] = [indices[j]!, indices[i]!];
332
+ }
333
+ const sampled = indices.slice(indices.length - n).map((i) => this._rows[i]!);
334
+ return new TabularData(sampled);
335
+ }
336
+
337
+ // ---- Sorting ----
338
+
339
+ /** Sort by a column (ascending). Nulls sort last. */
340
+ sortBy(col: string): TabularData {
341
+ const sorted = [...this._rows].sort((a, b) => {
342
+ const va = toColumnValue(a[col]);
343
+ const vb = toColumnValue(b[col]);
344
+ if (va === null && vb === null) return 0;
345
+ if (va === null) return 1;
346
+ if (vb === null) return -1;
347
+ if (typeof va === "number" && typeof vb === "number") return va - vb;
348
+ return String(va).localeCompare(String(vb));
349
+ });
350
+ return new TabularData(sorted);
351
+ }
352
+
353
+ // ---- Unique ----
354
+
355
+ /** Return rows with unique values in the given column (keeps first occurrence). */
356
+ unique(col: string): TabularData {
357
+ const seen = new Set<string>();
358
+ const result: Row[] = [];
359
+ for (const row of this._rows) {
360
+ const v = toColumnValue(row[col]);
361
+ const k = v === null ? "__null__" : String(v);
362
+ if (!seen.has(k)) {
363
+ seen.add(k);
364
+ result.push(row);
365
+ }
366
+ }
367
+ return new TabularData(result);
368
+ }
369
+
370
+ // ---- Serialization ----
371
+
372
+ /** Return rows as plain dicts. */
373
+ toDicts(): Row[] {
374
+ return [...this._rows];
375
+ }
376
+
377
+ // ---- Numeric / string helpers ----
378
+
379
+ numericValues(col: string): number[] {
380
+ const result: number[] = [];
381
+ for (const v of this.column(col)) {
382
+ if (typeof v === "number" && Number.isFinite(v)) {
383
+ result.push(v);
384
+ }
385
+ }
386
+ return result;
387
+ }
388
+
389
+ stringValues(col: string): string[] {
390
+ const result: string[] = [];
391
+ for (const v of this.column(col)) {
392
+ if (typeof v === "string") result.push(v);
393
+ }
394
+ return result;
395
+ }
396
+
397
+ // ---- Static constructors ----
398
+
399
+ /** Create from an array of row dicts. */
400
+ static fromDicts(rows: readonly Row[]): TabularData {
401
+ return new TabularData(rows);
402
+ }
403
+
404
+ /** Create from column-oriented data: {col: values[]}. */
405
+ static fromColumns(
406
+ cols: Readonly<Record<string, readonly ColumnValue[]>>,
407
+ ): TabularData {
408
+ const colNames = Object.keys(cols);
409
+ if (colNames.length === 0) return new TabularData([]);
410
+
411
+ const len = cols[colNames[0]!]!.length;
412
+ for (const name of colNames) {
413
+ if (cols[name]!.length !== len) {
414
+ throw new Error(
415
+ `fromColumns: column "${name}" length (${cols[name]!.length}) != expected (${len})`,
416
+ );
417
+ }
418
+ }
419
+
420
+ const rows: Row[] = [];
421
+ for (let i = 0; i < len; i++) {
422
+ const row: Record<string, unknown> = {};
423
+ for (const name of colNames) {
424
+ row[name] = cols[name]![i];
425
+ }
426
+ rows.push(row as Row);
427
+ }
428
+ return new TabularData(rows);
429
+ }
430
+ }
@@ -0,0 +1,277 @@
1
+ /**
2
+ * domain.ts — Domain detection & lightweight feature extraction.
3
+ * Edge-safe: no `node:` imports.
4
+ *
5
+ * Ports goldenmatch/core/domain.py. Detects the subject area (product,
6
+ * person, bibliographic, company, generic) from column names and extracts
7
+ * per-row features (brand, model, version, etc.) as extra columns.
8
+ */
9
+
10
+ import type { Row } from "./types.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface DomainProfile {
17
+ readonly name: string;
18
+ readonly confidence: number;
19
+ readonly textColumns: readonly string[];
20
+ readonly featureColumns: readonly string[];
21
+ }
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Domain signature tables
25
+ // ---------------------------------------------------------------------------
26
+
27
+ type Signature = { readonly pattern: RegExp; readonly weight: number };
28
+
29
+ const PRODUCT_SIGNATURES: readonly Signature[] = [
30
+ { pattern: /brand|manufacturer|mfr/i, weight: 2 },
31
+ { pattern: /model/i, weight: 2 },
32
+ { pattern: /sku|upc|ean|asin|mpn/i, weight: 3 },
33
+ { pattern: /price|msrp|cost/i, weight: 1 },
34
+ { pattern: /category|dept|department/i, weight: 1 },
35
+ { pattern: /product|item/i, weight: 1 },
36
+ ];
37
+
38
+ const PERSON_SIGNATURES: readonly Signature[] = [
39
+ { pattern: /^first|first_name|fname/i, weight: 2 },
40
+ { pattern: /^last|last_name|lname|surname/i, weight: 2 },
41
+ { pattern: /full_name|person_name/i, weight: 2 },
42
+ { pattern: /email/i, weight: 2 },
43
+ { pattern: /phone|mobile|cell/i, weight: 1 },
44
+ { pattern: /dob|birth|birthday/i, weight: 2 },
45
+ { pattern: /ssn|nin/i, weight: 3 },
46
+ ];
47
+
48
+ const BIBLIOGRAPHIC_SIGNATURES: readonly Signature[] = [
49
+ { pattern: /^title$|article_title/i, weight: 2 },
50
+ { pattern: /authors?|by_line/i, weight: 3 },
51
+ { pattern: /year|pub_year|published/i, weight: 1 },
52
+ { pattern: /venue|journal|conference/i, weight: 2 },
53
+ { pattern: /doi|issn|isbn/i, weight: 3 },
54
+ { pattern: /abstract/i, weight: 1 },
55
+ ];
56
+
57
+ const COMPANY_SIGNATURES: readonly Signature[] = [
58
+ { pattern: /company|employer|org(?!anization_id)/i, weight: 2 },
59
+ { pattern: /industry|sector/i, weight: 2 },
60
+ { pattern: /website|domain|url/i, weight: 1 },
61
+ { pattern: /ein|duns|cik|lei/i, weight: 3 },
62
+ { pattern: /hq|headquarters/i, weight: 1 },
63
+ ];
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Detection
67
+ // ---------------------------------------------------------------------------
68
+
69
+ function scoreDomain(
70
+ columns: readonly string[],
71
+ signatures: readonly Signature[],
72
+ ): number {
73
+ let score = 0;
74
+ for (const col of columns) {
75
+ for (const sig of signatures) {
76
+ if (sig.pattern.test(col)) {
77
+ score += sig.weight;
78
+ break;
79
+ }
80
+ }
81
+ }
82
+ return score;
83
+ }
84
+
85
+ function findMatchingColumns(
86
+ columns: readonly string[],
87
+ signatures: readonly Signature[],
88
+ ): string[] {
89
+ const hits: string[] = [];
90
+ for (const col of columns) {
91
+ if (signatures.some((s) => s.pattern.test(col))) {
92
+ hits.push(col);
93
+ }
94
+ }
95
+ return hits;
96
+ }
97
+
98
+ const TEXT_NAME_RE = /name|title|description|notes|text|body/i;
99
+
100
+ /**
101
+ * Detect the domain of a dataset based on its column names.
102
+ */
103
+ export function detectDomain(columns: readonly string[]): DomainProfile {
104
+ const candidates: ReadonlyArray<{
105
+ name: string;
106
+ score: number;
107
+ features: string[];
108
+ }> = [
109
+ {
110
+ name: "product",
111
+ score: scoreDomain(columns, PRODUCT_SIGNATURES),
112
+ features: findMatchingColumns(columns, PRODUCT_SIGNATURES),
113
+ },
114
+ {
115
+ name: "person",
116
+ score: scoreDomain(columns, PERSON_SIGNATURES),
117
+ features: findMatchingColumns(columns, PERSON_SIGNATURES),
118
+ },
119
+ {
120
+ name: "bibliographic",
121
+ score: scoreDomain(columns, BIBLIOGRAPHIC_SIGNATURES),
122
+ features: findMatchingColumns(columns, BIBLIOGRAPHIC_SIGNATURES),
123
+ },
124
+ {
125
+ name: "company",
126
+ score: scoreDomain(columns, COMPANY_SIGNATURES),
127
+ features: findMatchingColumns(columns, COMPANY_SIGNATURES),
128
+ },
129
+ ];
130
+
131
+ let winner = candidates[0]!;
132
+ for (const c of candidates) if (c.score > winner.score) winner = c;
133
+
134
+ const MAX_SCORE = 10;
135
+ const confidence =
136
+ winner.score <= 0 ? 0 : Math.min(1, winner.score / MAX_SCORE);
137
+
138
+ const textColumns = columns.filter((c) => TEXT_NAME_RE.test(c));
139
+
140
+ if (winner.score === 0) {
141
+ return {
142
+ name: "generic",
143
+ confidence: 0,
144
+ textColumns,
145
+ featureColumns: [],
146
+ };
147
+ }
148
+
149
+ return {
150
+ name: winner.name,
151
+ confidence,
152
+ textColumns,
153
+ featureColumns: winner.features,
154
+ };
155
+ }
156
+
157
+ // ---------------------------------------------------------------------------
158
+ // Feature extraction
159
+ // ---------------------------------------------------------------------------
160
+
161
+ function asString(value: unknown): string | null {
162
+ if (value === null || value === undefined) return null;
163
+ const s = typeof value === "string" ? value : String(value);
164
+ const trimmed = s.trim();
165
+ return trimmed.length === 0 ? null : trimmed;
166
+ }
167
+
168
+ const KNOWN_BRANDS = new Set(
169
+ [
170
+ "apple",
171
+ "samsung",
172
+ "sony",
173
+ "lg",
174
+ "dell",
175
+ "hp",
176
+ "lenovo",
177
+ "asus",
178
+ "acer",
179
+ "microsoft",
180
+ "google",
181
+ "amazon",
182
+ "bose",
183
+ "canon",
184
+ "nikon",
185
+ "panasonic",
186
+ "philips",
187
+ "toshiba",
188
+ ].map((s) => s.toLowerCase()),
189
+ );
190
+
191
+ const MODEL_RE = /\b([A-Z0-9]{2,}[\-_]?[A-Z0-9]{2,}|[A-Z][A-Z0-9]{3,})\b/;
192
+ const SEMVER_RE = /\b(\d+\.\d+(?:\.\d+)?(?:[\-+][A-Za-z0-9.]+)?)\b/;
193
+
194
+ function extractBrand(row: Row, profile: DomainProfile): string | null {
195
+ const manufacturer =
196
+ asString(row["manufacturer"]) ??
197
+ asString(row["brand"]) ??
198
+ asString(row["mfr"]);
199
+ if (manufacturer) return manufacturer.toLowerCase();
200
+
201
+ for (const col of profile.textColumns) {
202
+ const val = asString(row[col]);
203
+ if (!val) continue;
204
+ const first = val.split(/\s+/)[0];
205
+ if (first && KNOWN_BRANDS.has(first.toLowerCase())) {
206
+ return first.toLowerCase();
207
+ }
208
+ }
209
+ return null;
210
+ }
211
+
212
+ function extractModel(row: Row, profile: DomainProfile): string | null {
213
+ const explicit = asString(row["model"]) ?? asString(row["mpn"]);
214
+ if (explicit) {
215
+ return explicit.replace(/[\-_\s]/g, "").toUpperCase();
216
+ }
217
+ for (const col of profile.textColumns) {
218
+ const val = asString(row[col]);
219
+ if (!val) continue;
220
+ const m = MODEL_RE.exec(val);
221
+ if (m && m[1]) return m[1].replace(/[\-_]/g, "").toUpperCase();
222
+ }
223
+ return null;
224
+ }
225
+
226
+ function extractVersion(row: Row, profile: DomainProfile): string | null {
227
+ const explicit = asString(row["version"]) ?? asString(row["ver"]);
228
+ if (explicit) return explicit;
229
+ for (const col of profile.textColumns) {
230
+ const val = asString(row[col]);
231
+ if (!val) continue;
232
+ const m = SEMVER_RE.exec(val);
233
+ if (m && m[1]) return m[1];
234
+ }
235
+ return null;
236
+ }
237
+
238
+ /**
239
+ * Annotate rows with domain-specific extracted columns.
240
+ * Returns enriched rows plus indices with low extraction confidence.
241
+ */
242
+ export function extractFeatures(
243
+ rows: readonly Row[],
244
+ profile: DomainProfile,
245
+ confidenceThreshold: number = 0.3,
246
+ ): { rows: Row[]; lowConfidenceIds: readonly number[] } {
247
+ if (profile.name === "generic" || profile.confidence === 0) {
248
+ return { rows: rows.map((r) => ({ ...r })), lowConfidenceIds: [] };
249
+ }
250
+
251
+ const lowConfidenceIds: number[] = [];
252
+ const out: Row[] = [];
253
+
254
+ for (let i = 0; i < rows.length; i++) {
255
+ const row = rows[i]!;
256
+ const enriched: Record<string, unknown> = { ...row };
257
+
258
+ if (profile.name === "product") {
259
+ const brand = extractBrand(row, profile);
260
+ const model = extractModel(row, profile);
261
+ const version = extractVersion(row, profile);
262
+
263
+ if (brand !== null) enriched["__brand__"] = brand;
264
+ if (model !== null) enriched["__model__"] = model;
265
+ if (version !== null) enriched["__version__"] = version;
266
+
267
+ const expected = 3;
268
+ const got = [brand, model, version].filter((v) => v !== null).length;
269
+ const conf = got / expected;
270
+ if (conf < confidenceThreshold) lowConfidenceIds.push(i);
271
+ }
272
+
273
+ out.push(enriched as Row);
274
+ }
275
+
276
+ return { rows: out, lowConfidenceIds };
277
+ }