goldenflow 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +2915 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +2913 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +2980 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +360 -0
- package/dist/core/index.d.ts +360 -0
- package/dist/core/index.js +2941 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +2980 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +2941 -0
- package/dist/index.js.map +1 -0
- package/dist/node/index.cjs +3588 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +57 -0
- package/dist/node/index.d.ts +57 -0
- package/dist/node/index.js +3536 -0
- package/dist/node/index.js.map +1 -0
- package/package.json +83 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,2913 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from 'commander';
|
|
3
|
+
import { readFileSync, mkdirSync, writeFileSync, existsSync, readdirSync, statSync } from 'fs';
|
|
4
|
+
import { extname, dirname, join } from 'path';
|
|
5
|
+
import { homedir } from 'os';
|
|
6
|
+
|
|
7
|
+
var __defProp = Object.defineProperty;
|
|
8
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
9
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
10
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
11
|
+
}) : x)(function(x) {
|
|
12
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
13
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
14
|
+
});
|
|
15
|
+
var __esm = (fn, res) => function __init() {
|
|
16
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
17
|
+
};
|
|
18
|
+
var __export = (target, all) => {
|
|
19
|
+
for (var name in all)
|
|
20
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
// src/core/types.ts
|
|
24
|
+
function makeTransformRecord(input) {
|
|
25
|
+
return {
|
|
26
|
+
sampleBefore: [],
|
|
27
|
+
sampleAfter: [],
|
|
28
|
+
...input
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
function makeColumnProfile(input) {
|
|
32
|
+
return {
|
|
33
|
+
sampleValues: [],
|
|
34
|
+
detectedFormat: null,
|
|
35
|
+
...input
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
function makeConfig(input) {
|
|
39
|
+
return {
|
|
40
|
+
source: null,
|
|
41
|
+
output: null,
|
|
42
|
+
transforms: [],
|
|
43
|
+
splits: [],
|
|
44
|
+
renames: {},
|
|
45
|
+
drop: [],
|
|
46
|
+
filters: [],
|
|
47
|
+
dedup: null,
|
|
48
|
+
mappings: [],
|
|
49
|
+
...input
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
var MutableManifest;
|
|
53
|
+
var init_types = __esm({
|
|
54
|
+
"src/core/types.ts"() {
|
|
55
|
+
MutableManifest = class {
|
|
56
|
+
source;
|
|
57
|
+
records = [];
|
|
58
|
+
errors = [];
|
|
59
|
+
createdAt;
|
|
60
|
+
constructor(source) {
|
|
61
|
+
this.source = source;
|
|
62
|
+
this.createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
63
|
+
}
|
|
64
|
+
addRecord(record) {
|
|
65
|
+
this.records.push(record);
|
|
66
|
+
}
|
|
67
|
+
addError(column, transform, row, error) {
|
|
68
|
+
this.errors.push({ column, transform, row, error });
|
|
69
|
+
}
|
|
70
|
+
toDict() {
|
|
71
|
+
return {
|
|
72
|
+
source: this.source,
|
|
73
|
+
created_at: this.createdAt,
|
|
74
|
+
records: this.records.map((r) => ({
|
|
75
|
+
column: r.column,
|
|
76
|
+
transform: r.transform,
|
|
77
|
+
affected_rows: r.affectedRows,
|
|
78
|
+
total_rows: r.totalRows,
|
|
79
|
+
sample_before: r.sampleBefore,
|
|
80
|
+
sample_after: r.sampleAfter
|
|
81
|
+
})),
|
|
82
|
+
errors: this.errors.map((e) => ({
|
|
83
|
+
column: e.column,
|
|
84
|
+
transform: e.transform,
|
|
85
|
+
row: e.row,
|
|
86
|
+
error: e.error
|
|
87
|
+
})),
|
|
88
|
+
summary: {
|
|
89
|
+
total_transforms: this.records.length,
|
|
90
|
+
total_errors: this.errors.length,
|
|
91
|
+
columns_affected: [...new Set(this.records.map((r) => r.column))]
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// src/core/transforms/registry.ts
|
|
100
|
+
function registerTransform(opts, func) {
|
|
101
|
+
_REGISTRY.set(opts.name, {
|
|
102
|
+
name: opts.name,
|
|
103
|
+
func,
|
|
104
|
+
inputTypes: opts.inputTypes,
|
|
105
|
+
autoApply: opts.autoApply ?? false,
|
|
106
|
+
priority: opts.priority ?? 50,
|
|
107
|
+
mode: opts.mode ?? "series"
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
function getTransform(name) {
|
|
111
|
+
return _REGISTRY.get(name);
|
|
112
|
+
}
|
|
113
|
+
function listTransforms() {
|
|
114
|
+
return [..._REGISTRY.values()].sort((a, b) => b.priority - a.priority);
|
|
115
|
+
}
|
|
116
|
+
function parseTransformName(raw) {
|
|
117
|
+
const parts = raw.split(":");
|
|
118
|
+
return [parts[0], parts.slice(1)];
|
|
119
|
+
}
|
|
120
|
+
var _REGISTRY;
|
|
121
|
+
var init_registry = __esm({
|
|
122
|
+
"src/core/transforms/registry.ts"() {
|
|
123
|
+
_REGISTRY = /* @__PURE__ */ new Map();
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
// src/core/domains/people-hr.ts
|
|
128
|
+
var people_hr_exports = {};
|
|
129
|
+
__export(people_hr_exports, {
|
|
130
|
+
PACK: () => PACK
|
|
131
|
+
});
|
|
132
|
+
function ssnValidate(values) {
|
|
133
|
+
return values.map((v) => {
|
|
134
|
+
if (v === null || typeof v !== "string") return v;
|
|
135
|
+
const m = v.trim().match(SSN_RE);
|
|
136
|
+
if (!m) return false;
|
|
137
|
+
if (m[1] === "000" || m[2] === "00" || m[3] === "0000") return false;
|
|
138
|
+
return true;
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
var SSN_RE, PACK;
|
|
142
|
+
var init_people_hr = __esm({
|
|
143
|
+
"src/core/domains/people-hr.ts"() {
|
|
144
|
+
init_types();
|
|
145
|
+
init_registry();
|
|
146
|
+
SSN_RE = /^(\d{3})-?(\d{2})-?(\d{4})$/;
|
|
147
|
+
registerTransform(
|
|
148
|
+
{ name: "ssn_validate", inputTypes: ["ssn", "string"], priority: 55, mode: "series" },
|
|
149
|
+
ssnValidate
|
|
150
|
+
);
|
|
151
|
+
PACK = {
|
|
152
|
+
name: "people_hr",
|
|
153
|
+
description: "Name parsing, SSN formatting, employment dates, gender/boolean standardization",
|
|
154
|
+
transforms: [
|
|
155
|
+
"split_name",
|
|
156
|
+
"split_name_reverse",
|
|
157
|
+
"strip_titles",
|
|
158
|
+
"strip_suffixes",
|
|
159
|
+
"name_proper",
|
|
160
|
+
"ssn_mask",
|
|
161
|
+
"ssn_validate",
|
|
162
|
+
"date_iso8601",
|
|
163
|
+
"gender_standardize",
|
|
164
|
+
"boolean_normalize"
|
|
165
|
+
],
|
|
166
|
+
defaultConfig: makeConfig({
|
|
167
|
+
transforms: [
|
|
168
|
+
{ column: "name", ops: ["strip", "strip_titles", "title_case"] },
|
|
169
|
+
{ column: "ssn", ops: ["ssn_validate"] },
|
|
170
|
+
{ column: "gender", ops: ["gender_standardize"] },
|
|
171
|
+
{ column: "hire_date", ops: ["date_iso8601"] },
|
|
172
|
+
{ column: "active", ops: ["boolean_normalize"] }
|
|
173
|
+
]
|
|
174
|
+
})
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// src/core/domains/healthcare.ts
|
|
180
|
+
var healthcare_exports = {};
|
|
181
|
+
__export(healthcare_exports, {
|
|
182
|
+
PACK: () => PACK2
|
|
183
|
+
});
|
|
184
|
+
function npiValidate(values) {
|
|
185
|
+
return values.map((v) => {
|
|
186
|
+
if (v === null || typeof v !== "string") return v;
|
|
187
|
+
const digits = v.replace(/\D/g, "");
|
|
188
|
+
if (digits.length !== 10) return false;
|
|
189
|
+
const full = "80840" + digits;
|
|
190
|
+
let total = 0;
|
|
191
|
+
for (let i = full.length - 1, pos = 0; i >= 0; i--, pos++) {
|
|
192
|
+
let n = parseInt(full[i], 10);
|
|
193
|
+
if (pos % 2 === 1) {
|
|
194
|
+
n *= 2;
|
|
195
|
+
if (n > 9) n -= 9;
|
|
196
|
+
}
|
|
197
|
+
total += n;
|
|
198
|
+
}
|
|
199
|
+
return total % 10 === 0;
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
function icd10Format(values) {
|
|
203
|
+
return values.map((v) => {
|
|
204
|
+
if (v === null || typeof v !== "string") return v;
|
|
205
|
+
const code = v.trim().toUpperCase().replace(/\./g, "");
|
|
206
|
+
return code.length > 3 ? code.slice(0, 3) + "." + code.slice(3) : code;
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
var PACK2;
|
|
210
|
+
var init_healthcare = __esm({
|
|
211
|
+
"src/core/domains/healthcare.ts"() {
|
|
212
|
+
init_types();
|
|
213
|
+
init_registry();
|
|
214
|
+
registerTransform(
|
|
215
|
+
{ name: "npi_validate", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
216
|
+
npiValidate
|
|
217
|
+
);
|
|
218
|
+
registerTransform(
|
|
219
|
+
{ name: "icd10_format", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
220
|
+
icd10Format
|
|
221
|
+
);
|
|
222
|
+
PACK2 = {
|
|
223
|
+
name: "healthcare",
|
|
224
|
+
description: "MRN normalization, ICD-10 formatting, NPI validation, date standardization",
|
|
225
|
+
transforms: ["npi_validate", "icd10_format", "date_iso8601", "null_standardize", "strip"],
|
|
226
|
+
defaultConfig: makeConfig({
|
|
227
|
+
transforms: [
|
|
228
|
+
{ column: "npi", ops: ["npi_validate"] },
|
|
229
|
+
{ column: "icd10_code", ops: ["icd10_format"] },
|
|
230
|
+
{ column: "service_date", ops: ["date_iso8601"] },
|
|
231
|
+
{ column: "patient_name", ops: ["strip", "title_case"] }
|
|
232
|
+
]
|
|
233
|
+
})
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
// src/core/domains/finance.ts
|
|
239
|
+
var finance_exports = {};
|
|
240
|
+
__export(finance_exports, {
|
|
241
|
+
PACK: () => PACK3
|
|
242
|
+
});
|
|
243
|
+
function accountMask(values) {
|
|
244
|
+
return values.map((v) => {
|
|
245
|
+
if (v === null || typeof v !== "string") return v;
|
|
246
|
+
const digits = v.replace(/\D/g, "");
|
|
247
|
+
if (digits.length < 4) return v;
|
|
248
|
+
return "*".repeat(digits.length - 4) + digits.slice(-4);
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
function cusipFormat(values) {
|
|
252
|
+
return values.map((v) => {
|
|
253
|
+
if (v === null || typeof v !== "string") return v;
|
|
254
|
+
return v.trim().toUpperCase().slice(0, 9);
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
var PACK3;
|
|
258
|
+
var init_finance = __esm({
|
|
259
|
+
"src/core/domains/finance.ts"() {
|
|
260
|
+
init_types();
|
|
261
|
+
init_registry();
|
|
262
|
+
registerTransform(
|
|
263
|
+
{ name: "account_mask", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
264
|
+
accountMask
|
|
265
|
+
);
|
|
266
|
+
registerTransform(
|
|
267
|
+
{ name: "cusip_format", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
268
|
+
cusipFormat
|
|
269
|
+
);
|
|
270
|
+
PACK3 = {
|
|
271
|
+
name: "finance",
|
|
272
|
+
description: "Account masking, currency standardization, CUSIP/ISIN formatting",
|
|
273
|
+
transforms: ["account_mask", "cusip_format", "currency_strip", "date_iso8601"],
|
|
274
|
+
defaultConfig: makeConfig({
|
|
275
|
+
transforms: [
|
|
276
|
+
{ column: "account_number", ops: ["account_mask"] },
|
|
277
|
+
{ column: "amount", ops: ["currency_strip"] },
|
|
278
|
+
{ column: "transaction_date", ops: ["date_iso8601"] }
|
|
279
|
+
]
|
|
280
|
+
})
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// src/core/domains/ecommerce.ts
|
|
286
|
+
var ecommerce_exports = {};
|
|
287
|
+
__export(ecommerce_exports, {
|
|
288
|
+
PACK: () => PACK4
|
|
289
|
+
});
|
|
290
|
+
function skuNormalize(values) {
|
|
291
|
+
return values.map((v) => {
|
|
292
|
+
if (v === null || typeof v !== "string") return v;
|
|
293
|
+
return v.trim().toUpperCase().replace(/[^A-Z0-9-]/g, "");
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
var PACK4;
|
|
297
|
+
var init_ecommerce = __esm({
|
|
298
|
+
"src/core/domains/ecommerce.ts"() {
|
|
299
|
+
init_types();
|
|
300
|
+
init_registry();
|
|
301
|
+
registerTransform(
|
|
302
|
+
{ name: "sku_normalize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
303
|
+
skuNormalize
|
|
304
|
+
);
|
|
305
|
+
PACK4 = {
|
|
306
|
+
name: "ecommerce",
|
|
307
|
+
description: "SKU normalization, price cleaning, category standardization",
|
|
308
|
+
transforms: ["sku_normalize", "currency_strip", "category_auto_correct", "strip"],
|
|
309
|
+
defaultConfig: makeConfig({
|
|
310
|
+
transforms: [
|
|
311
|
+
{ column: "sku", ops: ["sku_normalize"] },
|
|
312
|
+
{ column: "price", ops: ["currency_strip"] },
|
|
313
|
+
{ column: "category", ops: ["strip", "title_case"] }
|
|
314
|
+
]
|
|
315
|
+
})
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
// src/core/domains/real-estate.ts
|
|
321
|
+
var real_estate_exports = {};
|
|
322
|
+
__export(real_estate_exports, {
|
|
323
|
+
PACK: () => PACK5
|
|
324
|
+
});
|
|
325
|
+
function mlsNormalize(values) {
|
|
326
|
+
return values.map((v) => {
|
|
327
|
+
if (v === null || typeof v !== "string") return v;
|
|
328
|
+
return v.trim().toUpperCase();
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
var PACK5;
|
|
332
|
+
var init_real_estate = __esm({
|
|
333
|
+
"src/core/domains/real-estate.ts"() {
|
|
334
|
+
init_types();
|
|
335
|
+
init_registry();
|
|
336
|
+
registerTransform(
|
|
337
|
+
{ name: "mls_normalize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
338
|
+
mlsNormalize
|
|
339
|
+
);
|
|
340
|
+
PACK5 = {
|
|
341
|
+
name: "real_estate",
|
|
342
|
+
description: "Address parsing (USPS), MLS ID normalization, price cleaning",
|
|
343
|
+
transforms: ["mls_normalize", "address_standardize", "zip_normalize", "currency_strip"],
|
|
344
|
+
defaultConfig: makeConfig({
|
|
345
|
+
transforms: [
|
|
346
|
+
{ column: "mls_id", ops: ["mls_normalize"] },
|
|
347
|
+
{ column: "address", ops: ["strip", "address_standardize"] },
|
|
348
|
+
{ column: "price", ops: ["currency_strip"] },
|
|
349
|
+
{ column: "zip", ops: ["zip_normalize"] }
|
|
350
|
+
]
|
|
351
|
+
})
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
// src/core/domains/index.ts
|
|
357
|
+
var domains_exports = {};
|
|
358
|
+
__export(domains_exports, {
|
|
359
|
+
listDomains: () => listDomains,
|
|
360
|
+
loadDomain: () => loadDomain
|
|
361
|
+
});
|
|
362
|
+
async function loadDomain(name) {
|
|
363
|
+
const key = name.toLowerCase().replace(/-/g, "_").replace(/\//g, "_");
|
|
364
|
+
const loader = DOMAIN_LOADERS[key];
|
|
365
|
+
if (!loader) return null;
|
|
366
|
+
const mod = await loader();
|
|
367
|
+
return mod.PACK;
|
|
368
|
+
}
|
|
369
|
+
function listDomains() {
|
|
370
|
+
return Object.keys(DOMAIN_LOADERS);
|
|
371
|
+
}
|
|
372
|
+
var DOMAIN_LOADERS;
|
|
373
|
+
var init_domains = __esm({
|
|
374
|
+
"src/core/domains/index.ts"() {
|
|
375
|
+
DOMAIN_LOADERS = {
|
|
376
|
+
people_hr: () => Promise.resolve().then(() => (init_people_hr(), people_hr_exports)),
|
|
377
|
+
healthcare: () => Promise.resolve().then(() => (init_healthcare(), healthcare_exports)),
|
|
378
|
+
finance: () => Promise.resolve().then(() => (init_finance(), finance_exports)),
|
|
379
|
+
ecommerce: () => Promise.resolve().then(() => (init_ecommerce(), ecommerce_exports)),
|
|
380
|
+
real_estate: () => Promise.resolve().then(() => (init_real_estate(), real_estate_exports))
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
});
|
|
384
|
+
function coerceValue(raw) {
|
|
385
|
+
if (raw === "true" || raw === "True" || raw === "TRUE") return true;
|
|
386
|
+
if (raw === "false" || raw === "False" || raw === "FALSE") return false;
|
|
387
|
+
if (raw.length > 0 && raw === raw.trim()) {
|
|
388
|
+
if (raw.length > 1 && raw[0] === "0" && raw[1] !== ".") return raw;
|
|
389
|
+
const n = Number(raw);
|
|
390
|
+
if (Number.isFinite(n) && raw !== "") return n;
|
|
391
|
+
}
|
|
392
|
+
return raw;
|
|
393
|
+
}
|
|
394
|
+
function parseCsv(content) {
|
|
395
|
+
const lines = content.split(/\r?\n/).filter((line) => line.trim());
|
|
396
|
+
if (lines.length === 0) return [];
|
|
397
|
+
const headers = parseCsvLine(lines[0]);
|
|
398
|
+
const rows = [];
|
|
399
|
+
for (let i = 1; i < lines.length; i++) {
|
|
400
|
+
const values = parseCsvLine(lines[i]);
|
|
401
|
+
const row = {};
|
|
402
|
+
for (let j = 0; j < headers.length; j++) {
|
|
403
|
+
const raw = values[j] ?? "";
|
|
404
|
+
row[headers[j]] = raw === "" ? null : coerceValue(raw);
|
|
405
|
+
}
|
|
406
|
+
rows.push(row);
|
|
407
|
+
}
|
|
408
|
+
return rows;
|
|
409
|
+
}
|
|
410
|
+
function parseCsvLine(line) {
|
|
411
|
+
const result = [];
|
|
412
|
+
let current = "";
|
|
413
|
+
let inQuotes = false;
|
|
414
|
+
for (let i = 0; i < line.length; i++) {
|
|
415
|
+
const ch = line[i];
|
|
416
|
+
if (inQuotes) {
|
|
417
|
+
if (ch === '"') {
|
|
418
|
+
if (i + 1 < line.length && line[i + 1] === '"') {
|
|
419
|
+
current += '"';
|
|
420
|
+
i++;
|
|
421
|
+
} else {
|
|
422
|
+
inQuotes = false;
|
|
423
|
+
}
|
|
424
|
+
} else {
|
|
425
|
+
current += ch;
|
|
426
|
+
}
|
|
427
|
+
} else {
|
|
428
|
+
if (ch === '"') {
|
|
429
|
+
inQuotes = true;
|
|
430
|
+
} else if (ch === ",") {
|
|
431
|
+
result.push(current);
|
|
432
|
+
current = "";
|
|
433
|
+
} else {
|
|
434
|
+
current += ch;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
result.push(current);
|
|
439
|
+
return result;
|
|
440
|
+
}
|
|
441
|
+
function rowsToCsv(rows) {
|
|
442
|
+
if (rows.length === 0) return "";
|
|
443
|
+
const headers = Object.keys(rows[0]);
|
|
444
|
+
const lines = [headers.join(",")];
|
|
445
|
+
for (const row of rows) {
|
|
446
|
+
const values = headers.map((h) => {
|
|
447
|
+
const v = row[h];
|
|
448
|
+
if (v === null || v === void 0) return "";
|
|
449
|
+
const s = String(v);
|
|
450
|
+
if (s.includes(",") || s.includes('"') || s.includes("\n")) {
|
|
451
|
+
return `"${s.replace(/"/g, '""')}"`;
|
|
452
|
+
}
|
|
453
|
+
return s;
|
|
454
|
+
});
|
|
455
|
+
lines.push(values.join(","));
|
|
456
|
+
}
|
|
457
|
+
return lines.join("\n") + "\n";
|
|
458
|
+
}
|
|
459
|
+
function readFile(path) {
|
|
460
|
+
const ext = extname(path).toLowerCase();
|
|
461
|
+
if (ext === ".json") {
|
|
462
|
+
const content2 = readFileSync(path, "utf-8");
|
|
463
|
+
return JSON.parse(content2);
|
|
464
|
+
}
|
|
465
|
+
if (ext !== ".csv") {
|
|
466
|
+
throw new Error(`Unsupported file format: ${ext}. Supported: .csv, .json`);
|
|
467
|
+
}
|
|
468
|
+
const content = readFileSync(path, "utf-8");
|
|
469
|
+
return parseCsv(content);
|
|
470
|
+
}
|
|
471
|
+
function writeFile(rows, path) {
|
|
472
|
+
const ext = extname(path).toLowerCase();
|
|
473
|
+
const dir = dirname(path);
|
|
474
|
+
mkdirSync(dir, { recursive: true });
|
|
475
|
+
if (ext === ".json") {
|
|
476
|
+
writeFileSync(path, JSON.stringify(rows, null, 2));
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
if (ext !== ".csv") {
|
|
480
|
+
throw new Error(`Unsupported file format: ${ext}. Supported: .csv, .json`);
|
|
481
|
+
}
|
|
482
|
+
writeFileSync(path, rowsToCsv(rows));
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// src/core/engine/transformer.ts
|
|
486
|
+
init_types();
|
|
487
|
+
|
|
488
|
+
// src/core/transforms/text.ts
|
|
489
|
+
init_registry();
|
|
490
|
+
function mapStrings(values, fn) {
|
|
491
|
+
return values.map((v) => {
|
|
492
|
+
if (v === null || typeof v !== "string") return v;
|
|
493
|
+
return fn(v);
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
function strip(values) {
|
|
497
|
+
return mapStrings(values, (s) => s.trim());
|
|
498
|
+
}
|
|
499
|
+
registerTransform(
|
|
500
|
+
{ name: "strip", inputTypes: ["string"], autoApply: true, priority: 90, mode: "expr" },
|
|
501
|
+
strip
|
|
502
|
+
);
|
|
503
|
+
function lowercase(values) {
|
|
504
|
+
return mapStrings(values, (s) => s.toLowerCase());
|
|
505
|
+
}
|
|
506
|
+
registerTransform(
|
|
507
|
+
{ name: "lowercase", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
508
|
+
lowercase
|
|
509
|
+
);
|
|
510
|
+
function uppercase(values) {
|
|
511
|
+
return mapStrings(values, (s) => s.toUpperCase());
|
|
512
|
+
}
|
|
513
|
+
registerTransform(
|
|
514
|
+
{ name: "uppercase", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
515
|
+
uppercase
|
|
516
|
+
);
|
|
517
|
+
function titleCase(values) {
|
|
518
|
+
return mapStrings(
|
|
519
|
+
values,
|
|
520
|
+
(s) => s.toLowerCase().replace(/\b\w/g, (ch) => ch.toUpperCase())
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
registerTransform(
|
|
524
|
+
{ name: "title_case", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
525
|
+
titleCase
|
|
526
|
+
);
|
|
527
|
+
function normalizeUnicode(values) {
|
|
528
|
+
return mapStrings(
|
|
529
|
+
values,
|
|
530
|
+
(s) => s.normalize("NFKD").replace(new RegExp("\\p{M}", "gu"), "")
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
registerTransform(
|
|
534
|
+
{ name: "normalize_unicode", inputTypes: ["string"], autoApply: true, priority: 85, mode: "series" },
|
|
535
|
+
normalizeUnicode
|
|
536
|
+
);
|
|
537
|
+
function removePunctuation(values) {
|
|
538
|
+
return mapStrings(values, (s) => s.replace(/[^\w\s]/g, ""));
|
|
539
|
+
}
|
|
540
|
+
registerTransform(
|
|
541
|
+
{ name: "remove_punctuation", inputTypes: ["string"], priority: 40, mode: "series" },
|
|
542
|
+
removePunctuation
|
|
543
|
+
);
|
|
544
|
+
function collapseWhitespace(values) {
|
|
545
|
+
return mapStrings(values, (s) => s.replace(/\s+/g, " ").trim());
|
|
546
|
+
}
|
|
547
|
+
registerTransform(
|
|
548
|
+
{ name: "collapse_whitespace", inputTypes: ["string"], autoApply: true, priority: 80, mode: "expr" },
|
|
549
|
+
collapseWhitespace
|
|
550
|
+
);
|
|
551
|
+
function truncate(values, n = 255) {
|
|
552
|
+
const maxLen = typeof n === "number" ? n : Number(n) || 255;
|
|
553
|
+
return mapStrings(values, (s) => s.slice(0, maxLen));
|
|
554
|
+
}
|
|
555
|
+
registerTransform(
|
|
556
|
+
{ name: "truncate", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
557
|
+
truncate
|
|
558
|
+
);
|
|
559
|
+
function normalizeQuotes(values) {
|
|
560
|
+
return mapStrings(
|
|
561
|
+
values,
|
|
562
|
+
(s) => s.replace(/[\u2018\u2019\u201A\u201B]/g, "'").replace(/[\u201C\u201D\u201E\u201F]/g, '"')
|
|
563
|
+
);
|
|
564
|
+
}
|
|
565
|
+
registerTransform(
|
|
566
|
+
{ name: "normalize_quotes", inputTypes: ["string"], autoApply: true, priority: 84, mode: "series" },
|
|
567
|
+
normalizeQuotes
|
|
568
|
+
);
|
|
569
|
+
function removeHtmlTags(values) {
|
|
570
|
+
return mapStrings(values, (s) => s.replace(/<[^>]*>/g, ""));
|
|
571
|
+
}
|
|
572
|
+
registerTransform(
|
|
573
|
+
{ name: "remove_html_tags", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
574
|
+
removeHtmlTags
|
|
575
|
+
);
|
|
576
|
+
function removeUrls(values) {
|
|
577
|
+
return mapStrings(
|
|
578
|
+
values,
|
|
579
|
+
(s) => s.replace(/https?:\/\/[^\s]+/g, "").trim()
|
|
580
|
+
);
|
|
581
|
+
}
|
|
582
|
+
registerTransform(
|
|
583
|
+
{ name: "remove_urls", inputTypes: ["string"], priority: 40, mode: "series" },
|
|
584
|
+
removeUrls
|
|
585
|
+
);
|
|
586
|
+
function removeDigits(values) {
|
|
587
|
+
return mapStrings(values, (s) => s.replace(/\d/g, ""));
|
|
588
|
+
}
|
|
589
|
+
registerTransform(
|
|
590
|
+
{ name: "remove_digits", inputTypes: ["string"], priority: 35, mode: "series" },
|
|
591
|
+
removeDigits
|
|
592
|
+
);
|
|
593
|
+
function padLeft(values, width = 10, char = "0") {
|
|
594
|
+
const w = typeof width === "number" ? width : Number(width) || 10;
|
|
595
|
+
const c = typeof char === "string" ? char : "0";
|
|
596
|
+
return mapStrings(values, (s) => s.padStart(w, c));
|
|
597
|
+
}
|
|
598
|
+
registerTransform(
|
|
599
|
+
{ name: "pad_left", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
600
|
+
padLeft
|
|
601
|
+
);
|
|
602
|
+
function padRight(values, width = 10, char = " ") {
|
|
603
|
+
const w = typeof width === "number" ? width : Number(width) || 10;
|
|
604
|
+
const c = typeof char === "string" ? char : " ";
|
|
605
|
+
return mapStrings(values, (s) => s.padEnd(w, c));
|
|
606
|
+
}
|
|
607
|
+
registerTransform(
|
|
608
|
+
{ name: "pad_right", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
609
|
+
padRight
|
|
610
|
+
);
|
|
611
|
+
function removeEmojis(values) {
|
|
612
|
+
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{FE00}-\u{FE0F}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{200D}\u{20E3}\u{E0020}-\u{E007F}]/gu;
|
|
613
|
+
return mapStrings(values, (s) => s.replace(emojiPattern, ""));
|
|
614
|
+
}
|
|
615
|
+
registerTransform(
|
|
616
|
+
{ name: "remove_emojis", inputTypes: ["string"], priority: 38, mode: "series" },
|
|
617
|
+
removeEmojis
|
|
618
|
+
);
|
|
619
|
+
function fixMojibake(values) {
|
|
620
|
+
return mapStrings(values, (s) => {
|
|
621
|
+
try {
|
|
622
|
+
const encoder = new TextEncoder();
|
|
623
|
+
const bytes = new Uint8Array(s.length);
|
|
624
|
+
for (let i = 0; i < s.length; i++) {
|
|
625
|
+
const code = s.charCodeAt(i);
|
|
626
|
+
if (code > 255) return s;
|
|
627
|
+
bytes[i] = code;
|
|
628
|
+
}
|
|
629
|
+
const decoded = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
|
|
630
|
+
return decoded;
|
|
631
|
+
} catch {
|
|
632
|
+
return s;
|
|
633
|
+
}
|
|
634
|
+
});
|
|
635
|
+
}
|
|
636
|
+
registerTransform(
|
|
637
|
+
{ name: "fix_mojibake", inputTypes: ["string"], priority: 86, mode: "series" },
|
|
638
|
+
fixMojibake
|
|
639
|
+
);
|
|
640
|
+
function normalizeLineEndings(values) {
|
|
641
|
+
return mapStrings(values, (s) => s.replace(/\r\n/g, "\n").replace(/\r/g, "\n"));
|
|
642
|
+
}
|
|
643
|
+
registerTransform(
|
|
644
|
+
{ name: "normalize_line_endings", inputTypes: ["string"], priority: 82, mode: "series" },
|
|
645
|
+
normalizeLineEndings
|
|
646
|
+
);
|
|
647
|
+
function extractNumbers(values) {
|
|
648
|
+
return mapStrings(values, (s) => {
|
|
649
|
+
const nums = s.match(/-?\d+(?:\.\d+)?/g);
|
|
650
|
+
return nums ? nums.join(" ") : "";
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
registerTransform(
|
|
654
|
+
{ name: "extract_numbers", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
655
|
+
extractNumbers
|
|
656
|
+
);
|
|
657
|
+
|
|
658
|
+
// src/core/transforms/phone.ts
|
|
659
|
+
init_registry();
|
|
660
|
+
function extractDigits(s) {
|
|
661
|
+
return s.replace(/\D/g, "");
|
|
662
|
+
}
|
|
663
|
+
function normalizeUsDigits(s) {
|
|
664
|
+
const digits = extractDigits(s);
|
|
665
|
+
if (digits.length === 10) return digits;
|
|
666
|
+
if (digits.length === 11 && digits[0] === "1") return digits.slice(1);
|
|
667
|
+
return null;
|
|
668
|
+
}
|
|
669
|
+
function phoneE164(values) {
|
|
670
|
+
return values.map((v) => {
|
|
671
|
+
if (v === null || typeof v !== "string") return v;
|
|
672
|
+
const digits = normalizeUsDigits(v);
|
|
673
|
+
if (digits === null) return v;
|
|
674
|
+
return `+1${digits}`;
|
|
675
|
+
});
|
|
676
|
+
}
|
|
677
|
+
registerTransform(
|
|
678
|
+
{ name: "phone_e164", inputTypes: ["phone"], autoApply: true, priority: 50, mode: "series" },
|
|
679
|
+
phoneE164
|
|
680
|
+
);
|
|
681
|
+
function phoneNational(values) {
|
|
682
|
+
return values.map((v) => {
|
|
683
|
+
if (v === null || typeof v !== "string") return v;
|
|
684
|
+
const digits = normalizeUsDigits(v);
|
|
685
|
+
if (digits === null) return v;
|
|
686
|
+
return `(${digits.slice(0, 3)}) ${digits.slice(3, 6)}-${digits.slice(6)}`;
|
|
687
|
+
});
|
|
688
|
+
}
|
|
689
|
+
registerTransform(
|
|
690
|
+
{ name: "phone_national", inputTypes: ["phone"], priority: 50, mode: "series" },
|
|
691
|
+
phoneNational
|
|
692
|
+
);
|
|
693
|
+
function phoneDigits(values) {
|
|
694
|
+
return values.map((v) => {
|
|
695
|
+
if (v === null || typeof v !== "string") return v;
|
|
696
|
+
return extractDigits(v);
|
|
697
|
+
});
|
|
698
|
+
}
|
|
699
|
+
registerTransform(
|
|
700
|
+
{ name: "phone_digits", inputTypes: ["phone"], priority: 50, mode: "series" },
|
|
701
|
+
phoneDigits
|
|
702
|
+
);
|
|
703
|
+
function phoneValidate(values) {
|
|
704
|
+
return values.map((v) => {
|
|
705
|
+
if (v === null || typeof v !== "string") return v;
|
|
706
|
+
const digits = extractDigits(v);
|
|
707
|
+
return digits.length === 10 || digits.length === 11 && digits[0] === "1";
|
|
708
|
+
});
|
|
709
|
+
}
|
|
710
|
+
registerTransform(
|
|
711
|
+
{ name: "phone_validate", inputTypes: ["phone"], priority: 60, mode: "series" },
|
|
712
|
+
phoneValidate
|
|
713
|
+
);
|
|
714
|
+
function phoneCountryCode(values) {
|
|
715
|
+
return values.map((v) => {
|
|
716
|
+
if (v === null || typeof v !== "string") return v;
|
|
717
|
+
const digits = extractDigits(v);
|
|
718
|
+
if (digits.length === 10) return 1;
|
|
719
|
+
if (digits.length === 11 && digits[0] === "1") return 1;
|
|
720
|
+
return null;
|
|
721
|
+
});
|
|
722
|
+
}
|
|
723
|
+
registerTransform(
|
|
724
|
+
{ name: "phone_country_code", inputTypes: ["phone"], priority: 45, mode: "series" },
|
|
725
|
+
phoneCountryCode
|
|
726
|
+
);
|
|
727
|
+
|
|
728
|
+
// src/core/transforms/names.ts
|
|
729
|
+
init_registry();
|
|
730
|
+
function mapStrings2(values, fn) {
|
|
731
|
+
return values.map((v) => {
|
|
732
|
+
if (v === null || typeof v !== "string") return v;
|
|
733
|
+
return fn(v);
|
|
734
|
+
});
|
|
735
|
+
}
|
|
736
|
+
var _TITLES = /^(Mr\.?|Mrs\.?|Ms\.?|Miss\.?|Dr\.?|Prof\.?|Rev\.?|Sr\.?|Sra\.?)\s+/i;
|
|
737
|
+
var _SUFFIXES = /\s+(Jr\.?|Sr\.?|II|III|IV|MD|PhD|PharmD|DDS|DVM|Esq\.?|CPA|RN|DO)$/i;
|
|
738
|
+
var _INITIAL_PATTERN = /\b[A-Z]\.\s/;
|
|
739
|
+
var _MC_PATTERN = /\bMc(\w)/g;
|
|
740
|
+
var _O_PATTERN = /\bO'(\w)/g;
|
|
741
|
+
var _NICKNAMES = {
|
|
742
|
+
bob: "Robert",
|
|
743
|
+
rob: "Robert",
|
|
744
|
+
robby: "Robert",
|
|
745
|
+
robbie: "Robert",
|
|
746
|
+
bobby: "Robert",
|
|
747
|
+
bill: "William",
|
|
748
|
+
billy: "William",
|
|
749
|
+
will: "William",
|
|
750
|
+
willy: "William",
|
|
751
|
+
jim: "James",
|
|
752
|
+
jimmy: "James",
|
|
753
|
+
jamie: "James",
|
|
754
|
+
mike: "Michael",
|
|
755
|
+
mikey: "Michael",
|
|
756
|
+
mick: "Michael",
|
|
757
|
+
dick: "Richard",
|
|
758
|
+
rick: "Richard",
|
|
759
|
+
rich: "Richard",
|
|
760
|
+
ricky: "Richard",
|
|
761
|
+
tom: "Thomas",
|
|
762
|
+
tommy: "Thomas",
|
|
763
|
+
joe: "Joseph",
|
|
764
|
+
joey: "Joseph",
|
|
765
|
+
jack: "John",
|
|
766
|
+
johnny: "John",
|
|
767
|
+
jon: "Jonathan",
|
|
768
|
+
dave: "David",
|
|
769
|
+
davy: "David",
|
|
770
|
+
steve: "Steven",
|
|
771
|
+
stevie: "Steven",
|
|
772
|
+
dan: "Daniel",
|
|
773
|
+
danny: "Daniel",
|
|
774
|
+
pat: "Patrick",
|
|
775
|
+
patty: "Patricia",
|
|
776
|
+
patsy: "Patricia",
|
|
777
|
+
chris: "Christopher",
|
|
778
|
+
kit: "Christopher",
|
|
779
|
+
tony: "Anthony",
|
|
780
|
+
ed: "Edward",
|
|
781
|
+
eddie: "Edward",
|
|
782
|
+
ted: "Edward",
|
|
783
|
+
teddy: "Edward",
|
|
784
|
+
al: "Albert",
|
|
785
|
+
bert: "Albert",
|
|
786
|
+
charlie: "Charles",
|
|
787
|
+
chuck: "Charles",
|
|
788
|
+
sam: "Samuel",
|
|
789
|
+
sammy: "Samuel",
|
|
790
|
+
ben: "Benjamin",
|
|
791
|
+
benny: "Benjamin",
|
|
792
|
+
matt: "Matthew",
|
|
793
|
+
andy: "Andrew",
|
|
794
|
+
drew: "Andrew",
|
|
795
|
+
nick: "Nicholas",
|
|
796
|
+
alex: "Alexander",
|
|
797
|
+
liz: "Elizabeth",
|
|
798
|
+
beth: "Elizabeth",
|
|
799
|
+
betty: "Elizabeth",
|
|
800
|
+
kate: "Katherine",
|
|
801
|
+
kathy: "Katherine",
|
|
802
|
+
katie: "Katherine",
|
|
803
|
+
sue: "Susan",
|
|
804
|
+
susie: "Susan",
|
|
805
|
+
meg: "Margaret",
|
|
806
|
+
maggie: "Margaret",
|
|
807
|
+
peggy: "Margaret",
|
|
808
|
+
jenny: "Jennifer",
|
|
809
|
+
jen: "Jennifer",
|
|
810
|
+
debbie: "Deborah",
|
|
811
|
+
deb: "Deborah",
|
|
812
|
+
barb: "Barbara",
|
|
813
|
+
cindy: "Cynthia",
|
|
814
|
+
sandy: "Sandra"
|
|
815
|
+
};
|
|
816
|
+
function splitName(rows, column) {
|
|
817
|
+
return rows.map((row) => {
|
|
818
|
+
const val = row[column];
|
|
819
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
820
|
+
return { ...row, first_name: null, last_name: null };
|
|
821
|
+
}
|
|
822
|
+
const trimmed = val.trim();
|
|
823
|
+
const lastSpace = trimmed.lastIndexOf(" ");
|
|
824
|
+
if (lastSpace === -1) {
|
|
825
|
+
return { ...row, first_name: trimmed, last_name: "" };
|
|
826
|
+
}
|
|
827
|
+
return {
|
|
828
|
+
...row,
|
|
829
|
+
first_name: trimmed.slice(0, lastSpace),
|
|
830
|
+
last_name: trimmed.slice(lastSpace + 1)
|
|
831
|
+
};
|
|
832
|
+
});
|
|
833
|
+
}
|
|
834
|
+
registerTransform(
|
|
835
|
+
{ name: "split_name", inputTypes: ["name"], priority: 50, mode: "dataframe" },
|
|
836
|
+
splitName
|
|
837
|
+
);
|
|
838
|
+
function splitNameReverse(rows, column) {
|
|
839
|
+
return rows.map((row) => {
|
|
840
|
+
const val = row[column];
|
|
841
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
842
|
+
return { ...row, first_name: null, last_name: null };
|
|
843
|
+
}
|
|
844
|
+
const commaIdx = val.indexOf(",");
|
|
845
|
+
if (commaIdx === -1) {
|
|
846
|
+
return { ...row, first_name: val.trim(), last_name: "" };
|
|
847
|
+
}
|
|
848
|
+
return {
|
|
849
|
+
...row,
|
|
850
|
+
last_name: val.slice(0, commaIdx).trim(),
|
|
851
|
+
first_name: val.slice(commaIdx + 1).trim()
|
|
852
|
+
};
|
|
853
|
+
});
|
|
854
|
+
}
|
|
855
|
+
registerTransform(
|
|
856
|
+
{ name: "split_name_reverse", inputTypes: ["name"], priority: 50, mode: "dataframe" },
|
|
857
|
+
splitNameReverse
|
|
858
|
+
);
|
|
859
|
+
function stripTitles(values) {
|
|
860
|
+
return mapStrings2(values, (s) => s.replace(_TITLES, "").trim());
|
|
861
|
+
}
|
|
862
|
+
registerTransform(
|
|
863
|
+
{ name: "strip_titles", inputTypes: ["name"], autoApply: true, priority: 70, mode: "series" },
|
|
864
|
+
stripTitles
|
|
865
|
+
);
|
|
866
|
+
function stripSuffixes(values) {
|
|
867
|
+
return mapStrings2(values, (s) => s.replace(_SUFFIXES, "").trim());
|
|
868
|
+
}
|
|
869
|
+
registerTransform(
|
|
870
|
+
{ name: "strip_suffixes", inputTypes: ["name"], priority: 60, mode: "series" },
|
|
871
|
+
stripSuffixes
|
|
872
|
+
);
|
|
873
|
+
function nameProper(values) {
|
|
874
|
+
return mapStrings2(values, (s) => {
|
|
875
|
+
let result = s.toLowerCase().replace(/\b\w/g, (ch) => ch.toUpperCase());
|
|
876
|
+
result = result.replace(_MC_PATTERN, (_match, letter) => `Mc${letter.toUpperCase()}`);
|
|
877
|
+
result = result.replace(_O_PATTERN, (_match, letter) => `O'${letter.toUpperCase()}`);
|
|
878
|
+
return result;
|
|
879
|
+
});
|
|
880
|
+
}
|
|
881
|
+
registerTransform(
|
|
882
|
+
{ name: "name_proper", inputTypes: ["name"], priority: 45, mode: "series" },
|
|
883
|
+
nameProper
|
|
884
|
+
);
|
|
885
|
+
function initialExpand(values) {
|
|
886
|
+
const flagged = [];
|
|
887
|
+
const result = values.map((v, i) => {
|
|
888
|
+
if (v !== null && typeof v === "string" && _INITIAL_PATTERN.test(v)) {
|
|
889
|
+
flagged.push(i);
|
|
890
|
+
}
|
|
891
|
+
return v === void 0 ? null : v;
|
|
892
|
+
});
|
|
893
|
+
return [result, flagged];
|
|
894
|
+
}
|
|
895
|
+
registerTransform(
|
|
896
|
+
{ name: "initial_expand", inputTypes: ["name"], priority: 40, mode: "series" },
|
|
897
|
+
initialExpand
|
|
898
|
+
);
|
|
899
|
+
function nicknameStandardize(values) {
|
|
900
|
+
return mapStrings2(values, (s) => {
|
|
901
|
+
const lookup = s.trim().toLowerCase();
|
|
902
|
+
return _NICKNAMES[lookup] ?? s;
|
|
903
|
+
});
|
|
904
|
+
}
|
|
905
|
+
registerTransform(
|
|
906
|
+
{ name: "nickname_standardize", inputTypes: ["name"], priority: 42, mode: "series" },
|
|
907
|
+
nicknameStandardize
|
|
908
|
+
);
|
|
909
|
+
function mergeName(rows, column, lastNameCol = "last_name") {
|
|
910
|
+
const lnCol = typeof lastNameCol === "string" ? lastNameCol : "last_name";
|
|
911
|
+
if (rows.length > 0 && !(lnCol in rows[0])) {
|
|
912
|
+
return rows.map((r) => ({ ...r }));
|
|
913
|
+
}
|
|
914
|
+
return rows.map((row) => {
|
|
915
|
+
const first = row[column];
|
|
916
|
+
const last = row[lnCol];
|
|
917
|
+
const parts = [];
|
|
918
|
+
if (first !== null && first !== void 0) {
|
|
919
|
+
const s = String(first).trim();
|
|
920
|
+
if (s) parts.push(s);
|
|
921
|
+
}
|
|
922
|
+
if (last !== null && last !== void 0) {
|
|
923
|
+
const s = String(last).trim();
|
|
924
|
+
if (s) parts.push(s);
|
|
925
|
+
}
|
|
926
|
+
return { ...row, full_name: parts.length > 0 ? parts.join(" ") : null };
|
|
927
|
+
});
|
|
928
|
+
}
|
|
929
|
+
registerTransform(
|
|
930
|
+
{ name: "merge_name", inputTypes: ["name"], priority: 45, mode: "dataframe" },
|
|
931
|
+
mergeName
|
|
932
|
+
);
|
|
933
|
+
|
|
934
|
+
// src/core/transforms/address.ts
|
|
935
|
+
init_registry();
|
|
936
|
+
function mapStrings3(values, fn) {
|
|
937
|
+
return values.map((v) => {
|
|
938
|
+
if (v === null || typeof v !== "string") return v;
|
|
939
|
+
return fn(v);
|
|
940
|
+
});
|
|
941
|
+
}
|
|
942
|
+
var _STREET_ABBREV = {
|
|
943
|
+
Street: "St",
|
|
944
|
+
Avenue: "Ave",
|
|
945
|
+
Boulevard: "Blvd",
|
|
946
|
+
Drive: "Dr",
|
|
947
|
+
Lane: "Ln",
|
|
948
|
+
Road: "Rd",
|
|
949
|
+
Court: "Ct",
|
|
950
|
+
Place: "Pl",
|
|
951
|
+
Circle: "Cir",
|
|
952
|
+
Trail: "Trl",
|
|
953
|
+
Way: "Way",
|
|
954
|
+
Parkway: "Pkwy",
|
|
955
|
+
Highway: "Hwy",
|
|
956
|
+
Terrace: "Ter",
|
|
957
|
+
Square: "Sq"
|
|
958
|
+
};
|
|
959
|
+
var _STREET_EXPAND = {};
|
|
960
|
+
for (const [full, abbr] of Object.entries(_STREET_ABBREV)) {
|
|
961
|
+
_STREET_EXPAND[abbr] = full;
|
|
962
|
+
}
|
|
963
|
+
var _STATES = {
|
|
964
|
+
Alabama: "AL",
|
|
965
|
+
Alaska: "AK",
|
|
966
|
+
Arizona: "AZ",
|
|
967
|
+
Arkansas: "AR",
|
|
968
|
+
California: "CA",
|
|
969
|
+
Colorado: "CO",
|
|
970
|
+
Connecticut: "CT",
|
|
971
|
+
Delaware: "DE",
|
|
972
|
+
Florida: "FL",
|
|
973
|
+
Georgia: "GA",
|
|
974
|
+
Hawaii: "HI",
|
|
975
|
+
Idaho: "ID",
|
|
976
|
+
Illinois: "IL",
|
|
977
|
+
Indiana: "IN",
|
|
978
|
+
Iowa: "IA",
|
|
979
|
+
Kansas: "KS",
|
|
980
|
+
Kentucky: "KY",
|
|
981
|
+
Louisiana: "LA",
|
|
982
|
+
Maine: "ME",
|
|
983
|
+
Maryland: "MD",
|
|
984
|
+
Massachusetts: "MA",
|
|
985
|
+
Michigan: "MI",
|
|
986
|
+
Minnesota: "MN",
|
|
987
|
+
Mississippi: "MS",
|
|
988
|
+
Missouri: "MO",
|
|
989
|
+
Montana: "MT",
|
|
990
|
+
Nebraska: "NE",
|
|
991
|
+
Nevada: "NV",
|
|
992
|
+
"New Hampshire": "NH",
|
|
993
|
+
"New Jersey": "NJ",
|
|
994
|
+
"New Mexico": "NM",
|
|
995
|
+
"New York": "NY",
|
|
996
|
+
"North Carolina": "NC",
|
|
997
|
+
"North Dakota": "ND",
|
|
998
|
+
Ohio: "OH",
|
|
999
|
+
Oklahoma: "OK",
|
|
1000
|
+
Oregon: "OR",
|
|
1001
|
+
Pennsylvania: "PA",
|
|
1002
|
+
"Rhode Island": "RI",
|
|
1003
|
+
"South Carolina": "SC",
|
|
1004
|
+
"South Dakota": "SD",
|
|
1005
|
+
Tennessee: "TN",
|
|
1006
|
+
Texas: "TX",
|
|
1007
|
+
Utah: "UT",
|
|
1008
|
+
Vermont: "VT",
|
|
1009
|
+
Virginia: "VA",
|
|
1010
|
+
Washington: "WA",
|
|
1011
|
+
"West Virginia": "WV",
|
|
1012
|
+
Wisconsin: "WI",
|
|
1013
|
+
Wyoming: "WY",
|
|
1014
|
+
"District Of Columbia": "DC"
|
|
1015
|
+
};
|
|
1016
|
+
var _STATES_REVERSE = {};
|
|
1017
|
+
for (const [name, abbr] of Object.entries(_STATES)) {
|
|
1018
|
+
_STATES_REVERSE[abbr] = name;
|
|
1019
|
+
}
|
|
1020
|
+
var _STATES_LOWER = {};
|
|
1021
|
+
for (const [name, abbr] of Object.entries(_STATES)) {
|
|
1022
|
+
_STATES_LOWER[name.toLowerCase()] = abbr;
|
|
1023
|
+
}
|
|
1024
|
+
var _COUNTRIES = {
|
|
1025
|
+
"united states": "US",
|
|
1026
|
+
"united states of america": "US",
|
|
1027
|
+
usa: "US",
|
|
1028
|
+
us: "US",
|
|
1029
|
+
"u.s.a.": "US",
|
|
1030
|
+
"u.s.": "US",
|
|
1031
|
+
america: "US",
|
|
1032
|
+
"united kingdom": "GB",
|
|
1033
|
+
uk: "GB",
|
|
1034
|
+
"great britain": "GB",
|
|
1035
|
+
england: "GB",
|
|
1036
|
+
scotland: "GB",
|
|
1037
|
+
wales: "GB",
|
|
1038
|
+
"northern ireland": "GB",
|
|
1039
|
+
canada: "CA",
|
|
1040
|
+
ca: "CA",
|
|
1041
|
+
australia: "AU",
|
|
1042
|
+
au: "AU",
|
|
1043
|
+
germany: "DE",
|
|
1044
|
+
deutschland: "DE",
|
|
1045
|
+
de: "DE",
|
|
1046
|
+
france: "FR",
|
|
1047
|
+
fr: "FR",
|
|
1048
|
+
italy: "IT",
|
|
1049
|
+
italia: "IT",
|
|
1050
|
+
it: "IT",
|
|
1051
|
+
spain: "ES",
|
|
1052
|
+
espana: "ES",
|
|
1053
|
+
es: "ES",
|
|
1054
|
+
mexico: "MX",
|
|
1055
|
+
mx: "MX",
|
|
1056
|
+
brazil: "BR",
|
|
1057
|
+
brasil: "BR",
|
|
1058
|
+
br: "BR",
|
|
1059
|
+
japan: "JP",
|
|
1060
|
+
jp: "JP",
|
|
1061
|
+
china: "CN",
|
|
1062
|
+
cn: "CN",
|
|
1063
|
+
india: "IN",
|
|
1064
|
+
in: "IN",
|
|
1065
|
+
"south korea": "KR",
|
|
1066
|
+
korea: "KR",
|
|
1067
|
+
kr: "KR",
|
|
1068
|
+
netherlands: "NL",
|
|
1069
|
+
holland: "NL",
|
|
1070
|
+
nl: "NL",
|
|
1071
|
+
sweden: "SE",
|
|
1072
|
+
se: "SE",
|
|
1073
|
+
norway: "NO",
|
|
1074
|
+
no: "NO",
|
|
1075
|
+
denmark: "DK",
|
|
1076
|
+
dk: "DK",
|
|
1077
|
+
switzerland: "CH",
|
|
1078
|
+
ch: "CH",
|
|
1079
|
+
ireland: "IE",
|
|
1080
|
+
ie: "IE",
|
|
1081
|
+
"new zealand": "NZ",
|
|
1082
|
+
nz: "NZ",
|
|
1083
|
+
singapore: "SG",
|
|
1084
|
+
sg: "SG",
|
|
1085
|
+
portugal: "PT",
|
|
1086
|
+
pt: "PT",
|
|
1087
|
+
argentina: "AR",
|
|
1088
|
+
ar: "AR",
|
|
1089
|
+
colombia: "CO",
|
|
1090
|
+
co: "CO",
|
|
1091
|
+
philippines: "PH",
|
|
1092
|
+
ph: "PH",
|
|
1093
|
+
poland: "PL",
|
|
1094
|
+
pl: "PL",
|
|
1095
|
+
belgium: "BE",
|
|
1096
|
+
be: "BE",
|
|
1097
|
+
austria: "AT",
|
|
1098
|
+
at: "AT"
|
|
1099
|
+
};
|
|
1100
|
+
var _UNIT_PATTERNS = [
|
|
1101
|
+
[/^(?:Apt|Apartment)\.?\s+/i, "Unit "],
|
|
1102
|
+
[/^(?:Ste|Suite)\.?\s+/i, "Ste "],
|
|
1103
|
+
[/^#\s*/i, "Unit "]
|
|
1104
|
+
];
|
|
1105
|
+
var _ABBREV_PATTERNS = Object.entries(_STREET_ABBREV).map(
|
|
1106
|
+
([full, abbr]) => [new RegExp(`\\b${full}\\b`, "gi"), abbr]
|
|
1107
|
+
);
|
|
1108
|
+
var _EXPAND_PATTERNS = Object.entries(_STREET_EXPAND).map(
|
|
1109
|
+
([abbr, full]) => [new RegExp(`\\b${abbr}\\b`, "gi"), full]
|
|
1110
|
+
);
|
|
1111
|
+
function addressStandardize(values) {
|
|
1112
|
+
return mapStrings3(values, (s) => {
|
|
1113
|
+
let result = s;
|
|
1114
|
+
for (const [pattern, abbr] of _ABBREV_PATTERNS) {
|
|
1115
|
+
result = result.replace(pattern, abbr);
|
|
1116
|
+
}
|
|
1117
|
+
return result;
|
|
1118
|
+
});
|
|
1119
|
+
}
|
|
1120
|
+
registerTransform(
|
|
1121
|
+
{ name: "address_standardize", inputTypes: ["address"], priority: 50, mode: "series" },
|
|
1122
|
+
addressStandardize
|
|
1123
|
+
);
|
|
1124
|
+
function addressExpand(values) {
|
|
1125
|
+
return mapStrings3(values, (s) => {
|
|
1126
|
+
let result = s;
|
|
1127
|
+
for (const [pattern, full] of _EXPAND_PATTERNS) {
|
|
1128
|
+
result = result.replace(pattern, full);
|
|
1129
|
+
}
|
|
1130
|
+
return result;
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
1133
|
+
registerTransform(
|
|
1134
|
+
{ name: "address_expand", inputTypes: ["address"], priority: 50, mode: "series" },
|
|
1135
|
+
addressExpand
|
|
1136
|
+
);
|
|
1137
|
+
function stateAbbreviate(values) {
|
|
1138
|
+
return mapStrings3(values, (s) => {
|
|
1139
|
+
const trimmed = s.trim();
|
|
1140
|
+
if (trimmed.length === 2 && _STATES_REVERSE[trimmed.toUpperCase()]) {
|
|
1141
|
+
return trimmed.toUpperCase();
|
|
1142
|
+
}
|
|
1143
|
+
const matched = _STATES_LOWER[trimmed.toLowerCase()];
|
|
1144
|
+
return matched ?? s;
|
|
1145
|
+
});
|
|
1146
|
+
}
|
|
1147
|
+
registerTransform(
|
|
1148
|
+
{ name: "state_abbreviate", inputTypes: ["state", "string"], priority: 50, mode: "series" },
|
|
1149
|
+
stateAbbreviate
|
|
1150
|
+
);
|
|
1151
|
+
function stateExpand(values) {
|
|
1152
|
+
return mapStrings3(values, (s) => {
|
|
1153
|
+
return _STATES_REVERSE[s.trim().toUpperCase()] ?? s;
|
|
1154
|
+
});
|
|
1155
|
+
}
|
|
1156
|
+
registerTransform(
|
|
1157
|
+
{ name: "state_expand", inputTypes: ["state", "string"], priority: 50, mode: "series" },
|
|
1158
|
+
stateExpand
|
|
1159
|
+
);
|
|
1160
|
+
function zipNormalize(values) {
|
|
1161
|
+
return mapStrings3(values, (s) => {
|
|
1162
|
+
let val = s.trim();
|
|
1163
|
+
val = val.split("-")[0];
|
|
1164
|
+
if (/^\d+$/.test(val)) {
|
|
1165
|
+
return val.padStart(5, "0");
|
|
1166
|
+
}
|
|
1167
|
+
return val;
|
|
1168
|
+
});
|
|
1169
|
+
}
|
|
1170
|
+
registerTransform(
|
|
1171
|
+
{ name: "zip_normalize", inputTypes: ["zip"], autoApply: true, priority: 55, mode: "series" },
|
|
1172
|
+
zipNormalize
|
|
1173
|
+
);
|
|
1174
|
+
var _ADDRESS_PATTERN = /^(.+?),\s*(.+?),\s*([A-Za-z]{2})\s+(\d{5}(?:-\d{4})?)$/;
|
|
1175
|
+
function splitAddress(rows, column) {
|
|
1176
|
+
return rows.map((row) => {
|
|
1177
|
+
const val = row[column];
|
|
1178
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
1179
|
+
return { ...row, street: null, city: null, state: null, zip: null };
|
|
1180
|
+
}
|
|
1181
|
+
const m = val.trim().match(_ADDRESS_PATTERN);
|
|
1182
|
+
if (m) {
|
|
1183
|
+
return { ...row, street: m[1], city: m[2], state: m[3], zip: m[4] };
|
|
1184
|
+
}
|
|
1185
|
+
return { ...row, street: val, city: null, state: null, zip: null };
|
|
1186
|
+
});
|
|
1187
|
+
}
|
|
1188
|
+
registerTransform(
|
|
1189
|
+
{ name: "split_address", inputTypes: ["address"], priority: 45, mode: "dataframe" },
|
|
1190
|
+
splitAddress
|
|
1191
|
+
);
|
|
1192
|
+
function countryStandardize(values) {
|
|
1193
|
+
return mapStrings3(values, (s) => {
|
|
1194
|
+
const lookup = s.trim().toLowerCase();
|
|
1195
|
+
return _COUNTRIES[lookup] ?? s;
|
|
1196
|
+
});
|
|
1197
|
+
}
|
|
1198
|
+
registerTransform(
|
|
1199
|
+
{ name: "country_standardize", inputTypes: ["country", "string"], priority: 50, mode: "series" },
|
|
1200
|
+
countryStandardize
|
|
1201
|
+
);
|
|
1202
|
+
function unitNormalize(values) {
|
|
1203
|
+
return mapStrings3(values, (s) => {
|
|
1204
|
+
let result = s.trim();
|
|
1205
|
+
for (const [pattern, replacement] of _UNIT_PATTERNS) {
|
|
1206
|
+
result = result.replace(pattern, replacement);
|
|
1207
|
+
}
|
|
1208
|
+
return result;
|
|
1209
|
+
});
|
|
1210
|
+
}
|
|
1211
|
+
registerTransform(
|
|
1212
|
+
{ name: "unit_normalize", inputTypes: ["address", "string"], priority: 45, mode: "series" },
|
|
1213
|
+
unitNormalize
|
|
1214
|
+
);
|
|
1215
|
+
|
|
1216
|
+
// src/core/transforms/dates.ts
|
|
1217
|
+
init_registry();
|
|
1218
|
+
function _parseDate(val) {
|
|
1219
|
+
const trimmed = val.trim();
|
|
1220
|
+
if (!trimmed) return null;
|
|
1221
|
+
const d = new Date(trimmed);
|
|
1222
|
+
if (isNaN(d.getTime())) return null;
|
|
1223
|
+
return d;
|
|
1224
|
+
}
|
|
1225
|
+
function pad(n) {
|
|
1226
|
+
return n < 10 ? `0${n}` : String(n);
|
|
1227
|
+
}
|
|
1228
|
+
var DAY_NAMES = [
|
|
1229
|
+
"Sunday",
|
|
1230
|
+
"Monday",
|
|
1231
|
+
"Tuesday",
|
|
1232
|
+
"Wednesday",
|
|
1233
|
+
"Thursday",
|
|
1234
|
+
"Friday",
|
|
1235
|
+
"Saturday"
|
|
1236
|
+
];
|
|
1237
|
+
function dateIso8601(values) {
|
|
1238
|
+
return values.map((v) => {
|
|
1239
|
+
if (v === null) return null;
|
|
1240
|
+
const s = String(v);
|
|
1241
|
+
const d = _parseDate(s);
|
|
1242
|
+
if (!d) return v;
|
|
1243
|
+
return `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}`;
|
|
1244
|
+
});
|
|
1245
|
+
}
|
|
1246
|
+
registerTransform(
|
|
1247
|
+
{ name: "date_iso8601", inputTypes: ["date"], autoApply: true, priority: 50, mode: "series" },
|
|
1248
|
+
dateIso8601
|
|
1249
|
+
);
|
|
1250
|
+
function dateUs(values) {
|
|
1251
|
+
return values.map((v) => {
|
|
1252
|
+
if (v === null) return null;
|
|
1253
|
+
const s = String(v);
|
|
1254
|
+
const d = _parseDate(s);
|
|
1255
|
+
if (!d) return v;
|
|
1256
|
+
return `${pad(d.getUTCMonth() + 1)}/${pad(d.getUTCDate())}/${d.getUTCFullYear()}`;
|
|
1257
|
+
});
|
|
1258
|
+
}
|
|
1259
|
+
registerTransform(
|
|
1260
|
+
{ name: "date_us", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1261
|
+
dateUs
|
|
1262
|
+
);
|
|
1263
|
+
function dateEu(values) {
|
|
1264
|
+
return values.map((v) => {
|
|
1265
|
+
if (v === null) return null;
|
|
1266
|
+
const s = String(v);
|
|
1267
|
+
const d = _parseDate(s);
|
|
1268
|
+
if (!d) return v;
|
|
1269
|
+
return `${pad(d.getUTCDate())}/${pad(d.getUTCMonth() + 1)}/${d.getUTCFullYear()}`;
|
|
1270
|
+
});
|
|
1271
|
+
}
|
|
1272
|
+
registerTransform(
|
|
1273
|
+
{ name: "date_eu", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1274
|
+
dateEu
|
|
1275
|
+
);
|
|
1276
|
+
registerTransform(
|
|
1277
|
+
{ name: "date_parse", inputTypes: ["date"], priority: 55, mode: "series" },
|
|
1278
|
+
dateIso8601
|
|
1279
|
+
);
|
|
1280
|
+
function ageFromDob(values, referenceDate = null) {
|
|
1281
|
+
const ref = referenceDate ? _parseDate(String(referenceDate)) : /* @__PURE__ */ new Date();
|
|
1282
|
+
if (!ref) return values.slice();
|
|
1283
|
+
return values.map((v) => {
|
|
1284
|
+
if (v === null) return null;
|
|
1285
|
+
const dob = _parseDate(String(v));
|
|
1286
|
+
if (!dob) return v;
|
|
1287
|
+
let age = ref.getUTCFullYear() - dob.getUTCFullYear();
|
|
1288
|
+
const monthDiff = ref.getUTCMonth() - dob.getUTCMonth();
|
|
1289
|
+
if (monthDiff < 0 || monthDiff === 0 && ref.getUTCDate() < dob.getUTCDate()) {
|
|
1290
|
+
age--;
|
|
1291
|
+
}
|
|
1292
|
+
return age;
|
|
1293
|
+
});
|
|
1294
|
+
}
|
|
1295
|
+
registerTransform(
|
|
1296
|
+
{ name: "age_from_dob", inputTypes: ["date"], priority: 40, mode: "series" },
|
|
1297
|
+
ageFromDob
|
|
1298
|
+
);
|
|
1299
|
+
function datetimeIso8601(values) {
|
|
1300
|
+
return values.map((v) => {
|
|
1301
|
+
if (v === null) return null;
|
|
1302
|
+
const d = _parseDate(String(v));
|
|
1303
|
+
if (!d) return v;
|
|
1304
|
+
return `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}T${pad(d.getUTCHours())}:${pad(d.getUTCMinutes())}:${pad(d.getUTCSeconds())}`;
|
|
1305
|
+
});
|
|
1306
|
+
}
|
|
1307
|
+
registerTransform(
|
|
1308
|
+
{ name: "datetime_iso8601", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1309
|
+
datetimeIso8601
|
|
1310
|
+
);
|
|
1311
|
+
function extractYear(values) {
|
|
1312
|
+
return values.map((v) => {
|
|
1313
|
+
if (v === null) return null;
|
|
1314
|
+
const d = _parseDate(String(v));
|
|
1315
|
+
return d ? d.getUTCFullYear() : v;
|
|
1316
|
+
});
|
|
1317
|
+
}
|
|
1318
|
+
function extractMonth(values) {
|
|
1319
|
+
return values.map((v) => {
|
|
1320
|
+
if (v === null) return null;
|
|
1321
|
+
const d = _parseDate(String(v));
|
|
1322
|
+
return d ? d.getUTCMonth() + 1 : v;
|
|
1323
|
+
});
|
|
1324
|
+
}
|
|
1325
|
+
function extractDay(values) {
|
|
1326
|
+
return values.map((v) => {
|
|
1327
|
+
if (v === null) return null;
|
|
1328
|
+
const d = _parseDate(String(v));
|
|
1329
|
+
return d ? d.getUTCDate() : v;
|
|
1330
|
+
});
|
|
1331
|
+
}
|
|
1332
|
+
function extractQuarter(values) {
|
|
1333
|
+
return values.map((v) => {
|
|
1334
|
+
if (v === null) return null;
|
|
1335
|
+
const d = _parseDate(String(v));
|
|
1336
|
+
if (!d) return v;
|
|
1337
|
+
return Math.floor(d.getUTCMonth() / 3) + 1;
|
|
1338
|
+
});
|
|
1339
|
+
}
|
|
1340
|
+
function extractDayOfWeek(values) {
|
|
1341
|
+
return values.map((v) => {
|
|
1342
|
+
if (v === null) return null;
|
|
1343
|
+
const d = _parseDate(String(v));
|
|
1344
|
+
return d ? DAY_NAMES[d.getUTCDay()] : v;
|
|
1345
|
+
});
|
|
1346
|
+
}
|
|
1347
|
+
registerTransform({ name: "extract_year", inputTypes: ["date"], priority: 35, mode: "series" }, extractYear);
|
|
1348
|
+
registerTransform({ name: "extract_month", inputTypes: ["date"], priority: 35, mode: "series" }, extractMonth);
|
|
1349
|
+
registerTransform({ name: "extract_day", inputTypes: ["date"], priority: 35, mode: "series" }, extractDay);
|
|
1350
|
+
registerTransform({ name: "extract_quarter", inputTypes: ["date"], priority: 35, mode: "series" }, extractQuarter);
|
|
1351
|
+
registerTransform({ name: "extract_day_of_week", inputTypes: ["date"], priority: 35, mode: "series" }, extractDayOfWeek);
|
|
1352
|
+
function dateShift(values, days = 0) {
|
|
1353
|
+
const shift = typeof days === "number" ? days : Number(days) || 0;
|
|
1354
|
+
const shiftMs = shift * 864e5;
|
|
1355
|
+
return values.map((v) => {
|
|
1356
|
+
if (v === null) return null;
|
|
1357
|
+
const d = _parseDate(String(v));
|
|
1358
|
+
if (!d) return v;
|
|
1359
|
+
const shifted = new Date(d.getTime() + shiftMs);
|
|
1360
|
+
return `${shifted.getUTCFullYear()}-${pad(shifted.getUTCMonth() + 1)}-${pad(shifted.getUTCDate())}`;
|
|
1361
|
+
});
|
|
1362
|
+
}
|
|
1363
|
+
registerTransform(
|
|
1364
|
+
{ name: "date_shift", inputTypes: ["date"], priority: 30, mode: "series" },
|
|
1365
|
+
dateShift
|
|
1366
|
+
);
|
|
1367
|
+
function dateValidate(values) {
|
|
1368
|
+
return values.map((v) => {
|
|
1369
|
+
if (v === null) return null;
|
|
1370
|
+
const s = String(v).trim();
|
|
1371
|
+
if (!s) return false;
|
|
1372
|
+
return _parseDate(s) !== null;
|
|
1373
|
+
});
|
|
1374
|
+
}
|
|
1375
|
+
registerTransform(
|
|
1376
|
+
{ name: "date_validate", inputTypes: ["date", "string"], priority: 60, mode: "series" },
|
|
1377
|
+
dateValidate
|
|
1378
|
+
);
|
|
1379
|
+
|
|
1380
|
+
// src/core/transforms/email.ts
|
|
1381
|
+
init_registry();
|
|
1382
|
+
var EMAIL_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
1383
|
+
var GMAIL_DOMAINS = /* @__PURE__ */ new Set(["gmail.com", "googlemail.com"]);
|
|
1384
|
+
function emailLowercase(values) {
|
|
1385
|
+
return values.map((v) => {
|
|
1386
|
+
if (v === null || typeof v !== "string") return v;
|
|
1387
|
+
return v.toLowerCase();
|
|
1388
|
+
});
|
|
1389
|
+
}
|
|
1390
|
+
registerTransform(
|
|
1391
|
+
{ name: "email_lowercase", inputTypes: ["email", "string"], priority: 55, mode: "series" },
|
|
1392
|
+
emailLowercase
|
|
1393
|
+
);
|
|
1394
|
+
function emailNormalize(values) {
|
|
1395
|
+
return values.map((v) => {
|
|
1396
|
+
if (v === null || typeof v !== "string") return v;
|
|
1397
|
+
const lowered = v.toLowerCase().trim();
|
|
1398
|
+
const atIdx = lowered.lastIndexOf("@");
|
|
1399
|
+
if (atIdx === -1) return lowered;
|
|
1400
|
+
let local = lowered.slice(0, atIdx);
|
|
1401
|
+
const domain = lowered.slice(atIdx + 1);
|
|
1402
|
+
const plusIdx = local.indexOf("+");
|
|
1403
|
+
if (plusIdx !== -1) {
|
|
1404
|
+
local = local.slice(0, plusIdx);
|
|
1405
|
+
}
|
|
1406
|
+
if (GMAIL_DOMAINS.has(domain)) {
|
|
1407
|
+
local = local.replace(/\./g, "");
|
|
1408
|
+
}
|
|
1409
|
+
return `${local}@${domain}`;
|
|
1410
|
+
});
|
|
1411
|
+
}
|
|
1412
|
+
registerTransform(
|
|
1413
|
+
{ name: "email_normalize", inputTypes: ["email"], priority: 50, mode: "series" },
|
|
1414
|
+
emailNormalize
|
|
1415
|
+
);
|
|
1416
|
+
function emailExtractDomain(values) {
|
|
1417
|
+
return values.map((v) => {
|
|
1418
|
+
if (v === null || typeof v !== "string") return v;
|
|
1419
|
+
const atIdx = v.lastIndexOf("@");
|
|
1420
|
+
if (atIdx === -1) return null;
|
|
1421
|
+
return v.slice(atIdx + 1).toLowerCase();
|
|
1422
|
+
});
|
|
1423
|
+
}
|
|
1424
|
+
registerTransform(
|
|
1425
|
+
{ name: "email_extract_domain", inputTypes: ["email"], priority: 40, mode: "series" },
|
|
1426
|
+
emailExtractDomain
|
|
1427
|
+
);
|
|
1428
|
+
function emailValidate(values) {
|
|
1429
|
+
return values.map((v) => {
|
|
1430
|
+
if (v === null || typeof v !== "string") return v;
|
|
1431
|
+
return EMAIL_RE.test(v.trim());
|
|
1432
|
+
});
|
|
1433
|
+
}
|
|
1434
|
+
registerTransform(
|
|
1435
|
+
{ name: "email_validate", inputTypes: ["email", "string"], priority: 60, mode: "series" },
|
|
1436
|
+
emailValidate
|
|
1437
|
+
);
|
|
1438
|
+
|
|
1439
|
+
// src/core/transforms/numeric.ts
|
|
1440
|
+
init_registry();
|
|
1441
|
+
function currencyStrip(values) {
|
|
1442
|
+
return values.map((v) => {
|
|
1443
|
+
if (v === null) return null;
|
|
1444
|
+
if (typeof v === "number") return v;
|
|
1445
|
+
const cleaned = String(v).replace(/[^0-9.\-]/g, "");
|
|
1446
|
+
if (cleaned === "" || cleaned === "-") return v;
|
|
1447
|
+
const n = Number(cleaned);
|
|
1448
|
+
return isNaN(n) ? v : n;
|
|
1449
|
+
});
|
|
1450
|
+
}
|
|
1451
|
+
registerTransform(
|
|
1452
|
+
{ name: "currency_strip", inputTypes: ["string", "numeric"], priority: 50, mode: "series" },
|
|
1453
|
+
currencyStrip
|
|
1454
|
+
);
|
|
1455
|
+
function percentageNormalize(values) {
|
|
1456
|
+
return values.map((v) => {
|
|
1457
|
+
if (v === null) return null;
|
|
1458
|
+
if (typeof v === "number") return v / 100;
|
|
1459
|
+
const s = String(v).replace(/%/g, "").trim();
|
|
1460
|
+
const n = Number(s);
|
|
1461
|
+
return isNaN(n) ? v : n / 100;
|
|
1462
|
+
});
|
|
1463
|
+
}
|
|
1464
|
+
registerTransform(
|
|
1465
|
+
{ name: "percentage_normalize", inputTypes: ["string", "numeric"], priority: 50, mode: "series" },
|
|
1466
|
+
percentageNormalize
|
|
1467
|
+
);
|
|
1468
|
+
function roundTransform(values, n = 2) {
|
|
1469
|
+
const decimals = typeof n === "number" ? n : Number(n) || 2;
|
|
1470
|
+
const factor = Math.pow(10, decimals);
|
|
1471
|
+
return values.map((v) => {
|
|
1472
|
+
if (v === null) return null;
|
|
1473
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1474
|
+
if (isNaN(num)) return v;
|
|
1475
|
+
return Math.round(num * factor) / factor;
|
|
1476
|
+
});
|
|
1477
|
+
}
|
|
1478
|
+
registerTransform(
|
|
1479
|
+
{ name: "round", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1480
|
+
roundTransform
|
|
1481
|
+
);
|
|
1482
|
+
function clamp(values, minVal = 0, maxVal = 1) {
|
|
1483
|
+
const lo = typeof minVal === "number" ? minVal : Number(minVal) || 0;
|
|
1484
|
+
const hi = typeof maxVal === "number" ? maxVal : Number(maxVal) || 1;
|
|
1485
|
+
return values.map((v) => {
|
|
1486
|
+
if (v === null) return null;
|
|
1487
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1488
|
+
if (isNaN(num)) return v;
|
|
1489
|
+
return Math.min(hi, Math.max(lo, num));
|
|
1490
|
+
});
|
|
1491
|
+
}
|
|
1492
|
+
registerTransform(
|
|
1493
|
+
{ name: "clamp", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1494
|
+
clamp
|
|
1495
|
+
);
|
|
1496
|
+
function toInteger(values) {
|
|
1497
|
+
return values.map((v) => {
|
|
1498
|
+
if (v === null) return null;
|
|
1499
|
+
const num = Number(v);
|
|
1500
|
+
if (isNaN(num)) return v;
|
|
1501
|
+
return Math.trunc(num);
|
|
1502
|
+
});
|
|
1503
|
+
}
|
|
1504
|
+
registerTransform(
|
|
1505
|
+
{ name: "to_integer", inputTypes: ["string", "numeric"], priority: 45, mode: "series" },
|
|
1506
|
+
toInteger
|
|
1507
|
+
);
|
|
1508
|
+
function absValue(values) {
|
|
1509
|
+
return values.map((v) => {
|
|
1510
|
+
if (v === null) return null;
|
|
1511
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1512
|
+
if (isNaN(num)) return v;
|
|
1513
|
+
return Math.abs(num);
|
|
1514
|
+
});
|
|
1515
|
+
}
|
|
1516
|
+
registerTransform(
|
|
1517
|
+
{ name: "abs_value", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1518
|
+
absValue
|
|
1519
|
+
);
|
|
1520
|
+
function fillZero(values) {
|
|
1521
|
+
return values.map((v) => v === null ? 0 : v);
|
|
1522
|
+
}
|
|
1523
|
+
registerTransform(
|
|
1524
|
+
{ name: "fill_zero", inputTypes: ["numeric"], priority: 35, mode: "series" },
|
|
1525
|
+
fillZero
|
|
1526
|
+
);
|
|
1527
|
+
function commaDecimal(values) {
|
|
1528
|
+
return values.map((v) => {
|
|
1529
|
+
if (v === null) return null;
|
|
1530
|
+
if (typeof v === "number") return v;
|
|
1531
|
+
const s = String(v);
|
|
1532
|
+
const converted = s.replace(/\./g, "").replace(",", ".");
|
|
1533
|
+
const n = Number(converted);
|
|
1534
|
+
return isNaN(n) ? v : n;
|
|
1535
|
+
});
|
|
1536
|
+
}
|
|
1537
|
+
registerTransform(
|
|
1538
|
+
{ name: "comma_decimal", inputTypes: ["string", "numeric"], priority: 48, mode: "series" },
|
|
1539
|
+
commaDecimal
|
|
1540
|
+
);
|
|
1541
|
+
function scientificToDecimal(values) {
|
|
1542
|
+
return values.map((v) => {
|
|
1543
|
+
if (v === null) return null;
|
|
1544
|
+
const n = Number(v);
|
|
1545
|
+
return isNaN(n) ? v : n;
|
|
1546
|
+
});
|
|
1547
|
+
}
|
|
1548
|
+
registerTransform(
|
|
1549
|
+
{ name: "scientific_to_decimal", inputTypes: ["string", "numeric"], priority: 45, mode: "series" },
|
|
1550
|
+
scientificToDecimal
|
|
1551
|
+
);
|
|
1552
|
+
|
|
1553
|
+
// src/core/transforms/categorical.ts
|
|
1554
|
+
init_registry();
|
|
1555
|
+
var TRUTHY = /* @__PURE__ */ new Set(["yes", "y", "1", "true", "t"]);
|
|
1556
|
+
var FALSY = /* @__PURE__ */ new Set(["no", "n", "0", "false", "f"]);
|
|
1557
|
+
function booleanNormalize(values) {
|
|
1558
|
+
return values.map((v) => {
|
|
1559
|
+
if (v === null) return null;
|
|
1560
|
+
const s = String(v).trim().toLowerCase();
|
|
1561
|
+
if (TRUTHY.has(s)) return true;
|
|
1562
|
+
if (FALSY.has(s)) return false;
|
|
1563
|
+
return v;
|
|
1564
|
+
});
|
|
1565
|
+
}
|
|
1566
|
+
registerTransform(
|
|
1567
|
+
{ name: "boolean_normalize", inputTypes: ["boolean", "string"], priority: 50, mode: "series" },
|
|
1568
|
+
booleanNormalize
|
|
1569
|
+
);
|
|
1570
|
+
function genderStandardize(values) {
|
|
1571
|
+
return values.map((v) => {
|
|
1572
|
+
if (v === null) return null;
|
|
1573
|
+
if (typeof v !== "string") return v;
|
|
1574
|
+
const s = v.trim().toLowerCase();
|
|
1575
|
+
if (s === "male" || s === "m") return "M";
|
|
1576
|
+
if (s === "female" || s === "f") return "F";
|
|
1577
|
+
return v;
|
|
1578
|
+
});
|
|
1579
|
+
}
|
|
1580
|
+
registerTransform(
|
|
1581
|
+
{ name: "gender_standardize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
1582
|
+
genderStandardize
|
|
1583
|
+
);
|
|
1584
|
+
var NULL_VARIANTS = /* @__PURE__ */ new Set([
|
|
1585
|
+
"n/a",
|
|
1586
|
+
"null",
|
|
1587
|
+
"none",
|
|
1588
|
+
"na",
|
|
1589
|
+
"nil",
|
|
1590
|
+
"nan",
|
|
1591
|
+
"-",
|
|
1592
|
+
""
|
|
1593
|
+
]);
|
|
1594
|
+
function nullStandardize(values) {
|
|
1595
|
+
return values.map((v) => {
|
|
1596
|
+
if (v === null) return null;
|
|
1597
|
+
if (typeof v !== "string") return v;
|
|
1598
|
+
const s = v.trim().toLowerCase();
|
|
1599
|
+
if (NULL_VARIANTS.has(s)) return null;
|
|
1600
|
+
return v;
|
|
1601
|
+
});
|
|
1602
|
+
}
|
|
1603
|
+
registerTransform(
|
|
1604
|
+
{ name: "null_standardize", inputTypes: ["string"], autoApply: true, priority: 80, mode: "series" },
|
|
1605
|
+
nullStandardize
|
|
1606
|
+
);
|
|
1607
|
+
function categoryStandardize(values, mapping = null) {
|
|
1608
|
+
if (!mapping || typeof mapping !== "object") return values.slice();
|
|
1609
|
+
const lookup = /* @__PURE__ */ new Map();
|
|
1610
|
+
for (const [canonical, variants] of Object.entries(
|
|
1611
|
+
mapping
|
|
1612
|
+
)) {
|
|
1613
|
+
if (Array.isArray(variants)) {
|
|
1614
|
+
for (const variant of variants) {
|
|
1615
|
+
lookup.set(String(variant).toLowerCase(), canonical);
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
lookup.set(canonical.toLowerCase(), canonical);
|
|
1619
|
+
}
|
|
1620
|
+
return values.map((v) => {
|
|
1621
|
+
if (v === null) return null;
|
|
1622
|
+
if (typeof v !== "string") return v;
|
|
1623
|
+
const key = v.trim().toLowerCase();
|
|
1624
|
+
return lookup.get(key) ?? v;
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
registerTransform(
|
|
1628
|
+
{ name: "category_standardize", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
1629
|
+
categoryStandardize
|
|
1630
|
+
);
|
|
1631
|
+
function categoryFromFile(values, lookupPath = null) {
|
|
1632
|
+
if (lookupPath) {
|
|
1633
|
+
console.warn("[goldenflow] category_from_file is not yet implemented in the JS port \u2014 returning values unchanged");
|
|
1634
|
+
}
|
|
1635
|
+
return values.slice();
|
|
1636
|
+
}
|
|
1637
|
+
registerTransform(
|
|
1638
|
+
{ name: "category_from_file", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
1639
|
+
categoryFromFile
|
|
1640
|
+
);
|
|
1641
|
+
|
|
1642
|
+
// src/core/transforms/identifiers.ts
|
|
1643
|
+
init_registry();
|
|
1644
|
+
function mapStrings4(values, fn) {
|
|
1645
|
+
return values.map((v) => {
|
|
1646
|
+
if (v === null || typeof v !== "string") return v;
|
|
1647
|
+
return fn(v);
|
|
1648
|
+
});
|
|
1649
|
+
}
|
|
1650
|
+
function extractDigits2(val) {
|
|
1651
|
+
return val.replace(/\D/g, "");
|
|
1652
|
+
}
|
|
1653
|
+
function ssnFormat(values) {
|
|
1654
|
+
return mapStrings4(values, (s) => {
|
|
1655
|
+
const digits = extractDigits2(s);
|
|
1656
|
+
if (digits.length !== 9) return s;
|
|
1657
|
+
return `${digits.slice(0, 3)}-${digits.slice(3, 5)}-${digits.slice(5)}`;
|
|
1658
|
+
});
|
|
1659
|
+
}
|
|
1660
|
+
registerTransform(
|
|
1661
|
+
{ name: "ssn_format", inputTypes: ["ssn", "string"], priority: 50, mode: "series" },
|
|
1662
|
+
ssnFormat
|
|
1663
|
+
);
|
|
1664
|
+
function ssnMask(values) {
|
|
1665
|
+
return mapStrings4(values, (s) => {
|
|
1666
|
+
const digits = extractDigits2(s);
|
|
1667
|
+
if (digits.length !== 9) return s;
|
|
1668
|
+
return `***-**-${digits.slice(5)}`;
|
|
1669
|
+
});
|
|
1670
|
+
}
|
|
1671
|
+
registerTransform(
|
|
1672
|
+
{ name: "ssn_mask", inputTypes: ["ssn", "string"], priority: 50, mode: "series" },
|
|
1673
|
+
ssnMask
|
|
1674
|
+
);
|
|
1675
|
+
function einFormat(values) {
|
|
1676
|
+
return mapStrings4(values, (s) => {
|
|
1677
|
+
const digits = extractDigits2(s);
|
|
1678
|
+
if (digits.length !== 9) return s;
|
|
1679
|
+
return `${digits.slice(0, 2)}-${digits.slice(2)}`;
|
|
1680
|
+
});
|
|
1681
|
+
}
|
|
1682
|
+
registerTransform(
|
|
1683
|
+
{ name: "ein_format", inputTypes: ["ein", "string"], priority: 50, mode: "series" },
|
|
1684
|
+
einFormat
|
|
1685
|
+
);
|
|
1686
|
+
|
|
1687
|
+
// src/core/transforms/url.ts
|
|
1688
|
+
init_registry();
|
|
1689
|
+
function mapStrings5(values, fn) {
|
|
1690
|
+
return values.map((v) => {
|
|
1691
|
+
if (v === null || typeof v !== "string") return v;
|
|
1692
|
+
return fn(v);
|
|
1693
|
+
});
|
|
1694
|
+
}
|
|
1695
|
+
var _SCHEME_RE = /^https?:\/\//i;
|
|
1696
|
+
function urlNormalize(values) {
|
|
1697
|
+
return mapStrings5(values, (s) => {
|
|
1698
|
+
let val = s.trim();
|
|
1699
|
+
if (!val) return null;
|
|
1700
|
+
if (!_SCHEME_RE.test(val)) {
|
|
1701
|
+
val = "https://" + val;
|
|
1702
|
+
}
|
|
1703
|
+
const schemeEnd = val.indexOf("://") + 3;
|
|
1704
|
+
const scheme = val.slice(0, schemeEnd).toLowerCase();
|
|
1705
|
+
const rest = val.slice(schemeEnd);
|
|
1706
|
+
const slashIdx = rest.indexOf("/");
|
|
1707
|
+
let domain;
|
|
1708
|
+
let path;
|
|
1709
|
+
if (slashIdx === -1) {
|
|
1710
|
+
domain = rest.toLowerCase();
|
|
1711
|
+
path = "";
|
|
1712
|
+
} else {
|
|
1713
|
+
domain = rest.slice(0, slashIdx).toLowerCase();
|
|
1714
|
+
path = rest.slice(slashIdx);
|
|
1715
|
+
}
|
|
1716
|
+
let result = scheme + domain + path;
|
|
1717
|
+
if (result.endsWith("/") && result.length > schemeEnd + domain.length + 1) {
|
|
1718
|
+
result = result.replace(/\/+$/, "");
|
|
1719
|
+
} else if (result.endsWith("/") && path === "/") {
|
|
1720
|
+
result = result.slice(0, -1);
|
|
1721
|
+
}
|
|
1722
|
+
return result;
|
|
1723
|
+
});
|
|
1724
|
+
}
|
|
1725
|
+
registerTransform(
|
|
1726
|
+
{ name: "url_normalize", inputTypes: ["url", "string"], priority: 50, mode: "series" },
|
|
1727
|
+
urlNormalize
|
|
1728
|
+
);
|
|
1729
|
+
function urlExtractDomain(values) {
|
|
1730
|
+
return mapStrings5(values, (s) => {
|
|
1731
|
+
let val = s.trim();
|
|
1732
|
+
if (!val) return null;
|
|
1733
|
+
if (val.includes("://")) {
|
|
1734
|
+
val = val.split("://", 2)[1];
|
|
1735
|
+
}
|
|
1736
|
+
const domain = val.split("/", 1)[0];
|
|
1737
|
+
return domain ? domain.toLowerCase() : null;
|
|
1738
|
+
});
|
|
1739
|
+
}
|
|
1740
|
+
registerTransform(
|
|
1741
|
+
{ name: "url_extract_domain", inputTypes: ["url", "string"], priority: 40, mode: "series" },
|
|
1742
|
+
urlExtractDomain
|
|
1743
|
+
);
|
|
1744
|
+
|
|
1745
|
+
// src/core/transforms/auto-correct.ts
|
|
1746
|
+
init_registry();
|
|
1747
|
+
function levenshtein(a, b) {
|
|
1748
|
+
const m = a.length;
|
|
1749
|
+
const n = b.length;
|
|
1750
|
+
if (m === 0) return n;
|
|
1751
|
+
if (n === 0) return m;
|
|
1752
|
+
const prev = new Array(n + 1);
|
|
1753
|
+
for (let j = 0; j <= n; j++) prev[j] = j;
|
|
1754
|
+
for (let i = 1; i <= m; i++) {
|
|
1755
|
+
let prevDiag = prev[0];
|
|
1756
|
+
prev[0] = i;
|
|
1757
|
+
for (let j = 1; j <= n; j++) {
|
|
1758
|
+
const temp = prev[j];
|
|
1759
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1760
|
+
prev[j] = prevDiag;
|
|
1761
|
+
} else {
|
|
1762
|
+
prev[j] = 1 + Math.min(prevDiag, prev[j], prev[j - 1]);
|
|
1763
|
+
}
|
|
1764
|
+
prevDiag = temp;
|
|
1765
|
+
}
|
|
1766
|
+
}
|
|
1767
|
+
return prev[n];
|
|
1768
|
+
}
|
|
1769
|
+
function fuzzyRatio(a, b) {
|
|
1770
|
+
if (a.length === 0 && b.length === 0) return 100;
|
|
1771
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1772
|
+
const dist = levenshtein(a, b);
|
|
1773
|
+
return 100 * (1 - dist / maxLen);
|
|
1774
|
+
}
|
|
1775
|
+
function categoryAutoCorrect(values, frequencyThreshold = 0.05, matchThreshold = 85) {
|
|
1776
|
+
const freqThresh = typeof frequencyThreshold === "number" ? frequencyThreshold : Number(frequencyThreshold) || 0.05;
|
|
1777
|
+
const matchThresh = typeof matchThreshold === "number" ? matchThreshold : Number(matchThreshold) || 85;
|
|
1778
|
+
const freqMap = /* @__PURE__ */ new Map();
|
|
1779
|
+
const casingMap = /* @__PURE__ */ new Map();
|
|
1780
|
+
let totalNonNull = 0;
|
|
1781
|
+
for (const v of values) {
|
|
1782
|
+
if (v === null || typeof v !== "string") continue;
|
|
1783
|
+
const lower = v.toLowerCase();
|
|
1784
|
+
totalNonNull++;
|
|
1785
|
+
freqMap.set(lower, (freqMap.get(lower) ?? 0) + 1);
|
|
1786
|
+
let casings = casingMap.get(lower);
|
|
1787
|
+
if (!casings) {
|
|
1788
|
+
casings = /* @__PURE__ */ new Map();
|
|
1789
|
+
casingMap.set(lower, casings);
|
|
1790
|
+
}
|
|
1791
|
+
casings.set(v, (casings.get(v) ?? 0) + 1);
|
|
1792
|
+
}
|
|
1793
|
+
if (totalNonNull === 0) return values.slice();
|
|
1794
|
+
const canonicals = /* @__PURE__ */ new Map();
|
|
1795
|
+
for (const [lower, count] of freqMap) {
|
|
1796
|
+
if (count / totalNonNull >= freqThresh) {
|
|
1797
|
+
const casings = casingMap.get(lower);
|
|
1798
|
+
let bestCasing = lower;
|
|
1799
|
+
let bestCount = 0;
|
|
1800
|
+
for (const [original, c] of casings) {
|
|
1801
|
+
if (c > bestCount) {
|
|
1802
|
+
bestCount = c;
|
|
1803
|
+
bestCasing = original;
|
|
1804
|
+
}
|
|
1805
|
+
}
|
|
1806
|
+
canonicals.set(lower, bestCasing);
|
|
1807
|
+
}
|
|
1808
|
+
}
|
|
1809
|
+
if (canonicals.size === 0) return values.slice();
|
|
1810
|
+
const corrections = /* @__PURE__ */ new Map();
|
|
1811
|
+
for (const [lower] of freqMap) {
|
|
1812
|
+
if (canonicals.has(lower)) continue;
|
|
1813
|
+
let bestCanonical = null;
|
|
1814
|
+
let bestScore = 0;
|
|
1815
|
+
for (const [canonLower, canonOriginal] of canonicals) {
|
|
1816
|
+
const score = fuzzyRatio(lower, canonLower);
|
|
1817
|
+
if (score >= matchThresh && score > bestScore) {
|
|
1818
|
+
bestScore = score;
|
|
1819
|
+
bestCanonical = canonOriginal;
|
|
1820
|
+
}
|
|
1821
|
+
}
|
|
1822
|
+
if (bestCanonical !== null) {
|
|
1823
|
+
corrections.set(lower, bestCanonical);
|
|
1824
|
+
}
|
|
1825
|
+
}
|
|
1826
|
+
return values.map((v) => {
|
|
1827
|
+
if (v === null || typeof v !== "string") return v;
|
|
1828
|
+
const lower = v.toLowerCase();
|
|
1829
|
+
const correction = corrections.get(lower);
|
|
1830
|
+
if (correction !== void 0) return correction;
|
|
1831
|
+
const canonical = canonicals.get(lower);
|
|
1832
|
+
if (canonical !== void 0) return canonical;
|
|
1833
|
+
return v;
|
|
1834
|
+
});
|
|
1835
|
+
}
|
|
1836
|
+
registerTransform(
|
|
1837
|
+
{ name: "category_auto_correct", inputTypes: ["string"], autoApply: true, priority: 35, mode: "series" },
|
|
1838
|
+
categoryAutoCorrect
|
|
1839
|
+
);
|
|
1840
|
+
|
|
1841
|
+
// src/core/transforms/index.ts
|
|
1842
|
+
init_registry();
|
|
1843
|
+
|
|
1844
|
+
// src/core/engine/profiler-bridge.ts
|
|
1845
|
+
init_types();
|
|
1846
|
+
|
|
1847
|
+
// src/core/data.ts
|
|
1848
|
+
var NULL_STRINGS = /* @__PURE__ */ new Set([
|
|
1849
|
+
"",
|
|
1850
|
+
"null",
|
|
1851
|
+
"none",
|
|
1852
|
+
"nan",
|
|
1853
|
+
"n/a",
|
|
1854
|
+
"na",
|
|
1855
|
+
"nil",
|
|
1856
|
+
"#n/a",
|
|
1857
|
+
"missing",
|
|
1858
|
+
"undefined"
|
|
1859
|
+
]);
|
|
1860
|
+
function isNullish(v) {
|
|
1861
|
+
if (v === null || v === void 0) return true;
|
|
1862
|
+
if (typeof v === "string") return NULL_STRINGS.has(v.toLowerCase().trim());
|
|
1863
|
+
if (typeof v === "number") return Number.isNaN(v);
|
|
1864
|
+
return false;
|
|
1865
|
+
}
|
|
1866
|
+
function toColumnValue(v) {
|
|
1867
|
+
if (isNullish(v)) return null;
|
|
1868
|
+
if (typeof v === "string") return v;
|
|
1869
|
+
if (typeof v === "number") return v;
|
|
1870
|
+
if (typeof v === "boolean") return v;
|
|
1871
|
+
return String(v);
|
|
1872
|
+
}
|
|
1873
|
+
function mulberry32(seed) {
|
|
1874
|
+
let s = seed | 0;
|
|
1875
|
+
return () => {
|
|
1876
|
+
s = s + 1831565813 | 0;
|
|
1877
|
+
let t = Math.imul(s ^ s >>> 15, 1 | s);
|
|
1878
|
+
t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
|
|
1879
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
1880
|
+
};
|
|
1881
|
+
}
|
|
1882
|
+
var TabularData = class _TabularData {
|
|
1883
|
+
_rows;
|
|
1884
|
+
_columnCache = /* @__PURE__ */ new Map();
|
|
1885
|
+
constructor(rows) {
|
|
1886
|
+
this._rows = rows;
|
|
1887
|
+
}
|
|
1888
|
+
get rows() {
|
|
1889
|
+
return this._rows;
|
|
1890
|
+
}
|
|
1891
|
+
get columns() {
|
|
1892
|
+
if (this._rows.length === 0) return [];
|
|
1893
|
+
return Object.keys(this._rows[0]);
|
|
1894
|
+
}
|
|
1895
|
+
get rowCount() {
|
|
1896
|
+
return this._rows.length;
|
|
1897
|
+
}
|
|
1898
|
+
// ---- Column access ----
|
|
1899
|
+
column(name) {
|
|
1900
|
+
const cached = this._columnCache.get(name);
|
|
1901
|
+
if (cached) return cached;
|
|
1902
|
+
const values = this._rows.map((r) => toColumnValue(r[name]));
|
|
1903
|
+
this._columnCache.set(name, values);
|
|
1904
|
+
return values;
|
|
1905
|
+
}
|
|
1906
|
+
/** Raw column access — preserves original values without null coercion.
|
|
1907
|
+
* Use for profiling where "N/A" should remain a string, not become null. */
|
|
1908
|
+
rawColumn(name) {
|
|
1909
|
+
return this._rows.map((r) => {
|
|
1910
|
+
const v = r[name];
|
|
1911
|
+
if (v === null || v === void 0) return null;
|
|
1912
|
+
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") return v;
|
|
1913
|
+
return String(v);
|
|
1914
|
+
});
|
|
1915
|
+
}
|
|
1916
|
+
// ---- Null handling ----
|
|
1917
|
+
nullCount(col) {
|
|
1918
|
+
let count = 0;
|
|
1919
|
+
for (const v of this.column(col)) {
|
|
1920
|
+
if (v === null) count++;
|
|
1921
|
+
}
|
|
1922
|
+
return count;
|
|
1923
|
+
}
|
|
1924
|
+
dropNulls(col) {
|
|
1925
|
+
return this.column(col).filter((v) => v !== null);
|
|
1926
|
+
}
|
|
1927
|
+
// ---- Type inference ----
|
|
1928
|
+
dtype(col) {
|
|
1929
|
+
const values = this.dropNulls(col);
|
|
1930
|
+
if (values.length === 0) return "null";
|
|
1931
|
+
let hasInt = false;
|
|
1932
|
+
let hasFloat = false;
|
|
1933
|
+
let hasBool = false;
|
|
1934
|
+
let hasString = false;
|
|
1935
|
+
for (const v of values) {
|
|
1936
|
+
if (typeof v === "boolean") {
|
|
1937
|
+
hasBool = true;
|
|
1938
|
+
} else if (typeof v === "number") {
|
|
1939
|
+
if (Number.isInteger(v)) hasInt = true;
|
|
1940
|
+
else hasFloat = true;
|
|
1941
|
+
} else {
|
|
1942
|
+
hasString = true;
|
|
1943
|
+
}
|
|
1944
|
+
}
|
|
1945
|
+
if (hasString) return "string";
|
|
1946
|
+
if (hasBool && !hasInt && !hasFloat) return "boolean";
|
|
1947
|
+
if (hasFloat) return "float";
|
|
1948
|
+
if (hasInt) return "integer";
|
|
1949
|
+
return "string";
|
|
1950
|
+
}
|
|
1951
|
+
// ---- Aggregation ----
|
|
1952
|
+
nUnique(col) {
|
|
1953
|
+
const set = /* @__PURE__ */ new Set();
|
|
1954
|
+
for (const v of this.dropNulls(col)) set.add(v);
|
|
1955
|
+
return set.size;
|
|
1956
|
+
}
|
|
1957
|
+
valueCounts(col) {
|
|
1958
|
+
const map = /* @__PURE__ */ new Map();
|
|
1959
|
+
for (const v of this.dropNulls(col)) {
|
|
1960
|
+
map.set(v, (map.get(v) ?? 0) + 1);
|
|
1961
|
+
}
|
|
1962
|
+
return map;
|
|
1963
|
+
}
|
|
1964
|
+
/** MUST use loop — Math.min(...array) crashes on >65K elements. */
|
|
1965
|
+
min(col) {
|
|
1966
|
+
const nums = this.numericValues(col);
|
|
1967
|
+
if (nums.length === 0) return null;
|
|
1968
|
+
let m = nums[0];
|
|
1969
|
+
for (let i = 1; i < nums.length; i++) {
|
|
1970
|
+
if (nums[i] < m) m = nums[i];
|
|
1971
|
+
}
|
|
1972
|
+
return m;
|
|
1973
|
+
}
|
|
1974
|
+
/** MUST use loop — Math.max(...array) crashes on >65K elements. */
|
|
1975
|
+
max(col) {
|
|
1976
|
+
const nums = this.numericValues(col);
|
|
1977
|
+
if (nums.length === 0) return null;
|
|
1978
|
+
let m = nums[0];
|
|
1979
|
+
for (let i = 1; i < nums.length; i++) {
|
|
1980
|
+
if (nums[i] > m) m = nums[i];
|
|
1981
|
+
}
|
|
1982
|
+
return m;
|
|
1983
|
+
}
|
|
1984
|
+
mean(col) {
|
|
1985
|
+
const nums = this.numericValues(col);
|
|
1986
|
+
if (nums.length === 0) return null;
|
|
1987
|
+
let sum = 0;
|
|
1988
|
+
for (const n of nums) sum += n;
|
|
1989
|
+
return sum / nums.length;
|
|
1990
|
+
}
|
|
1991
|
+
std(col) {
|
|
1992
|
+
const nums = this.numericValues(col);
|
|
1993
|
+
if (nums.length < 2) return null;
|
|
1994
|
+
const avg = this.mean(col);
|
|
1995
|
+
let sumSq = 0;
|
|
1996
|
+
for (const n of nums) sumSq += (n - avg) ** 2;
|
|
1997
|
+
return Math.sqrt(sumSq / (nums.length - 1));
|
|
1998
|
+
}
|
|
1999
|
+
// ---- Filtering & sampling ----
|
|
2000
|
+
filter(predicate) {
|
|
2001
|
+
return new _TabularData(this._rows.filter(predicate));
|
|
2002
|
+
}
|
|
2003
|
+
head(n) {
|
|
2004
|
+
return new _TabularData(this._rows.slice(0, n));
|
|
2005
|
+
}
|
|
2006
|
+
sample(n, seed = 42) {
|
|
2007
|
+
if (n >= this._rows.length) return this;
|
|
2008
|
+
const rng = mulberry32(seed);
|
|
2009
|
+
const indices = Array.from({ length: this._rows.length }, (_, i) => i);
|
|
2010
|
+
for (let i = indices.length - 1; i > 0 && indices.length - 1 - i < n; i--) {
|
|
2011
|
+
const j = Math.floor(rng() * (i + 1));
|
|
2012
|
+
[indices[i], indices[j]] = [indices[j], indices[i]];
|
|
2013
|
+
}
|
|
2014
|
+
const sampled = indices.slice(indices.length - n).map((i) => this._rows[i]);
|
|
2015
|
+
return new _TabularData(sampled);
|
|
2016
|
+
}
|
|
2017
|
+
// ---- String operations ----
|
|
2018
|
+
strContains(col, pattern) {
|
|
2019
|
+
return this.column(col).map(
|
|
2020
|
+
(v) => typeof v === "string" ? pattern.test(v) : false
|
|
2021
|
+
);
|
|
2022
|
+
}
|
|
2023
|
+
strLengths(col) {
|
|
2024
|
+
return this.column(col).map(
|
|
2025
|
+
(v) => typeof v === "string" ? v.length : 0
|
|
2026
|
+
);
|
|
2027
|
+
}
|
|
2028
|
+
// ---- Casting ----
|
|
2029
|
+
castFloat(col) {
|
|
2030
|
+
return this.column(col).map((v) => {
|
|
2031
|
+
if (v === null) return null;
|
|
2032
|
+
const n = Number(v);
|
|
2033
|
+
return Number.isFinite(n) ? n : null;
|
|
2034
|
+
});
|
|
2035
|
+
}
|
|
2036
|
+
castInt(col) {
|
|
2037
|
+
return this.column(col).map((v) => {
|
|
2038
|
+
if (v === null) return null;
|
|
2039
|
+
const n = Number(v);
|
|
2040
|
+
return Number.isFinite(n) ? Math.trunc(n) : null;
|
|
2041
|
+
});
|
|
2042
|
+
}
|
|
2043
|
+
// ---- Helpers ----
|
|
2044
|
+
numericValues(col) {
|
|
2045
|
+
const result = [];
|
|
2046
|
+
for (const v of this.column(col)) {
|
|
2047
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
2048
|
+
result.push(v);
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
return result;
|
|
2052
|
+
}
|
|
2053
|
+
stringValues(col) {
|
|
2054
|
+
const result = [];
|
|
2055
|
+
for (const v of this.column(col)) {
|
|
2056
|
+
if (typeof v === "string") result.push(v);
|
|
2057
|
+
}
|
|
2058
|
+
return result;
|
|
2059
|
+
}
|
|
2060
|
+
sortedNumeric(col) {
|
|
2061
|
+
return this.numericValues(col).sort((a, b) => a - b);
|
|
2062
|
+
}
|
|
2063
|
+
isSorted(col, descending = false) {
|
|
2064
|
+
const nums = this.numericValues(col);
|
|
2065
|
+
for (let i = 1; i < nums.length; i++) {
|
|
2066
|
+
if (descending ? nums[i] > nums[i - 1] : nums[i] < nums[i - 1]) {
|
|
2067
|
+
return false;
|
|
2068
|
+
}
|
|
2069
|
+
}
|
|
2070
|
+
return true;
|
|
2071
|
+
}
|
|
2072
|
+
};
|
|
2073
|
+
|
|
2074
|
+
// src/core/engine/profiler-bridge.ts
|
|
2075
|
+
var EMAIL_RE2 = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
|
|
2076
|
+
var PHONE_RE = /^[+(]?\d[\d()\-.\s]{6,18}\d$/;
|
|
2077
|
+
var DATE_RE = /^(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})$/;
|
|
2078
|
+
var NAME_RE = /^[A-Z][a-z]+(\s+[A-Z][a-z]+)+$/;
|
|
2079
|
+
var ZIP_RE = /^\d{5}(-\d{4})?$/;
|
|
2080
|
+
var NAME_PATTERNS = {
|
|
2081
|
+
zip: ["zip", "postal", "zipcode", "zip_code", "postal_code"],
|
|
2082
|
+
phone: ["phone", "tel", "mobile", "cell", "fax"],
|
|
2083
|
+
email: ["email", "e_mail", "mail"],
|
|
2084
|
+
date: ["date", "created", "updated", "timestamp", "dob", "birth"],
|
|
2085
|
+
state: ["state", "province", "region"],
|
|
2086
|
+
name: ["first_name", "last_name", "fname", "lname", "full_name", "fullname"]
|
|
2087
|
+
};
|
|
2088
|
+
function overrideTypeByColumnName(columnName, currentType) {
|
|
2089
|
+
if (currentType !== "string" && currentType !== "numeric") return currentType;
|
|
2090
|
+
const colLower = columnName.toLowerCase().replace(/-/g, "_");
|
|
2091
|
+
for (const [semanticType, patterns] of Object.entries(NAME_PATTERNS)) {
|
|
2092
|
+
for (const pattern of patterns) {
|
|
2093
|
+
if (colLower.includes(pattern)) return semanticType;
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
return currentType;
|
|
2097
|
+
}
|
|
2098
|
+
function inferType(values, columnName) {
|
|
2099
|
+
const nonNull = values.filter((v) => v !== null);
|
|
2100
|
+
if (nonNull.length === 0) return "string";
|
|
2101
|
+
let hasNumber = false;
|
|
2102
|
+
let hasBoolean = false;
|
|
2103
|
+
for (const v of nonNull) {
|
|
2104
|
+
if (typeof v === "number") hasNumber = true;
|
|
2105
|
+
else if (typeof v === "boolean") hasBoolean = true;
|
|
2106
|
+
}
|
|
2107
|
+
if (hasNumber && !hasBoolean) return overrideTypeByColumnName(columnName, "numeric");
|
|
2108
|
+
if (hasBoolean && !hasNumber) return "boolean";
|
|
2109
|
+
const stringVals = [];
|
|
2110
|
+
for (const v of nonNull) {
|
|
2111
|
+
if (typeof v === "string") {
|
|
2112
|
+
const trimmed = v.trim();
|
|
2113
|
+
if (trimmed) stringVals.push(trimmed);
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
2116
|
+
if (stringVals.length === 0) return "string";
|
|
2117
|
+
const sample = stringVals.slice(0, 100);
|
|
2118
|
+
const checks = [
|
|
2119
|
+
["email", EMAIL_RE2, 0.7],
|
|
2120
|
+
["zip", ZIP_RE, 0.7],
|
|
2121
|
+
["date", DATE_RE, 0.5],
|
|
2122
|
+
["phone", PHONE_RE, 0.6],
|
|
2123
|
+
["name", NAME_RE, 0.5]
|
|
2124
|
+
];
|
|
2125
|
+
for (const [typeName, pattern, threshold] of checks) {
|
|
2126
|
+
let matches = 0;
|
|
2127
|
+
for (const v of sample) {
|
|
2128
|
+
if (pattern.test(v)) matches++;
|
|
2129
|
+
}
|
|
2130
|
+
if (matches / sample.length >= threshold) {
|
|
2131
|
+
return overrideTypeByColumnName(columnName, typeName);
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
return overrideTypeByColumnName(columnName, "string");
|
|
2135
|
+
}
|
|
2136
|
+
function profileColumn(data, columnName) {
|
|
2137
|
+
const values = data.rawColumn(columnName);
|
|
2138
|
+
const rowCount = values.length;
|
|
2139
|
+
let nullCount = 0;
|
|
2140
|
+
const nonNullValues = [];
|
|
2141
|
+
const uniqueSet = /* @__PURE__ */ new Set();
|
|
2142
|
+
for (const v of values) {
|
|
2143
|
+
if (v === null) {
|
|
2144
|
+
nullCount++;
|
|
2145
|
+
continue;
|
|
2146
|
+
}
|
|
2147
|
+
nonNullValues.push(v);
|
|
2148
|
+
uniqueSet.add(v);
|
|
2149
|
+
}
|
|
2150
|
+
const uniqueCount = uniqueSet.size;
|
|
2151
|
+
const sampleValues = nonNullValues.slice(0, 5).map((v) => String(v));
|
|
2152
|
+
const inferredType = inferType(values, columnName);
|
|
2153
|
+
return makeColumnProfile({
|
|
2154
|
+
name: columnName,
|
|
2155
|
+
inferredType,
|
|
2156
|
+
rowCount,
|
|
2157
|
+
nullCount,
|
|
2158
|
+
nullPct: rowCount > 0 ? nullCount / rowCount : 0,
|
|
2159
|
+
uniqueCount,
|
|
2160
|
+
uniquePct: rowCount > 0 ? uniqueCount / rowCount : 0,
|
|
2161
|
+
sampleValues
|
|
2162
|
+
});
|
|
2163
|
+
}
|
|
2164
|
+
function profileDataframe(rows, filePath = "") {
|
|
2165
|
+
const data = new TabularData(rows);
|
|
2166
|
+
const columns = data.columns.map((col) => profileColumn(data, col));
|
|
2167
|
+
return {
|
|
2168
|
+
filePath,
|
|
2169
|
+
rowCount: data.rowCount,
|
|
2170
|
+
columnCount: data.columns.length,
|
|
2171
|
+
columns
|
|
2172
|
+
};
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
// src/core/engine/selector.ts
|
|
2176
|
+
var STRING_LIKE_TYPES = /* @__PURE__ */ new Set([
|
|
2177
|
+
"string",
|
|
2178
|
+
"email",
|
|
2179
|
+
"phone",
|
|
2180
|
+
"name",
|
|
2181
|
+
"address",
|
|
2182
|
+
"date"
|
|
2183
|
+
]);
|
|
2184
|
+
function selectTransforms(profile, _confidenceThreshold = 0.8) {
|
|
2185
|
+
const all = listTransforms();
|
|
2186
|
+
let selected = [];
|
|
2187
|
+
for (const t of all) {
|
|
2188
|
+
if (!t.autoApply) continue;
|
|
2189
|
+
if (t.inputTypes.includes(profile.inferredType)) {
|
|
2190
|
+
selected.push(t);
|
|
2191
|
+
} else if (t.inputTypes.includes("string") && STRING_LIKE_TYPES.has(profile.inferredType)) {
|
|
2192
|
+
selected.push(t);
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
if (profile.uniquePct > 0.1) {
|
|
2196
|
+
selected = selected.filter((t) => t.name !== "category_auto_correct");
|
|
2197
|
+
}
|
|
2198
|
+
selected.sort((a, b) => b.priority - a.priority);
|
|
2199
|
+
return selected;
|
|
2200
|
+
}
|
|
2201
|
+
|
|
2202
|
+
// src/core/engine/transformer.ts
|
|
2203
|
+
var TransformEngine = class {
|
|
2204
|
+
config;
|
|
2205
|
+
constructor(config) {
|
|
2206
|
+
this.config = makeConfig(config);
|
|
2207
|
+
}
|
|
2208
|
+
transformDf(rows, source = "<dataframe>") {
|
|
2209
|
+
const manifest = new MutableManifest(source);
|
|
2210
|
+
let currentRows = [...rows];
|
|
2211
|
+
if (this.config.transforms.length > 0) {
|
|
2212
|
+
currentRows = this._applyConfigTransforms(currentRows, manifest);
|
|
2213
|
+
} else {
|
|
2214
|
+
currentRows = this._applyAutoTransforms(currentRows, manifest, source);
|
|
2215
|
+
}
|
|
2216
|
+
for (const split of this.config.splits) {
|
|
2217
|
+
if (currentRows.length === 0 || !(split.source in currentRows[0])) continue;
|
|
2218
|
+
const info = getTransform(split.method);
|
|
2219
|
+
if (info && info.mode === "dataframe") {
|
|
2220
|
+
currentRows = info.func(currentRows, split.source);
|
|
2221
|
+
}
|
|
2222
|
+
}
|
|
2223
|
+
for (const [oldName, newName] of Object.entries(this.config.renames)) {
|
|
2224
|
+
if (currentRows.length === 0 || !(oldName in currentRows[0])) continue;
|
|
2225
|
+
currentRows = currentRows.map((row) => {
|
|
2226
|
+
const newRow = {};
|
|
2227
|
+
for (const [k, v] of Object.entries(row)) {
|
|
2228
|
+
newRow[k === oldName ? newName : k] = v;
|
|
2229
|
+
}
|
|
2230
|
+
return newRow;
|
|
2231
|
+
});
|
|
2232
|
+
}
|
|
2233
|
+
const dropCols = new Set(this.config.drop);
|
|
2234
|
+
if (dropCols.size > 0 && currentRows.length > 0) {
|
|
2235
|
+
const existingDrops = [...dropCols].filter((c) => c in currentRows[0]);
|
|
2236
|
+
if (existingDrops.length > 0) {
|
|
2237
|
+
const dropSet = new Set(existingDrops);
|
|
2238
|
+
currentRows = currentRows.map((row) => {
|
|
2239
|
+
const newRow = {};
|
|
2240
|
+
for (const [k, v] of Object.entries(row)) {
|
|
2241
|
+
if (!dropSet.has(k)) newRow[k] = v;
|
|
2242
|
+
}
|
|
2243
|
+
return newRow;
|
|
2244
|
+
});
|
|
2245
|
+
}
|
|
2246
|
+
}
|
|
2247
|
+
for (const filt of this.config.filters) {
|
|
2248
|
+
if (currentRows.length === 0 || !(filt.column in currentRows[0])) continue;
|
|
2249
|
+
currentRows = this._applyFilter(currentRows, filt.column, filt.condition);
|
|
2250
|
+
}
|
|
2251
|
+
if (this.config.dedup) {
|
|
2252
|
+
const dedupCols = this.config.dedup.columns.filter(
|
|
2253
|
+
(c) => currentRows.length > 0 && c in currentRows[0]
|
|
2254
|
+
);
|
|
2255
|
+
if (dedupCols.length > 0) {
|
|
2256
|
+
const before = currentRows.length;
|
|
2257
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2258
|
+
const deduped = [];
|
|
2259
|
+
const iterRows = this.config.dedup.keep === "last" ? [...currentRows].reverse() : currentRows;
|
|
2260
|
+
for (const row of iterRows) {
|
|
2261
|
+
const key = dedupCols.map((c) => String(row[c] ?? "")).join("\0");
|
|
2262
|
+
if (!seen.has(key)) {
|
|
2263
|
+
seen.add(key);
|
|
2264
|
+
deduped.push(row);
|
|
2265
|
+
}
|
|
2266
|
+
}
|
|
2267
|
+
if (this.config.dedup.keep === "last") deduped.reverse();
|
|
2268
|
+
currentRows = deduped;
|
|
2269
|
+
const after = currentRows.length;
|
|
2270
|
+
if (before !== after) {
|
|
2271
|
+
manifest.addRecord(
|
|
2272
|
+
makeTransformRecord({
|
|
2273
|
+
column: dedupCols.join(","),
|
|
2274
|
+
transform: "dedup",
|
|
2275
|
+
affectedRows: before - after,
|
|
2276
|
+
totalRows: before
|
|
2277
|
+
})
|
|
2278
|
+
);
|
|
2279
|
+
}
|
|
2280
|
+
}
|
|
2281
|
+
}
|
|
2282
|
+
const columns = currentRows.length > 0 ? Object.keys(currentRows[0]) : [];
|
|
2283
|
+
return { rows: currentRows, columns, manifest };
|
|
2284
|
+
}
|
|
2285
|
+
_applyConfigTransforms(rows, manifest) {
|
|
2286
|
+
for (const spec of this.config.transforms) {
|
|
2287
|
+
if (rows.length === 0 || !(spec.column in rows[0])) continue;
|
|
2288
|
+
for (const opRaw of spec.ops) {
|
|
2289
|
+
const [name, params] = parseTransformName(opRaw);
|
|
2290
|
+
const info = getTransform(name);
|
|
2291
|
+
if (!info) {
|
|
2292
|
+
manifest.addError(
|
|
2293
|
+
spec.column,
|
|
2294
|
+
name,
|
|
2295
|
+
-1,
|
|
2296
|
+
`Transform '${name}' not found in registry`
|
|
2297
|
+
);
|
|
2298
|
+
continue;
|
|
2299
|
+
}
|
|
2300
|
+
rows = this._applySingleTransform(rows, spec.column, info, params, manifest);
|
|
2301
|
+
}
|
|
2302
|
+
}
|
|
2303
|
+
return rows;
|
|
2304
|
+
}
|
|
2305
|
+
_applyAutoTransforms(rows, manifest, source) {
|
|
2306
|
+
const filePath = source !== "<dataframe>" ? source : "";
|
|
2307
|
+
const profile = profileDataframe(rows, filePath);
|
|
2308
|
+
for (const colProfile of profile.columns) {
|
|
2309
|
+
const selected = selectTransforms(colProfile);
|
|
2310
|
+
for (const info of selected) {
|
|
2311
|
+
rows = this._applySingleTransform(
|
|
2312
|
+
rows,
|
|
2313
|
+
colProfile.name,
|
|
2314
|
+
info,
|
|
2315
|
+
[],
|
|
2316
|
+
manifest
|
|
2317
|
+
);
|
|
2318
|
+
}
|
|
2319
|
+
}
|
|
2320
|
+
return rows;
|
|
2321
|
+
}
|
|
2322
|
+
_applySingleTransform(rows, column, info, params, manifest) {
|
|
2323
|
+
const totalRows = rows.length;
|
|
2324
|
+
const beforeSample = rows.slice(0, 3).map((r) => String(r[column] ?? ""));
|
|
2325
|
+
try {
|
|
2326
|
+
let newRows;
|
|
2327
|
+
if (info.mode === "dataframe") {
|
|
2328
|
+
newRows = info.func(rows, column, ...castParams(params));
|
|
2329
|
+
} else {
|
|
2330
|
+
const values = rows.map((r) => {
|
|
2331
|
+
const v = r[column];
|
|
2332
|
+
if (v === null || v === void 0) return null;
|
|
2333
|
+
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") return v;
|
|
2334
|
+
return String(v);
|
|
2335
|
+
});
|
|
2336
|
+
const typedParams = castParams(params);
|
|
2337
|
+
const result = typedParams.length > 0 ? info.func(values, ...typedParams) : info.func(values);
|
|
2338
|
+
let newValues;
|
|
2339
|
+
if (Array.isArray(result) && result.length === 2 && Array.isArray(result[1])) {
|
|
2340
|
+
newValues = result[0];
|
|
2341
|
+
const flagged = result[1];
|
|
2342
|
+
for (const rowIdx of flagged) {
|
|
2343
|
+
manifest.addError(column, info.name, rowIdx, "Flagged for review");
|
|
2344
|
+
}
|
|
2345
|
+
} else {
|
|
2346
|
+
newValues = result;
|
|
2347
|
+
}
|
|
2348
|
+
newRows = rows.map((row, i) => {
|
|
2349
|
+
const oldVal = row[column] ?? null;
|
|
2350
|
+
if (newValues[i] === oldVal) return row;
|
|
2351
|
+
return { ...row, [column]: newValues[i] };
|
|
2352
|
+
});
|
|
2353
|
+
}
|
|
2354
|
+
const afterSample = newRows.slice(0, 3).map((r) => String(r[column] ?? ""));
|
|
2355
|
+
let changed = 0;
|
|
2356
|
+
for (let i = 0; i < Math.min(rows.length, newRows.length); i++) {
|
|
2357
|
+
if (String(rows[i][column] ?? "") !== String(newRows[i][column] ?? "")) {
|
|
2358
|
+
changed++;
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
manifest.addRecord(
|
|
2362
|
+
makeTransformRecord({
|
|
2363
|
+
column,
|
|
2364
|
+
transform: info.name,
|
|
2365
|
+
affectedRows: changed,
|
|
2366
|
+
totalRows,
|
|
2367
|
+
sampleBefore: beforeSample,
|
|
2368
|
+
sampleAfter: afterSample
|
|
2369
|
+
})
|
|
2370
|
+
);
|
|
2371
|
+
return newRows;
|
|
2372
|
+
} catch (e) {
|
|
2373
|
+
manifest.addError(
|
|
2374
|
+
column,
|
|
2375
|
+
info.name,
|
|
2376
|
+
-1,
|
|
2377
|
+
e instanceof Error ? e.message : String(e)
|
|
2378
|
+
);
|
|
2379
|
+
return rows;
|
|
2380
|
+
}
|
|
2381
|
+
}
|
|
2382
|
+
_applyFilter(rows, column, condition) {
|
|
2383
|
+
if (condition === "not_null") {
|
|
2384
|
+
return rows.filter((r) => r[column] !== null && r[column] !== void 0);
|
|
2385
|
+
}
|
|
2386
|
+
if (condition.startsWith("after:")) {
|
|
2387
|
+
const dateStr = condition.slice(6);
|
|
2388
|
+
return rows.filter((r) => String(r[column] ?? "") > dateStr);
|
|
2389
|
+
}
|
|
2390
|
+
if (condition.startsWith("before:")) {
|
|
2391
|
+
const dateStr = condition.slice(7);
|
|
2392
|
+
return rows.filter((r) => String(r[column] ?? "") < dateStr);
|
|
2393
|
+
}
|
|
2394
|
+
return rows;
|
|
2395
|
+
}
|
|
2396
|
+
};
|
|
2397
|
+
function castParams(params) {
|
|
2398
|
+
return params.map((p) => {
|
|
2399
|
+
const asInt = parseInt(p, 10);
|
|
2400
|
+
if (!Number.isNaN(asInt) && String(asInt) === p) return asInt;
|
|
2401
|
+
const asFloat = parseFloat(p);
|
|
2402
|
+
if (!Number.isNaN(asFloat)) return asFloat;
|
|
2403
|
+
return p;
|
|
2404
|
+
});
|
|
2405
|
+
}
|
|
2406
|
+
|
|
2407
|
+
// src/core/engine/differ.ts
|
|
2408
|
+
function diffDataframes(before, after) {
|
|
2409
|
+
const beforeCols = new Set(before.length > 0 ? Object.keys(before[0]) : []);
|
|
2410
|
+
const afterCols = new Set(after.length > 0 ? Object.keys(after[0]) : []);
|
|
2411
|
+
const addedColumns = [...afterCols].filter((c) => !beforeCols.has(c)).sort();
|
|
2412
|
+
const removedColumns = [...beforeCols].filter((c) => !afterCols.has(c)).sort();
|
|
2413
|
+
const commonCols = [...beforeCols].filter((c) => afterCols.has(c)).sort();
|
|
2414
|
+
const changedColumns = [];
|
|
2415
|
+
const columnDetails = {};
|
|
2416
|
+
let totalChanges = 0;
|
|
2417
|
+
for (const col of commonCols) {
|
|
2418
|
+
if (before.length !== after.length) {
|
|
2419
|
+
changedColumns.push(col);
|
|
2420
|
+
totalChanges += Math.abs(before.length - after.length);
|
|
2421
|
+
continue;
|
|
2422
|
+
}
|
|
2423
|
+
let changes = 0;
|
|
2424
|
+
for (let i = 0; i < before.length; i++) {
|
|
2425
|
+
const bVal = String(before[i][col] ?? "");
|
|
2426
|
+
const aVal = String(after[i][col] ?? "");
|
|
2427
|
+
if (bVal !== aVal) changes++;
|
|
2428
|
+
}
|
|
2429
|
+
if (changes > 0) {
|
|
2430
|
+
changedColumns.push(col);
|
|
2431
|
+
totalChanges += changes;
|
|
2432
|
+
columnDetails[col] = { changedRows: changes };
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2435
|
+
return {
|
|
2436
|
+
totalChanges,
|
|
2437
|
+
changedColumns,
|
|
2438
|
+
addedColumns,
|
|
2439
|
+
removedColumns,
|
|
2440
|
+
rowCountBefore: before.length,
|
|
2441
|
+
rowCountAfter: after.length,
|
|
2442
|
+
columnDetails
|
|
2443
|
+
};
|
|
2444
|
+
}
|
|
2445
|
+
|
|
2446
|
+
// src/core/config/learner.ts
|
|
2447
|
+
init_types();
|
|
2448
|
+
function learnConfig(rows, source = "") {
|
|
2449
|
+
const profile = profileDataframe(rows, source);
|
|
2450
|
+
const transforms = [];
|
|
2451
|
+
for (const colProfile of profile.columns) {
|
|
2452
|
+
const selected = selectTransforms(colProfile);
|
|
2453
|
+
if (selected.length > 0) {
|
|
2454
|
+
transforms.push({
|
|
2455
|
+
column: colProfile.name,
|
|
2456
|
+
ops: selected.map((t) => t.name)
|
|
2457
|
+
});
|
|
2458
|
+
}
|
|
2459
|
+
}
|
|
2460
|
+
return makeConfig({
|
|
2461
|
+
source: source || null,
|
|
2462
|
+
transforms
|
|
2463
|
+
});
|
|
2464
|
+
}
|
|
2465
|
+
|
|
2466
|
+
// src/core/config/loader.ts
|
|
2467
|
+
init_types();
|
|
2468
|
+
|
|
2469
|
+
// src/core/config/schema.ts
|
|
2470
|
+
init_types();
|
|
2471
|
+
init_types();
|
|
2472
|
+
function validateConfig(raw) {
|
|
2473
|
+
const transforms = Array.isArray(raw["transforms"]) ? raw["transforms"].map((t) => ({
|
|
2474
|
+
column: String(t["column"] ?? ""),
|
|
2475
|
+
ops: Array.isArray(t["ops"]) ? t["ops"].map(String) : []
|
|
2476
|
+
})) : [];
|
|
2477
|
+
const splits = Array.isArray(raw["splits"]) ? raw["splits"].map((s) => ({
|
|
2478
|
+
source: String(s["source"] ?? ""),
|
|
2479
|
+
target: Array.isArray(s["target"]) ? s["target"].map(String) : [],
|
|
2480
|
+
method: String(s["method"] ?? "")
|
|
2481
|
+
})) : [];
|
|
2482
|
+
const renames = raw["renames"] && typeof raw["renames"] === "object" ? Object.fromEntries(
|
|
2483
|
+
Object.entries(raw["renames"]).map(
|
|
2484
|
+
([k, v]) => [k, String(v)]
|
|
2485
|
+
)
|
|
2486
|
+
) : {};
|
|
2487
|
+
const drop = Array.isArray(raw["drop"]) ? raw["drop"].map(String) : [];
|
|
2488
|
+
const filters = Array.isArray(raw["filters"]) ? raw["filters"].map((f) => ({
|
|
2489
|
+
column: String(f["column"] ?? ""),
|
|
2490
|
+
condition: String(f["condition"] ?? "")
|
|
2491
|
+
})) : [];
|
|
2492
|
+
const dedupRaw = raw["dedup"];
|
|
2493
|
+
const dedup = dedupRaw && typeof dedupRaw === "object" ? {
|
|
2494
|
+
columns: Array.isArray(dedupRaw["columns"]) ? dedupRaw["columns"].map(String) : [],
|
|
2495
|
+
keep: dedupRaw["keep"] === "last" ? "last" : "first"
|
|
2496
|
+
} : null;
|
|
2497
|
+
const mappings = Array.isArray(raw["mappings"]) ? raw["mappings"].map((m) => ({
|
|
2498
|
+
source: String(m["source"] ?? ""),
|
|
2499
|
+
target: m["target"],
|
|
2500
|
+
transform: m["transform"] ?? null
|
|
2501
|
+
})) : [];
|
|
2502
|
+
return makeConfig({
|
|
2503
|
+
source: raw["source"] != null ? String(raw["source"]) : null,
|
|
2504
|
+
output: raw["output"] != null ? String(raw["output"]) : null,
|
|
2505
|
+
transforms,
|
|
2506
|
+
splits,
|
|
2507
|
+
renames,
|
|
2508
|
+
drop,
|
|
2509
|
+
filters,
|
|
2510
|
+
dedup,
|
|
2511
|
+
mappings
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
|
|
2515
|
+
// src/core/config/loader.ts
|
|
2516
|
+
var yamlModule = null;
|
|
2517
|
+
function getYaml() {
|
|
2518
|
+
if (yamlModule) return yamlModule;
|
|
2519
|
+
try {
|
|
2520
|
+
yamlModule = __require("yaml");
|
|
2521
|
+
} catch {
|
|
2522
|
+
}
|
|
2523
|
+
return yamlModule;
|
|
2524
|
+
}
|
|
2525
|
+
function loadConfigFromString(content) {
|
|
2526
|
+
const yaml = getYaml();
|
|
2527
|
+
if (!yaml) {
|
|
2528
|
+
throw new Error("yaml package is required for config loading. Install with: npm install yaml");
|
|
2529
|
+
}
|
|
2530
|
+
const data = yaml.parse(content);
|
|
2531
|
+
if (data === null || data === void 0) return makeConfig();
|
|
2532
|
+
if (typeof data !== "object" || Array.isArray(data)) {
|
|
2533
|
+
throw new Error(`Config file is not a valid YAML object (got ${Array.isArray(data) ? "array" : typeof data})`);
|
|
2534
|
+
}
|
|
2535
|
+
return validateConfig(data);
|
|
2536
|
+
}
|
|
2537
|
+
function saveConfigToString(config) {
|
|
2538
|
+
const yaml = getYaml();
|
|
2539
|
+
if (!yaml) {
|
|
2540
|
+
throw new Error("yaml package is required for config saving. Install with: npm install yaml");
|
|
2541
|
+
}
|
|
2542
|
+
const data = {};
|
|
2543
|
+
if (config.source) data["source"] = config.source;
|
|
2544
|
+
if (config.output) data["output"] = config.output;
|
|
2545
|
+
if (config.transforms.length > 0) data["transforms"] = config.transforms;
|
|
2546
|
+
if (config.splits.length > 0) data["splits"] = config.splits;
|
|
2547
|
+
if (Object.keys(config.renames).length > 0) data["renames"] = config.renames;
|
|
2548
|
+
if (config.drop.length > 0) data["drop"] = config.drop;
|
|
2549
|
+
if (config.filters.length > 0) data["filters"] = config.filters;
|
|
2550
|
+
if (config.dedup) data["dedup"] = config.dedup;
|
|
2551
|
+
if (config.mappings.length > 0) data["mappings"] = config.mappings;
|
|
2552
|
+
return yaml.stringify(data);
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
// src/cli.ts
|
|
2556
|
+
init_types();
|
|
2557
|
+
|
|
2558
|
+
// src/core/reporters/json-reporter.ts
|
|
2559
|
+
init_types();
|
|
2560
|
+
function manifestToJson(manifest) {
|
|
2561
|
+
if (manifest instanceof MutableManifest) {
|
|
2562
|
+
return JSON.stringify(manifest.toDict(), null, 2);
|
|
2563
|
+
}
|
|
2564
|
+
return JSON.stringify(manifest, null, 2);
|
|
2565
|
+
}
|
|
2566
|
+
|
|
2567
|
+
// src/core/mapping/schema-mapper.ts
|
|
2568
|
+
init_types();
|
|
2569
|
+
|
|
2570
|
+
// src/core/mapping/name-similarity.ts
|
|
2571
|
+
var ALIASES = {
|
|
2572
|
+
first_name: ["fname", "first", "given_name", "first_nm"],
|
|
2573
|
+
last_name: ["lname", "last", "surname", "family_name", "last_nm"],
|
|
2574
|
+
email: ["email_address", "e_mail", "email_addr", "mail"],
|
|
2575
|
+
phone: ["phone_number", "ph", "telephone", "tel", "mobile", "cell"],
|
|
2576
|
+
address: ["addr", "street_address", "addr_line_1", "address_line_1"],
|
|
2577
|
+
city: ["town", "municipality"],
|
|
2578
|
+
state: ["st", "province", "region"],
|
|
2579
|
+
zip: ["zipcode", "zip_code", "postal_code", "postal"],
|
|
2580
|
+
name: ["full_name", "fullname", "customer_name"],
|
|
2581
|
+
created_at: ["signup_date", "signup_dt", "create_date", "date_created"]
|
|
2582
|
+
};
|
|
2583
|
+
var _ALIAS_LOOKUP = /* @__PURE__ */ new Map();
|
|
2584
|
+
for (const [canonical, aliases] of Object.entries(ALIASES)) {
|
|
2585
|
+
for (const alias of aliases) {
|
|
2586
|
+
_ALIAS_LOOKUP.set(alias.toLowerCase(), canonical.toLowerCase());
|
|
2587
|
+
}
|
|
2588
|
+
_ALIAS_LOOKUP.set(canonical.toLowerCase(), canonical.toLowerCase());
|
|
2589
|
+
}
|
|
2590
|
+
function fuzzyWRatio(a, b) {
|
|
2591
|
+
if (a === b) return 100;
|
|
2592
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
2593
|
+
const maxLen = Math.max(a.length, b.length);
|
|
2594
|
+
const prev = new Array(b.length + 1);
|
|
2595
|
+
const curr = new Array(b.length + 1);
|
|
2596
|
+
for (let j = 0; j <= b.length; j++) prev[j] = j;
|
|
2597
|
+
for (let i = 1; i <= a.length; i++) {
|
|
2598
|
+
curr[0] = i;
|
|
2599
|
+
for (let j = 1; j <= b.length; j++) {
|
|
2600
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
2601
|
+
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
|
|
2602
|
+
}
|
|
2603
|
+
for (let j = 0; j <= b.length; j++) prev[j] = curr[j];
|
|
2604
|
+
}
|
|
2605
|
+
const distance = prev[b.length];
|
|
2606
|
+
return 100 * (1 - distance / maxLen);
|
|
2607
|
+
}
|
|
2608
|
+
function nameSimilarity(source, target) {
|
|
2609
|
+
const sLower = source.toLowerCase().trim();
|
|
2610
|
+
const tLower = target.toLowerCase().trim();
|
|
2611
|
+
if (sLower === tLower) return 1;
|
|
2612
|
+
const sCanonical = _ALIAS_LOOKUP.get(sLower);
|
|
2613
|
+
const tCanonical = _ALIAS_LOOKUP.get(tLower);
|
|
2614
|
+
if (sCanonical && tCanonical && sCanonical === tCanonical) return 0.95;
|
|
2615
|
+
return fuzzyWRatio(sLower, tLower) / 100;
|
|
2616
|
+
}
|
|
2617
|
+
|
|
2618
|
+
// src/core/mapping/profile-similarity.ts
|
|
2619
|
+
function profileSimilarity(source, target) {
|
|
2620
|
+
let score = 0;
|
|
2621
|
+
let weights = 0;
|
|
2622
|
+
if (source.inferredType === target.inferredType) score += 0.4;
|
|
2623
|
+
weights += 0.4;
|
|
2624
|
+
const nullDiff = Math.abs(source.nullPct - target.nullPct);
|
|
2625
|
+
score += 0.2 * Math.max(0, 1 - nullDiff);
|
|
2626
|
+
weights += 0.2;
|
|
2627
|
+
const uniqueDiff = Math.abs(source.uniquePct - target.uniquePct);
|
|
2628
|
+
score += 0.2 * Math.max(0, 1 - uniqueDiff);
|
|
2629
|
+
weights += 0.2;
|
|
2630
|
+
if (source.uniqueCount > 0 && target.uniqueCount > 0) {
|
|
2631
|
+
const ratio = Math.min(source.uniqueCount, target.uniqueCount) / Math.max(source.uniqueCount, target.uniqueCount);
|
|
2632
|
+
score += 0.2 * ratio;
|
|
2633
|
+
}
|
|
2634
|
+
weights += 0.2;
|
|
2635
|
+
return weights > 0 ? score / weights : 0;
|
|
2636
|
+
}
|
|
2637
|
+
|
|
2638
|
+
// src/core/mapping/schema-mapper.ts
|
|
2639
|
+
var SchemaMapper = class {
|
|
2640
|
+
autoThreshold;
|
|
2641
|
+
suggestThreshold;
|
|
2642
|
+
constructor(autoThreshold = 0.9, suggestThreshold = 0.6) {
|
|
2643
|
+
this.autoThreshold = autoThreshold;
|
|
2644
|
+
this.suggestThreshold = suggestThreshold;
|
|
2645
|
+
}
|
|
2646
|
+
map(sourceRows, targetRows) {
|
|
2647
|
+
const sourceProfile = profileDataframe(sourceRows);
|
|
2648
|
+
const targetProfile = profileDataframe(targetRows);
|
|
2649
|
+
const sourceProfiles = new Map(sourceProfile.columns.map((c) => [c.name, c]));
|
|
2650
|
+
const targetProfiles = new Map(targetProfile.columns.map((c) => [c.name, c]));
|
|
2651
|
+
const sourceCols = sourceRows.length > 0 ? Object.keys(sourceRows[0]) : [];
|
|
2652
|
+
const targetCols = targetRows.length > 0 ? Object.keys(targetRows[0]) : [];
|
|
2653
|
+
const mappings = [];
|
|
2654
|
+
const usedTargets = /* @__PURE__ */ new Set();
|
|
2655
|
+
for (const sCol of sourceCols) {
|
|
2656
|
+
let bestMatch = null;
|
|
2657
|
+
let bestScore = 0;
|
|
2658
|
+
for (const tCol of targetCols) {
|
|
2659
|
+
if (usedTargets.has(tCol)) continue;
|
|
2660
|
+
const nScore = nameSimilarity(sCol, tCol);
|
|
2661
|
+
let pScore = 0;
|
|
2662
|
+
const sp = sourceProfiles.get(sCol);
|
|
2663
|
+
const tp = targetProfiles.get(tCol);
|
|
2664
|
+
if (sp && tp) {
|
|
2665
|
+
pScore = profileSimilarity(sp, tp);
|
|
2666
|
+
}
|
|
2667
|
+
const combined = 0.7 * nScore + 0.3 * pScore;
|
|
2668
|
+
if (combined > bestScore && combined >= this.suggestThreshold) {
|
|
2669
|
+
bestScore = combined;
|
|
2670
|
+
bestMatch = {
|
|
2671
|
+
source: sCol,
|
|
2672
|
+
target: tCol,
|
|
2673
|
+
confidence: Math.round(combined * 1e3) / 1e3,
|
|
2674
|
+
transform: null
|
|
2675
|
+
};
|
|
2676
|
+
}
|
|
2677
|
+
}
|
|
2678
|
+
if (bestMatch) {
|
|
2679
|
+
mappings.push(bestMatch);
|
|
2680
|
+
usedTargets.add(bestMatch.target);
|
|
2681
|
+
}
|
|
2682
|
+
}
|
|
2683
|
+
return mappings;
|
|
2684
|
+
}
|
|
2685
|
+
toConfig(mappings) {
|
|
2686
|
+
return makeConfig({
|
|
2687
|
+
mappings: mappings.map((m) => ({
|
|
2688
|
+
source: m.source,
|
|
2689
|
+
target: m.target,
|
|
2690
|
+
transform: m.transform
|
|
2691
|
+
}))
|
|
2692
|
+
});
|
|
2693
|
+
}
|
|
2694
|
+
};
|
|
2695
|
+
|
|
2696
|
+
// src/core/engine/streaming.ts
|
|
2697
|
+
var StreamProcessor = class {
|
|
2698
|
+
engine;
|
|
2699
|
+
_batchCount = 0;
|
|
2700
|
+
constructor(config) {
|
|
2701
|
+
this.engine = new TransformEngine(config);
|
|
2702
|
+
}
|
|
2703
|
+
/** Transform a single record. */
|
|
2704
|
+
transformOne(record) {
|
|
2705
|
+
return this.engine.transformDf([record]);
|
|
2706
|
+
}
|
|
2707
|
+
/** Transform a batch of rows. */
|
|
2708
|
+
transformBatch(rows) {
|
|
2709
|
+
this._batchCount++;
|
|
2710
|
+
return this.engine.transformDf(rows);
|
|
2711
|
+
}
|
|
2712
|
+
/** Process rows in chunks, yielding TransformResult per chunk. */
|
|
2713
|
+
*streamRows(rows, chunkSize = 1e4) {
|
|
2714
|
+
for (let start = 0; start < rows.length; start += chunkSize) {
|
|
2715
|
+
const batch = rows.slice(start, start + chunkSize);
|
|
2716
|
+
this._batchCount++;
|
|
2717
|
+
yield this.engine.transformDf(batch);
|
|
2718
|
+
}
|
|
2719
|
+
}
|
|
2720
|
+
get batchesProcessed() {
|
|
2721
|
+
return this._batchCount;
|
|
2722
|
+
}
|
|
2723
|
+
};
|
|
2724
|
+
function historyDir() {
|
|
2725
|
+
return join(homedir(), ".goldenflow", "history");
|
|
2726
|
+
}
|
|
2727
|
+
function listRuns(limit = 20) {
|
|
2728
|
+
const dir = historyDir();
|
|
2729
|
+
if (!existsSync(dir)) return [];
|
|
2730
|
+
const files = readdirSync(dir).filter((name) => name.endsWith(".json")).map((name) => ({ name, mtime: statSync(join(dir, name)).mtimeMs })).sort((a, b) => b.mtime - a.mtime).slice(0, limit);
|
|
2731
|
+
const runs = [];
|
|
2732
|
+
for (const file of files) {
|
|
2733
|
+
try {
|
|
2734
|
+
const content = readFileSync(join(dir, file.name), "utf-8");
|
|
2735
|
+
runs.push(JSON.parse(content));
|
|
2736
|
+
} catch (e) {
|
|
2737
|
+
console.warn(`[goldenflow:history] Skipping corrupt history file ${file.name}: ${e instanceof Error ? e.message : String(e)}`);
|
|
2738
|
+
}
|
|
2739
|
+
}
|
|
2740
|
+
return runs;
|
|
2741
|
+
}
|
|
2742
|
+
|
|
2743
|
+
// src/cli.ts
|
|
2744
|
+
var VERSION = "0.1.0";
|
|
2745
|
+
var program = new Command().name("goldenflow-js").description("GoldenFlow: data transformation toolkit (TypeScript)").version(VERSION);
|
|
2746
|
+
program.command("transform <file>").description("Transform a data file (zero-config or config-driven)").option("-c, --config <path>", "YAML config file").option("-o, --output-dir <dir>", "Output directory").option("--domain <name>", "Domain pack to use").option("--strict", "Fail if any transform errors occur").option("--json", "Output manifest as JSON").action(async (file, opts) => {
|
|
2747
|
+
const rows = readFile(file);
|
|
2748
|
+
let cfg = makeConfig();
|
|
2749
|
+
if (opts["config"]) {
|
|
2750
|
+
const { readFileSync: readFileSync3 } = await import('fs');
|
|
2751
|
+
const content = readFileSync3(opts["config"], "utf-8");
|
|
2752
|
+
cfg = loadConfigFromString(content);
|
|
2753
|
+
}
|
|
2754
|
+
if (opts["domain"]) {
|
|
2755
|
+
const { loadDomain: loadDomain2 } = await Promise.resolve().then(() => (init_domains(), domains_exports));
|
|
2756
|
+
const pack = await loadDomain2(opts["domain"]);
|
|
2757
|
+
if (pack) cfg = pack.defaultConfig;
|
|
2758
|
+
}
|
|
2759
|
+
const engine = new TransformEngine(cfg);
|
|
2760
|
+
const result = engine.transformDf(rows, file);
|
|
2761
|
+
if (opts["output-dir"]) {
|
|
2762
|
+
const dir = opts["output-dir"];
|
|
2763
|
+
const { basename, extname: ext_, join: join2 } = await import('path');
|
|
2764
|
+
const { writeFileSync: writeFileSync3, mkdirSync: mkdirSync3 } = await import('fs');
|
|
2765
|
+
mkdirSync3(dir, { recursive: true });
|
|
2766
|
+
const ext = ext_(file);
|
|
2767
|
+
const stem = basename(file, ext);
|
|
2768
|
+
writeFile(result.rows, join2(dir, `${stem}_transformed${ext}`));
|
|
2769
|
+
writeFileSync3(join2(dir, `${stem}_manifest.json`), manifestToJson(result.manifest));
|
|
2770
|
+
}
|
|
2771
|
+
if (opts["json"]) {
|
|
2772
|
+
console.log(manifestToJson(result.manifest));
|
|
2773
|
+
} else {
|
|
2774
|
+
const m = result.manifest;
|
|
2775
|
+
console.log(`Transforms: ${m.records.length} | Errors: ${m.errors.length} | Rows: ${result.rows.length}`);
|
|
2776
|
+
for (const r of m.records.slice(0, 10)) {
|
|
2777
|
+
console.log(` ${r.column}/${r.transform}: ${r.affectedRows}/${r.totalRows} affected`);
|
|
2778
|
+
}
|
|
2779
|
+
}
|
|
2780
|
+
if (opts["strict"] && result.manifest.errors.length > 0) {
|
|
2781
|
+
console.error(`Strict mode: ${result.manifest.errors.length} transform errors`);
|
|
2782
|
+
process.exit(1);
|
|
2783
|
+
}
|
|
2784
|
+
});
|
|
2785
|
+
program.command("validate <file>").description("Dry-run: show what would change").option("-c, --config <path>", "YAML config file").action(async (file, opts) => {
|
|
2786
|
+
const rows = readFile(file);
|
|
2787
|
+
let cfg = makeConfig();
|
|
2788
|
+
if (opts["config"]) {
|
|
2789
|
+
const { readFileSync: readFileSync3 } = await import('fs');
|
|
2790
|
+
cfg = loadConfigFromString(readFileSync3(opts["config"], "utf-8"));
|
|
2791
|
+
}
|
|
2792
|
+
const engine = new TransformEngine(cfg);
|
|
2793
|
+
const result = engine.transformDf(rows, file);
|
|
2794
|
+
console.log("Dry run \u2014 would change:");
|
|
2795
|
+
for (const r of result.manifest.records) {
|
|
2796
|
+
console.log(` ${r.column}/${r.transform}: ${r.affectedRows} rows`);
|
|
2797
|
+
}
|
|
2798
|
+
});
|
|
2799
|
+
program.command("profile <file>").description("Show column profiles for a data file").action((file) => {
|
|
2800
|
+
const rows = readFile(file);
|
|
2801
|
+
const prof = profileDataframe(rows, file);
|
|
2802
|
+
console.log(`${prof.rowCount} rows, ${prof.columnCount} columns
|
|
2803
|
+
`);
|
|
2804
|
+
for (const c of prof.columns) {
|
|
2805
|
+
const pct = (c.nullPct * 100).toFixed(0);
|
|
2806
|
+
console.log(` ${c.name}: ${c.inferredType} | nulls: ${c.nullCount} (${pct}%) | unique: ${c.uniqueCount}`);
|
|
2807
|
+
}
|
|
2808
|
+
});
|
|
2809
|
+
program.command("learn <file>").description("Generate a YAML config from data patterns").option("-o, --output <path>", "Output config path", "goldenflow.yaml").action(async (file, opts) => {
|
|
2810
|
+
const rows = readFile(file);
|
|
2811
|
+
const cfg = learnConfig(rows, file);
|
|
2812
|
+
const yaml = saveConfigToString(cfg);
|
|
2813
|
+
const { writeFileSync: writeFileSync3 } = await import('fs');
|
|
2814
|
+
writeFileSync3(opts.output, yaml);
|
|
2815
|
+
console.log(`Config saved to ${opts.output}`);
|
|
2816
|
+
});
|
|
2817
|
+
program.command("diff <before> <after>").description("Compare pre/post transform files").action((before, after) => {
|
|
2818
|
+
const bRows = readFile(before);
|
|
2819
|
+
const aRows = readFile(after);
|
|
2820
|
+
const result = diffDataframes(bRows, aRows);
|
|
2821
|
+
console.log(`Changes: ${result.totalChanges}`);
|
|
2822
|
+
console.log(`Rows: ${result.rowCountBefore} \u2192 ${result.rowCountAfter}`);
|
|
2823
|
+
if (result.addedColumns.length) console.log(`Added: ${result.addedColumns.join(", ")}`);
|
|
2824
|
+
if (result.removedColumns.length) console.log(`Removed: ${result.removedColumns.join(", ")}`);
|
|
2825
|
+
if (result.changedColumns.length) console.log(`Changed: ${result.changedColumns.join(", ")}`);
|
|
2826
|
+
});
|
|
2827
|
+
program.command("map").description("Auto-map schemas between source and target").requiredOption("-s, --source <file>", "Source data file").requiredOption("-t, --target <file>", "Target data file").option("-o, --output <path>", "Save mapping config").action(async (opts) => {
|
|
2828
|
+
const sRows = readFile(opts.source);
|
|
2829
|
+
const tRows = readFile(opts.target);
|
|
2830
|
+
const mapper = new SchemaMapper();
|
|
2831
|
+
const mappings = mapper.map(sRows, tRows);
|
|
2832
|
+
for (const m of mappings) {
|
|
2833
|
+
const tier = m.confidence >= 0.9 ? "auto" : m.confidence >= 0.6 ? "suggest" : "skip";
|
|
2834
|
+
console.log(` ${m.source} \u2192 ${m.target} (${m.confidence.toFixed(2)}) [${tier}]`);
|
|
2835
|
+
}
|
|
2836
|
+
if (opts.output) {
|
|
2837
|
+
const cfg = mapper.toConfig(mappings);
|
|
2838
|
+
const yaml = saveConfigToString(cfg);
|
|
2839
|
+
const { writeFileSync: writeFileSync3 } = await import('fs');
|
|
2840
|
+
writeFileSync3(opts.output, yaml);
|
|
2841
|
+
console.log(`
|
|
2842
|
+
Mapping saved to ${opts.output}`);
|
|
2843
|
+
}
|
|
2844
|
+
});
|
|
2845
|
+
program.command("stream <file>").description("Stream-process a large file in chunks").option("--chunk-size <n>", "Rows per batch", "10000").option("-c, --config <path>", "YAML config file").action(async (file, opts) => {
|
|
2846
|
+
const rows = readFile(file);
|
|
2847
|
+
let cfg = makeConfig();
|
|
2848
|
+
if (opts["config"]) {
|
|
2849
|
+
const { readFileSync: readFileSync3 } = await import('fs');
|
|
2850
|
+
cfg = loadConfigFromString(readFileSync3(opts["config"], "utf-8"));
|
|
2851
|
+
}
|
|
2852
|
+
const chunkSize = parseInt(opts["chunk-size"] ?? "10000", 10);
|
|
2853
|
+
const processor = new StreamProcessor(cfg);
|
|
2854
|
+
let totalRows = 0;
|
|
2855
|
+
for (const result of processor.streamRows(rows, chunkSize)) {
|
|
2856
|
+
totalRows += result.rows.length;
|
|
2857
|
+
console.log(`Batch ${processor.batchesProcessed}: ${result.rows.length} rows`);
|
|
2858
|
+
}
|
|
2859
|
+
console.log(`Streamed ${processor.batchesProcessed} batches, ${totalRows} rows total`);
|
|
2860
|
+
});
|
|
2861
|
+
program.command("history").description("Show recent transform runs").option("-n, --limit <n>", "Number of runs", "20").action((opts) => {
|
|
2862
|
+
try {
|
|
2863
|
+
const runs = listRuns(parseInt(opts.limit, 10));
|
|
2864
|
+
if (runs.length === 0) {
|
|
2865
|
+
console.log("No transform history yet.");
|
|
2866
|
+
return;
|
|
2867
|
+
}
|
|
2868
|
+
for (const r of runs) {
|
|
2869
|
+
console.log(
|
|
2870
|
+
` ${r.runId} ${r.source} rows=${r.rows} transforms=${r.transformsApplied} errors=${r.errors} ${r.timestamp.slice(0, 19)}`
|
|
2871
|
+
);
|
|
2872
|
+
}
|
|
2873
|
+
} catch {
|
|
2874
|
+
console.log("No transform history yet.");
|
|
2875
|
+
}
|
|
2876
|
+
});
|
|
2877
|
+
program.command("demo").description("Generate sample data for trying GoldenFlow").option("-o, --output-dir <dir>", "Output directory", ".").action(async (opts) => {
|
|
2878
|
+
const dir = opts["output-dir"];
|
|
2879
|
+
const { writeFileSync: writeFileSync3, mkdirSync: mkdirSync3 } = await import('fs');
|
|
2880
|
+
const { join: join2 } = await import('path');
|
|
2881
|
+
mkdirSync3(dir, { recursive: true });
|
|
2882
|
+
const demoData = [
|
|
2883
|
+
{ name: " John Smith ", email: "JOHN@EXAMPLE.COM", phone: "(555) 123-4567", state: "Pennsylvania", signup_date: "03/15/2024", price: "$1,234.56", status: "active" },
|
|
2884
|
+
{ name: "DR. JANE DOE", email: " jane@test.com ", phone: "555.987.6543", state: "CA", signup_date: "2024-01-20", price: "$99.99", status: "ACTIVE" },
|
|
2885
|
+
{ name: "mcdonald, robert", email: "bob@test.com", phone: "+1-555-456-7890", state: "new york", signup_date: "Jan 5, 2023", price: "$0.50", status: "actve" },
|
|
2886
|
+
{ name: "Mary O'Brien", email: "mary@sample.com", phone: "5554567890", state: "IL", signup_date: "12/25/2022", price: "$5,000.00", status: "inactive" }
|
|
2887
|
+
];
|
|
2888
|
+
writeFile(demoData, join2(dir, "demo_data.csv"));
|
|
2889
|
+
writeFileSync3(
|
|
2890
|
+
join2(dir, "demo_config.yaml"),
|
|
2891
|
+
`# GoldenFlow Demo Config
|
|
2892
|
+
transforms:
|
|
2893
|
+
- column: name
|
|
2894
|
+
ops: [strip, title_case]
|
|
2895
|
+
- column: email
|
|
2896
|
+
ops: [lowercase, strip]
|
|
2897
|
+
- column: phone
|
|
2898
|
+
ops: [phone_e164]
|
|
2899
|
+
- column: state
|
|
2900
|
+
ops: [state_abbreviate]
|
|
2901
|
+
- column: signup_date
|
|
2902
|
+
ops: [date_iso8601]
|
|
2903
|
+
- column: price
|
|
2904
|
+
ops: [currency_strip]
|
|
2905
|
+
`
|
|
2906
|
+
);
|
|
2907
|
+
console.log("Demo files created:");
|
|
2908
|
+
console.log(` Data: ${join2(dir, "demo_data.csv")}`);
|
|
2909
|
+
console.log(` Config: ${join2(dir, "demo_config.yaml")}`);
|
|
2910
|
+
});
|
|
2911
|
+
program.parse();
|
|
2912
|
+
//# sourceMappingURL=cli.js.map
|
|
2913
|
+
//# sourceMappingURL=cli.js.map
|