goldenflow 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +2915 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +2913 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +2980 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +360 -0
- package/dist/core/index.d.ts +360 -0
- package/dist/core/index.js +2941 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +2980 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +2941 -0
- package/dist/index.js.map +1 -0
- package/dist/node/index.cjs +3588 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +57 -0
- package/dist/node/index.d.ts +57 -0
- package/dist/node/index.js +3536 -0
- package/dist/node/index.js.map +1 -0
- package/package.json +83 -0
|
@@ -0,0 +1,2941 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
+
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
4
|
+
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
5
|
+
}) : x)(function(x) {
|
|
6
|
+
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
7
|
+
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
8
|
+
});
|
|
9
|
+
var __esm = (fn, res) => function __init() {
|
|
10
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
11
|
+
};
|
|
12
|
+
var __export = (target, all) => {
|
|
13
|
+
for (var name in all)
|
|
14
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
// src/core/types.ts
|
|
18
|
+
function makeTransformRecord(input) {
|
|
19
|
+
return {
|
|
20
|
+
sampleBefore: [],
|
|
21
|
+
sampleAfter: [],
|
|
22
|
+
...input
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
function makeManifest(source) {
|
|
26
|
+
return new MutableManifest(source);
|
|
27
|
+
}
|
|
28
|
+
function makeColumnProfile(input) {
|
|
29
|
+
return {
|
|
30
|
+
sampleValues: [],
|
|
31
|
+
detectedFormat: null,
|
|
32
|
+
...input
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
function makeConfig(input) {
|
|
36
|
+
return {
|
|
37
|
+
source: null,
|
|
38
|
+
output: null,
|
|
39
|
+
transforms: [],
|
|
40
|
+
splits: [],
|
|
41
|
+
renames: {},
|
|
42
|
+
drop: [],
|
|
43
|
+
filters: [],
|
|
44
|
+
dedup: null,
|
|
45
|
+
mappings: [],
|
|
46
|
+
...input
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
var MutableManifest;
|
|
50
|
+
var init_types = __esm({
|
|
51
|
+
"src/core/types.ts"() {
|
|
52
|
+
MutableManifest = class {
|
|
53
|
+
source;
|
|
54
|
+
records = [];
|
|
55
|
+
errors = [];
|
|
56
|
+
createdAt;
|
|
57
|
+
constructor(source) {
|
|
58
|
+
this.source = source;
|
|
59
|
+
this.createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
60
|
+
}
|
|
61
|
+
addRecord(record) {
|
|
62
|
+
this.records.push(record);
|
|
63
|
+
}
|
|
64
|
+
addError(column, transform, row, error) {
|
|
65
|
+
this.errors.push({ column, transform, row, error });
|
|
66
|
+
}
|
|
67
|
+
toDict() {
|
|
68
|
+
return {
|
|
69
|
+
source: this.source,
|
|
70
|
+
created_at: this.createdAt,
|
|
71
|
+
records: this.records.map((r) => ({
|
|
72
|
+
column: r.column,
|
|
73
|
+
transform: r.transform,
|
|
74
|
+
affected_rows: r.affectedRows,
|
|
75
|
+
total_rows: r.totalRows,
|
|
76
|
+
sample_before: r.sampleBefore,
|
|
77
|
+
sample_after: r.sampleAfter
|
|
78
|
+
})),
|
|
79
|
+
errors: this.errors.map((e) => ({
|
|
80
|
+
column: e.column,
|
|
81
|
+
transform: e.transform,
|
|
82
|
+
row: e.row,
|
|
83
|
+
error: e.error
|
|
84
|
+
})),
|
|
85
|
+
summary: {
|
|
86
|
+
total_transforms: this.records.length,
|
|
87
|
+
total_errors: this.errors.length,
|
|
88
|
+
columns_affected: [...new Set(this.records.map((r) => r.column))]
|
|
89
|
+
}
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
// src/core/transforms/registry.ts
|
|
97
|
+
function registerTransform(opts, func) {
|
|
98
|
+
_REGISTRY.set(opts.name, {
|
|
99
|
+
name: opts.name,
|
|
100
|
+
func,
|
|
101
|
+
inputTypes: opts.inputTypes,
|
|
102
|
+
autoApply: opts.autoApply ?? false,
|
|
103
|
+
priority: opts.priority ?? 50,
|
|
104
|
+
mode: opts.mode ?? "series"
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
function getTransform(name) {
|
|
108
|
+
return _REGISTRY.get(name);
|
|
109
|
+
}
|
|
110
|
+
function listTransforms() {
|
|
111
|
+
return [..._REGISTRY.values()].sort((a, b) => b.priority - a.priority);
|
|
112
|
+
}
|
|
113
|
+
function parseTransformName(raw) {
|
|
114
|
+
const parts = raw.split(":");
|
|
115
|
+
return [parts[0], parts.slice(1)];
|
|
116
|
+
}
|
|
117
|
+
function registry() {
|
|
118
|
+
return _REGISTRY;
|
|
119
|
+
}
|
|
120
|
+
var _REGISTRY;
|
|
121
|
+
var init_registry = __esm({
|
|
122
|
+
"src/core/transforms/registry.ts"() {
|
|
123
|
+
_REGISTRY = /* @__PURE__ */ new Map();
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
// src/core/domains/people-hr.ts
|
|
128
|
+
var people_hr_exports = {};
|
|
129
|
+
__export(people_hr_exports, {
|
|
130
|
+
PACK: () => PACK
|
|
131
|
+
});
|
|
132
|
+
function ssnValidate(values) {
|
|
133
|
+
return values.map((v) => {
|
|
134
|
+
if (v === null || typeof v !== "string") return v;
|
|
135
|
+
const m = v.trim().match(SSN_RE);
|
|
136
|
+
if (!m) return false;
|
|
137
|
+
if (m[1] === "000" || m[2] === "00" || m[3] === "0000") return false;
|
|
138
|
+
return true;
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
var SSN_RE, PACK;
|
|
142
|
+
var init_people_hr = __esm({
|
|
143
|
+
"src/core/domains/people-hr.ts"() {
|
|
144
|
+
init_types();
|
|
145
|
+
init_registry();
|
|
146
|
+
SSN_RE = /^(\d{3})-?(\d{2})-?(\d{4})$/;
|
|
147
|
+
registerTransform(
|
|
148
|
+
{ name: "ssn_validate", inputTypes: ["ssn", "string"], priority: 55, mode: "series" },
|
|
149
|
+
ssnValidate
|
|
150
|
+
);
|
|
151
|
+
PACK = {
|
|
152
|
+
name: "people_hr",
|
|
153
|
+
description: "Name parsing, SSN formatting, employment dates, gender/boolean standardization",
|
|
154
|
+
transforms: [
|
|
155
|
+
"split_name",
|
|
156
|
+
"split_name_reverse",
|
|
157
|
+
"strip_titles",
|
|
158
|
+
"strip_suffixes",
|
|
159
|
+
"name_proper",
|
|
160
|
+
"ssn_mask",
|
|
161
|
+
"ssn_validate",
|
|
162
|
+
"date_iso8601",
|
|
163
|
+
"gender_standardize",
|
|
164
|
+
"boolean_normalize"
|
|
165
|
+
],
|
|
166
|
+
defaultConfig: makeConfig({
|
|
167
|
+
transforms: [
|
|
168
|
+
{ column: "name", ops: ["strip", "strip_titles", "title_case"] },
|
|
169
|
+
{ column: "ssn", ops: ["ssn_validate"] },
|
|
170
|
+
{ column: "gender", ops: ["gender_standardize"] },
|
|
171
|
+
{ column: "hire_date", ops: ["date_iso8601"] },
|
|
172
|
+
{ column: "active", ops: ["boolean_normalize"] }
|
|
173
|
+
]
|
|
174
|
+
})
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// src/core/domains/healthcare.ts
|
|
180
|
+
var healthcare_exports = {};
|
|
181
|
+
__export(healthcare_exports, {
|
|
182
|
+
PACK: () => PACK2
|
|
183
|
+
});
|
|
184
|
+
function npiValidate(values) {
|
|
185
|
+
return values.map((v) => {
|
|
186
|
+
if (v === null || typeof v !== "string") return v;
|
|
187
|
+
const digits = v.replace(/\D/g, "");
|
|
188
|
+
if (digits.length !== 10) return false;
|
|
189
|
+
const full = "80840" + digits;
|
|
190
|
+
let total = 0;
|
|
191
|
+
for (let i = full.length - 1, pos = 0; i >= 0; i--, pos++) {
|
|
192
|
+
let n = parseInt(full[i], 10);
|
|
193
|
+
if (pos % 2 === 1) {
|
|
194
|
+
n *= 2;
|
|
195
|
+
if (n > 9) n -= 9;
|
|
196
|
+
}
|
|
197
|
+
total += n;
|
|
198
|
+
}
|
|
199
|
+
return total % 10 === 0;
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
function icd10Format(values) {
|
|
203
|
+
return values.map((v) => {
|
|
204
|
+
if (v === null || typeof v !== "string") return v;
|
|
205
|
+
const code = v.trim().toUpperCase().replace(/\./g, "");
|
|
206
|
+
return code.length > 3 ? code.slice(0, 3) + "." + code.slice(3) : code;
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
var PACK2;
|
|
210
|
+
var init_healthcare = __esm({
|
|
211
|
+
"src/core/domains/healthcare.ts"() {
|
|
212
|
+
init_types();
|
|
213
|
+
init_registry();
|
|
214
|
+
registerTransform(
|
|
215
|
+
{ name: "npi_validate", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
216
|
+
npiValidate
|
|
217
|
+
);
|
|
218
|
+
registerTransform(
|
|
219
|
+
{ name: "icd10_format", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
220
|
+
icd10Format
|
|
221
|
+
);
|
|
222
|
+
PACK2 = {
|
|
223
|
+
name: "healthcare",
|
|
224
|
+
description: "MRN normalization, ICD-10 formatting, NPI validation, date standardization",
|
|
225
|
+
transforms: ["npi_validate", "icd10_format", "date_iso8601", "null_standardize", "strip"],
|
|
226
|
+
defaultConfig: makeConfig({
|
|
227
|
+
transforms: [
|
|
228
|
+
{ column: "npi", ops: ["npi_validate"] },
|
|
229
|
+
{ column: "icd10_code", ops: ["icd10_format"] },
|
|
230
|
+
{ column: "service_date", ops: ["date_iso8601"] },
|
|
231
|
+
{ column: "patient_name", ops: ["strip", "title_case"] }
|
|
232
|
+
]
|
|
233
|
+
})
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
// src/core/domains/finance.ts
|
|
239
|
+
var finance_exports = {};
|
|
240
|
+
__export(finance_exports, {
|
|
241
|
+
PACK: () => PACK3
|
|
242
|
+
});
|
|
243
|
+
function accountMask(values) {
|
|
244
|
+
return values.map((v) => {
|
|
245
|
+
if (v === null || typeof v !== "string") return v;
|
|
246
|
+
const digits = v.replace(/\D/g, "");
|
|
247
|
+
if (digits.length < 4) return v;
|
|
248
|
+
return "*".repeat(digits.length - 4) + digits.slice(-4);
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
function cusipFormat(values) {
|
|
252
|
+
return values.map((v) => {
|
|
253
|
+
if (v === null || typeof v !== "string") return v;
|
|
254
|
+
return v.trim().toUpperCase().slice(0, 9);
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
var PACK3;
|
|
258
|
+
var init_finance = __esm({
|
|
259
|
+
"src/core/domains/finance.ts"() {
|
|
260
|
+
init_types();
|
|
261
|
+
init_registry();
|
|
262
|
+
registerTransform(
|
|
263
|
+
{ name: "account_mask", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
264
|
+
accountMask
|
|
265
|
+
);
|
|
266
|
+
registerTransform(
|
|
267
|
+
{ name: "cusip_format", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
268
|
+
cusipFormat
|
|
269
|
+
);
|
|
270
|
+
PACK3 = {
|
|
271
|
+
name: "finance",
|
|
272
|
+
description: "Account masking, currency standardization, CUSIP/ISIN formatting",
|
|
273
|
+
transforms: ["account_mask", "cusip_format", "currency_strip", "date_iso8601"],
|
|
274
|
+
defaultConfig: makeConfig({
|
|
275
|
+
transforms: [
|
|
276
|
+
{ column: "account_number", ops: ["account_mask"] },
|
|
277
|
+
{ column: "amount", ops: ["currency_strip"] },
|
|
278
|
+
{ column: "transaction_date", ops: ["date_iso8601"] }
|
|
279
|
+
]
|
|
280
|
+
})
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
// src/core/domains/ecommerce.ts
|
|
286
|
+
var ecommerce_exports = {};
|
|
287
|
+
__export(ecommerce_exports, {
|
|
288
|
+
PACK: () => PACK4
|
|
289
|
+
});
|
|
290
|
+
function skuNormalize(values) {
|
|
291
|
+
return values.map((v) => {
|
|
292
|
+
if (v === null || typeof v !== "string") return v;
|
|
293
|
+
return v.trim().toUpperCase().replace(/[^A-Z0-9-]/g, "");
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
var PACK4;
|
|
297
|
+
var init_ecommerce = __esm({
|
|
298
|
+
"src/core/domains/ecommerce.ts"() {
|
|
299
|
+
init_types();
|
|
300
|
+
init_registry();
|
|
301
|
+
registerTransform(
|
|
302
|
+
{ name: "sku_normalize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
303
|
+
skuNormalize
|
|
304
|
+
);
|
|
305
|
+
PACK4 = {
|
|
306
|
+
name: "ecommerce",
|
|
307
|
+
description: "SKU normalization, price cleaning, category standardization",
|
|
308
|
+
transforms: ["sku_normalize", "currency_strip", "category_auto_correct", "strip"],
|
|
309
|
+
defaultConfig: makeConfig({
|
|
310
|
+
transforms: [
|
|
311
|
+
{ column: "sku", ops: ["sku_normalize"] },
|
|
312
|
+
{ column: "price", ops: ["currency_strip"] },
|
|
313
|
+
{ column: "category", ops: ["strip", "title_case"] }
|
|
314
|
+
]
|
|
315
|
+
})
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
// src/core/domains/real-estate.ts
|
|
321
|
+
var real_estate_exports = {};
|
|
322
|
+
__export(real_estate_exports, {
|
|
323
|
+
PACK: () => PACK5
|
|
324
|
+
});
|
|
325
|
+
function mlsNormalize(values) {
|
|
326
|
+
return values.map((v) => {
|
|
327
|
+
if (v === null || typeof v !== "string") return v;
|
|
328
|
+
return v.trim().toUpperCase();
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
var PACK5;
|
|
332
|
+
var init_real_estate = __esm({
|
|
333
|
+
"src/core/domains/real-estate.ts"() {
|
|
334
|
+
init_types();
|
|
335
|
+
init_registry();
|
|
336
|
+
registerTransform(
|
|
337
|
+
{ name: "mls_normalize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
338
|
+
mlsNormalize
|
|
339
|
+
);
|
|
340
|
+
PACK5 = {
|
|
341
|
+
name: "real_estate",
|
|
342
|
+
description: "Address parsing (USPS), MLS ID normalization, price cleaning",
|
|
343
|
+
transforms: ["mls_normalize", "address_standardize", "zip_normalize", "currency_strip"],
|
|
344
|
+
defaultConfig: makeConfig({
|
|
345
|
+
transforms: [
|
|
346
|
+
{ column: "mls_id", ops: ["mls_normalize"] },
|
|
347
|
+
{ column: "address", ops: ["strip", "address_standardize"] },
|
|
348
|
+
{ column: "price", ops: ["currency_strip"] },
|
|
349
|
+
{ column: "zip", ops: ["zip_normalize"] }
|
|
350
|
+
]
|
|
351
|
+
})
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
// src/core/index.ts
|
|
357
|
+
init_types();
|
|
358
|
+
|
|
359
|
+
// src/core/data.ts
|
|
360
|
+
var NULL_STRINGS = /* @__PURE__ */ new Set([
|
|
361
|
+
"",
|
|
362
|
+
"null",
|
|
363
|
+
"none",
|
|
364
|
+
"nan",
|
|
365
|
+
"n/a",
|
|
366
|
+
"na",
|
|
367
|
+
"nil",
|
|
368
|
+
"#n/a",
|
|
369
|
+
"missing",
|
|
370
|
+
"undefined"
|
|
371
|
+
]);
|
|
372
|
+
function isNullish(v) {
|
|
373
|
+
if (v === null || v === void 0) return true;
|
|
374
|
+
if (typeof v === "string") return NULL_STRINGS.has(v.toLowerCase().trim());
|
|
375
|
+
if (typeof v === "number") return Number.isNaN(v);
|
|
376
|
+
return false;
|
|
377
|
+
}
|
|
378
|
+
function toColumnValue(v) {
|
|
379
|
+
if (isNullish(v)) return null;
|
|
380
|
+
if (typeof v === "string") return v;
|
|
381
|
+
if (typeof v === "number") return v;
|
|
382
|
+
if (typeof v === "boolean") return v;
|
|
383
|
+
return String(v);
|
|
384
|
+
}
|
|
385
|
+
function mulberry32(seed) {
|
|
386
|
+
let s = seed | 0;
|
|
387
|
+
return () => {
|
|
388
|
+
s = s + 1831565813 | 0;
|
|
389
|
+
let t = Math.imul(s ^ s >>> 15, 1 | s);
|
|
390
|
+
t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
|
|
391
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
var TabularData = class _TabularData {
|
|
395
|
+
_rows;
|
|
396
|
+
_columnCache = /* @__PURE__ */ new Map();
|
|
397
|
+
constructor(rows) {
|
|
398
|
+
this._rows = rows;
|
|
399
|
+
}
|
|
400
|
+
get rows() {
|
|
401
|
+
return this._rows;
|
|
402
|
+
}
|
|
403
|
+
get columns() {
|
|
404
|
+
if (this._rows.length === 0) return [];
|
|
405
|
+
return Object.keys(this._rows[0]);
|
|
406
|
+
}
|
|
407
|
+
get rowCount() {
|
|
408
|
+
return this._rows.length;
|
|
409
|
+
}
|
|
410
|
+
// ---- Column access ----
|
|
411
|
+
column(name) {
|
|
412
|
+
const cached = this._columnCache.get(name);
|
|
413
|
+
if (cached) return cached;
|
|
414
|
+
const values = this._rows.map((r) => toColumnValue(r[name]));
|
|
415
|
+
this._columnCache.set(name, values);
|
|
416
|
+
return values;
|
|
417
|
+
}
|
|
418
|
+
/** Raw column access — preserves original values without null coercion.
|
|
419
|
+
* Use for profiling where "N/A" should remain a string, not become null. */
|
|
420
|
+
rawColumn(name) {
|
|
421
|
+
return this._rows.map((r) => {
|
|
422
|
+
const v = r[name];
|
|
423
|
+
if (v === null || v === void 0) return null;
|
|
424
|
+
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") return v;
|
|
425
|
+
return String(v);
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
// ---- Null handling ----
|
|
429
|
+
nullCount(col) {
|
|
430
|
+
let count = 0;
|
|
431
|
+
for (const v of this.column(col)) {
|
|
432
|
+
if (v === null) count++;
|
|
433
|
+
}
|
|
434
|
+
return count;
|
|
435
|
+
}
|
|
436
|
+
dropNulls(col) {
|
|
437
|
+
return this.column(col).filter((v) => v !== null);
|
|
438
|
+
}
|
|
439
|
+
// ---- Type inference ----
|
|
440
|
+
dtype(col) {
|
|
441
|
+
const values = this.dropNulls(col);
|
|
442
|
+
if (values.length === 0) return "null";
|
|
443
|
+
let hasInt = false;
|
|
444
|
+
let hasFloat = false;
|
|
445
|
+
let hasBool = false;
|
|
446
|
+
let hasString = false;
|
|
447
|
+
for (const v of values) {
|
|
448
|
+
if (typeof v === "boolean") {
|
|
449
|
+
hasBool = true;
|
|
450
|
+
} else if (typeof v === "number") {
|
|
451
|
+
if (Number.isInteger(v)) hasInt = true;
|
|
452
|
+
else hasFloat = true;
|
|
453
|
+
} else {
|
|
454
|
+
hasString = true;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
if (hasString) return "string";
|
|
458
|
+
if (hasBool && !hasInt && !hasFloat) return "boolean";
|
|
459
|
+
if (hasFloat) return "float";
|
|
460
|
+
if (hasInt) return "integer";
|
|
461
|
+
return "string";
|
|
462
|
+
}
|
|
463
|
+
// ---- Aggregation ----
|
|
464
|
+
nUnique(col) {
|
|
465
|
+
const set = /* @__PURE__ */ new Set();
|
|
466
|
+
for (const v of this.dropNulls(col)) set.add(v);
|
|
467
|
+
return set.size;
|
|
468
|
+
}
|
|
469
|
+
valueCounts(col) {
|
|
470
|
+
const map = /* @__PURE__ */ new Map();
|
|
471
|
+
for (const v of this.dropNulls(col)) {
|
|
472
|
+
map.set(v, (map.get(v) ?? 0) + 1);
|
|
473
|
+
}
|
|
474
|
+
return map;
|
|
475
|
+
}
|
|
476
|
+
/** MUST use loop — Math.min(...array) crashes on >65K elements. */
|
|
477
|
+
min(col) {
|
|
478
|
+
const nums = this.numericValues(col);
|
|
479
|
+
if (nums.length === 0) return null;
|
|
480
|
+
let m = nums[0];
|
|
481
|
+
for (let i = 1; i < nums.length; i++) {
|
|
482
|
+
if (nums[i] < m) m = nums[i];
|
|
483
|
+
}
|
|
484
|
+
return m;
|
|
485
|
+
}
|
|
486
|
+
/** MUST use loop — Math.max(...array) crashes on >65K elements. */
|
|
487
|
+
max(col) {
|
|
488
|
+
const nums = this.numericValues(col);
|
|
489
|
+
if (nums.length === 0) return null;
|
|
490
|
+
let m = nums[0];
|
|
491
|
+
for (let i = 1; i < nums.length; i++) {
|
|
492
|
+
if (nums[i] > m) m = nums[i];
|
|
493
|
+
}
|
|
494
|
+
return m;
|
|
495
|
+
}
|
|
496
|
+
mean(col) {
|
|
497
|
+
const nums = this.numericValues(col);
|
|
498
|
+
if (nums.length === 0) return null;
|
|
499
|
+
let sum = 0;
|
|
500
|
+
for (const n of nums) sum += n;
|
|
501
|
+
return sum / nums.length;
|
|
502
|
+
}
|
|
503
|
+
std(col) {
|
|
504
|
+
const nums = this.numericValues(col);
|
|
505
|
+
if (nums.length < 2) return null;
|
|
506
|
+
const avg = this.mean(col);
|
|
507
|
+
let sumSq = 0;
|
|
508
|
+
for (const n of nums) sumSq += (n - avg) ** 2;
|
|
509
|
+
return Math.sqrt(sumSq / (nums.length - 1));
|
|
510
|
+
}
|
|
511
|
+
// ---- Filtering & sampling ----
|
|
512
|
+
filter(predicate) {
|
|
513
|
+
return new _TabularData(this._rows.filter(predicate));
|
|
514
|
+
}
|
|
515
|
+
head(n) {
|
|
516
|
+
return new _TabularData(this._rows.slice(0, n));
|
|
517
|
+
}
|
|
518
|
+
sample(n, seed = 42) {
|
|
519
|
+
if (n >= this._rows.length) return this;
|
|
520
|
+
const rng = mulberry32(seed);
|
|
521
|
+
const indices = Array.from({ length: this._rows.length }, (_, i) => i);
|
|
522
|
+
for (let i = indices.length - 1; i > 0 && indices.length - 1 - i < n; i--) {
|
|
523
|
+
const j = Math.floor(rng() * (i + 1));
|
|
524
|
+
[indices[i], indices[j]] = [indices[j], indices[i]];
|
|
525
|
+
}
|
|
526
|
+
const sampled = indices.slice(indices.length - n).map((i) => this._rows[i]);
|
|
527
|
+
return new _TabularData(sampled);
|
|
528
|
+
}
|
|
529
|
+
// ---- String operations ----
|
|
530
|
+
strContains(col, pattern) {
|
|
531
|
+
return this.column(col).map(
|
|
532
|
+
(v) => typeof v === "string" ? pattern.test(v) : false
|
|
533
|
+
);
|
|
534
|
+
}
|
|
535
|
+
strLengths(col) {
|
|
536
|
+
return this.column(col).map(
|
|
537
|
+
(v) => typeof v === "string" ? v.length : 0
|
|
538
|
+
);
|
|
539
|
+
}
|
|
540
|
+
// ---- Casting ----
|
|
541
|
+
castFloat(col) {
|
|
542
|
+
return this.column(col).map((v) => {
|
|
543
|
+
if (v === null) return null;
|
|
544
|
+
const n = Number(v);
|
|
545
|
+
return Number.isFinite(n) ? n : null;
|
|
546
|
+
});
|
|
547
|
+
}
|
|
548
|
+
castInt(col) {
|
|
549
|
+
return this.column(col).map((v) => {
|
|
550
|
+
if (v === null) return null;
|
|
551
|
+
const n = Number(v);
|
|
552
|
+
return Number.isFinite(n) ? Math.trunc(n) : null;
|
|
553
|
+
});
|
|
554
|
+
}
|
|
555
|
+
// ---- Helpers ----
|
|
556
|
+
numericValues(col) {
|
|
557
|
+
const result = [];
|
|
558
|
+
for (const v of this.column(col)) {
|
|
559
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
560
|
+
result.push(v);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
return result;
|
|
564
|
+
}
|
|
565
|
+
stringValues(col) {
|
|
566
|
+
const result = [];
|
|
567
|
+
for (const v of this.column(col)) {
|
|
568
|
+
if (typeof v === "string") result.push(v);
|
|
569
|
+
}
|
|
570
|
+
return result;
|
|
571
|
+
}
|
|
572
|
+
sortedNumeric(col) {
|
|
573
|
+
return this.numericValues(col).sort((a, b) => a - b);
|
|
574
|
+
}
|
|
575
|
+
isSorted(col, descending = false) {
|
|
576
|
+
const nums = this.numericValues(col);
|
|
577
|
+
for (let i = 1; i < nums.length; i++) {
|
|
578
|
+
if (descending ? nums[i] > nums[i - 1] : nums[i] < nums[i - 1]) {
|
|
579
|
+
return false;
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
return true;
|
|
583
|
+
}
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
// src/core/transforms/text.ts
|
|
587
|
+
init_registry();
|
|
588
|
+
function mapStrings(values, fn) {
|
|
589
|
+
return values.map((v) => {
|
|
590
|
+
if (v === null || typeof v !== "string") return v;
|
|
591
|
+
return fn(v);
|
|
592
|
+
});
|
|
593
|
+
}
|
|
594
|
+
function strip(values) {
|
|
595
|
+
return mapStrings(values, (s) => s.trim());
|
|
596
|
+
}
|
|
597
|
+
registerTransform(
|
|
598
|
+
{ name: "strip", inputTypes: ["string"], autoApply: true, priority: 90, mode: "expr" },
|
|
599
|
+
strip
|
|
600
|
+
);
|
|
601
|
+
function lowercase(values) {
|
|
602
|
+
return mapStrings(values, (s) => s.toLowerCase());
|
|
603
|
+
}
|
|
604
|
+
registerTransform(
|
|
605
|
+
{ name: "lowercase", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
606
|
+
lowercase
|
|
607
|
+
);
|
|
608
|
+
function uppercase(values) {
|
|
609
|
+
return mapStrings(values, (s) => s.toUpperCase());
|
|
610
|
+
}
|
|
611
|
+
registerTransform(
|
|
612
|
+
{ name: "uppercase", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
613
|
+
uppercase
|
|
614
|
+
);
|
|
615
|
+
function titleCase(values) {
|
|
616
|
+
return mapStrings(
|
|
617
|
+
values,
|
|
618
|
+
(s) => s.toLowerCase().replace(/\b\w/g, (ch) => ch.toUpperCase())
|
|
619
|
+
);
|
|
620
|
+
}
|
|
621
|
+
registerTransform(
|
|
622
|
+
{ name: "title_case", inputTypes: ["string"], priority: 50, mode: "expr" },
|
|
623
|
+
titleCase
|
|
624
|
+
);
|
|
625
|
+
function normalizeUnicode(values) {
|
|
626
|
+
return mapStrings(
|
|
627
|
+
values,
|
|
628
|
+
(s) => s.normalize("NFKD").replace(new RegExp("\\p{M}", "gu"), "")
|
|
629
|
+
);
|
|
630
|
+
}
|
|
631
|
+
registerTransform(
|
|
632
|
+
{ name: "normalize_unicode", inputTypes: ["string"], autoApply: true, priority: 85, mode: "series" },
|
|
633
|
+
normalizeUnicode
|
|
634
|
+
);
|
|
635
|
+
function removePunctuation(values) {
|
|
636
|
+
return mapStrings(values, (s) => s.replace(/[^\w\s]/g, ""));
|
|
637
|
+
}
|
|
638
|
+
registerTransform(
|
|
639
|
+
{ name: "remove_punctuation", inputTypes: ["string"], priority: 40, mode: "series" },
|
|
640
|
+
removePunctuation
|
|
641
|
+
);
|
|
642
|
+
function collapseWhitespace(values) {
|
|
643
|
+
return mapStrings(values, (s) => s.replace(/\s+/g, " ").trim());
|
|
644
|
+
}
|
|
645
|
+
registerTransform(
|
|
646
|
+
{ name: "collapse_whitespace", inputTypes: ["string"], autoApply: true, priority: 80, mode: "expr" },
|
|
647
|
+
collapseWhitespace
|
|
648
|
+
);
|
|
649
|
+
function truncate(values, n = 255) {
|
|
650
|
+
const maxLen = typeof n === "number" ? n : Number(n) || 255;
|
|
651
|
+
return mapStrings(values, (s) => s.slice(0, maxLen));
|
|
652
|
+
}
|
|
653
|
+
registerTransform(
|
|
654
|
+
{ name: "truncate", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
655
|
+
truncate
|
|
656
|
+
);
|
|
657
|
+
function normalizeQuotes(values) {
|
|
658
|
+
return mapStrings(
|
|
659
|
+
values,
|
|
660
|
+
(s) => s.replace(/[\u2018\u2019\u201A\u201B]/g, "'").replace(/[\u201C\u201D\u201E\u201F]/g, '"')
|
|
661
|
+
);
|
|
662
|
+
}
|
|
663
|
+
registerTransform(
|
|
664
|
+
{ name: "normalize_quotes", inputTypes: ["string"], autoApply: true, priority: 84, mode: "series" },
|
|
665
|
+
normalizeQuotes
|
|
666
|
+
);
|
|
667
|
+
function removeHtmlTags(values) {
|
|
668
|
+
return mapStrings(values, (s) => s.replace(/<[^>]*>/g, ""));
|
|
669
|
+
}
|
|
670
|
+
registerTransform(
|
|
671
|
+
{ name: "remove_html_tags", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
672
|
+
removeHtmlTags
|
|
673
|
+
);
|
|
674
|
+
function removeUrls(values) {
|
|
675
|
+
return mapStrings(
|
|
676
|
+
values,
|
|
677
|
+
(s) => s.replace(/https?:\/\/[^\s]+/g, "").trim()
|
|
678
|
+
);
|
|
679
|
+
}
|
|
680
|
+
registerTransform(
|
|
681
|
+
{ name: "remove_urls", inputTypes: ["string"], priority: 40, mode: "series" },
|
|
682
|
+
removeUrls
|
|
683
|
+
);
|
|
684
|
+
function removeDigits(values) {
|
|
685
|
+
return mapStrings(values, (s) => s.replace(/\d/g, ""));
|
|
686
|
+
}
|
|
687
|
+
registerTransform(
|
|
688
|
+
{ name: "remove_digits", inputTypes: ["string"], priority: 35, mode: "series" },
|
|
689
|
+
removeDigits
|
|
690
|
+
);
|
|
691
|
+
function padLeft(values, width = 10, char = "0") {
|
|
692
|
+
const w = typeof width === "number" ? width : Number(width) || 10;
|
|
693
|
+
const c = typeof char === "string" ? char : "0";
|
|
694
|
+
return mapStrings(values, (s) => s.padStart(w, c));
|
|
695
|
+
}
|
|
696
|
+
registerTransform(
|
|
697
|
+
{ name: "pad_left", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
698
|
+
padLeft
|
|
699
|
+
);
|
|
700
|
+
function padRight(values, width = 10, char = " ") {
|
|
701
|
+
const w = typeof width === "number" ? width : Number(width) || 10;
|
|
702
|
+
const c = typeof char === "string" ? char : " ";
|
|
703
|
+
return mapStrings(values, (s) => s.padEnd(w, c));
|
|
704
|
+
}
|
|
705
|
+
registerTransform(
|
|
706
|
+
{ name: "pad_right", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
707
|
+
padRight
|
|
708
|
+
);
|
|
709
|
+
function removeEmojis(values) {
|
|
710
|
+
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{FE00}-\u{FE0F}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{200D}\u{20E3}\u{E0020}-\u{E007F}]/gu;
|
|
711
|
+
return mapStrings(values, (s) => s.replace(emojiPattern, ""));
|
|
712
|
+
}
|
|
713
|
+
registerTransform(
|
|
714
|
+
{ name: "remove_emojis", inputTypes: ["string"], priority: 38, mode: "series" },
|
|
715
|
+
removeEmojis
|
|
716
|
+
);
|
|
717
|
+
function fixMojibake(values) {
|
|
718
|
+
return mapStrings(values, (s) => {
|
|
719
|
+
try {
|
|
720
|
+
const encoder = new TextEncoder();
|
|
721
|
+
const bytes = new Uint8Array(s.length);
|
|
722
|
+
for (let i = 0; i < s.length; i++) {
|
|
723
|
+
const code = s.charCodeAt(i);
|
|
724
|
+
if (code > 255) return s;
|
|
725
|
+
bytes[i] = code;
|
|
726
|
+
}
|
|
727
|
+
const decoded = new TextDecoder("utf-8", { fatal: true }).decode(bytes);
|
|
728
|
+
return decoded;
|
|
729
|
+
} catch {
|
|
730
|
+
return s;
|
|
731
|
+
}
|
|
732
|
+
});
|
|
733
|
+
}
|
|
734
|
+
registerTransform(
|
|
735
|
+
{ name: "fix_mojibake", inputTypes: ["string"], priority: 86, mode: "series" },
|
|
736
|
+
fixMojibake
|
|
737
|
+
);
|
|
738
|
+
function normalizeLineEndings(values) {
|
|
739
|
+
return mapStrings(values, (s) => s.replace(/\r\n/g, "\n").replace(/\r/g, "\n"));
|
|
740
|
+
}
|
|
741
|
+
registerTransform(
|
|
742
|
+
{ name: "normalize_line_endings", inputTypes: ["string"], priority: 82, mode: "series" },
|
|
743
|
+
normalizeLineEndings
|
|
744
|
+
);
|
|
745
|
+
function extractNumbers(values) {
|
|
746
|
+
return mapStrings(values, (s) => {
|
|
747
|
+
const nums = s.match(/-?\d+(?:\.\d+)?/g);
|
|
748
|
+
return nums ? nums.join(" ") : "";
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
registerTransform(
|
|
752
|
+
{ name: "extract_numbers", inputTypes: ["string"], priority: 30, mode: "series" },
|
|
753
|
+
extractNumbers
|
|
754
|
+
);
|
|
755
|
+
|
|
756
|
+
// src/core/transforms/phone.ts
|
|
757
|
+
init_registry();
|
|
758
|
+
function extractDigits(s) {
|
|
759
|
+
return s.replace(/\D/g, "");
|
|
760
|
+
}
|
|
761
|
+
function normalizeUsDigits(s) {
|
|
762
|
+
const digits = extractDigits(s);
|
|
763
|
+
if (digits.length === 10) return digits;
|
|
764
|
+
if (digits.length === 11 && digits[0] === "1") return digits.slice(1);
|
|
765
|
+
return null;
|
|
766
|
+
}
|
|
767
|
+
function phoneE164(values) {
|
|
768
|
+
return values.map((v) => {
|
|
769
|
+
if (v === null || typeof v !== "string") return v;
|
|
770
|
+
const digits = normalizeUsDigits(v);
|
|
771
|
+
if (digits === null) return v;
|
|
772
|
+
return `+1${digits}`;
|
|
773
|
+
});
|
|
774
|
+
}
|
|
775
|
+
registerTransform(
|
|
776
|
+
{ name: "phone_e164", inputTypes: ["phone"], autoApply: true, priority: 50, mode: "series" },
|
|
777
|
+
phoneE164
|
|
778
|
+
);
|
|
779
|
+
function phoneNational(values) {
|
|
780
|
+
return values.map((v) => {
|
|
781
|
+
if (v === null || typeof v !== "string") return v;
|
|
782
|
+
const digits = normalizeUsDigits(v);
|
|
783
|
+
if (digits === null) return v;
|
|
784
|
+
return `(${digits.slice(0, 3)}) ${digits.slice(3, 6)}-${digits.slice(6)}`;
|
|
785
|
+
});
|
|
786
|
+
}
|
|
787
|
+
registerTransform(
|
|
788
|
+
{ name: "phone_national", inputTypes: ["phone"], priority: 50, mode: "series" },
|
|
789
|
+
phoneNational
|
|
790
|
+
);
|
|
791
|
+
function phoneDigits(values) {
|
|
792
|
+
return values.map((v) => {
|
|
793
|
+
if (v === null || typeof v !== "string") return v;
|
|
794
|
+
return extractDigits(v);
|
|
795
|
+
});
|
|
796
|
+
}
|
|
797
|
+
registerTransform(
|
|
798
|
+
{ name: "phone_digits", inputTypes: ["phone"], priority: 50, mode: "series" },
|
|
799
|
+
phoneDigits
|
|
800
|
+
);
|
|
801
|
+
function phoneValidate(values) {
|
|
802
|
+
return values.map((v) => {
|
|
803
|
+
if (v === null || typeof v !== "string") return v;
|
|
804
|
+
const digits = extractDigits(v);
|
|
805
|
+
return digits.length === 10 || digits.length === 11 && digits[0] === "1";
|
|
806
|
+
});
|
|
807
|
+
}
|
|
808
|
+
registerTransform(
|
|
809
|
+
{ name: "phone_validate", inputTypes: ["phone"], priority: 60, mode: "series" },
|
|
810
|
+
phoneValidate
|
|
811
|
+
);
|
|
812
|
+
function phoneCountryCode(values) {
|
|
813
|
+
return values.map((v) => {
|
|
814
|
+
if (v === null || typeof v !== "string") return v;
|
|
815
|
+
const digits = extractDigits(v);
|
|
816
|
+
if (digits.length === 10) return 1;
|
|
817
|
+
if (digits.length === 11 && digits[0] === "1") return 1;
|
|
818
|
+
return null;
|
|
819
|
+
});
|
|
820
|
+
}
|
|
821
|
+
registerTransform(
|
|
822
|
+
{ name: "phone_country_code", inputTypes: ["phone"], priority: 45, mode: "series" },
|
|
823
|
+
phoneCountryCode
|
|
824
|
+
);
|
|
825
|
+
|
|
826
|
+
// src/core/transforms/names.ts
|
|
827
|
+
init_registry();
|
|
828
|
+
function mapStrings2(values, fn) {
|
|
829
|
+
return values.map((v) => {
|
|
830
|
+
if (v === null || typeof v !== "string") return v;
|
|
831
|
+
return fn(v);
|
|
832
|
+
});
|
|
833
|
+
}
|
|
834
|
+
var _TITLES = /^(Mr\.?|Mrs\.?|Ms\.?|Miss\.?|Dr\.?|Prof\.?|Rev\.?|Sr\.?|Sra\.?)\s+/i;
|
|
835
|
+
var _SUFFIXES = /\s+(Jr\.?|Sr\.?|II|III|IV|MD|PhD|PharmD|DDS|DVM|Esq\.?|CPA|RN|DO)$/i;
|
|
836
|
+
var _INITIAL_PATTERN = /\b[A-Z]\.\s/;
|
|
837
|
+
var _MC_PATTERN = /\bMc(\w)/g;
|
|
838
|
+
var _O_PATTERN = /\bO'(\w)/g;
|
|
839
|
+
var _NICKNAMES = {
|
|
840
|
+
bob: "Robert",
|
|
841
|
+
rob: "Robert",
|
|
842
|
+
robby: "Robert",
|
|
843
|
+
robbie: "Robert",
|
|
844
|
+
bobby: "Robert",
|
|
845
|
+
bill: "William",
|
|
846
|
+
billy: "William",
|
|
847
|
+
will: "William",
|
|
848
|
+
willy: "William",
|
|
849
|
+
jim: "James",
|
|
850
|
+
jimmy: "James",
|
|
851
|
+
jamie: "James",
|
|
852
|
+
mike: "Michael",
|
|
853
|
+
mikey: "Michael",
|
|
854
|
+
mick: "Michael",
|
|
855
|
+
dick: "Richard",
|
|
856
|
+
rick: "Richard",
|
|
857
|
+
rich: "Richard",
|
|
858
|
+
ricky: "Richard",
|
|
859
|
+
tom: "Thomas",
|
|
860
|
+
tommy: "Thomas",
|
|
861
|
+
joe: "Joseph",
|
|
862
|
+
joey: "Joseph",
|
|
863
|
+
jack: "John",
|
|
864
|
+
johnny: "John",
|
|
865
|
+
jon: "Jonathan",
|
|
866
|
+
dave: "David",
|
|
867
|
+
davy: "David",
|
|
868
|
+
steve: "Steven",
|
|
869
|
+
stevie: "Steven",
|
|
870
|
+
dan: "Daniel",
|
|
871
|
+
danny: "Daniel",
|
|
872
|
+
pat: "Patrick",
|
|
873
|
+
patty: "Patricia",
|
|
874
|
+
patsy: "Patricia",
|
|
875
|
+
chris: "Christopher",
|
|
876
|
+
kit: "Christopher",
|
|
877
|
+
tony: "Anthony",
|
|
878
|
+
ed: "Edward",
|
|
879
|
+
eddie: "Edward",
|
|
880
|
+
ted: "Edward",
|
|
881
|
+
teddy: "Edward",
|
|
882
|
+
al: "Albert",
|
|
883
|
+
bert: "Albert",
|
|
884
|
+
charlie: "Charles",
|
|
885
|
+
chuck: "Charles",
|
|
886
|
+
sam: "Samuel",
|
|
887
|
+
sammy: "Samuel",
|
|
888
|
+
ben: "Benjamin",
|
|
889
|
+
benny: "Benjamin",
|
|
890
|
+
matt: "Matthew",
|
|
891
|
+
andy: "Andrew",
|
|
892
|
+
drew: "Andrew",
|
|
893
|
+
nick: "Nicholas",
|
|
894
|
+
alex: "Alexander",
|
|
895
|
+
liz: "Elizabeth",
|
|
896
|
+
beth: "Elizabeth",
|
|
897
|
+
betty: "Elizabeth",
|
|
898
|
+
kate: "Katherine",
|
|
899
|
+
kathy: "Katherine",
|
|
900
|
+
katie: "Katherine",
|
|
901
|
+
sue: "Susan",
|
|
902
|
+
susie: "Susan",
|
|
903
|
+
meg: "Margaret",
|
|
904
|
+
maggie: "Margaret",
|
|
905
|
+
peggy: "Margaret",
|
|
906
|
+
jenny: "Jennifer",
|
|
907
|
+
jen: "Jennifer",
|
|
908
|
+
debbie: "Deborah",
|
|
909
|
+
deb: "Deborah",
|
|
910
|
+
barb: "Barbara",
|
|
911
|
+
cindy: "Cynthia",
|
|
912
|
+
sandy: "Sandra"
|
|
913
|
+
};
|
|
914
|
+
function splitName(rows, column) {
|
|
915
|
+
return rows.map((row) => {
|
|
916
|
+
const val = row[column];
|
|
917
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
918
|
+
return { ...row, first_name: null, last_name: null };
|
|
919
|
+
}
|
|
920
|
+
const trimmed = val.trim();
|
|
921
|
+
const lastSpace = trimmed.lastIndexOf(" ");
|
|
922
|
+
if (lastSpace === -1) {
|
|
923
|
+
return { ...row, first_name: trimmed, last_name: "" };
|
|
924
|
+
}
|
|
925
|
+
return {
|
|
926
|
+
...row,
|
|
927
|
+
first_name: trimmed.slice(0, lastSpace),
|
|
928
|
+
last_name: trimmed.slice(lastSpace + 1)
|
|
929
|
+
};
|
|
930
|
+
});
|
|
931
|
+
}
|
|
932
|
+
registerTransform(
|
|
933
|
+
{ name: "split_name", inputTypes: ["name"], priority: 50, mode: "dataframe" },
|
|
934
|
+
splitName
|
|
935
|
+
);
|
|
936
|
+
function splitNameReverse(rows, column) {
|
|
937
|
+
return rows.map((row) => {
|
|
938
|
+
const val = row[column];
|
|
939
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
940
|
+
return { ...row, first_name: null, last_name: null };
|
|
941
|
+
}
|
|
942
|
+
const commaIdx = val.indexOf(",");
|
|
943
|
+
if (commaIdx === -1) {
|
|
944
|
+
return { ...row, first_name: val.trim(), last_name: "" };
|
|
945
|
+
}
|
|
946
|
+
return {
|
|
947
|
+
...row,
|
|
948
|
+
last_name: val.slice(0, commaIdx).trim(),
|
|
949
|
+
first_name: val.slice(commaIdx + 1).trim()
|
|
950
|
+
};
|
|
951
|
+
});
|
|
952
|
+
}
|
|
953
|
+
registerTransform(
|
|
954
|
+
{ name: "split_name_reverse", inputTypes: ["name"], priority: 50, mode: "dataframe" },
|
|
955
|
+
splitNameReverse
|
|
956
|
+
);
|
|
957
|
+
function stripTitles(values) {
|
|
958
|
+
return mapStrings2(values, (s) => s.replace(_TITLES, "").trim());
|
|
959
|
+
}
|
|
960
|
+
registerTransform(
|
|
961
|
+
{ name: "strip_titles", inputTypes: ["name"], autoApply: true, priority: 70, mode: "series" },
|
|
962
|
+
stripTitles
|
|
963
|
+
);
|
|
964
|
+
function stripSuffixes(values) {
|
|
965
|
+
return mapStrings2(values, (s) => s.replace(_SUFFIXES, "").trim());
|
|
966
|
+
}
|
|
967
|
+
registerTransform(
|
|
968
|
+
{ name: "strip_suffixes", inputTypes: ["name"], priority: 60, mode: "series" },
|
|
969
|
+
stripSuffixes
|
|
970
|
+
);
|
|
971
|
+
function nameProper(values) {
|
|
972
|
+
return mapStrings2(values, (s) => {
|
|
973
|
+
let result = s.toLowerCase().replace(/\b\w/g, (ch) => ch.toUpperCase());
|
|
974
|
+
result = result.replace(_MC_PATTERN, (_match, letter) => `Mc${letter.toUpperCase()}`);
|
|
975
|
+
result = result.replace(_O_PATTERN, (_match, letter) => `O'${letter.toUpperCase()}`);
|
|
976
|
+
return result;
|
|
977
|
+
});
|
|
978
|
+
}
|
|
979
|
+
registerTransform(
|
|
980
|
+
{ name: "name_proper", inputTypes: ["name"], priority: 45, mode: "series" },
|
|
981
|
+
nameProper
|
|
982
|
+
);
|
|
983
|
+
function initialExpand(values) {
|
|
984
|
+
const flagged = [];
|
|
985
|
+
const result = values.map((v, i) => {
|
|
986
|
+
if (v !== null && typeof v === "string" && _INITIAL_PATTERN.test(v)) {
|
|
987
|
+
flagged.push(i);
|
|
988
|
+
}
|
|
989
|
+
return v === void 0 ? null : v;
|
|
990
|
+
});
|
|
991
|
+
return [result, flagged];
|
|
992
|
+
}
|
|
993
|
+
registerTransform(
|
|
994
|
+
{ name: "initial_expand", inputTypes: ["name"], priority: 40, mode: "series" },
|
|
995
|
+
initialExpand
|
|
996
|
+
);
|
|
997
|
+
function nicknameStandardize(values) {
|
|
998
|
+
return mapStrings2(values, (s) => {
|
|
999
|
+
const lookup = s.trim().toLowerCase();
|
|
1000
|
+
return _NICKNAMES[lookup] ?? s;
|
|
1001
|
+
});
|
|
1002
|
+
}
|
|
1003
|
+
registerTransform(
|
|
1004
|
+
{ name: "nickname_standardize", inputTypes: ["name"], priority: 42, mode: "series" },
|
|
1005
|
+
nicknameStandardize
|
|
1006
|
+
);
|
|
1007
|
+
function mergeName(rows, column, lastNameCol = "last_name") {
|
|
1008
|
+
const lnCol = typeof lastNameCol === "string" ? lastNameCol : "last_name";
|
|
1009
|
+
if (rows.length > 0 && !(lnCol in rows[0])) {
|
|
1010
|
+
return rows.map((r) => ({ ...r }));
|
|
1011
|
+
}
|
|
1012
|
+
return rows.map((row) => {
|
|
1013
|
+
const first = row[column];
|
|
1014
|
+
const last = row[lnCol];
|
|
1015
|
+
const parts = [];
|
|
1016
|
+
if (first !== null && first !== void 0) {
|
|
1017
|
+
const s = String(first).trim();
|
|
1018
|
+
if (s) parts.push(s);
|
|
1019
|
+
}
|
|
1020
|
+
if (last !== null && last !== void 0) {
|
|
1021
|
+
const s = String(last).trim();
|
|
1022
|
+
if (s) parts.push(s);
|
|
1023
|
+
}
|
|
1024
|
+
return { ...row, full_name: parts.length > 0 ? parts.join(" ") : null };
|
|
1025
|
+
});
|
|
1026
|
+
}
|
|
1027
|
+
registerTransform(
|
|
1028
|
+
{ name: "merge_name", inputTypes: ["name"], priority: 45, mode: "dataframe" },
|
|
1029
|
+
mergeName
|
|
1030
|
+
);
|
|
1031
|
+
|
|
1032
|
+
// src/core/transforms/address.ts
|
|
1033
|
+
init_registry();
|
|
1034
|
+
function mapStrings3(values, fn) {
|
|
1035
|
+
return values.map((v) => {
|
|
1036
|
+
if (v === null || typeof v !== "string") return v;
|
|
1037
|
+
return fn(v);
|
|
1038
|
+
});
|
|
1039
|
+
}
|
|
1040
|
+
var _STREET_ABBREV = {
|
|
1041
|
+
Street: "St",
|
|
1042
|
+
Avenue: "Ave",
|
|
1043
|
+
Boulevard: "Blvd",
|
|
1044
|
+
Drive: "Dr",
|
|
1045
|
+
Lane: "Ln",
|
|
1046
|
+
Road: "Rd",
|
|
1047
|
+
Court: "Ct",
|
|
1048
|
+
Place: "Pl",
|
|
1049
|
+
Circle: "Cir",
|
|
1050
|
+
Trail: "Trl",
|
|
1051
|
+
Way: "Way",
|
|
1052
|
+
Parkway: "Pkwy",
|
|
1053
|
+
Highway: "Hwy",
|
|
1054
|
+
Terrace: "Ter",
|
|
1055
|
+
Square: "Sq"
|
|
1056
|
+
};
|
|
1057
|
+
var _STREET_EXPAND = {};
|
|
1058
|
+
for (const [full, abbr] of Object.entries(_STREET_ABBREV)) {
|
|
1059
|
+
_STREET_EXPAND[abbr] = full;
|
|
1060
|
+
}
|
|
1061
|
+
var _STATES = {
|
|
1062
|
+
Alabama: "AL",
|
|
1063
|
+
Alaska: "AK",
|
|
1064
|
+
Arizona: "AZ",
|
|
1065
|
+
Arkansas: "AR",
|
|
1066
|
+
California: "CA",
|
|
1067
|
+
Colorado: "CO",
|
|
1068
|
+
Connecticut: "CT",
|
|
1069
|
+
Delaware: "DE",
|
|
1070
|
+
Florida: "FL",
|
|
1071
|
+
Georgia: "GA",
|
|
1072
|
+
Hawaii: "HI",
|
|
1073
|
+
Idaho: "ID",
|
|
1074
|
+
Illinois: "IL",
|
|
1075
|
+
Indiana: "IN",
|
|
1076
|
+
Iowa: "IA",
|
|
1077
|
+
Kansas: "KS",
|
|
1078
|
+
Kentucky: "KY",
|
|
1079
|
+
Louisiana: "LA",
|
|
1080
|
+
Maine: "ME",
|
|
1081
|
+
Maryland: "MD",
|
|
1082
|
+
Massachusetts: "MA",
|
|
1083
|
+
Michigan: "MI",
|
|
1084
|
+
Minnesota: "MN",
|
|
1085
|
+
Mississippi: "MS",
|
|
1086
|
+
Missouri: "MO",
|
|
1087
|
+
Montana: "MT",
|
|
1088
|
+
Nebraska: "NE",
|
|
1089
|
+
Nevada: "NV",
|
|
1090
|
+
"New Hampshire": "NH",
|
|
1091
|
+
"New Jersey": "NJ",
|
|
1092
|
+
"New Mexico": "NM",
|
|
1093
|
+
"New York": "NY",
|
|
1094
|
+
"North Carolina": "NC",
|
|
1095
|
+
"North Dakota": "ND",
|
|
1096
|
+
Ohio: "OH",
|
|
1097
|
+
Oklahoma: "OK",
|
|
1098
|
+
Oregon: "OR",
|
|
1099
|
+
Pennsylvania: "PA",
|
|
1100
|
+
"Rhode Island": "RI",
|
|
1101
|
+
"South Carolina": "SC",
|
|
1102
|
+
"South Dakota": "SD",
|
|
1103
|
+
Tennessee: "TN",
|
|
1104
|
+
Texas: "TX",
|
|
1105
|
+
Utah: "UT",
|
|
1106
|
+
Vermont: "VT",
|
|
1107
|
+
Virginia: "VA",
|
|
1108
|
+
Washington: "WA",
|
|
1109
|
+
"West Virginia": "WV",
|
|
1110
|
+
Wisconsin: "WI",
|
|
1111
|
+
Wyoming: "WY",
|
|
1112
|
+
"District Of Columbia": "DC"
|
|
1113
|
+
};
|
|
1114
|
+
var _STATES_REVERSE = {};
|
|
1115
|
+
for (const [name, abbr] of Object.entries(_STATES)) {
|
|
1116
|
+
_STATES_REVERSE[abbr] = name;
|
|
1117
|
+
}
|
|
1118
|
+
var _STATES_LOWER = {};
|
|
1119
|
+
for (const [name, abbr] of Object.entries(_STATES)) {
|
|
1120
|
+
_STATES_LOWER[name.toLowerCase()] = abbr;
|
|
1121
|
+
}
|
|
1122
|
+
var _COUNTRIES = {
|
|
1123
|
+
"united states": "US",
|
|
1124
|
+
"united states of america": "US",
|
|
1125
|
+
usa: "US",
|
|
1126
|
+
us: "US",
|
|
1127
|
+
"u.s.a.": "US",
|
|
1128
|
+
"u.s.": "US",
|
|
1129
|
+
america: "US",
|
|
1130
|
+
"united kingdom": "GB",
|
|
1131
|
+
uk: "GB",
|
|
1132
|
+
"great britain": "GB",
|
|
1133
|
+
england: "GB",
|
|
1134
|
+
scotland: "GB",
|
|
1135
|
+
wales: "GB",
|
|
1136
|
+
"northern ireland": "GB",
|
|
1137
|
+
canada: "CA",
|
|
1138
|
+
ca: "CA",
|
|
1139
|
+
australia: "AU",
|
|
1140
|
+
au: "AU",
|
|
1141
|
+
germany: "DE",
|
|
1142
|
+
deutschland: "DE",
|
|
1143
|
+
de: "DE",
|
|
1144
|
+
france: "FR",
|
|
1145
|
+
fr: "FR",
|
|
1146
|
+
italy: "IT",
|
|
1147
|
+
italia: "IT",
|
|
1148
|
+
it: "IT",
|
|
1149
|
+
spain: "ES",
|
|
1150
|
+
espana: "ES",
|
|
1151
|
+
es: "ES",
|
|
1152
|
+
mexico: "MX",
|
|
1153
|
+
mx: "MX",
|
|
1154
|
+
brazil: "BR",
|
|
1155
|
+
brasil: "BR",
|
|
1156
|
+
br: "BR",
|
|
1157
|
+
japan: "JP",
|
|
1158
|
+
jp: "JP",
|
|
1159
|
+
china: "CN",
|
|
1160
|
+
cn: "CN",
|
|
1161
|
+
india: "IN",
|
|
1162
|
+
in: "IN",
|
|
1163
|
+
"south korea": "KR",
|
|
1164
|
+
korea: "KR",
|
|
1165
|
+
kr: "KR",
|
|
1166
|
+
netherlands: "NL",
|
|
1167
|
+
holland: "NL",
|
|
1168
|
+
nl: "NL",
|
|
1169
|
+
sweden: "SE",
|
|
1170
|
+
se: "SE",
|
|
1171
|
+
norway: "NO",
|
|
1172
|
+
no: "NO",
|
|
1173
|
+
denmark: "DK",
|
|
1174
|
+
dk: "DK",
|
|
1175
|
+
switzerland: "CH",
|
|
1176
|
+
ch: "CH",
|
|
1177
|
+
ireland: "IE",
|
|
1178
|
+
ie: "IE",
|
|
1179
|
+
"new zealand": "NZ",
|
|
1180
|
+
nz: "NZ",
|
|
1181
|
+
singapore: "SG",
|
|
1182
|
+
sg: "SG",
|
|
1183
|
+
portugal: "PT",
|
|
1184
|
+
pt: "PT",
|
|
1185
|
+
argentina: "AR",
|
|
1186
|
+
ar: "AR",
|
|
1187
|
+
colombia: "CO",
|
|
1188
|
+
co: "CO",
|
|
1189
|
+
philippines: "PH",
|
|
1190
|
+
ph: "PH",
|
|
1191
|
+
poland: "PL",
|
|
1192
|
+
pl: "PL",
|
|
1193
|
+
belgium: "BE",
|
|
1194
|
+
be: "BE",
|
|
1195
|
+
austria: "AT",
|
|
1196
|
+
at: "AT"
|
|
1197
|
+
};
|
|
1198
|
+
var _UNIT_PATTERNS = [
|
|
1199
|
+
[/^(?:Apt|Apartment)\.?\s+/i, "Unit "],
|
|
1200
|
+
[/^(?:Ste|Suite)\.?\s+/i, "Ste "],
|
|
1201
|
+
[/^#\s*/i, "Unit "]
|
|
1202
|
+
];
|
|
1203
|
+
var _ABBREV_PATTERNS = Object.entries(_STREET_ABBREV).map(
|
|
1204
|
+
([full, abbr]) => [new RegExp(`\\b${full}\\b`, "gi"), abbr]
|
|
1205
|
+
);
|
|
1206
|
+
var _EXPAND_PATTERNS = Object.entries(_STREET_EXPAND).map(
|
|
1207
|
+
([abbr, full]) => [new RegExp(`\\b${abbr}\\b`, "gi"), full]
|
|
1208
|
+
);
|
|
1209
|
+
function addressStandardize(values) {
|
|
1210
|
+
return mapStrings3(values, (s) => {
|
|
1211
|
+
let result = s;
|
|
1212
|
+
for (const [pattern, abbr] of _ABBREV_PATTERNS) {
|
|
1213
|
+
result = result.replace(pattern, abbr);
|
|
1214
|
+
}
|
|
1215
|
+
return result;
|
|
1216
|
+
});
|
|
1217
|
+
}
|
|
1218
|
+
registerTransform(
|
|
1219
|
+
{ name: "address_standardize", inputTypes: ["address"], priority: 50, mode: "series" },
|
|
1220
|
+
addressStandardize
|
|
1221
|
+
);
|
|
1222
|
+
function addressExpand(values) {
|
|
1223
|
+
return mapStrings3(values, (s) => {
|
|
1224
|
+
let result = s;
|
|
1225
|
+
for (const [pattern, full] of _EXPAND_PATTERNS) {
|
|
1226
|
+
result = result.replace(pattern, full);
|
|
1227
|
+
}
|
|
1228
|
+
return result;
|
|
1229
|
+
});
|
|
1230
|
+
}
|
|
1231
|
+
registerTransform(
|
|
1232
|
+
{ name: "address_expand", inputTypes: ["address"], priority: 50, mode: "series" },
|
|
1233
|
+
addressExpand
|
|
1234
|
+
);
|
|
1235
|
+
function stateAbbreviate(values) {
|
|
1236
|
+
return mapStrings3(values, (s) => {
|
|
1237
|
+
const trimmed = s.trim();
|
|
1238
|
+
if (trimmed.length === 2 && _STATES_REVERSE[trimmed.toUpperCase()]) {
|
|
1239
|
+
return trimmed.toUpperCase();
|
|
1240
|
+
}
|
|
1241
|
+
const matched = _STATES_LOWER[trimmed.toLowerCase()];
|
|
1242
|
+
return matched ?? s;
|
|
1243
|
+
});
|
|
1244
|
+
}
|
|
1245
|
+
registerTransform(
|
|
1246
|
+
{ name: "state_abbreviate", inputTypes: ["state", "string"], priority: 50, mode: "series" },
|
|
1247
|
+
stateAbbreviate
|
|
1248
|
+
);
|
|
1249
|
+
function stateExpand(values) {
|
|
1250
|
+
return mapStrings3(values, (s) => {
|
|
1251
|
+
return _STATES_REVERSE[s.trim().toUpperCase()] ?? s;
|
|
1252
|
+
});
|
|
1253
|
+
}
|
|
1254
|
+
registerTransform(
|
|
1255
|
+
{ name: "state_expand", inputTypes: ["state", "string"], priority: 50, mode: "series" },
|
|
1256
|
+
stateExpand
|
|
1257
|
+
);
|
|
1258
|
+
function zipNormalize(values) {
|
|
1259
|
+
return mapStrings3(values, (s) => {
|
|
1260
|
+
let val = s.trim();
|
|
1261
|
+
val = val.split("-")[0];
|
|
1262
|
+
if (/^\d+$/.test(val)) {
|
|
1263
|
+
return val.padStart(5, "0");
|
|
1264
|
+
}
|
|
1265
|
+
return val;
|
|
1266
|
+
});
|
|
1267
|
+
}
|
|
1268
|
+
registerTransform(
|
|
1269
|
+
{ name: "zip_normalize", inputTypes: ["zip"], autoApply: true, priority: 55, mode: "series" },
|
|
1270
|
+
zipNormalize
|
|
1271
|
+
);
|
|
1272
|
+
var _ADDRESS_PATTERN = /^(.+?),\s*(.+?),\s*([A-Za-z]{2})\s+(\d{5}(?:-\d{4})?)$/;
|
|
1273
|
+
function splitAddress(rows, column) {
|
|
1274
|
+
return rows.map((row) => {
|
|
1275
|
+
const val = row[column];
|
|
1276
|
+
if (val === null || val === void 0 || typeof val !== "string") {
|
|
1277
|
+
return { ...row, street: null, city: null, state: null, zip: null };
|
|
1278
|
+
}
|
|
1279
|
+
const m = val.trim().match(_ADDRESS_PATTERN);
|
|
1280
|
+
if (m) {
|
|
1281
|
+
return { ...row, street: m[1], city: m[2], state: m[3], zip: m[4] };
|
|
1282
|
+
}
|
|
1283
|
+
return { ...row, street: val, city: null, state: null, zip: null };
|
|
1284
|
+
});
|
|
1285
|
+
}
|
|
1286
|
+
registerTransform(
|
|
1287
|
+
{ name: "split_address", inputTypes: ["address"], priority: 45, mode: "dataframe" },
|
|
1288
|
+
splitAddress
|
|
1289
|
+
);
|
|
1290
|
+
function countryStandardize(values) {
|
|
1291
|
+
return mapStrings3(values, (s) => {
|
|
1292
|
+
const lookup = s.trim().toLowerCase();
|
|
1293
|
+
return _COUNTRIES[lookup] ?? s;
|
|
1294
|
+
});
|
|
1295
|
+
}
|
|
1296
|
+
registerTransform(
|
|
1297
|
+
{ name: "country_standardize", inputTypes: ["country", "string"], priority: 50, mode: "series" },
|
|
1298
|
+
countryStandardize
|
|
1299
|
+
);
|
|
1300
|
+
function unitNormalize(values) {
|
|
1301
|
+
return mapStrings3(values, (s) => {
|
|
1302
|
+
let result = s.trim();
|
|
1303
|
+
for (const [pattern, replacement] of _UNIT_PATTERNS) {
|
|
1304
|
+
result = result.replace(pattern, replacement);
|
|
1305
|
+
}
|
|
1306
|
+
return result;
|
|
1307
|
+
});
|
|
1308
|
+
}
|
|
1309
|
+
registerTransform(
|
|
1310
|
+
{ name: "unit_normalize", inputTypes: ["address", "string"], priority: 45, mode: "series" },
|
|
1311
|
+
unitNormalize
|
|
1312
|
+
);
|
|
1313
|
+
|
|
1314
|
+
// src/core/transforms/dates.ts
|
|
1315
|
+
init_registry();
|
|
1316
|
+
function _parseDate(val) {
|
|
1317
|
+
const trimmed = val.trim();
|
|
1318
|
+
if (!trimmed) return null;
|
|
1319
|
+
const d = new Date(trimmed);
|
|
1320
|
+
if (isNaN(d.getTime())) return null;
|
|
1321
|
+
return d;
|
|
1322
|
+
}
|
|
1323
|
+
function pad(n) {
|
|
1324
|
+
return n < 10 ? `0${n}` : String(n);
|
|
1325
|
+
}
|
|
1326
|
+
var DAY_NAMES = [
|
|
1327
|
+
"Sunday",
|
|
1328
|
+
"Monday",
|
|
1329
|
+
"Tuesday",
|
|
1330
|
+
"Wednesday",
|
|
1331
|
+
"Thursday",
|
|
1332
|
+
"Friday",
|
|
1333
|
+
"Saturday"
|
|
1334
|
+
];
|
|
1335
|
+
function dateIso8601(values) {
|
|
1336
|
+
return values.map((v) => {
|
|
1337
|
+
if (v === null) return null;
|
|
1338
|
+
const s = String(v);
|
|
1339
|
+
const d = _parseDate(s);
|
|
1340
|
+
if (!d) return v;
|
|
1341
|
+
return `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}`;
|
|
1342
|
+
});
|
|
1343
|
+
}
|
|
1344
|
+
registerTransform(
|
|
1345
|
+
{ name: "date_iso8601", inputTypes: ["date"], autoApply: true, priority: 50, mode: "series" },
|
|
1346
|
+
dateIso8601
|
|
1347
|
+
);
|
|
1348
|
+
function dateUs(values) {
|
|
1349
|
+
return values.map((v) => {
|
|
1350
|
+
if (v === null) return null;
|
|
1351
|
+
const s = String(v);
|
|
1352
|
+
const d = _parseDate(s);
|
|
1353
|
+
if (!d) return v;
|
|
1354
|
+
return `${pad(d.getUTCMonth() + 1)}/${pad(d.getUTCDate())}/${d.getUTCFullYear()}`;
|
|
1355
|
+
});
|
|
1356
|
+
}
|
|
1357
|
+
registerTransform(
|
|
1358
|
+
{ name: "date_us", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1359
|
+
dateUs
|
|
1360
|
+
);
|
|
1361
|
+
function dateEu(values) {
|
|
1362
|
+
return values.map((v) => {
|
|
1363
|
+
if (v === null) return null;
|
|
1364
|
+
const s = String(v);
|
|
1365
|
+
const d = _parseDate(s);
|
|
1366
|
+
if (!d) return v;
|
|
1367
|
+
return `${pad(d.getUTCDate())}/${pad(d.getUTCMonth() + 1)}/${d.getUTCFullYear()}`;
|
|
1368
|
+
});
|
|
1369
|
+
}
|
|
1370
|
+
registerTransform(
|
|
1371
|
+
{ name: "date_eu", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1372
|
+
dateEu
|
|
1373
|
+
);
|
|
1374
|
+
registerTransform(
|
|
1375
|
+
{ name: "date_parse", inputTypes: ["date"], priority: 55, mode: "series" },
|
|
1376
|
+
dateIso8601
|
|
1377
|
+
);
|
|
1378
|
+
function ageFromDob(values, referenceDate = null) {
|
|
1379
|
+
const ref = referenceDate ? _parseDate(String(referenceDate)) : /* @__PURE__ */ new Date();
|
|
1380
|
+
if (!ref) return values.slice();
|
|
1381
|
+
return values.map((v) => {
|
|
1382
|
+
if (v === null) return null;
|
|
1383
|
+
const dob = _parseDate(String(v));
|
|
1384
|
+
if (!dob) return v;
|
|
1385
|
+
let age = ref.getUTCFullYear() - dob.getUTCFullYear();
|
|
1386
|
+
const monthDiff = ref.getUTCMonth() - dob.getUTCMonth();
|
|
1387
|
+
if (monthDiff < 0 || monthDiff === 0 && ref.getUTCDate() < dob.getUTCDate()) {
|
|
1388
|
+
age--;
|
|
1389
|
+
}
|
|
1390
|
+
return age;
|
|
1391
|
+
});
|
|
1392
|
+
}
|
|
1393
|
+
registerTransform(
|
|
1394
|
+
{ name: "age_from_dob", inputTypes: ["date"], priority: 40, mode: "series" },
|
|
1395
|
+
ageFromDob
|
|
1396
|
+
);
|
|
1397
|
+
function datetimeIso8601(values) {
|
|
1398
|
+
return values.map((v) => {
|
|
1399
|
+
if (v === null) return null;
|
|
1400
|
+
const d = _parseDate(String(v));
|
|
1401
|
+
if (!d) return v;
|
|
1402
|
+
return `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}T${pad(d.getUTCHours())}:${pad(d.getUTCMinutes())}:${pad(d.getUTCSeconds())}`;
|
|
1403
|
+
});
|
|
1404
|
+
}
|
|
1405
|
+
registerTransform(
|
|
1406
|
+
{ name: "datetime_iso8601", inputTypes: ["date"], priority: 50, mode: "series" },
|
|
1407
|
+
datetimeIso8601
|
|
1408
|
+
);
|
|
1409
|
+
function extractYear(values) {
|
|
1410
|
+
return values.map((v) => {
|
|
1411
|
+
if (v === null) return null;
|
|
1412
|
+
const d = _parseDate(String(v));
|
|
1413
|
+
return d ? d.getUTCFullYear() : v;
|
|
1414
|
+
});
|
|
1415
|
+
}
|
|
1416
|
+
function extractMonth(values) {
|
|
1417
|
+
return values.map((v) => {
|
|
1418
|
+
if (v === null) return null;
|
|
1419
|
+
const d = _parseDate(String(v));
|
|
1420
|
+
return d ? d.getUTCMonth() + 1 : v;
|
|
1421
|
+
});
|
|
1422
|
+
}
|
|
1423
|
+
function extractDay(values) {
|
|
1424
|
+
return values.map((v) => {
|
|
1425
|
+
if (v === null) return null;
|
|
1426
|
+
const d = _parseDate(String(v));
|
|
1427
|
+
return d ? d.getUTCDate() : v;
|
|
1428
|
+
});
|
|
1429
|
+
}
|
|
1430
|
+
function extractQuarter(values) {
|
|
1431
|
+
return values.map((v) => {
|
|
1432
|
+
if (v === null) return null;
|
|
1433
|
+
const d = _parseDate(String(v));
|
|
1434
|
+
if (!d) return v;
|
|
1435
|
+
return Math.floor(d.getUTCMonth() / 3) + 1;
|
|
1436
|
+
});
|
|
1437
|
+
}
|
|
1438
|
+
function extractDayOfWeek(values) {
|
|
1439
|
+
return values.map((v) => {
|
|
1440
|
+
if (v === null) return null;
|
|
1441
|
+
const d = _parseDate(String(v));
|
|
1442
|
+
return d ? DAY_NAMES[d.getUTCDay()] : v;
|
|
1443
|
+
});
|
|
1444
|
+
}
|
|
1445
|
+
registerTransform({ name: "extract_year", inputTypes: ["date"], priority: 35, mode: "series" }, extractYear);
|
|
1446
|
+
registerTransform({ name: "extract_month", inputTypes: ["date"], priority: 35, mode: "series" }, extractMonth);
|
|
1447
|
+
registerTransform({ name: "extract_day", inputTypes: ["date"], priority: 35, mode: "series" }, extractDay);
|
|
1448
|
+
registerTransform({ name: "extract_quarter", inputTypes: ["date"], priority: 35, mode: "series" }, extractQuarter);
|
|
1449
|
+
registerTransform({ name: "extract_day_of_week", inputTypes: ["date"], priority: 35, mode: "series" }, extractDayOfWeek);
|
|
1450
|
+
function dateShift(values, days = 0) {
|
|
1451
|
+
const shift = typeof days === "number" ? days : Number(days) || 0;
|
|
1452
|
+
const shiftMs = shift * 864e5;
|
|
1453
|
+
return values.map((v) => {
|
|
1454
|
+
if (v === null) return null;
|
|
1455
|
+
const d = _parseDate(String(v));
|
|
1456
|
+
if (!d) return v;
|
|
1457
|
+
const shifted = new Date(d.getTime() + shiftMs);
|
|
1458
|
+
return `${shifted.getUTCFullYear()}-${pad(shifted.getUTCMonth() + 1)}-${pad(shifted.getUTCDate())}`;
|
|
1459
|
+
});
|
|
1460
|
+
}
|
|
1461
|
+
registerTransform(
|
|
1462
|
+
{ name: "date_shift", inputTypes: ["date"], priority: 30, mode: "series" },
|
|
1463
|
+
dateShift
|
|
1464
|
+
);
|
|
1465
|
+
function dateValidate(values) {
|
|
1466
|
+
return values.map((v) => {
|
|
1467
|
+
if (v === null) return null;
|
|
1468
|
+
const s = String(v).trim();
|
|
1469
|
+
if (!s) return false;
|
|
1470
|
+
return _parseDate(s) !== null;
|
|
1471
|
+
});
|
|
1472
|
+
}
|
|
1473
|
+
registerTransform(
|
|
1474
|
+
{ name: "date_validate", inputTypes: ["date", "string"], priority: 60, mode: "series" },
|
|
1475
|
+
dateValidate
|
|
1476
|
+
);
|
|
1477
|
+
|
|
1478
|
+
// src/core/transforms/email.ts
|
|
1479
|
+
init_registry();
|
|
1480
|
+
var EMAIL_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
1481
|
+
var GMAIL_DOMAINS = /* @__PURE__ */ new Set(["gmail.com", "googlemail.com"]);
|
|
1482
|
+
function emailLowercase(values) {
|
|
1483
|
+
return values.map((v) => {
|
|
1484
|
+
if (v === null || typeof v !== "string") return v;
|
|
1485
|
+
return v.toLowerCase();
|
|
1486
|
+
});
|
|
1487
|
+
}
|
|
1488
|
+
registerTransform(
|
|
1489
|
+
{ name: "email_lowercase", inputTypes: ["email", "string"], priority: 55, mode: "series" },
|
|
1490
|
+
emailLowercase
|
|
1491
|
+
);
|
|
1492
|
+
function emailNormalize(values) {
|
|
1493
|
+
return values.map((v) => {
|
|
1494
|
+
if (v === null || typeof v !== "string") return v;
|
|
1495
|
+
const lowered = v.toLowerCase().trim();
|
|
1496
|
+
const atIdx = lowered.lastIndexOf("@");
|
|
1497
|
+
if (atIdx === -1) return lowered;
|
|
1498
|
+
let local = lowered.slice(0, atIdx);
|
|
1499
|
+
const domain = lowered.slice(atIdx + 1);
|
|
1500
|
+
const plusIdx = local.indexOf("+");
|
|
1501
|
+
if (plusIdx !== -1) {
|
|
1502
|
+
local = local.slice(0, plusIdx);
|
|
1503
|
+
}
|
|
1504
|
+
if (GMAIL_DOMAINS.has(domain)) {
|
|
1505
|
+
local = local.replace(/\./g, "");
|
|
1506
|
+
}
|
|
1507
|
+
return `${local}@${domain}`;
|
|
1508
|
+
});
|
|
1509
|
+
}
|
|
1510
|
+
registerTransform(
|
|
1511
|
+
{ name: "email_normalize", inputTypes: ["email"], priority: 50, mode: "series" },
|
|
1512
|
+
emailNormalize
|
|
1513
|
+
);
|
|
1514
|
+
function emailExtractDomain(values) {
|
|
1515
|
+
return values.map((v) => {
|
|
1516
|
+
if (v === null || typeof v !== "string") return v;
|
|
1517
|
+
const atIdx = v.lastIndexOf("@");
|
|
1518
|
+
if (atIdx === -1) return null;
|
|
1519
|
+
return v.slice(atIdx + 1).toLowerCase();
|
|
1520
|
+
});
|
|
1521
|
+
}
|
|
1522
|
+
registerTransform(
|
|
1523
|
+
{ name: "email_extract_domain", inputTypes: ["email"], priority: 40, mode: "series" },
|
|
1524
|
+
emailExtractDomain
|
|
1525
|
+
);
|
|
1526
|
+
function emailValidate(values) {
|
|
1527
|
+
return values.map((v) => {
|
|
1528
|
+
if (v === null || typeof v !== "string") return v;
|
|
1529
|
+
return EMAIL_RE.test(v.trim());
|
|
1530
|
+
});
|
|
1531
|
+
}
|
|
1532
|
+
registerTransform(
|
|
1533
|
+
{ name: "email_validate", inputTypes: ["email", "string"], priority: 60, mode: "series" },
|
|
1534
|
+
emailValidate
|
|
1535
|
+
);
|
|
1536
|
+
|
|
1537
|
+
// src/core/transforms/numeric.ts
|
|
1538
|
+
init_registry();
|
|
1539
|
+
function currencyStrip(values) {
|
|
1540
|
+
return values.map((v) => {
|
|
1541
|
+
if (v === null) return null;
|
|
1542
|
+
if (typeof v === "number") return v;
|
|
1543
|
+
const cleaned = String(v).replace(/[^0-9.\-]/g, "");
|
|
1544
|
+
if (cleaned === "" || cleaned === "-") return v;
|
|
1545
|
+
const n = Number(cleaned);
|
|
1546
|
+
return isNaN(n) ? v : n;
|
|
1547
|
+
});
|
|
1548
|
+
}
|
|
1549
|
+
registerTransform(
|
|
1550
|
+
{ name: "currency_strip", inputTypes: ["string", "numeric"], priority: 50, mode: "series" },
|
|
1551
|
+
currencyStrip
|
|
1552
|
+
);
|
|
1553
|
+
function percentageNormalize(values) {
|
|
1554
|
+
return values.map((v) => {
|
|
1555
|
+
if (v === null) return null;
|
|
1556
|
+
if (typeof v === "number") return v / 100;
|
|
1557
|
+
const s = String(v).replace(/%/g, "").trim();
|
|
1558
|
+
const n = Number(s);
|
|
1559
|
+
return isNaN(n) ? v : n / 100;
|
|
1560
|
+
});
|
|
1561
|
+
}
|
|
1562
|
+
registerTransform(
|
|
1563
|
+
{ name: "percentage_normalize", inputTypes: ["string", "numeric"], priority: 50, mode: "series" },
|
|
1564
|
+
percentageNormalize
|
|
1565
|
+
);
|
|
1566
|
+
function roundTransform(values, n = 2) {
|
|
1567
|
+
const decimals = typeof n === "number" ? n : Number(n) || 2;
|
|
1568
|
+
const factor = Math.pow(10, decimals);
|
|
1569
|
+
return values.map((v) => {
|
|
1570
|
+
if (v === null) return null;
|
|
1571
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1572
|
+
if (isNaN(num)) return v;
|
|
1573
|
+
return Math.round(num * factor) / factor;
|
|
1574
|
+
});
|
|
1575
|
+
}
|
|
1576
|
+
registerTransform(
|
|
1577
|
+
{ name: "round", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1578
|
+
roundTransform
|
|
1579
|
+
);
|
|
1580
|
+
function clamp(values, minVal = 0, maxVal = 1) {
|
|
1581
|
+
const lo = typeof minVal === "number" ? minVal : Number(minVal) || 0;
|
|
1582
|
+
const hi = typeof maxVal === "number" ? maxVal : Number(maxVal) || 1;
|
|
1583
|
+
return values.map((v) => {
|
|
1584
|
+
if (v === null) return null;
|
|
1585
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1586
|
+
if (isNaN(num)) return v;
|
|
1587
|
+
return Math.min(hi, Math.max(lo, num));
|
|
1588
|
+
});
|
|
1589
|
+
}
|
|
1590
|
+
registerTransform(
|
|
1591
|
+
{ name: "clamp", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1592
|
+
clamp
|
|
1593
|
+
);
|
|
1594
|
+
function toInteger(values) {
|
|
1595
|
+
return values.map((v) => {
|
|
1596
|
+
if (v === null) return null;
|
|
1597
|
+
const num = Number(v);
|
|
1598
|
+
if (isNaN(num)) return v;
|
|
1599
|
+
return Math.trunc(num);
|
|
1600
|
+
});
|
|
1601
|
+
}
|
|
1602
|
+
registerTransform(
|
|
1603
|
+
{ name: "to_integer", inputTypes: ["string", "numeric"], priority: 45, mode: "series" },
|
|
1604
|
+
toInteger
|
|
1605
|
+
);
|
|
1606
|
+
function absValue(values) {
|
|
1607
|
+
return values.map((v) => {
|
|
1608
|
+
if (v === null) return null;
|
|
1609
|
+
const num = typeof v === "number" ? v : Number(v);
|
|
1610
|
+
if (isNaN(num)) return v;
|
|
1611
|
+
return Math.abs(num);
|
|
1612
|
+
});
|
|
1613
|
+
}
|
|
1614
|
+
registerTransform(
|
|
1615
|
+
{ name: "abs_value", inputTypes: ["numeric"], priority: 40, mode: "series" },
|
|
1616
|
+
absValue
|
|
1617
|
+
);
|
|
1618
|
+
function fillZero(values) {
|
|
1619
|
+
return values.map((v) => v === null ? 0 : v);
|
|
1620
|
+
}
|
|
1621
|
+
registerTransform(
|
|
1622
|
+
{ name: "fill_zero", inputTypes: ["numeric"], priority: 35, mode: "series" },
|
|
1623
|
+
fillZero
|
|
1624
|
+
);
|
|
1625
|
+
function commaDecimal(values) {
|
|
1626
|
+
return values.map((v) => {
|
|
1627
|
+
if (v === null) return null;
|
|
1628
|
+
if (typeof v === "number") return v;
|
|
1629
|
+
const s = String(v);
|
|
1630
|
+
const converted = s.replace(/\./g, "").replace(",", ".");
|
|
1631
|
+
const n = Number(converted);
|
|
1632
|
+
return isNaN(n) ? v : n;
|
|
1633
|
+
});
|
|
1634
|
+
}
|
|
1635
|
+
registerTransform(
|
|
1636
|
+
{ name: "comma_decimal", inputTypes: ["string", "numeric"], priority: 48, mode: "series" },
|
|
1637
|
+
commaDecimal
|
|
1638
|
+
);
|
|
1639
|
+
function scientificToDecimal(values) {
|
|
1640
|
+
return values.map((v) => {
|
|
1641
|
+
if (v === null) return null;
|
|
1642
|
+
const n = Number(v);
|
|
1643
|
+
return isNaN(n) ? v : n;
|
|
1644
|
+
});
|
|
1645
|
+
}
|
|
1646
|
+
registerTransform(
|
|
1647
|
+
{ name: "scientific_to_decimal", inputTypes: ["string", "numeric"], priority: 45, mode: "series" },
|
|
1648
|
+
scientificToDecimal
|
|
1649
|
+
);
|
|
1650
|
+
|
|
1651
|
+
// src/core/transforms/categorical.ts
|
|
1652
|
+
init_registry();
|
|
1653
|
+
var TRUTHY = /* @__PURE__ */ new Set(["yes", "y", "1", "true", "t"]);
|
|
1654
|
+
var FALSY = /* @__PURE__ */ new Set(["no", "n", "0", "false", "f"]);
|
|
1655
|
+
function booleanNormalize(values) {
|
|
1656
|
+
return values.map((v) => {
|
|
1657
|
+
if (v === null) return null;
|
|
1658
|
+
const s = String(v).trim().toLowerCase();
|
|
1659
|
+
if (TRUTHY.has(s)) return true;
|
|
1660
|
+
if (FALSY.has(s)) return false;
|
|
1661
|
+
return v;
|
|
1662
|
+
});
|
|
1663
|
+
}
|
|
1664
|
+
registerTransform(
|
|
1665
|
+
{ name: "boolean_normalize", inputTypes: ["boolean", "string"], priority: 50, mode: "series" },
|
|
1666
|
+
booleanNormalize
|
|
1667
|
+
);
|
|
1668
|
+
function genderStandardize(values) {
|
|
1669
|
+
return values.map((v) => {
|
|
1670
|
+
if (v === null) return null;
|
|
1671
|
+
if (typeof v !== "string") return v;
|
|
1672
|
+
const s = v.trim().toLowerCase();
|
|
1673
|
+
if (s === "male" || s === "m") return "M";
|
|
1674
|
+
if (s === "female" || s === "f") return "F";
|
|
1675
|
+
return v;
|
|
1676
|
+
});
|
|
1677
|
+
}
|
|
1678
|
+
registerTransform(
|
|
1679
|
+
{ name: "gender_standardize", inputTypes: ["string"], priority: 50, mode: "series" },
|
|
1680
|
+
genderStandardize
|
|
1681
|
+
);
|
|
1682
|
+
var NULL_VARIANTS = /* @__PURE__ */ new Set([
|
|
1683
|
+
"n/a",
|
|
1684
|
+
"null",
|
|
1685
|
+
"none",
|
|
1686
|
+
"na",
|
|
1687
|
+
"nil",
|
|
1688
|
+
"nan",
|
|
1689
|
+
"-",
|
|
1690
|
+
""
|
|
1691
|
+
]);
|
|
1692
|
+
function nullStandardize(values) {
|
|
1693
|
+
return values.map((v) => {
|
|
1694
|
+
if (v === null) return null;
|
|
1695
|
+
if (typeof v !== "string") return v;
|
|
1696
|
+
const s = v.trim().toLowerCase();
|
|
1697
|
+
if (NULL_VARIANTS.has(s)) return null;
|
|
1698
|
+
return v;
|
|
1699
|
+
});
|
|
1700
|
+
}
|
|
1701
|
+
registerTransform(
|
|
1702
|
+
{ name: "null_standardize", inputTypes: ["string"], autoApply: true, priority: 80, mode: "series" },
|
|
1703
|
+
nullStandardize
|
|
1704
|
+
);
|
|
1705
|
+
function categoryStandardize(values, mapping = null) {
|
|
1706
|
+
if (!mapping || typeof mapping !== "object") return values.slice();
|
|
1707
|
+
const lookup = /* @__PURE__ */ new Map();
|
|
1708
|
+
for (const [canonical, variants] of Object.entries(
|
|
1709
|
+
mapping
|
|
1710
|
+
)) {
|
|
1711
|
+
if (Array.isArray(variants)) {
|
|
1712
|
+
for (const variant of variants) {
|
|
1713
|
+
lookup.set(String(variant).toLowerCase(), canonical);
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
lookup.set(canonical.toLowerCase(), canonical);
|
|
1717
|
+
}
|
|
1718
|
+
return values.map((v) => {
|
|
1719
|
+
if (v === null) return null;
|
|
1720
|
+
if (typeof v !== "string") return v;
|
|
1721
|
+
const key = v.trim().toLowerCase();
|
|
1722
|
+
return lookup.get(key) ?? v;
|
|
1723
|
+
});
|
|
1724
|
+
}
|
|
1725
|
+
registerTransform(
|
|
1726
|
+
{ name: "category_standardize", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
1727
|
+
categoryStandardize
|
|
1728
|
+
);
|
|
1729
|
+
function categoryFromFile(values, lookupPath = null) {
|
|
1730
|
+
if (lookupPath) {
|
|
1731
|
+
console.warn("[goldenflow] category_from_file is not yet implemented in the JS port \u2014 returning values unchanged");
|
|
1732
|
+
}
|
|
1733
|
+
return values.slice();
|
|
1734
|
+
}
|
|
1735
|
+
registerTransform(
|
|
1736
|
+
{ name: "category_from_file", inputTypes: ["string"], priority: 45, mode: "series" },
|
|
1737
|
+
categoryFromFile
|
|
1738
|
+
);
|
|
1739
|
+
|
|
1740
|
+
// src/core/transforms/identifiers.ts
|
|
1741
|
+
init_registry();
|
|
1742
|
+
function mapStrings4(values, fn) {
|
|
1743
|
+
return values.map((v) => {
|
|
1744
|
+
if (v === null || typeof v !== "string") return v;
|
|
1745
|
+
return fn(v);
|
|
1746
|
+
});
|
|
1747
|
+
}
|
|
1748
|
+
function extractDigits2(val) {
|
|
1749
|
+
return val.replace(/\D/g, "");
|
|
1750
|
+
}
|
|
1751
|
+
function ssnFormat(values) {
|
|
1752
|
+
return mapStrings4(values, (s) => {
|
|
1753
|
+
const digits = extractDigits2(s);
|
|
1754
|
+
if (digits.length !== 9) return s;
|
|
1755
|
+
return `${digits.slice(0, 3)}-${digits.slice(3, 5)}-${digits.slice(5)}`;
|
|
1756
|
+
});
|
|
1757
|
+
}
|
|
1758
|
+
registerTransform(
|
|
1759
|
+
{ name: "ssn_format", inputTypes: ["ssn", "string"], priority: 50, mode: "series" },
|
|
1760
|
+
ssnFormat
|
|
1761
|
+
);
|
|
1762
|
+
function ssnMask(values) {
|
|
1763
|
+
return mapStrings4(values, (s) => {
|
|
1764
|
+
const digits = extractDigits2(s);
|
|
1765
|
+
if (digits.length !== 9) return s;
|
|
1766
|
+
return `***-**-${digits.slice(5)}`;
|
|
1767
|
+
});
|
|
1768
|
+
}
|
|
1769
|
+
registerTransform(
|
|
1770
|
+
{ name: "ssn_mask", inputTypes: ["ssn", "string"], priority: 50, mode: "series" },
|
|
1771
|
+
ssnMask
|
|
1772
|
+
);
|
|
1773
|
+
function einFormat(values) {
|
|
1774
|
+
return mapStrings4(values, (s) => {
|
|
1775
|
+
const digits = extractDigits2(s);
|
|
1776
|
+
if (digits.length !== 9) return s;
|
|
1777
|
+
return `${digits.slice(0, 2)}-${digits.slice(2)}`;
|
|
1778
|
+
});
|
|
1779
|
+
}
|
|
1780
|
+
registerTransform(
|
|
1781
|
+
{ name: "ein_format", inputTypes: ["ein", "string"], priority: 50, mode: "series" },
|
|
1782
|
+
einFormat
|
|
1783
|
+
);
|
|
1784
|
+
|
|
1785
|
+
// src/core/transforms/url.ts
|
|
1786
|
+
init_registry();
|
|
1787
|
+
function mapStrings5(values, fn) {
|
|
1788
|
+
return values.map((v) => {
|
|
1789
|
+
if (v === null || typeof v !== "string") return v;
|
|
1790
|
+
return fn(v);
|
|
1791
|
+
});
|
|
1792
|
+
}
|
|
1793
|
+
var _SCHEME_RE = /^https?:\/\//i;
|
|
1794
|
+
function urlNormalize(values) {
|
|
1795
|
+
return mapStrings5(values, (s) => {
|
|
1796
|
+
let val = s.trim();
|
|
1797
|
+
if (!val) return null;
|
|
1798
|
+
if (!_SCHEME_RE.test(val)) {
|
|
1799
|
+
val = "https://" + val;
|
|
1800
|
+
}
|
|
1801
|
+
const schemeEnd = val.indexOf("://") + 3;
|
|
1802
|
+
const scheme = val.slice(0, schemeEnd).toLowerCase();
|
|
1803
|
+
const rest = val.slice(schemeEnd);
|
|
1804
|
+
const slashIdx = rest.indexOf("/");
|
|
1805
|
+
let domain;
|
|
1806
|
+
let path;
|
|
1807
|
+
if (slashIdx === -1) {
|
|
1808
|
+
domain = rest.toLowerCase();
|
|
1809
|
+
path = "";
|
|
1810
|
+
} else {
|
|
1811
|
+
domain = rest.slice(0, slashIdx).toLowerCase();
|
|
1812
|
+
path = rest.slice(slashIdx);
|
|
1813
|
+
}
|
|
1814
|
+
let result = scheme + domain + path;
|
|
1815
|
+
if (result.endsWith("/") && result.length > schemeEnd + domain.length + 1) {
|
|
1816
|
+
result = result.replace(/\/+$/, "");
|
|
1817
|
+
} else if (result.endsWith("/") && path === "/") {
|
|
1818
|
+
result = result.slice(0, -1);
|
|
1819
|
+
}
|
|
1820
|
+
return result;
|
|
1821
|
+
});
|
|
1822
|
+
}
|
|
1823
|
+
registerTransform(
|
|
1824
|
+
{ name: "url_normalize", inputTypes: ["url", "string"], priority: 50, mode: "series" },
|
|
1825
|
+
urlNormalize
|
|
1826
|
+
);
|
|
1827
|
+
function urlExtractDomain(values) {
|
|
1828
|
+
return mapStrings5(values, (s) => {
|
|
1829
|
+
let val = s.trim();
|
|
1830
|
+
if (!val) return null;
|
|
1831
|
+
if (val.includes("://")) {
|
|
1832
|
+
val = val.split("://", 2)[1];
|
|
1833
|
+
}
|
|
1834
|
+
const domain = val.split("/", 1)[0];
|
|
1835
|
+
return domain ? domain.toLowerCase() : null;
|
|
1836
|
+
});
|
|
1837
|
+
}
|
|
1838
|
+
registerTransform(
|
|
1839
|
+
{ name: "url_extract_domain", inputTypes: ["url", "string"], priority: 40, mode: "series" },
|
|
1840
|
+
urlExtractDomain
|
|
1841
|
+
);
|
|
1842
|
+
|
|
1843
|
+
// src/core/transforms/auto-correct.ts
|
|
1844
|
+
init_registry();
|
|
1845
|
+
function levenshtein(a, b) {
|
|
1846
|
+
const m = a.length;
|
|
1847
|
+
const n = b.length;
|
|
1848
|
+
if (m === 0) return n;
|
|
1849
|
+
if (n === 0) return m;
|
|
1850
|
+
const prev = new Array(n + 1);
|
|
1851
|
+
for (let j = 0; j <= n; j++) prev[j] = j;
|
|
1852
|
+
for (let i = 1; i <= m; i++) {
|
|
1853
|
+
let prevDiag = prev[0];
|
|
1854
|
+
prev[0] = i;
|
|
1855
|
+
for (let j = 1; j <= n; j++) {
|
|
1856
|
+
const temp = prev[j];
|
|
1857
|
+
if (a[i - 1] === b[j - 1]) {
|
|
1858
|
+
prev[j] = prevDiag;
|
|
1859
|
+
} else {
|
|
1860
|
+
prev[j] = 1 + Math.min(prevDiag, prev[j], prev[j - 1]);
|
|
1861
|
+
}
|
|
1862
|
+
prevDiag = temp;
|
|
1863
|
+
}
|
|
1864
|
+
}
|
|
1865
|
+
return prev[n];
|
|
1866
|
+
}
|
|
1867
|
+
function fuzzyRatio(a, b) {
|
|
1868
|
+
if (a.length === 0 && b.length === 0) return 100;
|
|
1869
|
+
const maxLen = Math.max(a.length, b.length);
|
|
1870
|
+
const dist = levenshtein(a, b);
|
|
1871
|
+
return 100 * (1 - dist / maxLen);
|
|
1872
|
+
}
|
|
1873
|
+
function categoryAutoCorrect(values, frequencyThreshold = 0.05, matchThreshold = 85) {
|
|
1874
|
+
const freqThresh = typeof frequencyThreshold === "number" ? frequencyThreshold : Number(frequencyThreshold) || 0.05;
|
|
1875
|
+
const matchThresh = typeof matchThreshold === "number" ? matchThreshold : Number(matchThreshold) || 85;
|
|
1876
|
+
const freqMap = /* @__PURE__ */ new Map();
|
|
1877
|
+
const casingMap = /* @__PURE__ */ new Map();
|
|
1878
|
+
let totalNonNull = 0;
|
|
1879
|
+
for (const v of values) {
|
|
1880
|
+
if (v === null || typeof v !== "string") continue;
|
|
1881
|
+
const lower = v.toLowerCase();
|
|
1882
|
+
totalNonNull++;
|
|
1883
|
+
freqMap.set(lower, (freqMap.get(lower) ?? 0) + 1);
|
|
1884
|
+
let casings = casingMap.get(lower);
|
|
1885
|
+
if (!casings) {
|
|
1886
|
+
casings = /* @__PURE__ */ new Map();
|
|
1887
|
+
casingMap.set(lower, casings);
|
|
1888
|
+
}
|
|
1889
|
+
casings.set(v, (casings.get(v) ?? 0) + 1);
|
|
1890
|
+
}
|
|
1891
|
+
if (totalNonNull === 0) return values.slice();
|
|
1892
|
+
const canonicals = /* @__PURE__ */ new Map();
|
|
1893
|
+
for (const [lower, count] of freqMap) {
|
|
1894
|
+
if (count / totalNonNull >= freqThresh) {
|
|
1895
|
+
const casings = casingMap.get(lower);
|
|
1896
|
+
let bestCasing = lower;
|
|
1897
|
+
let bestCount = 0;
|
|
1898
|
+
for (const [original, c] of casings) {
|
|
1899
|
+
if (c > bestCount) {
|
|
1900
|
+
bestCount = c;
|
|
1901
|
+
bestCasing = original;
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
canonicals.set(lower, bestCasing);
|
|
1905
|
+
}
|
|
1906
|
+
}
|
|
1907
|
+
if (canonicals.size === 0) return values.slice();
|
|
1908
|
+
const corrections = /* @__PURE__ */ new Map();
|
|
1909
|
+
for (const [lower] of freqMap) {
|
|
1910
|
+
if (canonicals.has(lower)) continue;
|
|
1911
|
+
let bestCanonical = null;
|
|
1912
|
+
let bestScore = 0;
|
|
1913
|
+
for (const [canonLower, canonOriginal] of canonicals) {
|
|
1914
|
+
const score = fuzzyRatio(lower, canonLower);
|
|
1915
|
+
if (score >= matchThresh && score > bestScore) {
|
|
1916
|
+
bestScore = score;
|
|
1917
|
+
bestCanonical = canonOriginal;
|
|
1918
|
+
}
|
|
1919
|
+
}
|
|
1920
|
+
if (bestCanonical !== null) {
|
|
1921
|
+
corrections.set(lower, bestCanonical);
|
|
1922
|
+
}
|
|
1923
|
+
}
|
|
1924
|
+
return values.map((v) => {
|
|
1925
|
+
if (v === null || typeof v !== "string") return v;
|
|
1926
|
+
const lower = v.toLowerCase();
|
|
1927
|
+
const correction = corrections.get(lower);
|
|
1928
|
+
if (correction !== void 0) return correction;
|
|
1929
|
+
const canonical = canonicals.get(lower);
|
|
1930
|
+
if (canonical !== void 0) return canonical;
|
|
1931
|
+
return v;
|
|
1932
|
+
});
|
|
1933
|
+
}
|
|
1934
|
+
registerTransform(
|
|
1935
|
+
{ name: "category_auto_correct", inputTypes: ["string"], autoApply: true, priority: 35, mode: "series" },
|
|
1936
|
+
categoryAutoCorrect
|
|
1937
|
+
);
|
|
1938
|
+
|
|
1939
|
+
// src/core/transforms/index.ts
|
|
1940
|
+
init_registry();
|
|
1941
|
+
|
|
1942
|
+
// src/core/engine/transformer.ts
|
|
1943
|
+
init_types();
|
|
1944
|
+
|
|
1945
|
+
// src/core/engine/profiler-bridge.ts
|
|
1946
|
+
init_types();
|
|
1947
|
+
var EMAIL_RE2 = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
|
|
1948
|
+
var PHONE_RE = /^[+(]?\d[\d()\-.\s]{6,18}\d$/;
|
|
1949
|
+
var DATE_RE = /^(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|[A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})$/;
|
|
1950
|
+
var NAME_RE = /^[A-Z][a-z]+(\s+[A-Z][a-z]+)+$/;
|
|
1951
|
+
var ZIP_RE = /^\d{5}(-\d{4})?$/;
|
|
1952
|
+
var NAME_PATTERNS = {
|
|
1953
|
+
zip: ["zip", "postal", "zipcode", "zip_code", "postal_code"],
|
|
1954
|
+
phone: ["phone", "tel", "mobile", "cell", "fax"],
|
|
1955
|
+
email: ["email", "e_mail", "mail"],
|
|
1956
|
+
date: ["date", "created", "updated", "timestamp", "dob", "birth"],
|
|
1957
|
+
state: ["state", "province", "region"],
|
|
1958
|
+
name: ["first_name", "last_name", "fname", "lname", "full_name", "fullname"]
|
|
1959
|
+
};
|
|
1960
|
+
function overrideTypeByColumnName(columnName, currentType) {
|
|
1961
|
+
if (currentType !== "string" && currentType !== "numeric") return currentType;
|
|
1962
|
+
const colLower = columnName.toLowerCase().replace(/-/g, "_");
|
|
1963
|
+
for (const [semanticType, patterns] of Object.entries(NAME_PATTERNS)) {
|
|
1964
|
+
for (const pattern of patterns) {
|
|
1965
|
+
if (colLower.includes(pattern)) return semanticType;
|
|
1966
|
+
}
|
|
1967
|
+
}
|
|
1968
|
+
return currentType;
|
|
1969
|
+
}
|
|
1970
|
+
function inferType(values, columnName) {
|
|
1971
|
+
const nonNull = values.filter((v) => v !== null);
|
|
1972
|
+
if (nonNull.length === 0) return "string";
|
|
1973
|
+
let hasNumber = false;
|
|
1974
|
+
let hasBoolean = false;
|
|
1975
|
+
for (const v of nonNull) {
|
|
1976
|
+
if (typeof v === "number") hasNumber = true;
|
|
1977
|
+
else if (typeof v === "boolean") hasBoolean = true;
|
|
1978
|
+
}
|
|
1979
|
+
if (hasNumber && !hasBoolean) return overrideTypeByColumnName(columnName, "numeric");
|
|
1980
|
+
if (hasBoolean && !hasNumber) return "boolean";
|
|
1981
|
+
const stringVals = [];
|
|
1982
|
+
for (const v of nonNull) {
|
|
1983
|
+
if (typeof v === "string") {
|
|
1984
|
+
const trimmed = v.trim();
|
|
1985
|
+
if (trimmed) stringVals.push(trimmed);
|
|
1986
|
+
}
|
|
1987
|
+
}
|
|
1988
|
+
if (stringVals.length === 0) return "string";
|
|
1989
|
+
const sample = stringVals.slice(0, 100);
|
|
1990
|
+
const checks = [
|
|
1991
|
+
["email", EMAIL_RE2, 0.7],
|
|
1992
|
+
["zip", ZIP_RE, 0.7],
|
|
1993
|
+
["date", DATE_RE, 0.5],
|
|
1994
|
+
["phone", PHONE_RE, 0.6],
|
|
1995
|
+
["name", NAME_RE, 0.5]
|
|
1996
|
+
];
|
|
1997
|
+
for (const [typeName, pattern, threshold] of checks) {
|
|
1998
|
+
let matches = 0;
|
|
1999
|
+
for (const v of sample) {
|
|
2000
|
+
if (pattern.test(v)) matches++;
|
|
2001
|
+
}
|
|
2002
|
+
if (matches / sample.length >= threshold) {
|
|
2003
|
+
return overrideTypeByColumnName(columnName, typeName);
|
|
2004
|
+
}
|
|
2005
|
+
}
|
|
2006
|
+
return overrideTypeByColumnName(columnName, "string");
|
|
2007
|
+
}
|
|
2008
|
+
function profileColumn(data, columnName) {
|
|
2009
|
+
const values = data.rawColumn(columnName);
|
|
2010
|
+
const rowCount = values.length;
|
|
2011
|
+
let nullCount = 0;
|
|
2012
|
+
const nonNullValues = [];
|
|
2013
|
+
const uniqueSet = /* @__PURE__ */ new Set();
|
|
2014
|
+
for (const v of values) {
|
|
2015
|
+
if (v === null) {
|
|
2016
|
+
nullCount++;
|
|
2017
|
+
continue;
|
|
2018
|
+
}
|
|
2019
|
+
nonNullValues.push(v);
|
|
2020
|
+
uniqueSet.add(v);
|
|
2021
|
+
}
|
|
2022
|
+
const uniqueCount = uniqueSet.size;
|
|
2023
|
+
const sampleValues = nonNullValues.slice(0, 5).map((v) => String(v));
|
|
2024
|
+
const inferredType = inferType(values, columnName);
|
|
2025
|
+
return makeColumnProfile({
|
|
2026
|
+
name: columnName,
|
|
2027
|
+
inferredType,
|
|
2028
|
+
rowCount,
|
|
2029
|
+
nullCount,
|
|
2030
|
+
nullPct: rowCount > 0 ? nullCount / rowCount : 0,
|
|
2031
|
+
uniqueCount,
|
|
2032
|
+
uniquePct: rowCount > 0 ? uniqueCount / rowCount : 0,
|
|
2033
|
+
sampleValues
|
|
2034
|
+
});
|
|
2035
|
+
}
|
|
2036
|
+
function profileDataframe(rows, filePath = "") {
|
|
2037
|
+
const data = new TabularData(rows);
|
|
2038
|
+
const columns = data.columns.map((col) => profileColumn(data, col));
|
|
2039
|
+
return {
|
|
2040
|
+
filePath,
|
|
2041
|
+
rowCount: data.rowCount,
|
|
2042
|
+
columnCount: data.columns.length,
|
|
2043
|
+
columns
|
|
2044
|
+
};
|
|
2045
|
+
}
|
|
2046
|
+
|
|
2047
|
+
// src/core/engine/selector.ts
|
|
2048
|
+
var FINDING_TRANSFORM_MAP = {
|
|
2049
|
+
type_inference: ["strip", "to_integer"],
|
|
2050
|
+
nullability: ["null_standardize"],
|
|
2051
|
+
uniqueness: ["strip", "collapse_whitespace", "email_normalize"],
|
|
2052
|
+
format_detection: ["phone_e164", "email_normalize", "date_iso8601", "zip_normalize"],
|
|
2053
|
+
range_distribution: ["clamp"],
|
|
2054
|
+
cardinality: ["category_auto_correct", "category_standardize"],
|
|
2055
|
+
pattern_consistency: ["phone_e164", "date_iso8601", "zip_normalize", "ssn_format"],
|
|
2056
|
+
encoding_detection: ["normalize_unicode", "normalize_quotes", "fix_mojibake"],
|
|
2057
|
+
sequence_detection: ["pad_left"],
|
|
2058
|
+
drift_detection: [],
|
|
2059
|
+
temporal_order: ["date_iso8601", "date_validate"],
|
|
2060
|
+
null_correlation: [],
|
|
2061
|
+
cross_column_validation: ["clamp"],
|
|
2062
|
+
cross_column: ["date_validate", "age_from_dob"]
|
|
2063
|
+
};
|
|
2064
|
+
var STRING_LIKE_TYPES = /* @__PURE__ */ new Set([
|
|
2065
|
+
"string",
|
|
2066
|
+
"email",
|
|
2067
|
+
"phone",
|
|
2068
|
+
"name",
|
|
2069
|
+
"address",
|
|
2070
|
+
"date"
|
|
2071
|
+
]);
|
|
2072
|
+
function selectTransforms(profile, _confidenceThreshold = 0.8) {
|
|
2073
|
+
const all = listTransforms();
|
|
2074
|
+
let selected = [];
|
|
2075
|
+
for (const t of all) {
|
|
2076
|
+
if (!t.autoApply) continue;
|
|
2077
|
+
if (t.inputTypes.includes(profile.inferredType)) {
|
|
2078
|
+
selected.push(t);
|
|
2079
|
+
} else if (t.inputTypes.includes("string") && STRING_LIKE_TYPES.has(profile.inferredType)) {
|
|
2080
|
+
selected.push(t);
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
if (profile.uniquePct > 0.1) {
|
|
2084
|
+
selected = selected.filter((t) => t.name !== "category_auto_correct");
|
|
2085
|
+
}
|
|
2086
|
+
selected.sort((a, b) => b.priority - a.priority);
|
|
2087
|
+
return selected;
|
|
2088
|
+
}
|
|
2089
|
+
function selectFromFindings(findings) {
|
|
2090
|
+
const columnTransforms = {};
|
|
2091
|
+
for (const finding of findings) {
|
|
2092
|
+
const check = String(finding["check"] ?? "");
|
|
2093
|
+
const column = String(finding["column"] ?? "");
|
|
2094
|
+
if (!column) continue;
|
|
2095
|
+
const transformNames = FINDING_TRANSFORM_MAP[check] ?? [];
|
|
2096
|
+
if (transformNames.length > 0) {
|
|
2097
|
+
if (!columnTransforms[column]) columnTransforms[column] = [];
|
|
2098
|
+
columnTransforms[column].push(...transformNames);
|
|
2099
|
+
}
|
|
2100
|
+
}
|
|
2101
|
+
for (const col of Object.keys(columnTransforms)) {
|
|
2102
|
+
columnTransforms[col] = [...new Set(columnTransforms[col])];
|
|
2103
|
+
}
|
|
2104
|
+
return columnTransforms;
|
|
2105
|
+
}
|
|
2106
|
+
|
|
2107
|
+
// src/core/engine/transformer.ts
|
|
2108
|
+
var TransformEngine = class {
|
|
2109
|
+
config;
|
|
2110
|
+
constructor(config) {
|
|
2111
|
+
this.config = makeConfig(config);
|
|
2112
|
+
}
|
|
2113
|
+
transformDf(rows, source = "<dataframe>") {
|
|
2114
|
+
const manifest = new MutableManifest(source);
|
|
2115
|
+
let currentRows = [...rows];
|
|
2116
|
+
if (this.config.transforms.length > 0) {
|
|
2117
|
+
currentRows = this._applyConfigTransforms(currentRows, manifest);
|
|
2118
|
+
} else {
|
|
2119
|
+
currentRows = this._applyAutoTransforms(currentRows, manifest, source);
|
|
2120
|
+
}
|
|
2121
|
+
for (const split of this.config.splits) {
|
|
2122
|
+
if (currentRows.length === 0 || !(split.source in currentRows[0])) continue;
|
|
2123
|
+
const info = getTransform(split.method);
|
|
2124
|
+
if (info && info.mode === "dataframe") {
|
|
2125
|
+
currentRows = info.func(currentRows, split.source);
|
|
2126
|
+
}
|
|
2127
|
+
}
|
|
2128
|
+
for (const [oldName, newName] of Object.entries(this.config.renames)) {
|
|
2129
|
+
if (currentRows.length === 0 || !(oldName in currentRows[0])) continue;
|
|
2130
|
+
currentRows = currentRows.map((row) => {
|
|
2131
|
+
const newRow = {};
|
|
2132
|
+
for (const [k, v] of Object.entries(row)) {
|
|
2133
|
+
newRow[k === oldName ? newName : k] = v;
|
|
2134
|
+
}
|
|
2135
|
+
return newRow;
|
|
2136
|
+
});
|
|
2137
|
+
}
|
|
2138
|
+
const dropCols = new Set(this.config.drop);
|
|
2139
|
+
if (dropCols.size > 0 && currentRows.length > 0) {
|
|
2140
|
+
const existingDrops = [...dropCols].filter((c) => c in currentRows[0]);
|
|
2141
|
+
if (existingDrops.length > 0) {
|
|
2142
|
+
const dropSet = new Set(existingDrops);
|
|
2143
|
+
currentRows = currentRows.map((row) => {
|
|
2144
|
+
const newRow = {};
|
|
2145
|
+
for (const [k, v] of Object.entries(row)) {
|
|
2146
|
+
if (!dropSet.has(k)) newRow[k] = v;
|
|
2147
|
+
}
|
|
2148
|
+
return newRow;
|
|
2149
|
+
});
|
|
2150
|
+
}
|
|
2151
|
+
}
|
|
2152
|
+
for (const filt of this.config.filters) {
|
|
2153
|
+
if (currentRows.length === 0 || !(filt.column in currentRows[0])) continue;
|
|
2154
|
+
currentRows = this._applyFilter(currentRows, filt.column, filt.condition);
|
|
2155
|
+
}
|
|
2156
|
+
if (this.config.dedup) {
|
|
2157
|
+
const dedupCols = this.config.dedup.columns.filter(
|
|
2158
|
+
(c) => currentRows.length > 0 && c in currentRows[0]
|
|
2159
|
+
);
|
|
2160
|
+
if (dedupCols.length > 0) {
|
|
2161
|
+
const before = currentRows.length;
|
|
2162
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2163
|
+
const deduped = [];
|
|
2164
|
+
const iterRows = this.config.dedup.keep === "last" ? [...currentRows].reverse() : currentRows;
|
|
2165
|
+
for (const row of iterRows) {
|
|
2166
|
+
const key = dedupCols.map((c) => String(row[c] ?? "")).join("\0");
|
|
2167
|
+
if (!seen.has(key)) {
|
|
2168
|
+
seen.add(key);
|
|
2169
|
+
deduped.push(row);
|
|
2170
|
+
}
|
|
2171
|
+
}
|
|
2172
|
+
if (this.config.dedup.keep === "last") deduped.reverse();
|
|
2173
|
+
currentRows = deduped;
|
|
2174
|
+
const after = currentRows.length;
|
|
2175
|
+
if (before !== after) {
|
|
2176
|
+
manifest.addRecord(
|
|
2177
|
+
makeTransformRecord({
|
|
2178
|
+
column: dedupCols.join(","),
|
|
2179
|
+
transform: "dedup",
|
|
2180
|
+
affectedRows: before - after,
|
|
2181
|
+
totalRows: before
|
|
2182
|
+
})
|
|
2183
|
+
);
|
|
2184
|
+
}
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
const columns = currentRows.length > 0 ? Object.keys(currentRows[0]) : [];
|
|
2188
|
+
return { rows: currentRows, columns, manifest };
|
|
2189
|
+
}
|
|
2190
|
+
_applyConfigTransforms(rows, manifest) {
|
|
2191
|
+
for (const spec of this.config.transforms) {
|
|
2192
|
+
if (rows.length === 0 || !(spec.column in rows[0])) continue;
|
|
2193
|
+
for (const opRaw of spec.ops) {
|
|
2194
|
+
const [name, params] = parseTransformName(opRaw);
|
|
2195
|
+
const info = getTransform(name);
|
|
2196
|
+
if (!info) {
|
|
2197
|
+
manifest.addError(
|
|
2198
|
+
spec.column,
|
|
2199
|
+
name,
|
|
2200
|
+
-1,
|
|
2201
|
+
`Transform '${name}' not found in registry`
|
|
2202
|
+
);
|
|
2203
|
+
continue;
|
|
2204
|
+
}
|
|
2205
|
+
rows = this._applySingleTransform(rows, spec.column, info, params, manifest);
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
return rows;
|
|
2209
|
+
}
|
|
2210
|
+
_applyAutoTransforms(rows, manifest, source) {
|
|
2211
|
+
const filePath = source !== "<dataframe>" ? source : "";
|
|
2212
|
+
const profile = profileDataframe(rows, filePath);
|
|
2213
|
+
for (const colProfile of profile.columns) {
|
|
2214
|
+
const selected = selectTransforms(colProfile);
|
|
2215
|
+
for (const info of selected) {
|
|
2216
|
+
rows = this._applySingleTransform(
|
|
2217
|
+
rows,
|
|
2218
|
+
colProfile.name,
|
|
2219
|
+
info,
|
|
2220
|
+
[],
|
|
2221
|
+
manifest
|
|
2222
|
+
);
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
return rows;
|
|
2226
|
+
}
|
|
2227
|
+
_applySingleTransform(rows, column, info, params, manifest) {
|
|
2228
|
+
const totalRows = rows.length;
|
|
2229
|
+
const beforeSample = rows.slice(0, 3).map((r) => String(r[column] ?? ""));
|
|
2230
|
+
try {
|
|
2231
|
+
let newRows;
|
|
2232
|
+
if (info.mode === "dataframe") {
|
|
2233
|
+
newRows = info.func(rows, column, ...castParams(params));
|
|
2234
|
+
} else {
|
|
2235
|
+
const values = rows.map((r) => {
|
|
2236
|
+
const v = r[column];
|
|
2237
|
+
if (v === null || v === void 0) return null;
|
|
2238
|
+
if (typeof v === "string" || typeof v === "number" || typeof v === "boolean") return v;
|
|
2239
|
+
return String(v);
|
|
2240
|
+
});
|
|
2241
|
+
const typedParams = castParams(params);
|
|
2242
|
+
const result = typedParams.length > 0 ? info.func(values, ...typedParams) : info.func(values);
|
|
2243
|
+
let newValues;
|
|
2244
|
+
if (Array.isArray(result) && result.length === 2 && Array.isArray(result[1])) {
|
|
2245
|
+
newValues = result[0];
|
|
2246
|
+
const flagged = result[1];
|
|
2247
|
+
for (const rowIdx of flagged) {
|
|
2248
|
+
manifest.addError(column, info.name, rowIdx, "Flagged for review");
|
|
2249
|
+
}
|
|
2250
|
+
} else {
|
|
2251
|
+
newValues = result;
|
|
2252
|
+
}
|
|
2253
|
+
newRows = rows.map((row, i) => {
|
|
2254
|
+
const oldVal = row[column] ?? null;
|
|
2255
|
+
if (newValues[i] === oldVal) return row;
|
|
2256
|
+
return { ...row, [column]: newValues[i] };
|
|
2257
|
+
});
|
|
2258
|
+
}
|
|
2259
|
+
const afterSample = newRows.slice(0, 3).map((r) => String(r[column] ?? ""));
|
|
2260
|
+
let changed = 0;
|
|
2261
|
+
for (let i = 0; i < Math.min(rows.length, newRows.length); i++) {
|
|
2262
|
+
if (String(rows[i][column] ?? "") !== String(newRows[i][column] ?? "")) {
|
|
2263
|
+
changed++;
|
|
2264
|
+
}
|
|
2265
|
+
}
|
|
2266
|
+
manifest.addRecord(
|
|
2267
|
+
makeTransformRecord({
|
|
2268
|
+
column,
|
|
2269
|
+
transform: info.name,
|
|
2270
|
+
affectedRows: changed,
|
|
2271
|
+
totalRows,
|
|
2272
|
+
sampleBefore: beforeSample,
|
|
2273
|
+
sampleAfter: afterSample
|
|
2274
|
+
})
|
|
2275
|
+
);
|
|
2276
|
+
return newRows;
|
|
2277
|
+
} catch (e) {
|
|
2278
|
+
manifest.addError(
|
|
2279
|
+
column,
|
|
2280
|
+
info.name,
|
|
2281
|
+
-1,
|
|
2282
|
+
e instanceof Error ? e.message : String(e)
|
|
2283
|
+
);
|
|
2284
|
+
return rows;
|
|
2285
|
+
}
|
|
2286
|
+
}
|
|
2287
|
+
_applyFilter(rows, column, condition) {
|
|
2288
|
+
if (condition === "not_null") {
|
|
2289
|
+
return rows.filter((r) => r[column] !== null && r[column] !== void 0);
|
|
2290
|
+
}
|
|
2291
|
+
if (condition.startsWith("after:")) {
|
|
2292
|
+
const dateStr = condition.slice(6);
|
|
2293
|
+
return rows.filter((r) => String(r[column] ?? "") > dateStr);
|
|
2294
|
+
}
|
|
2295
|
+
if (condition.startsWith("before:")) {
|
|
2296
|
+
const dateStr = condition.slice(7);
|
|
2297
|
+
return rows.filter((r) => String(r[column] ?? "") < dateStr);
|
|
2298
|
+
}
|
|
2299
|
+
return rows;
|
|
2300
|
+
}
|
|
2301
|
+
};
|
|
2302
|
+
function castParams(params) {
|
|
2303
|
+
return params.map((p) => {
|
|
2304
|
+
const asInt = parseInt(p, 10);
|
|
2305
|
+
if (!Number.isNaN(asInt) && String(asInt) === p) return asInt;
|
|
2306
|
+
const asFloat = parseFloat(p);
|
|
2307
|
+
if (!Number.isNaN(asFloat)) return asFloat;
|
|
2308
|
+
return p;
|
|
2309
|
+
});
|
|
2310
|
+
}
|
|
2311
|
+
|
|
2312
|
+
// src/core/engine/differ.ts
|
|
2313
|
+
function diffDataframes(before, after) {
|
|
2314
|
+
const beforeCols = new Set(before.length > 0 ? Object.keys(before[0]) : []);
|
|
2315
|
+
const afterCols = new Set(after.length > 0 ? Object.keys(after[0]) : []);
|
|
2316
|
+
const addedColumns = [...afterCols].filter((c) => !beforeCols.has(c)).sort();
|
|
2317
|
+
const removedColumns = [...beforeCols].filter((c) => !afterCols.has(c)).sort();
|
|
2318
|
+
const commonCols = [...beforeCols].filter((c) => afterCols.has(c)).sort();
|
|
2319
|
+
const changedColumns = [];
|
|
2320
|
+
const columnDetails = {};
|
|
2321
|
+
let totalChanges = 0;
|
|
2322
|
+
for (const col of commonCols) {
|
|
2323
|
+
if (before.length !== after.length) {
|
|
2324
|
+
changedColumns.push(col);
|
|
2325
|
+
totalChanges += Math.abs(before.length - after.length);
|
|
2326
|
+
continue;
|
|
2327
|
+
}
|
|
2328
|
+
let changes = 0;
|
|
2329
|
+
for (let i = 0; i < before.length; i++) {
|
|
2330
|
+
const bVal = String(before[i][col] ?? "");
|
|
2331
|
+
const aVal = String(after[i][col] ?? "");
|
|
2332
|
+
if (bVal !== aVal) changes++;
|
|
2333
|
+
}
|
|
2334
|
+
if (changes > 0) {
|
|
2335
|
+
changedColumns.push(col);
|
|
2336
|
+
totalChanges += changes;
|
|
2337
|
+
columnDetails[col] = { changedRows: changes };
|
|
2338
|
+
}
|
|
2339
|
+
}
|
|
2340
|
+
return {
|
|
2341
|
+
totalChanges,
|
|
2342
|
+
changedColumns,
|
|
2343
|
+
addedColumns,
|
|
2344
|
+
removedColumns,
|
|
2345
|
+
rowCountBefore: before.length,
|
|
2346
|
+
rowCountAfter: after.length,
|
|
2347
|
+
columnDetails
|
|
2348
|
+
};
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
// src/core/engine/streaming.ts
|
|
2352
|
+
var StreamProcessor = class {
|
|
2353
|
+
engine;
|
|
2354
|
+
_batchCount = 0;
|
|
2355
|
+
constructor(config) {
|
|
2356
|
+
this.engine = new TransformEngine(config);
|
|
2357
|
+
}
|
|
2358
|
+
/** Transform a single record. */
|
|
2359
|
+
transformOne(record) {
|
|
2360
|
+
return this.engine.transformDf([record]);
|
|
2361
|
+
}
|
|
2362
|
+
/** Transform a batch of rows. */
|
|
2363
|
+
transformBatch(rows) {
|
|
2364
|
+
this._batchCount++;
|
|
2365
|
+
return this.engine.transformDf(rows);
|
|
2366
|
+
}
|
|
2367
|
+
/** Process rows in chunks, yielding TransformResult per chunk. */
|
|
2368
|
+
*streamRows(rows, chunkSize = 1e4) {
|
|
2369
|
+
for (let start = 0; start < rows.length; start += chunkSize) {
|
|
2370
|
+
const batch = rows.slice(start, start + chunkSize);
|
|
2371
|
+
this._batchCount++;
|
|
2372
|
+
yield this.engine.transformDf(batch);
|
|
2373
|
+
}
|
|
2374
|
+
}
|
|
2375
|
+
get batchesProcessed() {
|
|
2376
|
+
return this._batchCount;
|
|
2377
|
+
}
|
|
2378
|
+
};
|
|
2379
|
+
|
|
2380
|
+
// src/core/config/schema.ts
|
|
2381
|
+
init_types();
|
|
2382
|
+
init_types();
|
|
2383
|
+
function validateConfig(raw) {
|
|
2384
|
+
const transforms = Array.isArray(raw["transforms"]) ? raw["transforms"].map((t) => ({
|
|
2385
|
+
column: String(t["column"] ?? ""),
|
|
2386
|
+
ops: Array.isArray(t["ops"]) ? t["ops"].map(String) : []
|
|
2387
|
+
})) : [];
|
|
2388
|
+
const splits = Array.isArray(raw["splits"]) ? raw["splits"].map((s) => ({
|
|
2389
|
+
source: String(s["source"] ?? ""),
|
|
2390
|
+
target: Array.isArray(s["target"]) ? s["target"].map(String) : [],
|
|
2391
|
+
method: String(s["method"] ?? "")
|
|
2392
|
+
})) : [];
|
|
2393
|
+
const renames = raw["renames"] && typeof raw["renames"] === "object" ? Object.fromEntries(
|
|
2394
|
+
Object.entries(raw["renames"]).map(
|
|
2395
|
+
([k, v]) => [k, String(v)]
|
|
2396
|
+
)
|
|
2397
|
+
) : {};
|
|
2398
|
+
const drop = Array.isArray(raw["drop"]) ? raw["drop"].map(String) : [];
|
|
2399
|
+
const filters = Array.isArray(raw["filters"]) ? raw["filters"].map((f) => ({
|
|
2400
|
+
column: String(f["column"] ?? ""),
|
|
2401
|
+
condition: String(f["condition"] ?? "")
|
|
2402
|
+
})) : [];
|
|
2403
|
+
const dedupRaw = raw["dedup"];
|
|
2404
|
+
const dedup = dedupRaw && typeof dedupRaw === "object" ? {
|
|
2405
|
+
columns: Array.isArray(dedupRaw["columns"]) ? dedupRaw["columns"].map(String) : [],
|
|
2406
|
+
keep: dedupRaw["keep"] === "last" ? "last" : "first"
|
|
2407
|
+
} : null;
|
|
2408
|
+
const mappings = Array.isArray(raw["mappings"]) ? raw["mappings"].map((m) => ({
|
|
2409
|
+
source: String(m["source"] ?? ""),
|
|
2410
|
+
target: m["target"],
|
|
2411
|
+
transform: m["transform"] ?? null
|
|
2412
|
+
})) : [];
|
|
2413
|
+
return makeConfig({
|
|
2414
|
+
source: raw["source"] != null ? String(raw["source"]) : null,
|
|
2415
|
+
output: raw["output"] != null ? String(raw["output"]) : null,
|
|
2416
|
+
transforms,
|
|
2417
|
+
splits,
|
|
2418
|
+
renames,
|
|
2419
|
+
drop,
|
|
2420
|
+
filters,
|
|
2421
|
+
dedup,
|
|
2422
|
+
mappings
|
|
2423
|
+
});
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
// src/core/config/loader.ts
|
|
2427
|
+
init_types();
|
|
2428
|
+
var yamlModule = null;
|
|
2429
|
+
function getYaml() {
|
|
2430
|
+
if (yamlModule) return yamlModule;
|
|
2431
|
+
try {
|
|
2432
|
+
yamlModule = __require("yaml");
|
|
2433
|
+
} catch {
|
|
2434
|
+
}
|
|
2435
|
+
return yamlModule;
|
|
2436
|
+
}
|
|
2437
|
+
function loadConfigFromString(content) {
|
|
2438
|
+
const yaml = getYaml();
|
|
2439
|
+
if (!yaml) {
|
|
2440
|
+
throw new Error("yaml package is required for config loading. Install with: npm install yaml");
|
|
2441
|
+
}
|
|
2442
|
+
const data = yaml.parse(content);
|
|
2443
|
+
if (data === null || data === void 0) return makeConfig();
|
|
2444
|
+
if (typeof data !== "object" || Array.isArray(data)) {
|
|
2445
|
+
throw new Error(`Config file is not a valid YAML object (got ${Array.isArray(data) ? "array" : typeof data})`);
|
|
2446
|
+
}
|
|
2447
|
+
return validateConfig(data);
|
|
2448
|
+
}
|
|
2449
|
+
function saveConfigToString(config) {
|
|
2450
|
+
const yaml = getYaml();
|
|
2451
|
+
if (!yaml) {
|
|
2452
|
+
throw new Error("yaml package is required for config saving. Install with: npm install yaml");
|
|
2453
|
+
}
|
|
2454
|
+
const data = {};
|
|
2455
|
+
if (config.source) data["source"] = config.source;
|
|
2456
|
+
if (config.output) data["output"] = config.output;
|
|
2457
|
+
if (config.transforms.length > 0) data["transforms"] = config.transforms;
|
|
2458
|
+
if (config.splits.length > 0) data["splits"] = config.splits;
|
|
2459
|
+
if (Object.keys(config.renames).length > 0) data["renames"] = config.renames;
|
|
2460
|
+
if (config.drop.length > 0) data["drop"] = config.drop;
|
|
2461
|
+
if (config.filters.length > 0) data["filters"] = config.filters;
|
|
2462
|
+
if (config.dedup) data["dedup"] = config.dedup;
|
|
2463
|
+
if (config.mappings.length > 0) data["mappings"] = config.mappings;
|
|
2464
|
+
return yaml.stringify(data);
|
|
2465
|
+
}
|
|
2466
|
+
function mergeConfigs(fileConfig, cliOverrides) {
|
|
2467
|
+
return makeConfig({ ...fileConfig, ...cliOverrides });
|
|
2468
|
+
}
|
|
2469
|
+
|
|
2470
|
+
// src/core/config/learner.ts
|
|
2471
|
+
init_types();
|
|
2472
|
+
function learnConfig(rows, source = "") {
|
|
2473
|
+
const profile = profileDataframe(rows, source);
|
|
2474
|
+
const transforms = [];
|
|
2475
|
+
for (const colProfile of profile.columns) {
|
|
2476
|
+
const selected = selectTransforms(colProfile);
|
|
2477
|
+
if (selected.length > 0) {
|
|
2478
|
+
transforms.push({
|
|
2479
|
+
column: colProfile.name,
|
|
2480
|
+
ops: selected.map((t) => t.name)
|
|
2481
|
+
});
|
|
2482
|
+
}
|
|
2483
|
+
}
|
|
2484
|
+
return makeConfig({
|
|
2485
|
+
source: source || null,
|
|
2486
|
+
transforms
|
|
2487
|
+
});
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2490
|
+
// src/core/mapping/name-similarity.ts
|
|
2491
|
+
var ALIASES = {
|
|
2492
|
+
first_name: ["fname", "first", "given_name", "first_nm"],
|
|
2493
|
+
last_name: ["lname", "last", "surname", "family_name", "last_nm"],
|
|
2494
|
+
email: ["email_address", "e_mail", "email_addr", "mail"],
|
|
2495
|
+
phone: ["phone_number", "ph", "telephone", "tel", "mobile", "cell"],
|
|
2496
|
+
address: ["addr", "street_address", "addr_line_1", "address_line_1"],
|
|
2497
|
+
city: ["town", "municipality"],
|
|
2498
|
+
state: ["st", "province", "region"],
|
|
2499
|
+
zip: ["zipcode", "zip_code", "postal_code", "postal"],
|
|
2500
|
+
name: ["full_name", "fullname", "customer_name"],
|
|
2501
|
+
created_at: ["signup_date", "signup_dt", "create_date", "date_created"]
|
|
2502
|
+
};
|
|
2503
|
+
var _ALIAS_LOOKUP = /* @__PURE__ */ new Map();
|
|
2504
|
+
for (const [canonical, aliases] of Object.entries(ALIASES)) {
|
|
2505
|
+
for (const alias of aliases) {
|
|
2506
|
+
_ALIAS_LOOKUP.set(alias.toLowerCase(), canonical.toLowerCase());
|
|
2507
|
+
}
|
|
2508
|
+
_ALIAS_LOOKUP.set(canonical.toLowerCase(), canonical.toLowerCase());
|
|
2509
|
+
}
|
|
2510
|
+
function fuzzyWRatio(a, b) {
|
|
2511
|
+
if (a === b) return 100;
|
|
2512
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
2513
|
+
const maxLen = Math.max(a.length, b.length);
|
|
2514
|
+
const prev = new Array(b.length + 1);
|
|
2515
|
+
const curr = new Array(b.length + 1);
|
|
2516
|
+
for (let j = 0; j <= b.length; j++) prev[j] = j;
|
|
2517
|
+
for (let i = 1; i <= a.length; i++) {
|
|
2518
|
+
curr[0] = i;
|
|
2519
|
+
for (let j = 1; j <= b.length; j++) {
|
|
2520
|
+
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
2521
|
+
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
|
|
2522
|
+
}
|
|
2523
|
+
for (let j = 0; j <= b.length; j++) prev[j] = curr[j];
|
|
2524
|
+
}
|
|
2525
|
+
const distance = prev[b.length];
|
|
2526
|
+
return 100 * (1 - distance / maxLen);
|
|
2527
|
+
}
|
|
2528
|
+
function nameSimilarity(source, target) {
|
|
2529
|
+
const sLower = source.toLowerCase().trim();
|
|
2530
|
+
const tLower = target.toLowerCase().trim();
|
|
2531
|
+
if (sLower === tLower) return 1;
|
|
2532
|
+
const sCanonical = _ALIAS_LOOKUP.get(sLower);
|
|
2533
|
+
const tCanonical = _ALIAS_LOOKUP.get(tLower);
|
|
2534
|
+
if (sCanonical && tCanonical && sCanonical === tCanonical) return 0.95;
|
|
2535
|
+
return fuzzyWRatio(sLower, tLower) / 100;
|
|
2536
|
+
}
|
|
2537
|
+
|
|
2538
|
+
// src/core/mapping/profile-similarity.ts
|
|
2539
|
+
function profileSimilarity(source, target) {
|
|
2540
|
+
let score = 0;
|
|
2541
|
+
let weights = 0;
|
|
2542
|
+
if (source.inferredType === target.inferredType) score += 0.4;
|
|
2543
|
+
weights += 0.4;
|
|
2544
|
+
const nullDiff = Math.abs(source.nullPct - target.nullPct);
|
|
2545
|
+
score += 0.2 * Math.max(0, 1 - nullDiff);
|
|
2546
|
+
weights += 0.2;
|
|
2547
|
+
const uniqueDiff = Math.abs(source.uniquePct - target.uniquePct);
|
|
2548
|
+
score += 0.2 * Math.max(0, 1 - uniqueDiff);
|
|
2549
|
+
weights += 0.2;
|
|
2550
|
+
if (source.uniqueCount > 0 && target.uniqueCount > 0) {
|
|
2551
|
+
const ratio = Math.min(source.uniqueCount, target.uniqueCount) / Math.max(source.uniqueCount, target.uniqueCount);
|
|
2552
|
+
score += 0.2 * ratio;
|
|
2553
|
+
}
|
|
2554
|
+
weights += 0.2;
|
|
2555
|
+
return weights > 0 ? score / weights : 0;
|
|
2556
|
+
}
|
|
2557
|
+
|
|
2558
|
+
// src/core/mapping/schema-mapper.ts
|
|
2559
|
+
init_types();
|
|
2560
|
+
var SchemaMapper = class {
|
|
2561
|
+
autoThreshold;
|
|
2562
|
+
suggestThreshold;
|
|
2563
|
+
constructor(autoThreshold = 0.9, suggestThreshold = 0.6) {
|
|
2564
|
+
this.autoThreshold = autoThreshold;
|
|
2565
|
+
this.suggestThreshold = suggestThreshold;
|
|
2566
|
+
}
|
|
2567
|
+
map(sourceRows, targetRows) {
|
|
2568
|
+
const sourceProfile = profileDataframe(sourceRows);
|
|
2569
|
+
const targetProfile = profileDataframe(targetRows);
|
|
2570
|
+
const sourceProfiles = new Map(sourceProfile.columns.map((c) => [c.name, c]));
|
|
2571
|
+
const targetProfiles = new Map(targetProfile.columns.map((c) => [c.name, c]));
|
|
2572
|
+
const sourceCols = sourceRows.length > 0 ? Object.keys(sourceRows[0]) : [];
|
|
2573
|
+
const targetCols = targetRows.length > 0 ? Object.keys(targetRows[0]) : [];
|
|
2574
|
+
const mappings = [];
|
|
2575
|
+
const usedTargets = /* @__PURE__ */ new Set();
|
|
2576
|
+
for (const sCol of sourceCols) {
|
|
2577
|
+
let bestMatch = null;
|
|
2578
|
+
let bestScore = 0;
|
|
2579
|
+
for (const tCol of targetCols) {
|
|
2580
|
+
if (usedTargets.has(tCol)) continue;
|
|
2581
|
+
const nScore = nameSimilarity(sCol, tCol);
|
|
2582
|
+
let pScore = 0;
|
|
2583
|
+
const sp = sourceProfiles.get(sCol);
|
|
2584
|
+
const tp = targetProfiles.get(tCol);
|
|
2585
|
+
if (sp && tp) {
|
|
2586
|
+
pScore = profileSimilarity(sp, tp);
|
|
2587
|
+
}
|
|
2588
|
+
const combined = 0.7 * nScore + 0.3 * pScore;
|
|
2589
|
+
if (combined > bestScore && combined >= this.suggestThreshold) {
|
|
2590
|
+
bestScore = combined;
|
|
2591
|
+
bestMatch = {
|
|
2592
|
+
source: sCol,
|
|
2593
|
+
target: tCol,
|
|
2594
|
+
confidence: Math.round(combined * 1e3) / 1e3,
|
|
2595
|
+
transform: null
|
|
2596
|
+
};
|
|
2597
|
+
}
|
|
2598
|
+
}
|
|
2599
|
+
if (bestMatch) {
|
|
2600
|
+
mappings.push(bestMatch);
|
|
2601
|
+
usedTargets.add(bestMatch.target);
|
|
2602
|
+
}
|
|
2603
|
+
}
|
|
2604
|
+
return mappings;
|
|
2605
|
+
}
|
|
2606
|
+
toConfig(mappings) {
|
|
2607
|
+
return makeConfig({
|
|
2608
|
+
mappings: mappings.map((m) => ({
|
|
2609
|
+
source: m.source,
|
|
2610
|
+
target: m.target,
|
|
2611
|
+
transform: m.transform
|
|
2612
|
+
}))
|
|
2613
|
+
});
|
|
2614
|
+
}
|
|
2615
|
+
};
|
|
2616
|
+
|
|
2617
|
+
// src/core/domains/index.ts
|
|
2618
|
+
var DOMAIN_LOADERS = {
|
|
2619
|
+
people_hr: () => Promise.resolve().then(() => (init_people_hr(), people_hr_exports)),
|
|
2620
|
+
healthcare: () => Promise.resolve().then(() => (init_healthcare(), healthcare_exports)),
|
|
2621
|
+
finance: () => Promise.resolve().then(() => (init_finance(), finance_exports)),
|
|
2622
|
+
ecommerce: () => Promise.resolve().then(() => (init_ecommerce(), ecommerce_exports)),
|
|
2623
|
+
real_estate: () => Promise.resolve().then(() => (init_real_estate(), real_estate_exports))
|
|
2624
|
+
};
|
|
2625
|
+
async function loadDomain(name) {
|
|
2626
|
+
const key = name.toLowerCase().replace(/-/g, "_").replace(/\//g, "_");
|
|
2627
|
+
const loader = DOMAIN_LOADERS[key];
|
|
2628
|
+
if (!loader) return null;
|
|
2629
|
+
const mod = await loader();
|
|
2630
|
+
return mod.PACK;
|
|
2631
|
+
}
|
|
2632
|
+
function listDomains() {
|
|
2633
|
+
return Object.keys(DOMAIN_LOADERS);
|
|
2634
|
+
}
|
|
2635
|
+
|
|
2636
|
+
// src/core/reporters/json-reporter.ts
|
|
2637
|
+
init_types();
|
|
2638
|
+
function manifestToJson(manifest) {
|
|
2639
|
+
if (manifest instanceof MutableManifest) {
|
|
2640
|
+
return JSON.stringify(manifest.toDict(), null, 2);
|
|
2641
|
+
}
|
|
2642
|
+
return JSON.stringify(manifest, null, 2);
|
|
2643
|
+
}
|
|
2644
|
+
|
|
2645
|
+
// src/core/reporters/console.ts
|
|
2646
|
+
var BOLD = "\x1B[1m";
|
|
2647
|
+
var DIM = "\x1B[2m";
|
|
2648
|
+
var RED = "\x1B[31m";
|
|
2649
|
+
var GREEN = "\x1B[32m";
|
|
2650
|
+
var YELLOW = "\x1B[33m";
|
|
2651
|
+
var CYAN = "\x1B[36m";
|
|
2652
|
+
var MAGENTA = "\x1B[35m";
|
|
2653
|
+
var RESET = "\x1B[0m";
|
|
2654
|
+
function printProfile(profile) {
|
|
2655
|
+
console.log(`
|
|
2656
|
+
${BOLD}Profile: ${profile.filePath || "<dataframe>"}${RESET}
|
|
2657
|
+
`);
|
|
2658
|
+
console.log(` ${"Column".padEnd(20)} ${"Type".padEnd(12)} ${"Nulls".padEnd(15)} ${"Unique".padEnd(10)} Sample`);
|
|
2659
|
+
console.log(` ${"\u2500".repeat(20)} ${"\u2500".repeat(12)} ${"\u2500".repeat(15)} ${"\u2500".repeat(10)} ${"\u2500".repeat(20)}`);
|
|
2660
|
+
for (const col of profile.columns) {
|
|
2661
|
+
const pct = (col.nullPct * 100).toFixed(0);
|
|
2662
|
+
console.log(
|
|
2663
|
+
` ${CYAN}${col.name.padEnd(20)}${RESET} ${GREEN}${col.inferredType.padEnd(12)}${RESET} ${YELLOW}${`${col.nullCount} (${pct}%)`.padEnd(15)}${RESET} ${MAGENTA}${String(col.uniqueCount).padEnd(10)}${RESET} ${DIM}${col.sampleValues.slice(0, 3).join(", ")}${RESET}`
|
|
2664
|
+
);
|
|
2665
|
+
}
|
|
2666
|
+
console.log(`
|
|
2667
|
+
${BOLD}${profile.rowCount}${RESET} rows, ${BOLD}${profile.columnCount}${RESET} columns`);
|
|
2668
|
+
}
|
|
2669
|
+
function printManifest(manifest) {
|
|
2670
|
+
if (manifest.records.length === 0 && manifest.errors.length === 0) {
|
|
2671
|
+
console.log(`${DIM}No transforms applied.${RESET}`);
|
|
2672
|
+
return;
|
|
2673
|
+
}
|
|
2674
|
+
console.log(`
|
|
2675
|
+
${BOLD}Transforms Applied${RESET}
|
|
2676
|
+
`);
|
|
2677
|
+
console.log(` ${"Column".padEnd(20)} ${"Transform".padEnd(22)} ${"Affected".padEnd(12)} ${"Before".padEnd(20)} After`);
|
|
2678
|
+
console.log(` ${"\u2500".repeat(20)} ${"\u2500".repeat(22)} ${"\u2500".repeat(12)} ${"\u2500".repeat(20)} ${"\u2500".repeat(20)}`);
|
|
2679
|
+
for (const r of manifest.records) {
|
|
2680
|
+
const before = r.sampleBefore.slice(0, 2).join(", ");
|
|
2681
|
+
const after = r.sampleAfter.slice(0, 2).join(", ");
|
|
2682
|
+
console.log(
|
|
2683
|
+
` ${CYAN}${r.column.padEnd(20)}${RESET} ${GREEN}${r.transform.padEnd(22)}${RESET} ${YELLOW}${`${r.affectedRows}/${r.totalRows}`.padEnd(12)}${RESET} ${DIM}${before.padEnd(20)}${RESET} ${BOLD}${after}${RESET}`
|
|
2684
|
+
);
|
|
2685
|
+
}
|
|
2686
|
+
if (manifest.errors.length > 0) {
|
|
2687
|
+
console.log(`
|
|
2688
|
+
${RED}${BOLD}${manifest.errors.length} errors:${RESET}`);
|
|
2689
|
+
for (const e of manifest.errors) {
|
|
2690
|
+
console.log(` ${RED}${e.column}${RESET} / ${e.transform}: ${e.error}`);
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
}
|
|
2694
|
+
function printDiff(diff) {
|
|
2695
|
+
console.log(`Rows: ${diff.rowCountBefore} \u2192 ${diff.rowCountAfter}`);
|
|
2696
|
+
console.log(`Total changes: ${BOLD}${diff.totalChanges}${RESET}`);
|
|
2697
|
+
if (diff.addedColumns.length) console.log(`Added columns: ${GREEN}${diff.addedColumns.join(", ")}${RESET}`);
|
|
2698
|
+
if (diff.removedColumns.length) console.log(`Removed columns: ${RED}${diff.removedColumns.join(", ")}${RESET}`);
|
|
2699
|
+
if (diff.changedColumns.length) console.log(`Changed columns: ${YELLOW}${diff.changedColumns.join(", ")}${RESET}`);
|
|
2700
|
+
}
|
|
2701
|
+
|
|
2702
|
+
// src/core/llm/corrector.ts
|
|
2703
|
+
init_registry();
|
|
2704
|
+
var _correctionsCache = /* @__PURE__ */ new Map();
|
|
2705
|
+
function getValueSummary(values, max = 30) {
|
|
2706
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2707
|
+
for (const v of values) {
|
|
2708
|
+
if (v === null || typeof v !== "string") continue;
|
|
2709
|
+
const trimmed = v.trim();
|
|
2710
|
+
if (!trimmed) continue;
|
|
2711
|
+
counts.set(trimmed, (counts.get(trimmed) ?? 0) + 1);
|
|
2712
|
+
}
|
|
2713
|
+
const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]).slice(0, max);
|
|
2714
|
+
return Object.fromEntries(sorted);
|
|
2715
|
+
}
|
|
2716
|
+
function buildPrompt(columnName, valueSummary) {
|
|
2717
|
+
return `You are a data quality expert. Analyze this column and identify values that appear to be misspellings, abbreviations, or variants of other values in the same column.
|
|
2718
|
+
|
|
2719
|
+
Column name: ${columnName}
|
|
2720
|
+
Value frequencies (value: count):
|
|
2721
|
+
${JSON.stringify(valueSummary, null, 2)}
|
|
2722
|
+
|
|
2723
|
+
For each incorrect value, provide the corrected canonical form. Only include values that need correction. Return JSON object mapping incorrect values to their corrections.
|
|
2724
|
+
|
|
2725
|
+
Example response:
|
|
2726
|
+
{"actve": "active", "ACTIVE": "active", "pendng": "pending"}
|
|
2727
|
+
|
|
2728
|
+
Return ONLY the JSON object, no other text.`;
|
|
2729
|
+
}
|
|
2730
|
+
function envVar(key) {
|
|
2731
|
+
if (typeof process !== "undefined" && process.env) {
|
|
2732
|
+
return process.env[key];
|
|
2733
|
+
}
|
|
2734
|
+
return void 0;
|
|
2735
|
+
}
|
|
2736
|
+
function validateCorrections(parsed) {
|
|
2737
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return {};
|
|
2738
|
+
const result = {};
|
|
2739
|
+
for (const [k, v] of Object.entries(parsed)) {
|
|
2740
|
+
if (typeof k === "string" && typeof v === "string") result[k] = v;
|
|
2741
|
+
}
|
|
2742
|
+
return result;
|
|
2743
|
+
}
|
|
2744
|
+
async function askLlmForCorrections(columnName, valueSummary) {
|
|
2745
|
+
const prompt = buildPrompt(columnName, valueSummary);
|
|
2746
|
+
const anthropicKey = envVar("ANTHROPIC_API_KEY");
|
|
2747
|
+
const openaiKey = envVar("OPENAI_API_KEY");
|
|
2748
|
+
try {
|
|
2749
|
+
if (anthropicKey) {
|
|
2750
|
+
const resp = await fetch("https://api.anthropic.com/v1/messages", {
|
|
2751
|
+
method: "POST",
|
|
2752
|
+
headers: {
|
|
2753
|
+
"Content-Type": "application/json",
|
|
2754
|
+
"x-api-key": anthropicKey,
|
|
2755
|
+
"anthropic-version": "2023-06-01"
|
|
2756
|
+
},
|
|
2757
|
+
body: JSON.stringify({
|
|
2758
|
+
model: "claude-sonnet-4-5-20250514",
|
|
2759
|
+
max_tokens: 1024,
|
|
2760
|
+
messages: [{ role: "user", content: prompt }]
|
|
2761
|
+
})
|
|
2762
|
+
});
|
|
2763
|
+
if (!resp.ok) {
|
|
2764
|
+
console.warn(`[goldenflow:llm] Anthropic API error: ${resp.status} ${resp.statusText}`);
|
|
2765
|
+
return {};
|
|
2766
|
+
}
|
|
2767
|
+
const data = await resp.json();
|
|
2768
|
+
const text = data.content?.[0]?.text ?? "";
|
|
2769
|
+
if (!text) {
|
|
2770
|
+
console.warn("[goldenflow:llm] Anthropic returned empty response");
|
|
2771
|
+
return {};
|
|
2772
|
+
}
|
|
2773
|
+
return validateCorrections(JSON.parse(text));
|
|
2774
|
+
}
|
|
2775
|
+
if (openaiKey) {
|
|
2776
|
+
const resp = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
2777
|
+
method: "POST",
|
|
2778
|
+
headers: {
|
|
2779
|
+
"Content-Type": "application/json",
|
|
2780
|
+
Authorization: `Bearer ${openaiKey}`
|
|
2781
|
+
},
|
|
2782
|
+
body: JSON.stringify({
|
|
2783
|
+
model: "gpt-4o-mini",
|
|
2784
|
+
messages: [{ role: "user", content: prompt }],
|
|
2785
|
+
response_format: { type: "json_object" }
|
|
2786
|
+
})
|
|
2787
|
+
});
|
|
2788
|
+
if (!resp.ok) {
|
|
2789
|
+
console.warn(`[goldenflow:llm] OpenAI API error: ${resp.status} ${resp.statusText}`);
|
|
2790
|
+
return {};
|
|
2791
|
+
}
|
|
2792
|
+
const data = await resp.json();
|
|
2793
|
+
const text = data.choices?.[0]?.message?.content ?? "";
|
|
2794
|
+
if (!text) {
|
|
2795
|
+
console.warn("[goldenflow:llm] OpenAI returned empty response");
|
|
2796
|
+
return {};
|
|
2797
|
+
}
|
|
2798
|
+
return validateCorrections(JSON.parse(text));
|
|
2799
|
+
}
|
|
2800
|
+
} catch (e) {
|
|
2801
|
+
console.warn(
|
|
2802
|
+
`[goldenflow:llm] LLM correction failed: ${e instanceof Error ? e.message : String(e)}`
|
|
2803
|
+
);
|
|
2804
|
+
}
|
|
2805
|
+
return {};
|
|
2806
|
+
}
|
|
2807
|
+
async function prepareLlmCorrections(columnName, values) {
|
|
2808
|
+
const summary = getValueSummary(values);
|
|
2809
|
+
if (Object.keys(summary).length === 0) return {};
|
|
2810
|
+
const corrections = await askLlmForCorrections(columnName, summary);
|
|
2811
|
+
if (Object.keys(corrections).length > 0) {
|
|
2812
|
+
_correctionsCache.set(columnName, new Map(Object.entries(corrections)));
|
|
2813
|
+
}
|
|
2814
|
+
return corrections;
|
|
2815
|
+
}
|
|
2816
|
+
async function applyLlmCorrections(columnName, values) {
|
|
2817
|
+
if (!_correctionsCache.has(columnName)) {
|
|
2818
|
+
await prepareLlmCorrections(columnName, values);
|
|
2819
|
+
}
|
|
2820
|
+
const map = _correctionsCache.get(columnName);
|
|
2821
|
+
if (!map || map.size === 0) return [...values];
|
|
2822
|
+
return values.map((v) => {
|
|
2823
|
+
if (v === null || typeof v !== "string") return v;
|
|
2824
|
+
const trimmed = v.trim();
|
|
2825
|
+
return map.get(trimmed) ?? v;
|
|
2826
|
+
});
|
|
2827
|
+
}
|
|
2828
|
+
function categoryLlmCorrect(values, ...params) {
|
|
2829
|
+
const columnName = typeof params[0] === "string" ? params[0] : "__default__";
|
|
2830
|
+
const map = _correctionsCache.get(columnName);
|
|
2831
|
+
if (!map || map.size === 0) return [...values];
|
|
2832
|
+
return values.map((v) => {
|
|
2833
|
+
if (v === null || typeof v !== "string") return v;
|
|
2834
|
+
const trimmed = v.trim();
|
|
2835
|
+
return map.get(trimmed) ?? v;
|
|
2836
|
+
});
|
|
2837
|
+
}
|
|
2838
|
+
registerTransform(
|
|
2839
|
+
{
|
|
2840
|
+
name: "category_llm_correct",
|
|
2841
|
+
inputTypes: ["string"],
|
|
2842
|
+
autoApply: false,
|
|
2843
|
+
priority: 34,
|
|
2844
|
+
mode: "series"
|
|
2845
|
+
},
|
|
2846
|
+
categoryLlmCorrect
|
|
2847
|
+
);
|
|
2848
|
+
|
|
2849
|
+
// src/core/notebook.ts
|
|
2850
|
+
function transformResultToHtml(result) {
|
|
2851
|
+
const rows = result.rows.length;
|
|
2852
|
+
const cols = result.columns.length;
|
|
2853
|
+
const transforms = result.manifest.records.length;
|
|
2854
|
+
const errors = result.manifest.errors.length;
|
|
2855
|
+
let html = `<div style="font-family: monospace; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
|
|
2856
|
+
<h3 style="margin: 0 0 10px 0;">GoldenFlow TransformResult</h3>
|
|
2857
|
+
<table style="border-collapse: collapse; width: 100%;">
|
|
2858
|
+
<tr><td style="padding: 4px 8px; font-weight: bold;">Rows</td><td>${rows.toLocaleString()}</td></tr>
|
|
2859
|
+
<tr><td style="padding: 4px 8px; font-weight: bold;">Columns</td><td>${cols}</td></tr>
|
|
2860
|
+
<tr><td style="padding: 4px 8px; font-weight: bold;">Transforms Applied</td><td>${transforms}</td></tr>
|
|
2861
|
+
<tr><td style="padding: 4px 8px; font-weight: bold;">Errors</td><td style="color: ${errors ? "red" : "green"};">${errors}</td></tr>
|
|
2862
|
+
</table>`;
|
|
2863
|
+
if (result.manifest.records.length > 0) {
|
|
2864
|
+
html += `<h4 style="margin: 10px 0 5px 0;">Transforms</h4>
|
|
2865
|
+
<table style="border-collapse: collapse; width: 100%; font-size: 0.9em;">
|
|
2866
|
+
<tr style="background: #f5f5f5;">
|
|
2867
|
+
<th style="padding: 4px 8px; text-align: left;">Column</th>
|
|
2868
|
+
<th style="padding: 4px 8px; text-align: left;">Transform</th>
|
|
2869
|
+
<th style="padding: 4px 8px; text-align: left;">Affected</th>
|
|
2870
|
+
</tr>`;
|
|
2871
|
+
const shown = result.manifest.records.slice(0, 10);
|
|
2872
|
+
for (const r of shown) {
|
|
2873
|
+
html += `<tr>
|
|
2874
|
+
<td style="padding: 4px 8px;">${r.column}</td>
|
|
2875
|
+
<td style="padding: 4px 8px;">${r.transform}</td>
|
|
2876
|
+
<td style="padding: 4px 8px;">${r.affectedRows}/${r.totalRows}</td>
|
|
2877
|
+
</tr>`;
|
|
2878
|
+
}
|
|
2879
|
+
if (result.manifest.records.length > 10) {
|
|
2880
|
+
html += `<tr><td colspan="3" style="padding: 4px 8px; color: #888;">... and ${result.manifest.records.length - 10} more</td></tr>`;
|
|
2881
|
+
}
|
|
2882
|
+
html += "</table>";
|
|
2883
|
+
}
|
|
2884
|
+
html += "</div>";
|
|
2885
|
+
return html;
|
|
2886
|
+
}
|
|
2887
|
+
function manifestToHtml(manifest) {
|
|
2888
|
+
let html = `<div style="font-family: monospace; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
|
|
2889
|
+
<h3>GoldenFlow Manifest</h3>
|
|
2890
|
+
<p>Source: ${manifest.source} | Transforms: ${manifest.records.length} | Errors: ${manifest.errors.length}</p>
|
|
2891
|
+
<table style="border-collapse: collapse; width: 100%; font-size: 0.9em;">
|
|
2892
|
+
<tr style="background: #f5f5f5;">
|
|
2893
|
+
<th style="padding: 4px 8px; text-align: left;">Column</th>
|
|
2894
|
+
<th style="padding: 4px 8px; text-align: left;">Transform</th>
|
|
2895
|
+
<th style="padding: 4px 8px; text-align: left;">Affected</th>
|
|
2896
|
+
<th style="padding: 4px 8px; text-align: left;">Before</th>
|
|
2897
|
+
<th style="padding: 4px 8px; text-align: left;">After</th>
|
|
2898
|
+
</tr>`;
|
|
2899
|
+
for (const r of manifest.records) {
|
|
2900
|
+
const before = r.sampleBefore.slice(0, 2).join(", ");
|
|
2901
|
+
const after = r.sampleAfter.slice(0, 2).join(", ");
|
|
2902
|
+
html += `<tr>
|
|
2903
|
+
<td style="padding: 4px 8px;">${r.column}</td>
|
|
2904
|
+
<td style="padding: 4px 8px;">${r.transform}</td>
|
|
2905
|
+
<td style="padding: 4px 8px;">${r.affectedRows}/${r.totalRows}</td>
|
|
2906
|
+
<td style="padding: 4px 8px; color: #c00;">${before}</td>
|
|
2907
|
+
<td style="padding: 4px 8px; color: #0a0;">${after}</td>
|
|
2908
|
+
</tr>`;
|
|
2909
|
+
}
|
|
2910
|
+
html += "</table></div>";
|
|
2911
|
+
return html;
|
|
2912
|
+
}
|
|
2913
|
+
function profileToHtml(profile) {
|
|
2914
|
+
let html = `<div style="font-family: monospace; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
|
|
2915
|
+
<h3>GoldenFlow Profile</h3>
|
|
2916
|
+
<p>${profile.rowCount.toLocaleString()} rows, ${profile.columnCount} columns</p>
|
|
2917
|
+
<table style="border-collapse: collapse; width: 100%; font-size: 0.9em;">
|
|
2918
|
+
<tr style="background: #f5f5f5;">
|
|
2919
|
+
<th style="padding: 4px 8px; text-align: left;">Column</th>
|
|
2920
|
+
<th style="padding: 4px 8px; text-align: left;">Type</th>
|
|
2921
|
+
<th style="padding: 4px 8px; text-align: left;">Nulls</th>
|
|
2922
|
+
<th style="padding: 4px 8px; text-align: left;">Unique</th>
|
|
2923
|
+
<th style="padding: 4px 8px; text-align: left;">Sample</th>
|
|
2924
|
+
</tr>`;
|
|
2925
|
+
for (const c of profile.columns) {
|
|
2926
|
+
const pct = (c.nullPct * 100).toFixed(0);
|
|
2927
|
+
html += `<tr>
|
|
2928
|
+
<td style="padding: 4px 8px;">${c.name}</td>
|
|
2929
|
+
<td style="padding: 4px 8px;">${c.inferredType}</td>
|
|
2930
|
+
<td style="padding: 4px 8px;">${c.nullCount} (${pct}%)</td>
|
|
2931
|
+
<td style="padding: 4px 8px;">${c.uniqueCount}</td>
|
|
2932
|
+
<td style="padding: 4px 8px; color: #888;">${c.sampleValues.slice(0, 3).join(", ")}</td>
|
|
2933
|
+
</tr>`;
|
|
2934
|
+
}
|
|
2935
|
+
html += "</table></div>";
|
|
2936
|
+
return html;
|
|
2937
|
+
}
|
|
2938
|
+
|
|
2939
|
+
export { FINDING_TRANSFORM_MAP, MutableManifest, SchemaMapper, StreamProcessor, TabularData, TransformEngine, applyLlmCorrections, diffDataframes, getTransform, isNullish, learnConfig, listDomains, listTransforms, loadConfigFromString, loadDomain, makeColumnProfile, makeConfig, makeManifest, makeTransformRecord, manifestToHtml, manifestToJson, mergeConfigs, nameSimilarity, parseTransformName, prepareLlmCorrections, printDiff, printManifest, printProfile, profileDataframe, profileSimilarity, profileToHtml, registerTransform, registry, saveConfigToString, selectFromFindings, selectTransforms, toColumnValue, transformResultToHtml, validateConfig };
|
|
2940
|
+
//# sourceMappingURL=index.js.map
|
|
2941
|
+
//# sourceMappingURL=index.js.map
|