@forzalabs/remora 0.1.3-nasco.3 → 0.1.4-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +4 -0
- package/drivers/DriverHelper.js +1 -1
- package/drivers/LocalDriver.js +1 -0
- package/drivers/S3Driver.js +1 -0
- package/engines/consumer/PostProcessor.js +5 -3
- package/engines/dataset/ParallelDataset.js +16 -4
- package/engines/transform/TransformationEngine.js +5 -2
- package/engines/transform/TypeCaster.js +20 -9
- package/engines/validation/Validator.js +0 -3
- package/package.json +2 -1
package/Constants.js
CHANGED
|
@@ -464,6 +464,10 @@
|
|
|
464
464
|
"type": "string",
|
|
465
465
|
"description": "Cast the value to a specific type",
|
|
466
466
|
"enum": ["string", "number", "date", "boolean"]
|
|
467
|
+
},
|
|
468
|
+
"format": {
|
|
469
|
+
"type": "string",
|
|
470
|
+
"description": "Optional format for date parsing or string formatting (supports tokens: yyyy, mm, dd)"
|
|
467
471
|
}
|
|
468
472
|
},
|
|
469
473
|
"required": ["cast"],
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -109,7 +109,7 @@ const DriverHelper = {
|
|
|
109
109
|
return keys.map(k => parsed[k]).join(delimiter);
|
|
110
110
|
}
|
|
111
111
|
catch (error) {
|
|
112
|
-
Logger_1.default.log(`Failed parsing
|
|
112
|
+
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
|
|
113
113
|
throw error;
|
|
114
114
|
}
|
|
115
115
|
}
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -143,6 +143,7 @@ class LocalSourceDriver {
|
|
|
143
143
|
if (fileKey.includes('%')) {
|
|
144
144
|
const allFileKeys = this.listFiles(fileKey);
|
|
145
145
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
146
|
+
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in path "${this._path}".`);
|
|
146
147
|
// Get header line from the first file
|
|
147
148
|
const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
148
149
|
dataset.setFirstLine(headerLine);
|
package/drivers/S3Driver.js
CHANGED
|
@@ -216,6 +216,7 @@ class S3SourceDriver {
|
|
|
216
216
|
if (fileKey.includes('%')) {
|
|
217
217
|
const allFileKeys = yield this.listFiles(fileKey);
|
|
218
218
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
219
|
+
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`);
|
|
219
220
|
// Get header line from the first file
|
|
220
221
|
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
221
222
|
Bucket: this._bucketName,
|
|
@@ -214,17 +214,19 @@ class PostProcessorClass {
|
|
|
214
214
|
return defaultValue;
|
|
215
215
|
const fieldKey = alias !== null && alias !== void 0 ? alias : key;
|
|
216
216
|
const fieldValue = record.getValue(fieldKey);
|
|
217
|
+
const fieldType = (_b = (_a = field.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string';
|
|
217
218
|
if (Algo_1.default.hasVal(fieldValue) && !isNaN(fieldValue)) {
|
|
218
|
-
const fieldType = (_b = (_a = field.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string';
|
|
219
219
|
if (fieldType === 'number' && typeof fieldValue === 'string' && fieldValue.length === 0)
|
|
220
220
|
return (_c = field.cField.default) !== null && _c !== void 0 ? _c : fieldValue;
|
|
221
221
|
else
|
|
222
222
|
return fieldValue;
|
|
223
223
|
}
|
|
224
|
-
else if ((!Algo_1.default.hasVal(fieldValue) || isNaN(fieldValue)) && Algo_1.default.hasVal(field.cField.default))
|
|
224
|
+
else if ((!Algo_1.default.hasVal(fieldValue) || (isNaN(fieldValue) && fieldType === 'number')) && Algo_1.default.hasVal(field.cField.default)) {
|
|
225
225
|
return field.cField.default;
|
|
226
|
-
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
227
228
|
return fieldValue;
|
|
229
|
+
}
|
|
228
230
|
};
|
|
229
231
|
this.distinct = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
230
232
|
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
@@ -20,9 +20,22 @@ const DatasetManager_1 = __importDefault(require("./DatasetManager"));
|
|
|
20
20
|
const path_1 = __importDefault(require("path"));
|
|
21
21
|
class ParallelDatasetClass {
|
|
22
22
|
constructor() {
|
|
23
|
+
this.init = () => {
|
|
24
|
+
/**
|
|
25
|
+
* I need the init to be called after all the setup has been completed because I need the .env to be loaded
|
|
26
|
+
*/
|
|
27
|
+
if (!this._filterPool || !this._projectionPool || !this._transformPool) {
|
|
28
|
+
const workerPath = this._getWorkerPath();
|
|
29
|
+
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
|
|
30
|
+
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
|
|
31
|
+
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
|
|
32
|
+
}
|
|
33
|
+
};
|
|
23
34
|
this._getWorkerPath = () => {
|
|
24
35
|
// Get the current file's directory
|
|
25
36
|
const currentDir = __dirname;
|
|
37
|
+
if (process.env.NODE_ENV === 'dev' || process.env.NODE_ENV === 'development')
|
|
38
|
+
return path_1.default.resolve('./.build/workers');
|
|
26
39
|
// Check if we're in a published npm package (no .build in path)
|
|
27
40
|
if (!currentDir.includes('.build')) {
|
|
28
41
|
// We're in the published package, workers are relative to package root
|
|
@@ -47,6 +60,7 @@ class ParallelDatasetClass {
|
|
|
47
60
|
this.filter = (dataset, filters) => __awaiter(this, void 0, void 0, function* () {
|
|
48
61
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
49
62
|
(0, Affirm_1.default)(filters, `Invalid filters`);
|
|
63
|
+
this.init();
|
|
50
64
|
// Distribute the work of the filter among the various workers, trying to have them match the batch size
|
|
51
65
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
52
66
|
dataset._startOperation('filter-parallel', { workerCount });
|
|
@@ -84,6 +98,7 @@ class ParallelDatasetClass {
|
|
|
84
98
|
this.projection = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
|
|
85
99
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
86
100
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
101
|
+
this.init();
|
|
87
102
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
88
103
|
dataset._startOperation('projection-parallel', { workerCount });
|
|
89
104
|
const threads = [];
|
|
@@ -118,6 +133,7 @@ class ParallelDatasetClass {
|
|
|
118
133
|
this.transform = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
|
|
119
134
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
120
135
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
136
|
+
this.init();
|
|
121
137
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
122
138
|
dataset._startOperation('transform-parallel', { workerCount });
|
|
123
139
|
const threads = [];
|
|
@@ -149,10 +165,6 @@ class ParallelDatasetClass {
|
|
|
149
165
|
dataset._finishOperation('transform-parallel');
|
|
150
166
|
return dataset;
|
|
151
167
|
});
|
|
152
|
-
const workerPath = this._getWorkerPath();
|
|
153
|
-
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
|
|
154
|
-
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
|
|
155
|
-
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
|
|
156
168
|
}
|
|
157
169
|
}
|
|
158
170
|
const ParallelDataset = new ParallelDatasetClass();
|
|
@@ -73,9 +73,12 @@ class TransformationEngineClass {
|
|
|
73
73
|
}
|
|
74
74
|
// Single transformation
|
|
75
75
|
if ('cast' in transformations) {
|
|
76
|
-
const
|
|
77
|
-
|
|
76
|
+
const { cast, format } = transformations;
|
|
77
|
+
const casted = TypeCaster_1.default.cast(value, cast, format);
|
|
78
|
+
if (cast === 'number' && isNaN(casted))
|
|
78
79
|
throw new Error(`Cannot cast non-numeric value in field '${field.key}'`);
|
|
80
|
+
if (cast === 'date' && casted instanceof Date && isNaN(casted.getTime()))
|
|
81
|
+
throw new Error(`Cannot cast value to date in field '${field.key}'`);
|
|
79
82
|
return casted;
|
|
80
83
|
}
|
|
81
84
|
if ('multiply' in transformations) {
|
|
@@ -4,34 +4,45 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
7
|
+
const dayjs_1 = __importDefault(require("dayjs"));
|
|
8
|
+
const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat"));
|
|
9
|
+
const utc_1 = __importDefault(require("dayjs/plugin/utc"));
|
|
10
|
+
dayjs_1.default.extend(customParseFormat_1.default);
|
|
11
|
+
dayjs_1.default.extend(utc_1.default);
|
|
7
12
|
class TypeCasterClass {
|
|
8
13
|
/**
|
|
9
14
|
* Casts the value to the requested type (only if needed)
|
|
15
|
+
* Optional format parameter currently supports:
|
|
16
|
+
* - Parsing dates (type 'date'/'datetime') from string with tokens: yyyy, mm, dd
|
|
17
|
+
* - Formatting dates when casting to string with same tokens
|
|
10
18
|
*/
|
|
11
|
-
cast(value, type) {
|
|
19
|
+
cast(value, type, format) {
|
|
12
20
|
if (!Algo_1.default.hasVal(value))
|
|
13
21
|
return value;
|
|
14
22
|
switch (type) {
|
|
15
23
|
case 'boolean': {
|
|
16
24
|
if (typeof value === 'boolean')
|
|
17
25
|
return value;
|
|
18
|
-
|
|
19
|
-
return Boolean(value);
|
|
26
|
+
return Boolean(value);
|
|
20
27
|
}
|
|
21
28
|
case 'datetime':
|
|
22
|
-
case 'date':
|
|
23
|
-
|
|
29
|
+
case 'date': {
|
|
30
|
+
let dateValue = null;
|
|
31
|
+
if (format && typeof value === 'string')
|
|
32
|
+
dateValue = dayjs_1.default.utc(value, format, true).toDate();
|
|
33
|
+
else
|
|
34
|
+
dateValue = new Date(value);
|
|
35
|
+
return dateValue.toISOString();
|
|
36
|
+
}
|
|
24
37
|
case 'number': {
|
|
25
38
|
if (typeof value === 'number')
|
|
26
39
|
return value;
|
|
27
|
-
|
|
28
|
-
return Number(value);
|
|
40
|
+
return Number(value);
|
|
29
41
|
}
|
|
30
42
|
case 'string': {
|
|
31
43
|
if (typeof value === 'string')
|
|
32
44
|
return value;
|
|
33
|
-
|
|
34
|
-
return String(value);
|
|
45
|
+
return String(value);
|
|
35
46
|
}
|
|
36
47
|
}
|
|
37
48
|
}
|
|
@@ -154,9 +154,6 @@ class ValidatorClass {
|
|
|
154
154
|
else
|
|
155
155
|
trxToValidate.push(field.transform);
|
|
156
156
|
for (const trans of trxToValidate) {
|
|
157
|
-
const trxKeys = Object.keys(trans);
|
|
158
|
-
if (trxKeys.length !== 1)
|
|
159
|
-
errors.push(`There can only be 1 transformation type in your transformation pipeline. Field "${field.key}" got ${trxKeys.length}.`);
|
|
160
157
|
if ('combine_fields' in trans) {
|
|
161
158
|
const { combine_fields } = trans;
|
|
162
159
|
if (!combine_fields.fields || combine_fields.fields.length === 0)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forzalabs/remora",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4-nasco.3",
|
|
4
4
|
"description": "A powerful CLI tool for seamless data translation.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"private": false,
|
|
@@ -43,6 +43,7 @@
|
|
|
43
43
|
"chalk": "^4.1.2",
|
|
44
44
|
"commander": "^10.0.0",
|
|
45
45
|
"cross-env": "^7.0.3",
|
|
46
|
+
"dayjs": "^1.11.13",
|
|
46
47
|
"dotenv": "^16.0.3",
|
|
47
48
|
"fast-xml-parser": "^5.2.3",
|
|
48
49
|
"fs-extra": "^11.1.0",
|