@forzalabs/remora 0.1.3-nasco.3 → 0.1.5-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +9 -1
- package/definitions/json_schemas/producer-schema.json +2 -1
- package/definitions/json_schemas/source-schema.json +14 -1
- package/documentation/README.md +1 -0
- package/documentation/default_resources/consumer.json +7 -7
- package/drivers/DeltaShareDriver.js +178 -0
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +16 -1
- package/drivers/LocalDriver.js +1 -0
- package/drivers/S3Driver.js +1 -0
- package/engines/ai/DeveloperEngine.js +90 -1
- package/engines/consumer/ConsumerEngine.js +1 -1
- package/engines/consumer/PostProcessor.js +27 -18
- package/engines/dataset/Dataset.js +18 -7
- package/engines/dataset/DatasetManager.js +58 -12
- package/engines/dataset/DatasetRecord.js +17 -4
- package/engines/dataset/ParallelDataset.js +29 -7
- package/engines/execution/ExecutionEnvironment.js +13 -4
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/file/FileCompiler.js +2 -1
- package/engines/file/FileExporter.js +12 -3
- package/engines/parsing/ParseManager.js +7 -2
- package/engines/producer/ProducerEngine.js +4 -2
- package/engines/transform/JoinEngine.js +10 -6
- package/engines/transform/TransformationEngine.js +35 -3
- package/engines/transform/TypeCaster.js +20 -9
- package/engines/usage/UsageDataManager.js +110 -0
- package/engines/validation/Validator.js +0 -3
- package/package.json +3 -1
- package/workers/FilterWorker.js +3 -3
- package/workers/ProjectionWorker.js +3 -3
- package/workers/TransformWorker.js +3 -3
|
@@ -35,7 +35,7 @@ const Helper_1 = __importDefault(require("../../helper/Helper"));
|
|
|
35
35
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
36
36
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
37
37
|
class Dataset {
|
|
38
|
-
constructor(name, file, batchSize) {
|
|
38
|
+
constructor(name, file, batchSize, executionId) {
|
|
39
39
|
var _a;
|
|
40
40
|
this.getPath = () => this._path;
|
|
41
41
|
this.setPath = (path) => {
|
|
@@ -43,6 +43,7 @@ class Dataset {
|
|
|
43
43
|
return this;
|
|
44
44
|
};
|
|
45
45
|
this.getFile = () => this._file;
|
|
46
|
+
this.getExecutionId = () => this._executionId;
|
|
46
47
|
this.getBatchSize = () => this._batchSize;
|
|
47
48
|
this.setFirstLine = (firstLine) => {
|
|
48
49
|
this._firstLine = firstLine;
|
|
@@ -170,7 +171,7 @@ class Dataset {
|
|
|
170
171
|
}
|
|
171
172
|
}
|
|
172
173
|
catch (error) {
|
|
173
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
174
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
174
175
|
}
|
|
175
176
|
}
|
|
176
177
|
}
|
|
@@ -536,7 +537,7 @@ class Dataset {
|
|
|
536
537
|
}
|
|
537
538
|
}
|
|
538
539
|
catch (error) {
|
|
539
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
540
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
540
541
|
}
|
|
541
542
|
}
|
|
542
543
|
}
|
|
@@ -627,7 +628,7 @@ class Dataset {
|
|
|
627
628
|
}
|
|
628
629
|
}
|
|
629
630
|
catch (error) {
|
|
630
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
631
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
631
632
|
}
|
|
632
633
|
}
|
|
633
634
|
}
|
|
@@ -703,10 +704,18 @@ class Dataset {
|
|
|
703
704
|
return this;
|
|
704
705
|
});
|
|
705
706
|
this.getDimensions = () => this._dimensions;
|
|
706
|
-
this.
|
|
707
|
+
this.setDimensions = (dimensions) => {
|
|
707
708
|
this._dimensions = dimensions;
|
|
708
709
|
return this;
|
|
709
710
|
};
|
|
711
|
+
this.setSingleDimension = (newDimension, oldDimension) => {
|
|
712
|
+
(0, Affirm_1.default)(newDimension, `Invalid new dimension`);
|
|
713
|
+
(0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
|
|
714
|
+
const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
|
|
715
|
+
(0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
|
|
716
|
+
this._dimensions.splice(current, 1, newDimension);
|
|
717
|
+
return this;
|
|
718
|
+
};
|
|
710
719
|
/**
|
|
711
720
|
* Update the record pool when dimensions change
|
|
712
721
|
*/
|
|
@@ -829,6 +838,7 @@ class Dataset {
|
|
|
829
838
|
this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
|
|
830
839
|
this.name = name;
|
|
831
840
|
this._file = file;
|
|
841
|
+
this._executionId = executionId;
|
|
832
842
|
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
833
843
|
this._dimensions = [];
|
|
834
844
|
this._firstLine = '';
|
|
@@ -843,8 +853,9 @@ class Dataset {
|
|
|
843
853
|
.replace(/_{2,}/g, '_')
|
|
844
854
|
.replace(/^_+|_+$/g, '')
|
|
845
855
|
.toLowerCase();
|
|
846
|
-
|
|
847
|
-
this.
|
|
856
|
+
const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
|
|
857
|
+
this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
|
|
858
|
+
this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
|
|
848
859
|
this.ensureFile(this._path);
|
|
849
860
|
}
|
|
850
861
|
}
|
|
@@ -21,9 +21,15 @@ const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
|
|
|
21
21
|
const Dataset_1 = __importDefault(require("./Dataset"));
|
|
22
22
|
const promises_1 = require("stream/promises");
|
|
23
23
|
const fs_1 = require("fs");
|
|
24
|
+
const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
|
|
24
25
|
class DatasetManagerClass {
|
|
25
26
|
constructor() {
|
|
26
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Create a new Dataset for a producer. If an executionId is provided, the dataset files will
|
|
29
|
+
* be isolated inside a sub-folder specific to that execution to avoid concurrency conflicts
|
|
30
|
+
* when the same producer / consumer is executed multiple times in parallel.
|
|
31
|
+
*/
|
|
32
|
+
this.create = (producer, executionId) => {
|
|
27
33
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
28
34
|
const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName } } = producer;
|
|
29
35
|
const dataset = new Dataset_1.default(name, {
|
|
@@ -32,7 +38,7 @@ class DatasetManagerClass {
|
|
|
32
38
|
hasHeaderRow,
|
|
33
39
|
sheetName,
|
|
34
40
|
delimiter
|
|
35
|
-
});
|
|
41
|
+
}, undefined, executionId);
|
|
36
42
|
return dataset;
|
|
37
43
|
};
|
|
38
44
|
this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
|
|
@@ -43,7 +49,7 @@ class DatasetManagerClass {
|
|
|
43
49
|
return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
|
|
44
50
|
});
|
|
45
51
|
this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
|
|
46
|
-
var _a, _b, _c, _d, _e, _f;
|
|
52
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
47
53
|
(0, Affirm_1.default)(firstLine, `Invalid first line`);
|
|
48
54
|
(0, Affirm_1.default)(dsFile, `Invalid dataset file`);
|
|
49
55
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
@@ -54,10 +60,17 @@ class DatasetManagerClass {
|
|
|
54
60
|
const headerLine = firstLine;
|
|
55
61
|
const rawDimensions = ParseManager_1.default._extractHeader(headerLine, delimiterChar, producer, discover);
|
|
56
62
|
return {
|
|
57
|
-
dimensions: rawDimensions.map(x => ({
|
|
63
|
+
dimensions: rawDimensions.map(x => ({
|
|
64
|
+
key: x.name,
|
|
65
|
+
name: x.saveAs,
|
|
66
|
+
index: x.index,
|
|
67
|
+
hidden: null,
|
|
68
|
+
type: x.type
|
|
69
|
+
})),
|
|
58
70
|
delimiter: delimiterChar
|
|
59
71
|
};
|
|
60
72
|
}
|
|
73
|
+
case 'PARQUET':
|
|
61
74
|
case 'JSONL':
|
|
62
75
|
case 'JSON': {
|
|
63
76
|
const source = Environment_1.default.getSource(producer.source);
|
|
@@ -67,7 +80,13 @@ class DatasetManagerClass {
|
|
|
67
80
|
if (discover) {
|
|
68
81
|
return {
|
|
69
82
|
delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
|
|
70
|
-
dimensions: keys.map((x, i) => ({
|
|
83
|
+
dimensions: keys.map((x, i) => ({
|
|
84
|
+
hidden: false,
|
|
85
|
+
index: i,
|
|
86
|
+
key: x,
|
|
87
|
+
name: x,
|
|
88
|
+
type: DeveloperEngine_1.default.inferDimensionType(firstObject === null || firstObject === void 0 ? void 0 : firstObject[x])
|
|
89
|
+
}))
|
|
71
90
|
};
|
|
72
91
|
}
|
|
73
92
|
const dimensions = [];
|
|
@@ -75,34 +94,61 @@ class DatasetManagerClass {
|
|
|
75
94
|
const columnKey = (_c = pColumn.aliasInProducer) !== null && _c !== void 0 ? _c : pColumn.nameInProducer;
|
|
76
95
|
const csvColumnIndex = keys.findIndex(x => x === columnKey);
|
|
77
96
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
78
|
-
dimensions.push({
|
|
97
|
+
dimensions.push({
|
|
98
|
+
index: csvColumnIndex,
|
|
99
|
+
key: columnKey,
|
|
100
|
+
name: pColumn.nameInProducer,
|
|
101
|
+
hidden: null,
|
|
102
|
+
type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
|
|
103
|
+
});
|
|
79
104
|
}
|
|
80
|
-
const delimiterChar = (
|
|
105
|
+
const delimiterChar = (_f = file.delimiter) !== null && _f !== void 0 ? _f : ',';
|
|
81
106
|
return { dimensions, delimiter: delimiterChar };
|
|
82
107
|
}
|
|
83
108
|
case 'TXT': {
|
|
84
109
|
if (!file.hasHeaderRow) {
|
|
85
110
|
// If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
|
|
86
|
-
const delimiterChar = (
|
|
111
|
+
const delimiterChar = (_g = file.delimiter) !== null && _g !== void 0 ? _g : ',';
|
|
87
112
|
const source = Environment_1.default.getSource(producer.source);
|
|
88
113
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
89
114
|
if (discover) {
|
|
90
115
|
// Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
|
|
91
116
|
return {
|
|
92
117
|
delimiter: delimiterChar,
|
|
93
|
-
dimensions: firstLine.split(delimiterChar).map((x, i) => ({
|
|
118
|
+
dimensions: firstLine.split(delimiterChar).map((x, i) => ({
|
|
119
|
+
hidden: false,
|
|
120
|
+
index: i,
|
|
121
|
+
key: `Col ${i + 1}`,
|
|
122
|
+
name: `Col ${i + 1}`,
|
|
123
|
+
type: 'string'
|
|
124
|
+
}))
|
|
94
125
|
};
|
|
95
126
|
}
|
|
96
127
|
return {
|
|
97
|
-
dimensions: columns.map((x, i) => {
|
|
128
|
+
dimensions: columns.map((x, i) => {
|
|
129
|
+
var _a, _b, _c;
|
|
130
|
+
return ({
|
|
131
|
+
key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
|
|
132
|
+
name: x.nameInProducer,
|
|
133
|
+
index: i,
|
|
134
|
+
hidden: null,
|
|
135
|
+
type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
136
|
+
});
|
|
137
|
+
}),
|
|
98
138
|
delimiter: delimiterChar
|
|
99
139
|
};
|
|
100
140
|
}
|
|
101
141
|
else {
|
|
102
|
-
const delimiterChar = (
|
|
142
|
+
const delimiterChar = (_h = producer.settings.delimiter) !== null && _h !== void 0 ? _h : ',';
|
|
103
143
|
const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
|
|
104
144
|
return {
|
|
105
|
-
dimensions: rawDimensions.map(x => ({
|
|
145
|
+
dimensions: rawDimensions.map(x => ({
|
|
146
|
+
key: x.name,
|
|
147
|
+
name: x.saveAs,
|
|
148
|
+
index: x.index,
|
|
149
|
+
hidden: null,
|
|
150
|
+
type: x.type
|
|
151
|
+
})),
|
|
106
152
|
delimiter: delimiterChar
|
|
107
153
|
};
|
|
108
154
|
}
|
|
@@ -4,6 +4,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
7
|
+
const TypeCaster_1 = __importDefault(require("../transform/TypeCaster"));
|
|
7
8
|
class DatasetRecord {
|
|
8
9
|
constructor(row, dimensions, delimiter) {
|
|
9
10
|
this.parse = (row, delimiter, dimensions) => {
|
|
@@ -11,7 +12,7 @@ class DatasetRecord {
|
|
|
11
12
|
const parts = row.split(delimiter);
|
|
12
13
|
for (let i = 0; i < dimensions.length; i++) {
|
|
13
14
|
const dim = dimensions[i];
|
|
14
|
-
this._value[dim.name] = parts[i];
|
|
15
|
+
this._value[dim.name] = TypeCaster_1.default.cast(parts[i], dim.type);
|
|
15
16
|
}
|
|
16
17
|
}
|
|
17
18
|
};
|
|
@@ -35,7 +36,7 @@ class DatasetRecord {
|
|
|
35
36
|
this.parse(row, delimiter, this._dimensions);
|
|
36
37
|
};
|
|
37
38
|
this.wholeUpdateDimension = (update) => {
|
|
38
|
-
var _a;
|
|
39
|
+
var _a, _b, _c, _d, _e;
|
|
39
40
|
if (update.toDelete) {
|
|
40
41
|
// To remove
|
|
41
42
|
delete this._value[update.currentDimension.name];
|
|
@@ -46,7 +47,13 @@ class DatasetRecord {
|
|
|
46
47
|
}
|
|
47
48
|
else if (!update.currentDimension) {
|
|
48
49
|
// To create (at the right position)
|
|
49
|
-
const newDimension = {
|
|
50
|
+
const newDimension = {
|
|
51
|
+
index: update.newPosition,
|
|
52
|
+
key: update.newName,
|
|
53
|
+
name: update.newName,
|
|
54
|
+
hidden: update.newHidden,
|
|
55
|
+
type: (_b = (_a = update.currentDimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string'
|
|
56
|
+
};
|
|
50
57
|
this._value[newDimension.name] = null;
|
|
51
58
|
this._dimensions = [...this._dimensions, newDimension];
|
|
52
59
|
}
|
|
@@ -56,7 +63,13 @@ class DatasetRecord {
|
|
|
56
63
|
if (index < 0)
|
|
57
64
|
index = this._dimensions.findIndex(x => x.key === update.currentDimension.key);
|
|
58
65
|
const currentDim = this._dimensions[index];
|
|
59
|
-
const updatedDim = {
|
|
66
|
+
const updatedDim = {
|
|
67
|
+
name: update.newName,
|
|
68
|
+
key: (_c = currentDim.key) !== null && _c !== void 0 ? _c : update.newName,
|
|
69
|
+
hidden: update.newHidden,
|
|
70
|
+
index: update.newPosition,
|
|
71
|
+
type: (_e = (_d = update.currentDimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
|
|
72
|
+
};
|
|
60
73
|
this._value[updatedDim.name] = this._value[currentDim.name];
|
|
61
74
|
if (updatedDim.name !== currentDim.name)
|
|
62
75
|
delete this._value[currentDim.name];
|
|
@@ -20,9 +20,29 @@ const DatasetManager_1 = __importDefault(require("./DatasetManager"));
|
|
|
20
20
|
const path_1 = __importDefault(require("path"));
|
|
21
21
|
class ParallelDatasetClass {
|
|
22
22
|
constructor() {
|
|
23
|
+
this.init = () => {
|
|
24
|
+
/**
|
|
25
|
+
* I need the init to be called after all the setup has been completed because I need the .env to be loaded
|
|
26
|
+
*/
|
|
27
|
+
if (!this._filterPool || !this._projectionPool || !this._transformPool) {
|
|
28
|
+
const options = {
|
|
29
|
+
workerThreadOpts: {
|
|
30
|
+
resourceLimits: {
|
|
31
|
+
maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
const workerPath = this._getWorkerPath();
|
|
36
|
+
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'), options);
|
|
37
|
+
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'), options);
|
|
38
|
+
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'), options);
|
|
39
|
+
}
|
|
40
|
+
};
|
|
23
41
|
this._getWorkerPath = () => {
|
|
24
42
|
// Get the current file's directory
|
|
25
43
|
const currentDir = __dirname;
|
|
44
|
+
if (process.env.NODE_ENV === 'dev' || process.env.NODE_ENV === 'development')
|
|
45
|
+
return path_1.default.resolve('./.build/workers');
|
|
26
46
|
// Check if we're in a published npm package (no .build in path)
|
|
27
47
|
if (!currentDir.includes('.build')) {
|
|
28
48
|
// We're in the published package, workers are relative to package root
|
|
@@ -47,6 +67,7 @@ class ParallelDatasetClass {
|
|
|
47
67
|
this.filter = (dataset, filters) => __awaiter(this, void 0, void 0, function* () {
|
|
48
68
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
49
69
|
(0, Affirm_1.default)(filters, `Invalid filters`);
|
|
70
|
+
this.init();
|
|
50
71
|
// Distribute the work of the filter among the various workers, trying to have them match the batch size
|
|
51
72
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
52
73
|
dataset._startOperation('filter-parallel', { workerCount });
|
|
@@ -60,6 +81,7 @@ class ParallelDatasetClass {
|
|
|
60
81
|
const workerData = {
|
|
61
82
|
datasetDimensions: dataset.getDimensions(),
|
|
62
83
|
datasetFile: dataset.getFile(),
|
|
84
|
+
executionId: dataset.getExecutionId(),
|
|
63
85
|
datasetName: dataset.name,
|
|
64
86
|
datasetDelimiter: dataset.getDelimiter(),
|
|
65
87
|
fromLine: fromLine,
|
|
@@ -77,13 +99,14 @@ class ParallelDatasetClass {
|
|
|
77
99
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
78
100
|
dataset
|
|
79
101
|
.setDelimiter(results[0].datasetDelimiter)
|
|
80
|
-
.
|
|
102
|
+
.setDimensions(results[0].datasetDimensions);
|
|
81
103
|
dataset._finishOperation('filter-parallel');
|
|
82
104
|
return dataset;
|
|
83
105
|
});
|
|
84
106
|
this.projection = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
|
|
85
107
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
86
108
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
109
|
+
this.init();
|
|
87
110
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
88
111
|
dataset._startOperation('projection-parallel', { workerCount });
|
|
89
112
|
const threads = [];
|
|
@@ -96,6 +119,7 @@ class ParallelDatasetClass {
|
|
|
96
119
|
const workerData = {
|
|
97
120
|
datasetDimensions: dataset.getDimensions(),
|
|
98
121
|
datasetFile: dataset.getFile(),
|
|
122
|
+
executionId: dataset.getExecutionId(),
|
|
99
123
|
datasetName: dataset.name,
|
|
100
124
|
datasetDelimiter: dataset.getDelimiter(),
|
|
101
125
|
fromLine: fromLine,
|
|
@@ -111,13 +135,14 @@ class ParallelDatasetClass {
|
|
|
111
135
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
112
136
|
dataset
|
|
113
137
|
.setDelimiter(results[0].datasetDelimiter)
|
|
114
|
-
.
|
|
138
|
+
.setDimensions(results[0].datasetDimensions);
|
|
115
139
|
dataset._finishOperation('projection-parallel');
|
|
116
140
|
return dataset;
|
|
117
141
|
});
|
|
118
142
|
this.transform = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
|
|
119
143
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
120
144
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
145
|
+
this.init();
|
|
121
146
|
const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
|
|
122
147
|
dataset._startOperation('transform-parallel', { workerCount });
|
|
123
148
|
const threads = [];
|
|
@@ -130,6 +155,7 @@ class ParallelDatasetClass {
|
|
|
130
155
|
const workerData = {
|
|
131
156
|
datasetDimensions: dataset.getDimensions(),
|
|
132
157
|
datasetFile: dataset.getFile(),
|
|
158
|
+
executionId: dataset.getExecutionId(),
|
|
133
159
|
datasetName: dataset.name,
|
|
134
160
|
datasetDelimiter: dataset.getDelimiter(),
|
|
135
161
|
fromLine: fromLine,
|
|
@@ -145,14 +171,10 @@ class ParallelDatasetClass {
|
|
|
145
171
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
146
172
|
dataset
|
|
147
173
|
.setDelimiter(results[0].datasetDelimiter)
|
|
148
|
-
.
|
|
174
|
+
.setDimensions(results[0].datasetDimensions);
|
|
149
175
|
dataset._finishOperation('transform-parallel');
|
|
150
176
|
return dataset;
|
|
151
177
|
});
|
|
152
|
-
const workerPath = this._getWorkerPath();
|
|
153
|
-
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
|
|
154
|
-
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
|
|
155
|
-
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
|
|
156
178
|
}
|
|
157
179
|
}
|
|
158
180
|
const ParallelDataset = new ParallelDatasetClass();
|
|
@@ -28,7 +28,7 @@ const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
|
28
28
|
const Logger_1 = __importDefault(require("../../helper/Logger"));
|
|
29
29
|
const ParallelDataset_1 = __importDefault(require("../dataset/ParallelDataset"));
|
|
30
30
|
class ExecutionEnvironment {
|
|
31
|
-
constructor(consumer) {
|
|
31
|
+
constructor(consumer, executionId) {
|
|
32
32
|
this.run = (options) => __awaiter(this, void 0, void 0, function* () {
|
|
33
33
|
var _a, _b, _c, _d;
|
|
34
34
|
(0, Affirm_1.default)(this._consumer, 'Invalid consumer');
|
|
@@ -64,7 +64,7 @@ class ExecutionEnvironment {
|
|
|
64
64
|
(0, Affirm_1.default)(planStep.producer, `Invalid producer in execute-SQL step`);
|
|
65
65
|
const driver = yield DriverFactory_1.default.instantiateSource(planStep.source);
|
|
66
66
|
const queryData = (yield driver.query(this._envData.finalSQL)).rows;
|
|
67
|
-
let dataset = DatasetManager_1.default.create(planStep.producer);
|
|
67
|
+
let dataset = DatasetManager_1.default.create(planStep.producer, this._executionId);
|
|
68
68
|
dataset = yield dataset.loadFromMemory(queryData, planStep.producer);
|
|
69
69
|
this._storeIntermidiate(planStep, dataset);
|
|
70
70
|
break;
|
|
@@ -74,7 +74,7 @@ class ExecutionEnvironment {
|
|
|
74
74
|
const { producer } = planStep;
|
|
75
75
|
const source = Environment_1.default.getSource(producer.source);
|
|
76
76
|
(0, Affirm_1.default)(source, `Source "${producer.source}" of producer "${producer.name}" not found.`);
|
|
77
|
-
let dataset = DatasetManager_1.default.create(producer);
|
|
77
|
+
let dataset = DatasetManager_1.default.create(producer, this._executionId);
|
|
78
78
|
dataset = yield dataset.load(source);
|
|
79
79
|
this._storeIntermidiate(planStep, dataset);
|
|
80
80
|
break;
|
|
@@ -102,7 +102,7 @@ class ExecutionEnvironment {
|
|
|
102
102
|
case 'export-file': {
|
|
103
103
|
(0, Affirm_1.default)(planStep.output, `Invalid output in export-file step`);
|
|
104
104
|
(0, Affirm_1.default)(this._resultingDataset, 'Invalid resulting dataset in export-file step');
|
|
105
|
-
const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset);
|
|
105
|
+
const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset, this._executionId);
|
|
106
106
|
result.fileUri = res;
|
|
107
107
|
break;
|
|
108
108
|
}
|
|
@@ -156,6 +156,13 @@ class ExecutionEnvironment {
|
|
|
156
156
|
if (ds)
|
|
157
157
|
Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getCount()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
|
|
158
158
|
Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
|
|
159
|
+
// IMPORTANT: cleanup all the datasets to not leave any data around and to avoid memory leaks
|
|
160
|
+
const datasets = [
|
|
161
|
+
...this._producedData.map(x => x.dataset),
|
|
162
|
+
this._resultingDataset
|
|
163
|
+
].filter(Algo_1.default.hasVal);
|
|
164
|
+
const promises = datasets.map(x => x.destroy());
|
|
165
|
+
yield Promise.all(promises);
|
|
159
166
|
throw error;
|
|
160
167
|
}
|
|
161
168
|
Logger_1.default.log(`Completed execution of consumer:\n\tSize: ${result._stats.size}\n\tCycles: ${result._stats.cycles}\n\tTime: ${result._stats.elapsedMS}\n\tOperations: ${Logger_1.default.formatList(result._stats.operations)}`);
|
|
@@ -184,6 +191,8 @@ class ExecutionEnvironment {
|
|
|
184
191
|
this._envData = { consumerSQL: null, executionRequestSQL: null, finalSQL: null };
|
|
185
192
|
this._producedData = [];
|
|
186
193
|
this._resultingDataset = null;
|
|
194
|
+
// A short unique id to isolate temp dataset files & output names
|
|
195
|
+
this._executionId = executionId;
|
|
187
196
|
}
|
|
188
197
|
}
|
|
189
198
|
exports.default = ExecutionEnvironment;
|
|
@@ -108,7 +108,8 @@ class ExecutionPlannerClas {
|
|
|
108
108
|
break;
|
|
109
109
|
}
|
|
110
110
|
case 'local':
|
|
111
|
-
case 'aws-s3':
|
|
111
|
+
case 'aws-s3':
|
|
112
|
+
case 'delta-share': {
|
|
112
113
|
plan.push({ type: 'load-dataset', producer });
|
|
113
114
|
plan.push({ type: 'prepare-dataset', producer });
|
|
114
115
|
if (producer.dimensions.some(x => { var _a, _b; return ((_a = x.alias) === null || _a === void 0 ? void 0 : _a.includes('{')) || ((_b = x.alias) === null || _b === void 0 ? void 0 : _b.includes('[')); }))
|
|
@@ -7,9 +7,10 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
7
7
|
class FileCompilerClass {
|
|
8
8
|
constructor() {
|
|
9
9
|
this.compileProducer = (producer, source) => {
|
|
10
|
+
var _a;
|
|
10
11
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
11
12
|
(0, Affirm_1.default)(source, `Invalid source`);
|
|
12
|
-
(0, Affirm_1.default)(producer.settings.fileKey, `Missing required file key in producer settings`);
|
|
13
|
+
(0, Affirm_1.default)((_a = producer.settings.fileKey) !== null && _a !== void 0 ? _a : producer.settings.sqlTable, `Missing required file key in producer settings`);
|
|
13
14
|
(0, Affirm_1.default)(producer.settings.fileType, `Missing required file type in producer settings`);
|
|
14
15
|
(0, Affirm_1.default)(!producer.measures || producer.measures.length === 0, `Cannot use "measure" with a producer linked to a file (only dimensions are allowed).`);
|
|
15
16
|
const columns = producer.dimensions.map(x => ({
|
|
@@ -20,7 +20,7 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
|
|
|
20
20
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
21
21
|
class FileExporterClass {
|
|
22
22
|
constructor() {
|
|
23
|
-
this.export = (consumer, output, dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
23
|
+
this.export = (consumer, output, dataset, executionId) => __awaiter(this, void 0, void 0, function* () {
|
|
24
24
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
25
25
|
(0, Affirm_1.default)(output, `Invalid output`);
|
|
26
26
|
(0, Affirm_1.default)(dataset, `Invalid export dataset`);
|
|
@@ -32,7 +32,7 @@ class FileExporterClass {
|
|
|
32
32
|
: output.format === 'JSON'
|
|
33
33
|
? 'jsonl'
|
|
34
34
|
: 'txt';
|
|
35
|
-
const name = this._composeFileName(consumer, extension);
|
|
35
|
+
const name = this._composeFileName(consumer, output, extension, executionId);
|
|
36
36
|
const uploadRes = yield driver.uploadStream({
|
|
37
37
|
dataset,
|
|
38
38
|
name,
|
|
@@ -67,7 +67,16 @@ class FileExporterClass {
|
|
|
67
67
|
}
|
|
68
68
|
return chunks;
|
|
69
69
|
};
|
|
70
|
-
this._composeFileName = (consumer, extension) =>
|
|
70
|
+
this._composeFileName = (consumer, output, extension, executionId) => {
|
|
71
|
+
if (output.exportName && output.exportName.trim().length > 0) {
|
|
72
|
+
// Ensure no extension duplication
|
|
73
|
+
const sanitized = output.exportName.replace(/\.[^.]+$/, '');
|
|
74
|
+
return `${sanitized}.${extension}`;
|
|
75
|
+
}
|
|
76
|
+
const baseTs = Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-');
|
|
77
|
+
const suffix = executionId ? `_${executionId}` : '';
|
|
78
|
+
return `${consumer.name}_${baseTs}${suffix}.${extension}`;
|
|
79
|
+
};
|
|
71
80
|
}
|
|
72
81
|
}
|
|
73
82
|
const FileExporter = new FileExporterClass();
|
|
@@ -9,7 +9,7 @@ const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
|
|
|
9
9
|
class ParseManagerClass {
|
|
10
10
|
constructor() {
|
|
11
11
|
this._extractHeader = (headerLine, delimiter, producer, discover) => {
|
|
12
|
-
var _a;
|
|
12
|
+
var _a, _b, _c;
|
|
13
13
|
(0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
|
|
14
14
|
(0, Affirm_1.default)(delimiter, 'Invalid CSV delimiter');
|
|
15
15
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
@@ -24,7 +24,12 @@ class ParseManagerClass {
|
|
|
24
24
|
const columnKey = (_a = pColumn.aliasInProducer) !== null && _a !== void 0 ? _a : pColumn.nameInProducer;
|
|
25
25
|
const csvColumnIndex = headerColumns.findIndex(x => x === columnKey);
|
|
26
26
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
27
|
-
csvColumns.push({
|
|
27
|
+
csvColumns.push({
|
|
28
|
+
index: csvColumnIndex,
|
|
29
|
+
name: columnKey,
|
|
30
|
+
saveAs: pColumn.nameInProducer,
|
|
31
|
+
type: (_c = (_b = pColumn.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
32
|
+
});
|
|
28
33
|
}
|
|
29
34
|
return csvColumns;
|
|
30
35
|
};
|
|
@@ -34,7 +34,8 @@ class ProducerEngineClass {
|
|
|
34
34
|
(0, Affirm_1.default)(sql, `Invalid SQL from compilation for producer "${producer.name}"`);
|
|
35
35
|
return sql;
|
|
36
36
|
}
|
|
37
|
-
case 'aws-s3':
|
|
37
|
+
case 'aws-s3':
|
|
38
|
+
case 'delta-share': {
|
|
38
39
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
39
40
|
(0, Affirm_1.default)(columns, `Invalid columns from compilation for producer "${producer.name}"`);
|
|
40
41
|
break;
|
|
@@ -141,7 +142,8 @@ class ProducerEngineClass {
|
|
|
141
142
|
break;
|
|
142
143
|
}
|
|
143
144
|
case 'local':
|
|
144
|
-
case 'aws-s3':
|
|
145
|
+
case 'aws-s3':
|
|
146
|
+
case 'delta-share': {
|
|
145
147
|
const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } });
|
|
146
148
|
dataset = yield dataset.loadFromMemory(fileData.data, producer, discover);
|
|
147
149
|
break;
|
|
@@ -102,12 +102,16 @@ class JoinEngineClass {
|
|
|
102
102
|
fileType: 'CSV'
|
|
103
103
|
});
|
|
104
104
|
// Get dimensions for the result dataset based on consumer columns
|
|
105
|
-
const resultDimensions = consumerColumns.map((col, index) =>
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
105
|
+
const resultDimensions = consumerColumns.map((col, index) => {
|
|
106
|
+
var _a, _b;
|
|
107
|
+
return ({
|
|
108
|
+
name: col.consumerAlias || col.consumerKey,
|
|
109
|
+
key: col.consumerAlias || col.consumerKey,
|
|
110
|
+
index,
|
|
111
|
+
type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string',
|
|
112
|
+
hidden: null
|
|
113
|
+
});
|
|
114
|
+
});
|
|
111
115
|
// Initialize the result dataset with proper dimensions
|
|
112
116
|
resultDataset.getDimensions().length = 0;
|
|
113
117
|
resultDataset.getDimensions().push(...resultDimensions);
|
|
@@ -22,7 +22,7 @@ class TransformationEngineClass {
|
|
|
22
22
|
(0, Affirm_1.default)(dataset, 'Invalid data');
|
|
23
23
|
const fieldsToTransform = consumer.fields.filter(field => Algo_1.default.hasVal(field.transform));
|
|
24
24
|
Affirm_1.default.hasItems(fieldsToTransform, 'No fields with transformations');
|
|
25
|
-
|
|
25
|
+
yield dataset.map(record => {
|
|
26
26
|
var _a;
|
|
27
27
|
for (const field of fieldsToTransform) {
|
|
28
28
|
if (!field.transform)
|
|
@@ -54,6 +54,17 @@ class TransformationEngineClass {
|
|
|
54
54
|
}
|
|
55
55
|
return record;
|
|
56
56
|
}, options);
|
|
57
|
+
/**
|
|
58
|
+
* Some transformations (for now only "cast") change the underlying type of the dataset dimension
|
|
59
|
+
* Here I update the dimension type of the dataset.
|
|
60
|
+
* TODO: I think that we may have a bug if you cast AND then do an operation on the number, since it reverts back to being a string in the same trnasformation chain, since the dimension type update is applied only at the end of all the transformations
|
|
61
|
+
*/
|
|
62
|
+
for (const field of fieldsToTransform) {
|
|
63
|
+
if (!field.transform)
|
|
64
|
+
continue;
|
|
65
|
+
this.applyDimensionsChanges(field.transform, field, dataset);
|
|
66
|
+
}
|
|
67
|
+
return dataset;
|
|
57
68
|
});
|
|
58
69
|
this.isFieldCombinationTransformation = (transformation) => {
|
|
59
70
|
if (Array.isArray(transformation)) {
|
|
@@ -73,9 +84,12 @@ class TransformationEngineClass {
|
|
|
73
84
|
}
|
|
74
85
|
// Single transformation
|
|
75
86
|
if ('cast' in transformations) {
|
|
76
|
-
const
|
|
77
|
-
|
|
87
|
+
const { cast, format } = transformations;
|
|
88
|
+
const casted = TypeCaster_1.default.cast(value, cast, format);
|
|
89
|
+
if (cast === 'number' && isNaN(casted))
|
|
78
90
|
throw new Error(`Cannot cast non-numeric value in field '${field.key}'`);
|
|
91
|
+
if (cast === 'datetime' && casted instanceof Date && isNaN(casted.getTime()))
|
|
92
|
+
throw new Error(`Cannot cast value to date in field '${field.key}'`);
|
|
79
93
|
return casted;
|
|
80
94
|
}
|
|
81
95
|
if ('multiply' in transformations) {
|
|
@@ -260,6 +274,24 @@ class TransformationEngineClass {
|
|
|
260
274
|
}
|
|
261
275
|
return false;
|
|
262
276
|
};
|
|
277
|
+
this.applyDimensionsChanges = (transformations, field, dataset) => {
|
|
278
|
+
if (Array.isArray(transformations)) {
|
|
279
|
+
for (const transform of transformations) {
|
|
280
|
+
this.applyDimensionsChanges(transform, field, dataset);
|
|
281
|
+
}
|
|
282
|
+
return dataset;
|
|
283
|
+
}
|
|
284
|
+
// Single transformation
|
|
285
|
+
if ('cast' in transformations) {
|
|
286
|
+
const { cast } = transformations;
|
|
287
|
+
let oldDimension = dataset.getDimensions().find(x => x.name === field.key);
|
|
288
|
+
if (!oldDimension)
|
|
289
|
+
oldDimension = dataset.getDimensions().find(x => x.key === field.key);
|
|
290
|
+
const newDimension = Object.assign(Object.assign({}, structuredClone(oldDimension)), { type: cast });
|
|
291
|
+
dataset.setSingleDimension(newDimension, oldDimension);
|
|
292
|
+
}
|
|
293
|
+
return dataset;
|
|
294
|
+
};
|
|
263
295
|
}
|
|
264
296
|
}
|
|
265
297
|
const TransformationEngine = new TransformationEngineClass();
|