@forzalabs/remora 0.1.4-nasco.3 → 0.1.5-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +6 -2
- package/definitions/json_schemas/producer-schema.json +2 -1
- package/definitions/json_schemas/source-schema.json +14 -1
- package/documentation/README.md +1 -0
- package/documentation/default_resources/consumer.json +7 -7
- package/drivers/DeltaShareDriver.js +178 -0
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +15 -0
- package/engines/ai/DeveloperEngine.js +90 -1
- package/engines/consumer/ConsumerEngine.js +1 -1
- package/engines/consumer/PostProcessor.js +22 -15
- package/engines/dataset/Dataset.js +18 -7
- package/engines/dataset/DatasetManager.js +58 -12
- package/engines/dataset/DatasetRecord.js +17 -4
- package/engines/dataset/ParallelDataset.js +16 -6
- package/engines/execution/ExecutionEnvironment.js +13 -4
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/file/FileCompiler.js +2 -1
- package/engines/file/FileExporter.js +12 -3
- package/engines/parsing/ParseManager.js +7 -2
- package/engines/producer/ProducerEngine.js +4 -2
- package/engines/transform/JoinEngine.js +10 -6
- package/engines/transform/TransformationEngine.js +31 -2
- package/engines/usage/UsageDataManager.js +110 -0
- package/package.json +2 -1
- package/workers/FilterWorker.js +3 -3
- package/workers/ProjectionWorker.js +3 -3
- package/workers/TransformWorker.js +3 -3
|
@@ -21,9 +21,15 @@ const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
|
|
|
21
21
|
const Dataset_1 = __importDefault(require("./Dataset"));
|
|
22
22
|
const promises_1 = require("stream/promises");
|
|
23
23
|
const fs_1 = require("fs");
|
|
24
|
+
const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
|
|
24
25
|
class DatasetManagerClass {
|
|
25
26
|
constructor() {
|
|
26
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Create a new Dataset for a producer. If an executionId is provided, the dataset files will
|
|
29
|
+
* be isolated inside a sub-folder specific to that execution to avoid concurrency conflicts
|
|
30
|
+
* when the same producer / consumer is executed multiple times in parallel.
|
|
31
|
+
*/
|
|
32
|
+
this.create = (producer, executionId) => {
|
|
27
33
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
28
34
|
const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName } } = producer;
|
|
29
35
|
const dataset = new Dataset_1.default(name, {
|
|
@@ -32,7 +38,7 @@ class DatasetManagerClass {
|
|
|
32
38
|
hasHeaderRow,
|
|
33
39
|
sheetName,
|
|
34
40
|
delimiter
|
|
35
|
-
});
|
|
41
|
+
}, undefined, executionId);
|
|
36
42
|
return dataset;
|
|
37
43
|
};
|
|
38
44
|
this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
|
|
@@ -43,7 +49,7 @@ class DatasetManagerClass {
|
|
|
43
49
|
return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
|
|
44
50
|
});
|
|
45
51
|
this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
|
|
46
|
-
var _a, _b, _c, _d, _e, _f;
|
|
52
|
+
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
47
53
|
(0, Affirm_1.default)(firstLine, `Invalid first line`);
|
|
48
54
|
(0, Affirm_1.default)(dsFile, `Invalid dataset file`);
|
|
49
55
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
@@ -54,10 +60,17 @@ class DatasetManagerClass {
|
|
|
54
60
|
const headerLine = firstLine;
|
|
55
61
|
const rawDimensions = ParseManager_1.default._extractHeader(headerLine, delimiterChar, producer, discover);
|
|
56
62
|
return {
|
|
57
|
-
dimensions: rawDimensions.map(x => ({
|
|
63
|
+
dimensions: rawDimensions.map(x => ({
|
|
64
|
+
key: x.name,
|
|
65
|
+
name: x.saveAs,
|
|
66
|
+
index: x.index,
|
|
67
|
+
hidden: null,
|
|
68
|
+
type: x.type
|
|
69
|
+
})),
|
|
58
70
|
delimiter: delimiterChar
|
|
59
71
|
};
|
|
60
72
|
}
|
|
73
|
+
case 'PARQUET':
|
|
61
74
|
case 'JSONL':
|
|
62
75
|
case 'JSON': {
|
|
63
76
|
const source = Environment_1.default.getSource(producer.source);
|
|
@@ -67,7 +80,13 @@ class DatasetManagerClass {
|
|
|
67
80
|
if (discover) {
|
|
68
81
|
return {
|
|
69
82
|
delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
|
|
70
|
-
dimensions: keys.map((x, i) => ({
|
|
83
|
+
dimensions: keys.map((x, i) => ({
|
|
84
|
+
hidden: false,
|
|
85
|
+
index: i,
|
|
86
|
+
key: x,
|
|
87
|
+
name: x,
|
|
88
|
+
type: DeveloperEngine_1.default.inferDimensionType(firstObject === null || firstObject === void 0 ? void 0 : firstObject[x])
|
|
89
|
+
}))
|
|
71
90
|
};
|
|
72
91
|
}
|
|
73
92
|
const dimensions = [];
|
|
@@ -75,34 +94,61 @@ class DatasetManagerClass {
|
|
|
75
94
|
const columnKey = (_c = pColumn.aliasInProducer) !== null && _c !== void 0 ? _c : pColumn.nameInProducer;
|
|
76
95
|
const csvColumnIndex = keys.findIndex(x => x === columnKey);
|
|
77
96
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
78
|
-
dimensions.push({
|
|
97
|
+
dimensions.push({
|
|
98
|
+
index: csvColumnIndex,
|
|
99
|
+
key: columnKey,
|
|
100
|
+
name: pColumn.nameInProducer,
|
|
101
|
+
hidden: null,
|
|
102
|
+
type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
|
|
103
|
+
});
|
|
79
104
|
}
|
|
80
|
-
const delimiterChar = (
|
|
105
|
+
const delimiterChar = (_f = file.delimiter) !== null && _f !== void 0 ? _f : ',';
|
|
81
106
|
return { dimensions, delimiter: delimiterChar };
|
|
82
107
|
}
|
|
83
108
|
case 'TXT': {
|
|
84
109
|
if (!file.hasHeaderRow) {
|
|
85
110
|
// If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
|
|
86
|
-
const delimiterChar = (
|
|
111
|
+
const delimiterChar = (_g = file.delimiter) !== null && _g !== void 0 ? _g : ',';
|
|
87
112
|
const source = Environment_1.default.getSource(producer.source);
|
|
88
113
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
89
114
|
if (discover) {
|
|
90
115
|
// Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
|
|
91
116
|
return {
|
|
92
117
|
delimiter: delimiterChar,
|
|
93
|
-
dimensions: firstLine.split(delimiterChar).map((x, i) => ({
|
|
118
|
+
dimensions: firstLine.split(delimiterChar).map((x, i) => ({
|
|
119
|
+
hidden: false,
|
|
120
|
+
index: i,
|
|
121
|
+
key: `Col ${i + 1}`,
|
|
122
|
+
name: `Col ${i + 1}`,
|
|
123
|
+
type: 'string'
|
|
124
|
+
}))
|
|
94
125
|
};
|
|
95
126
|
}
|
|
96
127
|
return {
|
|
97
|
-
dimensions: columns.map((x, i) => {
|
|
128
|
+
dimensions: columns.map((x, i) => {
|
|
129
|
+
var _a, _b, _c;
|
|
130
|
+
return ({
|
|
131
|
+
key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
|
|
132
|
+
name: x.nameInProducer,
|
|
133
|
+
index: i,
|
|
134
|
+
hidden: null,
|
|
135
|
+
type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
136
|
+
});
|
|
137
|
+
}),
|
|
98
138
|
delimiter: delimiterChar
|
|
99
139
|
};
|
|
100
140
|
}
|
|
101
141
|
else {
|
|
102
|
-
const delimiterChar = (
|
|
142
|
+
const delimiterChar = (_h = producer.settings.delimiter) !== null && _h !== void 0 ? _h : ',';
|
|
103
143
|
const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
|
|
104
144
|
return {
|
|
105
|
-
dimensions: rawDimensions.map(x => ({
|
|
145
|
+
dimensions: rawDimensions.map(x => ({
|
|
146
|
+
key: x.name,
|
|
147
|
+
name: x.saveAs,
|
|
148
|
+
index: x.index,
|
|
149
|
+
hidden: null,
|
|
150
|
+
type: x.type
|
|
151
|
+
})),
|
|
106
152
|
delimiter: delimiterChar
|
|
107
153
|
};
|
|
108
154
|
}
|
|
@@ -4,6 +4,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
7
|
+
const TypeCaster_1 = __importDefault(require("../transform/TypeCaster"));
|
|
7
8
|
class DatasetRecord {
|
|
8
9
|
constructor(row, dimensions, delimiter) {
|
|
9
10
|
this.parse = (row, delimiter, dimensions) => {
|
|
@@ -11,7 +12,7 @@ class DatasetRecord {
|
|
|
11
12
|
const parts = row.split(delimiter);
|
|
12
13
|
for (let i = 0; i < dimensions.length; i++) {
|
|
13
14
|
const dim = dimensions[i];
|
|
14
|
-
this._value[dim.name] = parts[i];
|
|
15
|
+
this._value[dim.name] = TypeCaster_1.default.cast(parts[i], dim.type);
|
|
15
16
|
}
|
|
16
17
|
}
|
|
17
18
|
};
|
|
@@ -35,7 +36,7 @@ class DatasetRecord {
|
|
|
35
36
|
this.parse(row, delimiter, this._dimensions);
|
|
36
37
|
};
|
|
37
38
|
this.wholeUpdateDimension = (update) => {
|
|
38
|
-
var _a;
|
|
39
|
+
var _a, _b, _c, _d, _e;
|
|
39
40
|
if (update.toDelete) {
|
|
40
41
|
// To remove
|
|
41
42
|
delete this._value[update.currentDimension.name];
|
|
@@ -46,7 +47,13 @@ class DatasetRecord {
|
|
|
46
47
|
}
|
|
47
48
|
else if (!update.currentDimension) {
|
|
48
49
|
// To create (at the right position)
|
|
49
|
-
const newDimension = {
|
|
50
|
+
const newDimension = {
|
|
51
|
+
index: update.newPosition,
|
|
52
|
+
key: update.newName,
|
|
53
|
+
name: update.newName,
|
|
54
|
+
hidden: update.newHidden,
|
|
55
|
+
type: (_b = (_a = update.currentDimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string'
|
|
56
|
+
};
|
|
50
57
|
this._value[newDimension.name] = null;
|
|
51
58
|
this._dimensions = [...this._dimensions, newDimension];
|
|
52
59
|
}
|
|
@@ -56,7 +63,13 @@ class DatasetRecord {
|
|
|
56
63
|
if (index < 0)
|
|
57
64
|
index = this._dimensions.findIndex(x => x.key === update.currentDimension.key);
|
|
58
65
|
const currentDim = this._dimensions[index];
|
|
59
|
-
const updatedDim = {
|
|
66
|
+
const updatedDim = {
|
|
67
|
+
name: update.newName,
|
|
68
|
+
key: (_c = currentDim.key) !== null && _c !== void 0 ? _c : update.newName,
|
|
69
|
+
hidden: update.newHidden,
|
|
70
|
+
index: update.newPosition,
|
|
71
|
+
type: (_e = (_d = update.currentDimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
|
|
72
|
+
};
|
|
60
73
|
this._value[updatedDim.name] = this._value[currentDim.name];
|
|
61
74
|
if (updatedDim.name !== currentDim.name)
|
|
62
75
|
delete this._value[currentDim.name];
|
|
@@ -25,10 +25,17 @@ class ParallelDatasetClass {
|
|
|
25
25
|
* I need the init to be called after all the setup has been completed because I need the .env to be loaded
|
|
26
26
|
*/
|
|
27
27
|
if (!this._filterPool || !this._projectionPool || !this._transformPool) {
|
|
28
|
+
const options = {
|
|
29
|
+
workerThreadOpts: {
|
|
30
|
+
resourceLimits: {
|
|
31
|
+
maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
};
|
|
28
35
|
const workerPath = this._getWorkerPath();
|
|
29
|
-
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
|
|
30
|
-
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
|
|
31
|
-
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
|
|
36
|
+
this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'), options);
|
|
37
|
+
this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'), options);
|
|
38
|
+
this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'), options);
|
|
32
39
|
}
|
|
33
40
|
};
|
|
34
41
|
this._getWorkerPath = () => {
|
|
@@ -74,6 +81,7 @@ class ParallelDatasetClass {
|
|
|
74
81
|
const workerData = {
|
|
75
82
|
datasetDimensions: dataset.getDimensions(),
|
|
76
83
|
datasetFile: dataset.getFile(),
|
|
84
|
+
executionId: dataset.getExecutionId(),
|
|
77
85
|
datasetName: dataset.name,
|
|
78
86
|
datasetDelimiter: dataset.getDelimiter(),
|
|
79
87
|
fromLine: fromLine,
|
|
@@ -91,7 +99,7 @@ class ParallelDatasetClass {
|
|
|
91
99
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
92
100
|
dataset
|
|
93
101
|
.setDelimiter(results[0].datasetDelimiter)
|
|
94
|
-
.
|
|
102
|
+
.setDimensions(results[0].datasetDimensions);
|
|
95
103
|
dataset._finishOperation('filter-parallel');
|
|
96
104
|
return dataset;
|
|
97
105
|
});
|
|
@@ -111,6 +119,7 @@ class ParallelDatasetClass {
|
|
|
111
119
|
const workerData = {
|
|
112
120
|
datasetDimensions: dataset.getDimensions(),
|
|
113
121
|
datasetFile: dataset.getFile(),
|
|
122
|
+
executionId: dataset.getExecutionId(),
|
|
114
123
|
datasetName: dataset.name,
|
|
115
124
|
datasetDelimiter: dataset.getDelimiter(),
|
|
116
125
|
fromLine: fromLine,
|
|
@@ -126,7 +135,7 @@ class ParallelDatasetClass {
|
|
|
126
135
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
127
136
|
dataset
|
|
128
137
|
.setDelimiter(results[0].datasetDelimiter)
|
|
129
|
-
.
|
|
138
|
+
.setDimensions(results[0].datasetDimensions);
|
|
130
139
|
dataset._finishOperation('projection-parallel');
|
|
131
140
|
return dataset;
|
|
132
141
|
});
|
|
@@ -146,6 +155,7 @@ class ParallelDatasetClass {
|
|
|
146
155
|
const workerData = {
|
|
147
156
|
datasetDimensions: dataset.getDimensions(),
|
|
148
157
|
datasetFile: dataset.getFile(),
|
|
158
|
+
executionId: dataset.getExecutionId(),
|
|
149
159
|
datasetName: dataset.name,
|
|
150
160
|
datasetDelimiter: dataset.getDelimiter(),
|
|
151
161
|
fromLine: fromLine,
|
|
@@ -161,7 +171,7 @@ class ParallelDatasetClass {
|
|
|
161
171
|
yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
|
|
162
172
|
dataset
|
|
163
173
|
.setDelimiter(results[0].datasetDelimiter)
|
|
164
|
-
.
|
|
174
|
+
.setDimensions(results[0].datasetDimensions);
|
|
165
175
|
dataset._finishOperation('transform-parallel');
|
|
166
176
|
return dataset;
|
|
167
177
|
});
|
|
@@ -28,7 +28,7 @@ const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
|
28
28
|
const Logger_1 = __importDefault(require("../../helper/Logger"));
|
|
29
29
|
const ParallelDataset_1 = __importDefault(require("../dataset/ParallelDataset"));
|
|
30
30
|
class ExecutionEnvironment {
|
|
31
|
-
constructor(consumer) {
|
|
31
|
+
constructor(consumer, executionId) {
|
|
32
32
|
this.run = (options) => __awaiter(this, void 0, void 0, function* () {
|
|
33
33
|
var _a, _b, _c, _d;
|
|
34
34
|
(0, Affirm_1.default)(this._consumer, 'Invalid consumer');
|
|
@@ -64,7 +64,7 @@ class ExecutionEnvironment {
|
|
|
64
64
|
(0, Affirm_1.default)(planStep.producer, `Invalid producer in execute-SQL step`);
|
|
65
65
|
const driver = yield DriverFactory_1.default.instantiateSource(planStep.source);
|
|
66
66
|
const queryData = (yield driver.query(this._envData.finalSQL)).rows;
|
|
67
|
-
let dataset = DatasetManager_1.default.create(planStep.producer);
|
|
67
|
+
let dataset = DatasetManager_1.default.create(planStep.producer, this._executionId);
|
|
68
68
|
dataset = yield dataset.loadFromMemory(queryData, planStep.producer);
|
|
69
69
|
this._storeIntermidiate(planStep, dataset);
|
|
70
70
|
break;
|
|
@@ -74,7 +74,7 @@ class ExecutionEnvironment {
|
|
|
74
74
|
const { producer } = planStep;
|
|
75
75
|
const source = Environment_1.default.getSource(producer.source);
|
|
76
76
|
(0, Affirm_1.default)(source, `Source "${producer.source}" of producer "${producer.name}" not found.`);
|
|
77
|
-
let dataset = DatasetManager_1.default.create(producer);
|
|
77
|
+
let dataset = DatasetManager_1.default.create(producer, this._executionId);
|
|
78
78
|
dataset = yield dataset.load(source);
|
|
79
79
|
this._storeIntermidiate(planStep, dataset);
|
|
80
80
|
break;
|
|
@@ -102,7 +102,7 @@ class ExecutionEnvironment {
|
|
|
102
102
|
case 'export-file': {
|
|
103
103
|
(0, Affirm_1.default)(planStep.output, `Invalid output in export-file step`);
|
|
104
104
|
(0, Affirm_1.default)(this._resultingDataset, 'Invalid resulting dataset in export-file step');
|
|
105
|
-
const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset);
|
|
105
|
+
const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset, this._executionId);
|
|
106
106
|
result.fileUri = res;
|
|
107
107
|
break;
|
|
108
108
|
}
|
|
@@ -156,6 +156,13 @@ class ExecutionEnvironment {
|
|
|
156
156
|
if (ds)
|
|
157
157
|
Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getCount()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
|
|
158
158
|
Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
|
|
159
|
+
// IMPORTANT: cleanup all the datasets to not leave any data around and to avoid memory leaks
|
|
160
|
+
const datasets = [
|
|
161
|
+
...this._producedData.map(x => x.dataset),
|
|
162
|
+
this._resultingDataset
|
|
163
|
+
].filter(Algo_1.default.hasVal);
|
|
164
|
+
const promises = datasets.map(x => x.destroy());
|
|
165
|
+
yield Promise.all(promises);
|
|
159
166
|
throw error;
|
|
160
167
|
}
|
|
161
168
|
Logger_1.default.log(`Completed execution of consumer:\n\tSize: ${result._stats.size}\n\tCycles: ${result._stats.cycles}\n\tTime: ${result._stats.elapsedMS}\n\tOperations: ${Logger_1.default.formatList(result._stats.operations)}`);
|
|
@@ -184,6 +191,8 @@ class ExecutionEnvironment {
|
|
|
184
191
|
this._envData = { consumerSQL: null, executionRequestSQL: null, finalSQL: null };
|
|
185
192
|
this._producedData = [];
|
|
186
193
|
this._resultingDataset = null;
|
|
194
|
+
// A short unique id to isolate temp dataset files & output names
|
|
195
|
+
this._executionId = executionId;
|
|
187
196
|
}
|
|
188
197
|
}
|
|
189
198
|
exports.default = ExecutionEnvironment;
|
|
@@ -108,7 +108,8 @@ class ExecutionPlannerClas {
|
|
|
108
108
|
break;
|
|
109
109
|
}
|
|
110
110
|
case 'local':
|
|
111
|
-
case 'aws-s3':
|
|
111
|
+
case 'aws-s3':
|
|
112
|
+
case 'delta-share': {
|
|
112
113
|
plan.push({ type: 'load-dataset', producer });
|
|
113
114
|
plan.push({ type: 'prepare-dataset', producer });
|
|
114
115
|
if (producer.dimensions.some(x => { var _a, _b; return ((_a = x.alias) === null || _a === void 0 ? void 0 : _a.includes('{')) || ((_b = x.alias) === null || _b === void 0 ? void 0 : _b.includes('[')); }))
|
|
@@ -7,9 +7,10 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
7
7
|
class FileCompilerClass {
|
|
8
8
|
constructor() {
|
|
9
9
|
this.compileProducer = (producer, source) => {
|
|
10
|
+
var _a;
|
|
10
11
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
11
12
|
(0, Affirm_1.default)(source, `Invalid source`);
|
|
12
|
-
(0, Affirm_1.default)(producer.settings.fileKey, `Missing required file key in producer settings`);
|
|
13
|
+
(0, Affirm_1.default)((_a = producer.settings.fileKey) !== null && _a !== void 0 ? _a : producer.settings.sqlTable, `Missing required file key in producer settings`);
|
|
13
14
|
(0, Affirm_1.default)(producer.settings.fileType, `Missing required file type in producer settings`);
|
|
14
15
|
(0, Affirm_1.default)(!producer.measures || producer.measures.length === 0, `Cannot use "measure" with a producer linked to a file (only dimensions are allowed).`);
|
|
15
16
|
const columns = producer.dimensions.map(x => ({
|
|
@@ -20,7 +20,7 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
|
|
|
20
20
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
21
21
|
class FileExporterClass {
|
|
22
22
|
constructor() {
|
|
23
|
-
this.export = (consumer, output, dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
23
|
+
this.export = (consumer, output, dataset, executionId) => __awaiter(this, void 0, void 0, function* () {
|
|
24
24
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
25
25
|
(0, Affirm_1.default)(output, `Invalid output`);
|
|
26
26
|
(0, Affirm_1.default)(dataset, `Invalid export dataset`);
|
|
@@ -32,7 +32,7 @@ class FileExporterClass {
|
|
|
32
32
|
: output.format === 'JSON'
|
|
33
33
|
? 'jsonl'
|
|
34
34
|
: 'txt';
|
|
35
|
-
const name = this._composeFileName(consumer, extension);
|
|
35
|
+
const name = this._composeFileName(consumer, output, extension, executionId);
|
|
36
36
|
const uploadRes = yield driver.uploadStream({
|
|
37
37
|
dataset,
|
|
38
38
|
name,
|
|
@@ -67,7 +67,16 @@ class FileExporterClass {
|
|
|
67
67
|
}
|
|
68
68
|
return chunks;
|
|
69
69
|
};
|
|
70
|
-
this._composeFileName = (consumer, extension) =>
|
|
70
|
+
this._composeFileName = (consumer, output, extension, executionId) => {
|
|
71
|
+
if (output.exportName && output.exportName.trim().length > 0) {
|
|
72
|
+
// Ensure no extension duplication
|
|
73
|
+
const sanitized = output.exportName.replace(/\.[^.]+$/, '');
|
|
74
|
+
return `${sanitized}.${extension}`;
|
|
75
|
+
}
|
|
76
|
+
const baseTs = Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-');
|
|
77
|
+
const suffix = executionId ? `_${executionId}` : '';
|
|
78
|
+
return `${consumer.name}_${baseTs}${suffix}.${extension}`;
|
|
79
|
+
};
|
|
71
80
|
}
|
|
72
81
|
}
|
|
73
82
|
const FileExporter = new FileExporterClass();
|
|
@@ -9,7 +9,7 @@ const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
|
|
|
9
9
|
class ParseManagerClass {
|
|
10
10
|
constructor() {
|
|
11
11
|
this._extractHeader = (headerLine, delimiter, producer, discover) => {
|
|
12
|
-
var _a;
|
|
12
|
+
var _a, _b, _c;
|
|
13
13
|
(0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
|
|
14
14
|
(0, Affirm_1.default)(delimiter, 'Invalid CSV delimiter');
|
|
15
15
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
@@ -24,7 +24,12 @@ class ParseManagerClass {
|
|
|
24
24
|
const columnKey = (_a = pColumn.aliasInProducer) !== null && _a !== void 0 ? _a : pColumn.nameInProducer;
|
|
25
25
|
const csvColumnIndex = headerColumns.findIndex(x => x === columnKey);
|
|
26
26
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
27
|
-
csvColumns.push({
|
|
27
|
+
csvColumns.push({
|
|
28
|
+
index: csvColumnIndex,
|
|
29
|
+
name: columnKey,
|
|
30
|
+
saveAs: pColumn.nameInProducer,
|
|
31
|
+
type: (_c = (_b = pColumn.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
32
|
+
});
|
|
28
33
|
}
|
|
29
34
|
return csvColumns;
|
|
30
35
|
};
|
|
@@ -34,7 +34,8 @@ class ProducerEngineClass {
|
|
|
34
34
|
(0, Affirm_1.default)(sql, `Invalid SQL from compilation for producer "${producer.name}"`);
|
|
35
35
|
return sql;
|
|
36
36
|
}
|
|
37
|
-
case 'aws-s3':
|
|
37
|
+
case 'aws-s3':
|
|
38
|
+
case 'delta-share': {
|
|
38
39
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
39
40
|
(0, Affirm_1.default)(columns, `Invalid columns from compilation for producer "${producer.name}"`);
|
|
40
41
|
break;
|
|
@@ -141,7 +142,8 @@ class ProducerEngineClass {
|
|
|
141
142
|
break;
|
|
142
143
|
}
|
|
143
144
|
case 'local':
|
|
144
|
-
case 'aws-s3':
|
|
145
|
+
case 'aws-s3':
|
|
146
|
+
case 'delta-share': {
|
|
145
147
|
const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } });
|
|
146
148
|
dataset = yield dataset.loadFromMemory(fileData.data, producer, discover);
|
|
147
149
|
break;
|
|
@@ -102,12 +102,16 @@ class JoinEngineClass {
|
|
|
102
102
|
fileType: 'CSV'
|
|
103
103
|
});
|
|
104
104
|
// Get dimensions for the result dataset based on consumer columns
|
|
105
|
-
const resultDimensions = consumerColumns.map((col, index) =>
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
105
|
+
const resultDimensions = consumerColumns.map((col, index) => {
|
|
106
|
+
var _a, _b;
|
|
107
|
+
return ({
|
|
108
|
+
name: col.consumerAlias || col.consumerKey,
|
|
109
|
+
key: col.consumerAlias || col.consumerKey,
|
|
110
|
+
index,
|
|
111
|
+
type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string',
|
|
112
|
+
hidden: null
|
|
113
|
+
});
|
|
114
|
+
});
|
|
111
115
|
// Initialize the result dataset with proper dimensions
|
|
112
116
|
resultDataset.getDimensions().length = 0;
|
|
113
117
|
resultDataset.getDimensions().push(...resultDimensions);
|
|
@@ -22,7 +22,7 @@ class TransformationEngineClass {
|
|
|
22
22
|
(0, Affirm_1.default)(dataset, 'Invalid data');
|
|
23
23
|
const fieldsToTransform = consumer.fields.filter(field => Algo_1.default.hasVal(field.transform));
|
|
24
24
|
Affirm_1.default.hasItems(fieldsToTransform, 'No fields with transformations');
|
|
25
|
-
|
|
25
|
+
yield dataset.map(record => {
|
|
26
26
|
var _a;
|
|
27
27
|
for (const field of fieldsToTransform) {
|
|
28
28
|
if (!field.transform)
|
|
@@ -54,6 +54,17 @@ class TransformationEngineClass {
|
|
|
54
54
|
}
|
|
55
55
|
return record;
|
|
56
56
|
}, options);
|
|
57
|
+
/**
|
|
58
|
+
* Some transformations (for now only "cast") change the underlying type of the dataset dimension
|
|
59
|
+
* Here I update the dimension type of the dataset.
|
|
60
|
+
* TODO: I think that we may have a bug if you cast AND then do an operation on the number, since it reverts back to being a string in the same trnasformation chain, since the dimension type update is applied only at the end of all the transformations
|
|
61
|
+
*/
|
|
62
|
+
for (const field of fieldsToTransform) {
|
|
63
|
+
if (!field.transform)
|
|
64
|
+
continue;
|
|
65
|
+
this.applyDimensionsChanges(field.transform, field, dataset);
|
|
66
|
+
}
|
|
67
|
+
return dataset;
|
|
57
68
|
});
|
|
58
69
|
this.isFieldCombinationTransformation = (transformation) => {
|
|
59
70
|
if (Array.isArray(transformation)) {
|
|
@@ -77,7 +88,7 @@ class TransformationEngineClass {
|
|
|
77
88
|
const casted = TypeCaster_1.default.cast(value, cast, format);
|
|
78
89
|
if (cast === 'number' && isNaN(casted))
|
|
79
90
|
throw new Error(`Cannot cast non-numeric value in field '${field.key}'`);
|
|
80
|
-
if (cast === '
|
|
91
|
+
if (cast === 'datetime' && casted instanceof Date && isNaN(casted.getTime()))
|
|
81
92
|
throw new Error(`Cannot cast value to date in field '${field.key}'`);
|
|
82
93
|
return casted;
|
|
83
94
|
}
|
|
@@ -263,6 +274,24 @@ class TransformationEngineClass {
|
|
|
263
274
|
}
|
|
264
275
|
return false;
|
|
265
276
|
};
|
|
277
|
+
this.applyDimensionsChanges = (transformations, field, dataset) => {
|
|
278
|
+
if (Array.isArray(transformations)) {
|
|
279
|
+
for (const transform of transformations) {
|
|
280
|
+
this.applyDimensionsChanges(transform, field, dataset);
|
|
281
|
+
}
|
|
282
|
+
return dataset;
|
|
283
|
+
}
|
|
284
|
+
// Single transformation
|
|
285
|
+
if ('cast' in transformations) {
|
|
286
|
+
const { cast } = transformations;
|
|
287
|
+
let oldDimension = dataset.getDimensions().find(x => x.name === field.key);
|
|
288
|
+
if (!oldDimension)
|
|
289
|
+
oldDimension = dataset.getDimensions().find(x => x.key === field.key);
|
|
290
|
+
const newDimension = Object.assign(Object.assign({}, structuredClone(oldDimension)), { type: cast });
|
|
291
|
+
dataset.setSingleDimension(newDimension, oldDimension);
|
|
292
|
+
}
|
|
293
|
+
return dataset;
|
|
294
|
+
};
|
|
266
295
|
}
|
|
267
296
|
}
|
|
268
297
|
const TransformationEngine = new TransformationEngineClass();
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const DSTE_1 = __importDefault(require("../../core/dste/DSTE"));
|
|
16
|
+
const DatabaseEngine_1 = __importDefault(require("../../database/DatabaseEngine"));
|
|
17
|
+
const DataframeManager_1 = __importDefault(require("./DataframeManager"));
|
|
18
|
+
class UsageDataManager {
|
|
19
|
+
getUsageDetails() {
|
|
20
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
21
|
+
const now = DSTE_1.default.now();
|
|
22
|
+
const from = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000);
|
|
23
|
+
const prevMonthFrom = new Date(now.getTime() - 60 * 24 * 60 * 60 * 1000);
|
|
24
|
+
const yearAgo = new Date(now.getFullYear(), now.getMonth() - 11, 1);
|
|
25
|
+
const collection = 'usage';
|
|
26
|
+
// Aggregate status counts for current and previous month
|
|
27
|
+
const getStatusCounts = (start, end) => __awaiter(this, void 0, void 0, function* () {
|
|
28
|
+
const results = yield DatabaseEngine_1.default.aggregate(collection, [
|
|
29
|
+
{ $match: { startedAt: { $gte: start, $lte: end } } },
|
|
30
|
+
{ $group: { _id: '$status', count: { $sum: 1 } } }
|
|
31
|
+
]);
|
|
32
|
+
let success = 0, failed = 0, total = 0;
|
|
33
|
+
results.forEach(r => {
|
|
34
|
+
total += r.count;
|
|
35
|
+
if (r._id === 'success')
|
|
36
|
+
success = r.count;
|
|
37
|
+
if (r._id === 'failed')
|
|
38
|
+
failed = r.count;
|
|
39
|
+
});
|
|
40
|
+
return { total, success, failed };
|
|
41
|
+
});
|
|
42
|
+
const statusesRequests = yield getStatusCounts(from, now);
|
|
43
|
+
const prevStatusesRequests = yield getStatusCounts(prevMonthFrom, from);
|
|
44
|
+
// Monthly success and fails for last 12 months
|
|
45
|
+
const monthlySuccessPipeline = [
|
|
46
|
+
{ $match: { status: 'success', startedAt: { $gte: yearAgo, $lte: now } } },
|
|
47
|
+
{ $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
|
|
48
|
+
{ $group: { _id: { year: '$year', month: '$month' }, count: { $sum: 1 } } },
|
|
49
|
+
{ $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
|
|
50
|
+
{ $sort: { x: 1 } }
|
|
51
|
+
];
|
|
52
|
+
const monthlyFailsPipeline = [
|
|
53
|
+
{ $match: { status: 'failed', startedAt: { $gte: yearAgo, $lte: now } } },
|
|
54
|
+
{ $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
|
|
55
|
+
{ $group: { _id: { year: '$year', month: '$month' }, count: { $sum: 1 } } },
|
|
56
|
+
{ $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
|
|
57
|
+
{ $sort: { x: 1 } }
|
|
58
|
+
];
|
|
59
|
+
const rawMonthlySuccess = yield DatabaseEngine_1.default.aggregate(collection, monthlySuccessPipeline);
|
|
60
|
+
const rawMonthlyFails = yield DatabaseEngine_1.default.aggregate(collection, monthlyFailsPipeline);
|
|
61
|
+
// Top lines per month for last 12 months
|
|
62
|
+
const topLinesPipeline = [
|
|
63
|
+
{ $match: { startedAt: { $gte: yearAgo, $lte: now } } },
|
|
64
|
+
{ $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
|
|
65
|
+
{ $group: { _id: { year: '$year', month: '$month' }, itemsCount: { $max: '$itemsCount' } } },
|
|
66
|
+
{ $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$itemsCount' } },
|
|
67
|
+
{ $sort: { x: 1 } }
|
|
68
|
+
];
|
|
69
|
+
const topLines = yield DatabaseEngine_1.default.aggregate(collection, topLinesPipeline);
|
|
70
|
+
// Top times per month for last 12 months
|
|
71
|
+
const topTimePipeline = [
|
|
72
|
+
{ $match: { startedAt: { $gte: yearAgo, $lte: now } } },
|
|
73
|
+
{ $addFields: { durationMs: { $subtract: ['$finishedAt', '$startedAt'] }, year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
|
|
74
|
+
{ $group: { _id: { year: '$year', month: '$month' }, maxDuration: { $max: '$durationMs' } } },
|
|
75
|
+
{ $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$maxDuration' } },
|
|
76
|
+
{ $sort: { x: 1 } }
|
|
77
|
+
];
|
|
78
|
+
const topTime = yield DatabaseEngine_1.default.aggregate(collection, topTimePipeline);
|
|
79
|
+
// Monthly consumers: for each consumer, per month count
|
|
80
|
+
const consumerPipeline = [
|
|
81
|
+
{ $match: { startedAt: { $gte: yearAgo, $lte: now } } },
|
|
82
|
+
{ $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
|
|
83
|
+
{ $group: { _id: { consumer: '$consumer', year: '$year', month: '$month' }, count: { $sum: 1 } } },
|
|
84
|
+
{ $project: { _id: 0, consumer: '$_id.consumer', x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
|
|
85
|
+
{ $sort: { consumer: 1, x: 1 } }
|
|
86
|
+
];
|
|
87
|
+
const consumersData = yield DatabaseEngine_1.default.aggregate(collection, consumerPipeline);
|
|
88
|
+
// transform to consumer array
|
|
89
|
+
const consumerMap = {};
|
|
90
|
+
consumersData.forEach(r => {
|
|
91
|
+
consumerMap[r.consumer] = consumerMap[r.consumer] || [];
|
|
92
|
+
consumerMap[r.consumer].push({ x: r.x, y: r.y });
|
|
93
|
+
});
|
|
94
|
+
const consumers = Object.entries(consumerMap).map(([name, data]) => ({ name, data: DataframeManager_1.default.fill(data !== null && data !== void 0 ? data : [], yearAgo, now) }));
|
|
95
|
+
// Recent executions
|
|
96
|
+
const recentExecution = yield DatabaseEngine_1.default.query(collection, { startedAt: { $gte: from, $lte: now } }, { sort: { startedAt: -1 }, limit: 10 });
|
|
97
|
+
return {
|
|
98
|
+
statusesRequests,
|
|
99
|
+
prevStatusesRequests,
|
|
100
|
+
monthlySuccess: DataframeManager_1.default.fill(rawMonthlySuccess !== null && rawMonthlySuccess !== void 0 ? rawMonthlySuccess : [], yearAgo, now),
|
|
101
|
+
monthlyFails: DataframeManager_1.default.fill(rawMonthlyFails !== null && rawMonthlyFails !== void 0 ? rawMonthlyFails : [], yearAgo, now),
|
|
102
|
+
consumers: consumers,
|
|
103
|
+
topLine: DataframeManager_1.default.fill(topLines !== null && topLines !== void 0 ? topLines : [], yearAgo, now),
|
|
104
|
+
topTime: DataframeManager_1.default.fill(topTime !== null && topTime !== void 0 ? topTime : [], yearAgo, now),
|
|
105
|
+
recentExecution
|
|
106
|
+
};
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.default = new UsageDataManager();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forzalabs/remora",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.5-nasco.3",
|
|
4
4
|
"description": "A powerful CLI tool for seamless data translation.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"private": false,
|
|
@@ -47,6 +47,7 @@
|
|
|
47
47
|
"dotenv": "^16.0.3",
|
|
48
48
|
"fast-xml-parser": "^5.2.3",
|
|
49
49
|
"fs-extra": "^11.1.0",
|
|
50
|
+
"hyparquet": "^1.17.4",
|
|
50
51
|
"inquirer": "^8.2.5",
|
|
51
52
|
"json-schema": "^0.4.0",
|
|
52
53
|
"jsonwebtoken": "^9.0.2",
|