@forzalabs/remora 0.1.3-nasco.3 → 0.1.5-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/Constants.js +1 -1
  2. package/definitions/json_schemas/consumer-schema.json +9 -1
  3. package/definitions/json_schemas/producer-schema.json +2 -1
  4. package/definitions/json_schemas/source-schema.json +14 -1
  5. package/documentation/README.md +1 -0
  6. package/documentation/default_resources/consumer.json +7 -7
  7. package/drivers/DeltaShareDriver.js +178 -0
  8. package/drivers/DriverFactory.js +6 -0
  9. package/drivers/DriverHelper.js +16 -1
  10. package/drivers/LocalDriver.js +1 -0
  11. package/drivers/S3Driver.js +1 -0
  12. package/engines/ai/DeveloperEngine.js +90 -1
  13. package/engines/consumer/ConsumerEngine.js +1 -1
  14. package/engines/consumer/PostProcessor.js +27 -18
  15. package/engines/dataset/Dataset.js +18 -7
  16. package/engines/dataset/DatasetManager.js +58 -12
  17. package/engines/dataset/DatasetRecord.js +17 -4
  18. package/engines/dataset/ParallelDataset.js +29 -7
  19. package/engines/execution/ExecutionEnvironment.js +13 -4
  20. package/engines/execution/ExecutionPlanner.js +2 -1
  21. package/engines/file/FileCompiler.js +2 -1
  22. package/engines/file/FileExporter.js +12 -3
  23. package/engines/parsing/ParseManager.js +7 -2
  24. package/engines/producer/ProducerEngine.js +4 -2
  25. package/engines/transform/JoinEngine.js +10 -6
  26. package/engines/transform/TransformationEngine.js +35 -3
  27. package/engines/transform/TypeCaster.js +20 -9
  28. package/engines/usage/UsageDataManager.js +110 -0
  29. package/engines/validation/Validator.js +0 -3
  30. package/package.json +3 -1
  31. package/workers/FilterWorker.js +3 -3
  32. package/workers/ProjectionWorker.js +3 -3
  33. package/workers/TransformWorker.js +3 -3
@@ -35,7 +35,7 @@ const Helper_1 = __importDefault(require("../../helper/Helper"));
35
35
  const Algo_1 = __importDefault(require("../../core/Algo"));
36
36
  const Environment_1 = __importDefault(require("../Environment"));
37
37
  class Dataset {
38
- constructor(name, file, batchSize) {
38
+ constructor(name, file, batchSize, executionId) {
39
39
  var _a;
40
40
  this.getPath = () => this._path;
41
41
  this.setPath = (path) => {
@@ -43,6 +43,7 @@ class Dataset {
43
43
  return this;
44
44
  };
45
45
  this.getFile = () => this._file;
46
+ this.getExecutionId = () => this._executionId;
46
47
  this.getBatchSize = () => this._batchSize;
47
48
  this.setFirstLine = (firstLine) => {
48
49
  this._firstLine = firstLine;
@@ -170,7 +171,7 @@ class Dataset {
170
171
  }
171
172
  }
172
173
  catch (error) {
173
- console.warn(`Error parsing line ${lineCount}: ${error}`);
174
+ console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
174
175
  }
175
176
  }
176
177
  }
@@ -536,7 +537,7 @@ class Dataset {
536
537
  }
537
538
  }
538
539
  catch (error) {
539
- console.warn(`Error parsing line ${lineCount}: ${error}`);
540
+ console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
540
541
  }
541
542
  }
542
543
  }
@@ -627,7 +628,7 @@ class Dataset {
627
628
  }
628
629
  }
629
630
  catch (error) {
630
- console.warn(`Error parsing line ${lineCount}: ${error}`);
631
+ console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
631
632
  }
632
633
  }
633
634
  }
@@ -703,10 +704,18 @@ class Dataset {
703
704
  return this;
704
705
  });
705
706
  this.getDimensions = () => this._dimensions;
706
- this.setDimensinons = (dimensions) => {
707
+ this.setDimensions = (dimensions) => {
707
708
  this._dimensions = dimensions;
708
709
  return this;
709
710
  };
711
+ this.setSingleDimension = (newDimension, oldDimension) => {
712
+ (0, Affirm_1.default)(newDimension, `Invalid new dimension`);
713
+ (0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
714
+ const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
715
+ (0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
716
+ this._dimensions.splice(current, 1, newDimension);
717
+ return this;
718
+ };
710
719
  /**
711
720
  * Update the record pool when dimensions change
712
721
  */
@@ -829,6 +838,7 @@ class Dataset {
829
838
  this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
830
839
  this.name = name;
831
840
  this._file = file;
841
+ this._executionId = executionId;
832
842
  this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
833
843
  this._dimensions = [];
834
844
  this._firstLine = '';
@@ -843,8 +853,9 @@ class Dataset {
843
853
  .replace(/_{2,}/g, '_')
844
854
  .replace(/^_+|_+$/g, '')
845
855
  .toLowerCase();
846
- this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, datasetName, '.dataset');
847
- this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, datasetName, '.dataset_tmp');
856
+ const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
857
+ this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
858
+ this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
848
859
  this.ensureFile(this._path);
849
860
  }
850
861
  }
@@ -21,9 +21,15 @@ const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
21
21
  const Dataset_1 = __importDefault(require("./Dataset"));
22
22
  const promises_1 = require("stream/promises");
23
23
  const fs_1 = require("fs");
24
+ const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
24
25
  class DatasetManagerClass {
25
26
  constructor() {
26
- this.create = (producer) => {
27
+ /**
28
+ * Create a new Dataset for a producer. If an executionId is provided, the dataset files will
29
+ * be isolated inside a sub-folder specific to that execution to avoid concurrency conflicts
30
+ * when the same producer / consumer is executed multiple times in parallel.
31
+ */
32
+ this.create = (producer, executionId) => {
27
33
  (0, Affirm_1.default)(producer, 'Invalid producer');
28
34
  const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName } } = producer;
29
35
  const dataset = new Dataset_1.default(name, {
@@ -32,7 +38,7 @@ class DatasetManagerClass {
32
38
  hasHeaderRow,
33
39
  sheetName,
34
40
  delimiter
35
- });
41
+ }, undefined, executionId);
36
42
  return dataset;
37
43
  };
38
44
  this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
@@ -43,7 +49,7 @@ class DatasetManagerClass {
43
49
  return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
44
50
  });
45
51
  this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
46
- var _a, _b, _c, _d, _e, _f;
52
+ var _a, _b, _c, _d, _e, _f, _g, _h;
47
53
  (0, Affirm_1.default)(firstLine, `Invalid first line`);
48
54
  (0, Affirm_1.default)(dsFile, `Invalid dataset file`);
49
55
  (0, Affirm_1.default)(producer, `Invalid producer`);
@@ -54,10 +60,17 @@ class DatasetManagerClass {
54
60
  const headerLine = firstLine;
55
61
  const rawDimensions = ParseManager_1.default._extractHeader(headerLine, delimiterChar, producer, discover);
56
62
  return {
57
- dimensions: rawDimensions.map(x => ({ key: x.name, name: x.saveAs, index: x.index, hidden: null })),
63
+ dimensions: rawDimensions.map(x => ({
64
+ key: x.name,
65
+ name: x.saveAs,
66
+ index: x.index,
67
+ hidden: null,
68
+ type: x.type
69
+ })),
58
70
  delimiter: delimiterChar
59
71
  };
60
72
  }
73
+ case 'PARQUET':
61
74
  case 'JSONL':
62
75
  case 'JSON': {
63
76
  const source = Environment_1.default.getSource(producer.source);
@@ -67,7 +80,13 @@ class DatasetManagerClass {
67
80
  if (discover) {
68
81
  return {
69
82
  delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
70
- dimensions: keys.map((x, i) => ({ hidden: false, index: i, key: x, name: x }))
83
+ dimensions: keys.map((x, i) => ({
84
+ hidden: false,
85
+ index: i,
86
+ key: x,
87
+ name: x,
88
+ type: DeveloperEngine_1.default.inferDimensionType(firstObject === null || firstObject === void 0 ? void 0 : firstObject[x])
89
+ }))
71
90
  };
72
91
  }
73
92
  const dimensions = [];
@@ -75,34 +94,61 @@ class DatasetManagerClass {
75
94
  const columnKey = (_c = pColumn.aliasInProducer) !== null && _c !== void 0 ? _c : pColumn.nameInProducer;
76
95
  const csvColumnIndex = keys.findIndex(x => x === columnKey);
77
96
  (0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
78
- dimensions.push({ index: csvColumnIndex, key: columnKey, name: pColumn.nameInProducer, hidden: null });
97
+ dimensions.push({
98
+ index: csvColumnIndex,
99
+ key: columnKey,
100
+ name: pColumn.nameInProducer,
101
+ hidden: null,
102
+ type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
103
+ });
79
104
  }
80
- const delimiterChar = (_d = file.delimiter) !== null && _d !== void 0 ? _d : ',';
105
+ const delimiterChar = (_f = file.delimiter) !== null && _f !== void 0 ? _f : ',';
81
106
  return { dimensions, delimiter: delimiterChar };
82
107
  }
83
108
  case 'TXT': {
84
109
  if (!file.hasHeaderRow) {
85
110
  // If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
86
- const delimiterChar = (_e = file.delimiter) !== null && _e !== void 0 ? _e : ',';
111
+ const delimiterChar = (_g = file.delimiter) !== null && _g !== void 0 ? _g : ',';
87
112
  const source = Environment_1.default.getSource(producer.source);
88
113
  const columns = FileCompiler_1.default.compileProducer(producer, source);
89
114
  if (discover) {
90
115
  // Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
91
116
  return {
92
117
  delimiter: delimiterChar,
93
- dimensions: firstLine.split(delimiterChar).map((x, i) => ({ hidden: false, index: i, key: `Col ${i + 1}`, name: `Col ${i + 1}` }))
118
+ dimensions: firstLine.split(delimiterChar).map((x, i) => ({
119
+ hidden: false,
120
+ index: i,
121
+ key: `Col ${i + 1}`,
122
+ name: `Col ${i + 1}`,
123
+ type: 'string'
124
+ }))
94
125
  };
95
126
  }
96
127
  return {
97
- dimensions: columns.map((x, i) => { var _a; return ({ key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer, name: x.nameInProducer, index: i, hidden: null }); }),
128
+ dimensions: columns.map((x, i) => {
129
+ var _a, _b, _c;
130
+ return ({
131
+ key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
132
+ name: x.nameInProducer,
133
+ index: i,
134
+ hidden: null,
135
+ type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
136
+ });
137
+ }),
98
138
  delimiter: delimiterChar
99
139
  };
100
140
  }
101
141
  else {
102
- const delimiterChar = (_f = producer.settings.delimiter) !== null && _f !== void 0 ? _f : ',';
142
+ const delimiterChar = (_h = producer.settings.delimiter) !== null && _h !== void 0 ? _h : ',';
103
143
  const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
104
144
  return {
105
- dimensions: rawDimensions.map(x => ({ key: x.name, name: x.saveAs, index: x.index, hidden: null })),
145
+ dimensions: rawDimensions.map(x => ({
146
+ key: x.name,
147
+ name: x.saveAs,
148
+ index: x.index,
149
+ hidden: null,
150
+ type: x.type
151
+ })),
106
152
  delimiter: delimiterChar
107
153
  };
108
154
  }
@@ -4,6 +4,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  const Algo_1 = __importDefault(require("../../core/Algo"));
7
+ const TypeCaster_1 = __importDefault(require("../transform/TypeCaster"));
7
8
  class DatasetRecord {
8
9
  constructor(row, dimensions, delimiter) {
9
10
  this.parse = (row, delimiter, dimensions) => {
@@ -11,7 +12,7 @@ class DatasetRecord {
11
12
  const parts = row.split(delimiter);
12
13
  for (let i = 0; i < dimensions.length; i++) {
13
14
  const dim = dimensions[i];
14
- this._value[dim.name] = parts[i];
15
+ this._value[dim.name] = TypeCaster_1.default.cast(parts[i], dim.type);
15
16
  }
16
17
  }
17
18
  };
@@ -35,7 +36,7 @@ class DatasetRecord {
35
36
  this.parse(row, delimiter, this._dimensions);
36
37
  };
37
38
  this.wholeUpdateDimension = (update) => {
38
- var _a;
39
+ var _a, _b, _c, _d, _e;
39
40
  if (update.toDelete) {
40
41
  // To remove
41
42
  delete this._value[update.currentDimension.name];
@@ -46,7 +47,13 @@ class DatasetRecord {
46
47
  }
47
48
  else if (!update.currentDimension) {
48
49
  // To create (at the right position)
49
- const newDimension = { index: update.newPosition, key: update.newName, name: update.newName, hidden: update.newHidden };
50
+ const newDimension = {
51
+ index: update.newPosition,
52
+ key: update.newName,
53
+ name: update.newName,
54
+ hidden: update.newHidden,
55
+ type: (_b = (_a = update.currentDimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string'
56
+ };
50
57
  this._value[newDimension.name] = null;
51
58
  this._dimensions = [...this._dimensions, newDimension];
52
59
  }
@@ -56,7 +63,13 @@ class DatasetRecord {
56
63
  if (index < 0)
57
64
  index = this._dimensions.findIndex(x => x.key === update.currentDimension.key);
58
65
  const currentDim = this._dimensions[index];
59
- const updatedDim = { name: update.newName, key: (_a = currentDim.key) !== null && _a !== void 0 ? _a : update.newName, hidden: update.newHidden, index: update.newPosition };
66
+ const updatedDim = {
67
+ name: update.newName,
68
+ key: (_c = currentDim.key) !== null && _c !== void 0 ? _c : update.newName,
69
+ hidden: update.newHidden,
70
+ index: update.newPosition,
71
+ type: (_e = (_d = update.currentDimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
72
+ };
60
73
  this._value[updatedDim.name] = this._value[currentDim.name];
61
74
  if (updatedDim.name !== currentDim.name)
62
75
  delete this._value[currentDim.name];
@@ -20,9 +20,29 @@ const DatasetManager_1 = __importDefault(require("./DatasetManager"));
20
20
  const path_1 = __importDefault(require("path"));
21
21
  class ParallelDatasetClass {
22
22
  constructor() {
23
+ this.init = () => {
24
+ /**
25
+ * I need the init to be called after all the setup has been completed because I need the .env to be loaded
26
+ */
27
+ if (!this._filterPool || !this._projectionPool || !this._transformPool) {
28
+ const options = {
29
+ workerThreadOpts: {
30
+ resourceLimits: {
31
+ maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
32
+ }
33
+ }
34
+ };
35
+ const workerPath = this._getWorkerPath();
36
+ this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'), options);
37
+ this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'), options);
38
+ this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'), options);
39
+ }
40
+ };
23
41
  this._getWorkerPath = () => {
24
42
  // Get the current file's directory
25
43
  const currentDir = __dirname;
44
+ if (process.env.NODE_ENV === 'dev' || process.env.NODE_ENV === 'development')
45
+ return path_1.default.resolve('./.build/workers');
26
46
  // Check if we're in a published npm package (no .build in path)
27
47
  if (!currentDir.includes('.build')) {
28
48
  // We're in the published package, workers are relative to package root
@@ -47,6 +67,7 @@ class ParallelDatasetClass {
47
67
  this.filter = (dataset, filters) => __awaiter(this, void 0, void 0, function* () {
48
68
  (0, Affirm_1.default)(dataset, `Invalid dataset`);
49
69
  (0, Affirm_1.default)(filters, `Invalid filters`);
70
+ this.init();
50
71
  // Distribute the work of the filter among the various workers, trying to have them match the batch size
51
72
  const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
52
73
  dataset._startOperation('filter-parallel', { workerCount });
@@ -60,6 +81,7 @@ class ParallelDatasetClass {
60
81
  const workerData = {
61
82
  datasetDimensions: dataset.getDimensions(),
62
83
  datasetFile: dataset.getFile(),
84
+ executionId: dataset.getExecutionId(),
63
85
  datasetName: dataset.name,
64
86
  datasetDelimiter: dataset.getDelimiter(),
65
87
  fromLine: fromLine,
@@ -77,13 +99,14 @@ class ParallelDatasetClass {
77
99
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
78
100
  dataset
79
101
  .setDelimiter(results[0].datasetDelimiter)
80
- .setDimensinons(results[0].datasetDimensions);
102
+ .setDimensions(results[0].datasetDimensions);
81
103
  dataset._finishOperation('filter-parallel');
82
104
  return dataset;
83
105
  });
84
106
  this.projection = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
85
107
  (0, Affirm_1.default)(dataset, `Invalid dataset`);
86
108
  (0, Affirm_1.default)(consumer, `Invalid consumer`);
109
+ this.init();
87
110
  const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
88
111
  dataset._startOperation('projection-parallel', { workerCount });
89
112
  const threads = [];
@@ -96,6 +119,7 @@ class ParallelDatasetClass {
96
119
  const workerData = {
97
120
  datasetDimensions: dataset.getDimensions(),
98
121
  datasetFile: dataset.getFile(),
122
+ executionId: dataset.getExecutionId(),
99
123
  datasetName: dataset.name,
100
124
  datasetDelimiter: dataset.getDelimiter(),
101
125
  fromLine: fromLine,
@@ -111,13 +135,14 @@ class ParallelDatasetClass {
111
135
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
112
136
  dataset
113
137
  .setDelimiter(results[0].datasetDelimiter)
114
- .setDimensinons(results[0].datasetDimensions);
138
+ .setDimensions(results[0].datasetDimensions);
115
139
  dataset._finishOperation('projection-parallel');
116
140
  return dataset;
117
141
  });
118
142
  this.transform = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
119
143
  (0, Affirm_1.default)(dataset, `Invalid dataset`);
120
144
  (0, Affirm_1.default)(consumer, `Invalid consumer`);
145
+ this.init();
121
146
  const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
122
147
  dataset._startOperation('transform-parallel', { workerCount });
123
148
  const threads = [];
@@ -130,6 +155,7 @@ class ParallelDatasetClass {
130
155
  const workerData = {
131
156
  datasetDimensions: dataset.getDimensions(),
132
157
  datasetFile: dataset.getFile(),
158
+ executionId: dataset.getExecutionId(),
133
159
  datasetName: dataset.name,
134
160
  datasetDelimiter: dataset.getDelimiter(),
135
161
  fromLine: fromLine,
@@ -145,14 +171,10 @@ class ParallelDatasetClass {
145
171
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
146
172
  dataset
147
173
  .setDelimiter(results[0].datasetDelimiter)
148
- .setDimensinons(results[0].datasetDimensions);
174
+ .setDimensions(results[0].datasetDimensions);
149
175
  dataset._finishOperation('transform-parallel');
150
176
  return dataset;
151
177
  });
152
- const workerPath = this._getWorkerPath();
153
- this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
154
- this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
155
- this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
156
178
  }
157
179
  }
158
180
  const ParallelDataset = new ParallelDatasetClass();
@@ -28,7 +28,7 @@ const Algo_1 = __importDefault(require("../../core/Algo"));
28
28
  const Logger_1 = __importDefault(require("../../helper/Logger"));
29
29
  const ParallelDataset_1 = __importDefault(require("../dataset/ParallelDataset"));
30
30
  class ExecutionEnvironment {
31
- constructor(consumer) {
31
+ constructor(consumer, executionId) {
32
32
  this.run = (options) => __awaiter(this, void 0, void 0, function* () {
33
33
  var _a, _b, _c, _d;
34
34
  (0, Affirm_1.default)(this._consumer, 'Invalid consumer');
@@ -64,7 +64,7 @@ class ExecutionEnvironment {
64
64
  (0, Affirm_1.default)(planStep.producer, `Invalid producer in execute-SQL step`);
65
65
  const driver = yield DriverFactory_1.default.instantiateSource(planStep.source);
66
66
  const queryData = (yield driver.query(this._envData.finalSQL)).rows;
67
- let dataset = DatasetManager_1.default.create(planStep.producer);
67
+ let dataset = DatasetManager_1.default.create(planStep.producer, this._executionId);
68
68
  dataset = yield dataset.loadFromMemory(queryData, planStep.producer);
69
69
  this._storeIntermidiate(planStep, dataset);
70
70
  break;
@@ -74,7 +74,7 @@ class ExecutionEnvironment {
74
74
  const { producer } = planStep;
75
75
  const source = Environment_1.default.getSource(producer.source);
76
76
  (0, Affirm_1.default)(source, `Source "${producer.source}" of producer "${producer.name}" not found.`);
77
- let dataset = DatasetManager_1.default.create(producer);
77
+ let dataset = DatasetManager_1.default.create(producer, this._executionId);
78
78
  dataset = yield dataset.load(source);
79
79
  this._storeIntermidiate(planStep, dataset);
80
80
  break;
@@ -102,7 +102,7 @@ class ExecutionEnvironment {
102
102
  case 'export-file': {
103
103
  (0, Affirm_1.default)(planStep.output, `Invalid output in export-file step`);
104
104
  (0, Affirm_1.default)(this._resultingDataset, 'Invalid resulting dataset in export-file step');
105
- const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset);
105
+ const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset, this._executionId);
106
106
  result.fileUri = res;
107
107
  break;
108
108
  }
@@ -156,6 +156,13 @@ class ExecutionEnvironment {
156
156
  if (ds)
157
157
  Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getCount()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
158
158
  Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
159
+ // IMPORTANT: cleanup all the datasets to not leave any data around and to avoid memory leaks
160
+ const datasets = [
161
+ ...this._producedData.map(x => x.dataset),
162
+ this._resultingDataset
163
+ ].filter(Algo_1.default.hasVal);
164
+ const promises = datasets.map(x => x.destroy());
165
+ yield Promise.all(promises);
159
166
  throw error;
160
167
  }
161
168
  Logger_1.default.log(`Completed execution of consumer:\n\tSize: ${result._stats.size}\n\tCycles: ${result._stats.cycles}\n\tTime: ${result._stats.elapsedMS}\n\tOperations: ${Logger_1.default.formatList(result._stats.operations)}`);
@@ -184,6 +191,8 @@ class ExecutionEnvironment {
184
191
  this._envData = { consumerSQL: null, executionRequestSQL: null, finalSQL: null };
185
192
  this._producedData = [];
186
193
  this._resultingDataset = null;
194
+ // A short unique id to isolate temp dataset files & output names
195
+ this._executionId = executionId;
187
196
  }
188
197
  }
189
198
  exports.default = ExecutionEnvironment;
@@ -108,7 +108,8 @@ class ExecutionPlannerClas {
108
108
  break;
109
109
  }
110
110
  case 'local':
111
- case 'aws-s3': {
111
+ case 'aws-s3':
112
+ case 'delta-share': {
112
113
  plan.push({ type: 'load-dataset', producer });
113
114
  plan.push({ type: 'prepare-dataset', producer });
114
115
  if (producer.dimensions.some(x => { var _a, _b; return ((_a = x.alias) === null || _a === void 0 ? void 0 : _a.includes('{')) || ((_b = x.alias) === null || _b === void 0 ? void 0 : _b.includes('[')); }))
@@ -7,9 +7,10 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
7
7
  class FileCompilerClass {
8
8
  constructor() {
9
9
  this.compileProducer = (producer, source) => {
10
+ var _a;
10
11
  (0, Affirm_1.default)(producer, `Invalid producer`);
11
12
  (0, Affirm_1.default)(source, `Invalid source`);
12
- (0, Affirm_1.default)(producer.settings.fileKey, `Missing required file key in producer settings`);
13
+ (0, Affirm_1.default)((_a = producer.settings.fileKey) !== null && _a !== void 0 ? _a : producer.settings.sqlTable, `Missing required file key in producer settings`);
13
14
  (0, Affirm_1.default)(producer.settings.fileType, `Missing required file type in producer settings`);
14
15
  (0, Affirm_1.default)(!producer.measures || producer.measures.length === 0, `Cannot use "measure" with a producer linked to a file (only dimensions are allowed).`);
15
16
  const columns = producer.dimensions.map(x => ({
@@ -20,7 +20,7 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
20
20
  const Environment_1 = __importDefault(require("../Environment"));
21
21
  class FileExporterClass {
22
22
  constructor() {
23
- this.export = (consumer, output, dataset) => __awaiter(this, void 0, void 0, function* () {
23
+ this.export = (consumer, output, dataset, executionId) => __awaiter(this, void 0, void 0, function* () {
24
24
  (0, Affirm_1.default)(consumer, `Invalid consumer`);
25
25
  (0, Affirm_1.default)(output, `Invalid output`);
26
26
  (0, Affirm_1.default)(dataset, `Invalid export dataset`);
@@ -32,7 +32,7 @@ class FileExporterClass {
32
32
  : output.format === 'JSON'
33
33
  ? 'jsonl'
34
34
  : 'txt';
35
- const name = this._composeFileName(consumer, extension);
35
+ const name = this._composeFileName(consumer, output, extension, executionId);
36
36
  const uploadRes = yield driver.uploadStream({
37
37
  dataset,
38
38
  name,
@@ -67,7 +67,16 @@ class FileExporterClass {
67
67
  }
68
68
  return chunks;
69
69
  };
70
- this._composeFileName = (consumer, extension) => `${consumer.name}_${Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-')}.${extension}`;
70
+ this._composeFileName = (consumer, output, extension, executionId) => {
71
+ if (output.exportName && output.exportName.trim().length > 0) {
72
+ // Ensure no extension duplication
73
+ const sanitized = output.exportName.replace(/\.[^.]+$/, '');
74
+ return `${sanitized}.${extension}`;
75
+ }
76
+ const baseTs = Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-');
77
+ const suffix = executionId ? `_${executionId}` : '';
78
+ return `${consumer.name}_${baseTs}${suffix}.${extension}`;
79
+ };
71
80
  }
72
81
  }
73
82
  const FileExporter = new FileExporterClass();
@@ -9,7 +9,7 @@ const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
9
9
  class ParseManagerClass {
10
10
  constructor() {
11
11
  this._extractHeader = (headerLine, delimiter, producer, discover) => {
12
- var _a;
12
+ var _a, _b, _c;
13
13
  (0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
14
14
  (0, Affirm_1.default)(delimiter, 'Invalid CSV delimiter');
15
15
  (0, Affirm_1.default)(producer, 'Invalid producer');
@@ -24,7 +24,12 @@ class ParseManagerClass {
24
24
  const columnKey = (_a = pColumn.aliasInProducer) !== null && _a !== void 0 ? _a : pColumn.nameInProducer;
25
25
  const csvColumnIndex = headerColumns.findIndex(x => x === columnKey);
26
26
  (0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
27
- csvColumns.push({ index: csvColumnIndex, name: columnKey, saveAs: pColumn.nameInProducer });
27
+ csvColumns.push({
28
+ index: csvColumnIndex,
29
+ name: columnKey,
30
+ saveAs: pColumn.nameInProducer,
31
+ type: (_c = (_b = pColumn.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
32
+ });
28
33
  }
29
34
  return csvColumns;
30
35
  };
@@ -34,7 +34,8 @@ class ProducerEngineClass {
34
34
  (0, Affirm_1.default)(sql, `Invalid SQL from compilation for producer "${producer.name}"`);
35
35
  return sql;
36
36
  }
37
- case 'aws-s3': {
37
+ case 'aws-s3':
38
+ case 'delta-share': {
38
39
  const columns = FileCompiler_1.default.compileProducer(producer, source);
39
40
  (0, Affirm_1.default)(columns, `Invalid columns from compilation for producer "${producer.name}"`);
40
41
  break;
@@ -141,7 +142,8 @@ class ProducerEngineClass {
141
142
  break;
142
143
  }
143
144
  case 'local':
144
- case 'aws-s3': {
145
+ case 'aws-s3':
146
+ case 'delta-share': {
145
147
  const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } });
146
148
  dataset = yield dataset.loadFromMemory(fileData.data, producer, discover);
147
149
  break;
@@ -102,12 +102,16 @@ class JoinEngineClass {
102
102
  fileType: 'CSV'
103
103
  });
104
104
  // Get dimensions for the result dataset based on consumer columns
105
- const resultDimensions = consumerColumns.map((col, index) => ({
106
- name: col.consumerAlias || col.consumerKey,
107
- key: col.consumerAlias || col.consumerKey,
108
- index,
109
- hidden: null
110
- }));
105
+ const resultDimensions = consumerColumns.map((col, index) => {
106
+ var _a, _b;
107
+ return ({
108
+ name: col.consumerAlias || col.consumerKey,
109
+ key: col.consumerAlias || col.consumerKey,
110
+ index,
111
+ type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string',
112
+ hidden: null
113
+ });
114
+ });
111
115
  // Initialize the result dataset with proper dimensions
112
116
  resultDataset.getDimensions().length = 0;
113
117
  resultDataset.getDimensions().push(...resultDimensions);
@@ -22,7 +22,7 @@ class TransformationEngineClass {
22
22
  (0, Affirm_1.default)(dataset, 'Invalid data');
23
23
  const fieldsToTransform = consumer.fields.filter(field => Algo_1.default.hasVal(field.transform));
24
24
  Affirm_1.default.hasItems(fieldsToTransform, 'No fields with transformations');
25
- return yield dataset.map(record => {
25
+ yield dataset.map(record => {
26
26
  var _a;
27
27
  for (const field of fieldsToTransform) {
28
28
  if (!field.transform)
@@ -54,6 +54,17 @@ class TransformationEngineClass {
54
54
  }
55
55
  return record;
56
56
  }, options);
57
+ /**
58
+ * Some transformations (for now only "cast") change the underlying type of the dataset dimension
59
+ * Here I update the dimension type of the dataset.
60
+ * TODO: I think that we may have a bug if you cast AND then do an operation on the number, since it reverts back to being a string in the same trnasformation chain, since the dimension type update is applied only at the end of all the transformations
61
+ */
62
+ for (const field of fieldsToTransform) {
63
+ if (!field.transform)
64
+ continue;
65
+ this.applyDimensionsChanges(field.transform, field, dataset);
66
+ }
67
+ return dataset;
57
68
  });
58
69
  this.isFieldCombinationTransformation = (transformation) => {
59
70
  if (Array.isArray(transformation)) {
@@ -73,9 +84,12 @@ class TransformationEngineClass {
73
84
  }
74
85
  // Single transformation
75
86
  if ('cast' in transformations) {
76
- const casted = TypeCaster_1.default.cast(value, transformations.cast);
77
- if (isNaN(casted) && transformations.cast === 'number')
87
+ const { cast, format } = transformations;
88
+ const casted = TypeCaster_1.default.cast(value, cast, format);
89
+ if (cast === 'number' && isNaN(casted))
78
90
  throw new Error(`Cannot cast non-numeric value in field '${field.key}'`);
91
+ if (cast === 'datetime' && casted instanceof Date && isNaN(casted.getTime()))
92
+ throw new Error(`Cannot cast value to date in field '${field.key}'`);
79
93
  return casted;
80
94
  }
81
95
  if ('multiply' in transformations) {
@@ -260,6 +274,24 @@ class TransformationEngineClass {
260
274
  }
261
275
  return false;
262
276
  };
277
+ this.applyDimensionsChanges = (transformations, field, dataset) => {
278
+ if (Array.isArray(transformations)) {
279
+ for (const transform of transformations) {
280
+ this.applyDimensionsChanges(transform, field, dataset);
281
+ }
282
+ return dataset;
283
+ }
284
+ // Single transformation
285
+ if ('cast' in transformations) {
286
+ const { cast } = transformations;
287
+ let oldDimension = dataset.getDimensions().find(x => x.name === field.key);
288
+ if (!oldDimension)
289
+ oldDimension = dataset.getDimensions().find(x => x.key === field.key);
290
+ const newDimension = Object.assign(Object.assign({}, structuredClone(oldDimension)), { type: cast });
291
+ dataset.setSingleDimension(newDimension, oldDimension);
292
+ }
293
+ return dataset;
294
+ };
263
295
  }
264
296
  }
265
297
  const TransformationEngine = new TransformationEngineClass();