@forzalabs/remora 0.1.4-nasco.3 → 0.1.5-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,9 +21,15 @@ const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
21
21
  const Dataset_1 = __importDefault(require("./Dataset"));
22
22
  const promises_1 = require("stream/promises");
23
23
  const fs_1 = require("fs");
24
+ const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
24
25
  class DatasetManagerClass {
25
26
  constructor() {
26
- this.create = (producer) => {
27
+ /**
28
+ * Create a new Dataset for a producer. If an executionId is provided, the dataset files will
29
+ * be isolated inside a sub-folder specific to that execution to avoid concurrency conflicts
30
+ * when the same producer / consumer is executed multiple times in parallel.
31
+ */
32
+ this.create = (producer, executionId) => {
27
33
  (0, Affirm_1.default)(producer, 'Invalid producer');
28
34
  const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName } } = producer;
29
35
  const dataset = new Dataset_1.default(name, {
@@ -32,7 +38,7 @@ class DatasetManagerClass {
32
38
  hasHeaderRow,
33
39
  sheetName,
34
40
  delimiter
35
- });
41
+ }, undefined, executionId);
36
42
  return dataset;
37
43
  };
38
44
  this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
@@ -43,7 +49,7 @@ class DatasetManagerClass {
43
49
  return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
44
50
  });
45
51
  this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
46
- var _a, _b, _c, _d, _e, _f;
52
+ var _a, _b, _c, _d, _e, _f, _g, _h;
47
53
  (0, Affirm_1.default)(firstLine, `Invalid first line`);
48
54
  (0, Affirm_1.default)(dsFile, `Invalid dataset file`);
49
55
  (0, Affirm_1.default)(producer, `Invalid producer`);
@@ -54,10 +60,17 @@ class DatasetManagerClass {
54
60
  const headerLine = firstLine;
55
61
  const rawDimensions = ParseManager_1.default._extractHeader(headerLine, delimiterChar, producer, discover);
56
62
  return {
57
- dimensions: rawDimensions.map(x => ({ key: x.name, name: x.saveAs, index: x.index, hidden: null })),
63
+ dimensions: rawDimensions.map(x => ({
64
+ key: x.name,
65
+ name: x.saveAs,
66
+ index: x.index,
67
+ hidden: null,
68
+ type: x.type
69
+ })),
58
70
  delimiter: delimiterChar
59
71
  };
60
72
  }
73
+ case 'PARQUET':
61
74
  case 'JSONL':
62
75
  case 'JSON': {
63
76
  const source = Environment_1.default.getSource(producer.source);
@@ -67,7 +80,13 @@ class DatasetManagerClass {
67
80
  if (discover) {
68
81
  return {
69
82
  delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
70
- dimensions: keys.map((x, i) => ({ hidden: false, index: i, key: x, name: x }))
83
+ dimensions: keys.map((x, i) => ({
84
+ hidden: false,
85
+ index: i,
86
+ key: x,
87
+ name: x,
88
+ type: DeveloperEngine_1.default.inferDimensionType(firstObject === null || firstObject === void 0 ? void 0 : firstObject[x])
89
+ }))
71
90
  };
72
91
  }
73
92
  const dimensions = [];
@@ -75,34 +94,61 @@ class DatasetManagerClass {
75
94
  const columnKey = (_c = pColumn.aliasInProducer) !== null && _c !== void 0 ? _c : pColumn.nameInProducer;
76
95
  const csvColumnIndex = keys.findIndex(x => x === columnKey);
77
96
  (0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
78
- dimensions.push({ index: csvColumnIndex, key: columnKey, name: pColumn.nameInProducer, hidden: null });
97
+ dimensions.push({
98
+ index: csvColumnIndex,
99
+ key: columnKey,
100
+ name: pColumn.nameInProducer,
101
+ hidden: null,
102
+ type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
103
+ });
79
104
  }
80
- const delimiterChar = (_d = file.delimiter) !== null && _d !== void 0 ? _d : ',';
105
+ const delimiterChar = (_f = file.delimiter) !== null && _f !== void 0 ? _f : ',';
81
106
  return { dimensions, delimiter: delimiterChar };
82
107
  }
83
108
  case 'TXT': {
84
109
  if (!file.hasHeaderRow) {
85
110
  // If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
86
- const delimiterChar = (_e = file.delimiter) !== null && _e !== void 0 ? _e : ',';
111
+ const delimiterChar = (_g = file.delimiter) !== null && _g !== void 0 ? _g : ',';
87
112
  const source = Environment_1.default.getSource(producer.source);
88
113
  const columns = FileCompiler_1.default.compileProducer(producer, source);
89
114
  if (discover) {
90
115
  // Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
91
116
  return {
92
117
  delimiter: delimiterChar,
93
- dimensions: firstLine.split(delimiterChar).map((x, i) => ({ hidden: false, index: i, key: `Col ${i + 1}`, name: `Col ${i + 1}` }))
118
+ dimensions: firstLine.split(delimiterChar).map((x, i) => ({
119
+ hidden: false,
120
+ index: i,
121
+ key: `Col ${i + 1}`,
122
+ name: `Col ${i + 1}`,
123
+ type: 'string'
124
+ }))
94
125
  };
95
126
  }
96
127
  return {
97
- dimensions: columns.map((x, i) => { var _a; return ({ key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer, name: x.nameInProducer, index: i, hidden: null }); }),
128
+ dimensions: columns.map((x, i) => {
129
+ var _a, _b, _c;
130
+ return ({
131
+ key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
132
+ name: x.nameInProducer,
133
+ index: i,
134
+ hidden: null,
135
+ type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
136
+ });
137
+ }),
98
138
  delimiter: delimiterChar
99
139
  };
100
140
  }
101
141
  else {
102
- const delimiterChar = (_f = producer.settings.delimiter) !== null && _f !== void 0 ? _f : ',';
142
+ const delimiterChar = (_h = producer.settings.delimiter) !== null && _h !== void 0 ? _h : ',';
103
143
  const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
104
144
  return {
105
- dimensions: rawDimensions.map(x => ({ key: x.name, name: x.saveAs, index: x.index, hidden: null })),
145
+ dimensions: rawDimensions.map(x => ({
146
+ key: x.name,
147
+ name: x.saveAs,
148
+ index: x.index,
149
+ hidden: null,
150
+ type: x.type
151
+ })),
106
152
  delimiter: delimiterChar
107
153
  };
108
154
  }
@@ -4,6 +4,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  const Algo_1 = __importDefault(require("../../core/Algo"));
7
+ const TypeCaster_1 = __importDefault(require("../transform/TypeCaster"));
7
8
  class DatasetRecord {
8
9
  constructor(row, dimensions, delimiter) {
9
10
  this.parse = (row, delimiter, dimensions) => {
@@ -11,7 +12,7 @@ class DatasetRecord {
11
12
  const parts = row.split(delimiter);
12
13
  for (let i = 0; i < dimensions.length; i++) {
13
14
  const dim = dimensions[i];
14
- this._value[dim.name] = parts[i];
15
+ this._value[dim.name] = TypeCaster_1.default.cast(parts[i], dim.type);
15
16
  }
16
17
  }
17
18
  };
@@ -35,7 +36,7 @@ class DatasetRecord {
35
36
  this.parse(row, delimiter, this._dimensions);
36
37
  };
37
38
  this.wholeUpdateDimension = (update) => {
38
- var _a;
39
+ var _a, _b, _c, _d, _e;
39
40
  if (update.toDelete) {
40
41
  // To remove
41
42
  delete this._value[update.currentDimension.name];
@@ -46,7 +47,13 @@ class DatasetRecord {
46
47
  }
47
48
  else if (!update.currentDimension) {
48
49
  // To create (at the right position)
49
- const newDimension = { index: update.newPosition, key: update.newName, name: update.newName, hidden: update.newHidden };
50
+ const newDimension = {
51
+ index: update.newPosition,
52
+ key: update.newName,
53
+ name: update.newName,
54
+ hidden: update.newHidden,
55
+ type: (_b = (_a = update.currentDimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string'
56
+ };
50
57
  this._value[newDimension.name] = null;
51
58
  this._dimensions = [...this._dimensions, newDimension];
52
59
  }
@@ -56,7 +63,13 @@ class DatasetRecord {
56
63
  if (index < 0)
57
64
  index = this._dimensions.findIndex(x => x.key === update.currentDimension.key);
58
65
  const currentDim = this._dimensions[index];
59
- const updatedDim = { name: update.newName, key: (_a = currentDim.key) !== null && _a !== void 0 ? _a : update.newName, hidden: update.newHidden, index: update.newPosition };
66
+ const updatedDim = {
67
+ name: update.newName,
68
+ key: (_c = currentDim.key) !== null && _c !== void 0 ? _c : update.newName,
69
+ hidden: update.newHidden,
70
+ index: update.newPosition,
71
+ type: (_e = (_d = update.currentDimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
72
+ };
60
73
  this._value[updatedDim.name] = this._value[currentDim.name];
61
74
  if (updatedDim.name !== currentDim.name)
62
75
  delete this._value[currentDim.name];
@@ -25,10 +25,17 @@ class ParallelDatasetClass {
25
25
  * I need the init to be called after all the setup has been completed because I need the .env to be loaded
26
26
  */
27
27
  if (!this._filterPool || !this._projectionPool || !this._transformPool) {
28
+ const options = {
29
+ workerThreadOpts: {
30
+ resourceLimits: {
31
+ maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
32
+ }
33
+ }
34
+ };
28
35
  const workerPath = this._getWorkerPath();
29
- this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
30
- this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
31
- this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
36
+ this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'), options);
37
+ this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'), options);
38
+ this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'), options);
32
39
  }
33
40
  };
34
41
  this._getWorkerPath = () => {
@@ -74,6 +81,7 @@ class ParallelDatasetClass {
74
81
  const workerData = {
75
82
  datasetDimensions: dataset.getDimensions(),
76
83
  datasetFile: dataset.getFile(),
84
+ executionId: dataset.getExecutionId(),
77
85
  datasetName: dataset.name,
78
86
  datasetDelimiter: dataset.getDelimiter(),
79
87
  fromLine: fromLine,
@@ -91,7 +99,7 @@ class ParallelDatasetClass {
91
99
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
92
100
  dataset
93
101
  .setDelimiter(results[0].datasetDelimiter)
94
- .setDimensinons(results[0].datasetDimensions);
102
+ .setDimensions(results[0].datasetDimensions);
95
103
  dataset._finishOperation('filter-parallel');
96
104
  return dataset;
97
105
  });
@@ -111,6 +119,7 @@ class ParallelDatasetClass {
111
119
  const workerData = {
112
120
  datasetDimensions: dataset.getDimensions(),
113
121
  datasetFile: dataset.getFile(),
122
+ executionId: dataset.getExecutionId(),
114
123
  datasetName: dataset.name,
115
124
  datasetDelimiter: dataset.getDelimiter(),
116
125
  fromLine: fromLine,
@@ -126,7 +135,7 @@ class ParallelDatasetClass {
126
135
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
127
136
  dataset
128
137
  .setDelimiter(results[0].datasetDelimiter)
129
- .setDimensinons(results[0].datasetDimensions);
138
+ .setDimensions(results[0].datasetDimensions);
130
139
  dataset._finishOperation('projection-parallel');
131
140
  return dataset;
132
141
  });
@@ -146,6 +155,7 @@ class ParallelDatasetClass {
146
155
  const workerData = {
147
156
  datasetDimensions: dataset.getDimensions(),
148
157
  datasetFile: dataset.getFile(),
158
+ executionId: dataset.getExecutionId(),
149
159
  datasetName: dataset.name,
150
160
  datasetDelimiter: dataset.getDelimiter(),
151
161
  fromLine: fromLine,
@@ -161,7 +171,7 @@ class ParallelDatasetClass {
161
171
  yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
162
172
  dataset
163
173
  .setDelimiter(results[0].datasetDelimiter)
164
- .setDimensinons(results[0].datasetDimensions);
174
+ .setDimensions(results[0].datasetDimensions);
165
175
  dataset._finishOperation('transform-parallel');
166
176
  return dataset;
167
177
  });
@@ -28,7 +28,7 @@ const Algo_1 = __importDefault(require("../../core/Algo"));
28
28
  const Logger_1 = __importDefault(require("../../helper/Logger"));
29
29
  const ParallelDataset_1 = __importDefault(require("../dataset/ParallelDataset"));
30
30
  class ExecutionEnvironment {
31
- constructor(consumer) {
31
+ constructor(consumer, executionId) {
32
32
  this.run = (options) => __awaiter(this, void 0, void 0, function* () {
33
33
  var _a, _b, _c, _d;
34
34
  (0, Affirm_1.default)(this._consumer, 'Invalid consumer');
@@ -64,7 +64,7 @@ class ExecutionEnvironment {
64
64
  (0, Affirm_1.default)(planStep.producer, `Invalid producer in execute-SQL step`);
65
65
  const driver = yield DriverFactory_1.default.instantiateSource(planStep.source);
66
66
  const queryData = (yield driver.query(this._envData.finalSQL)).rows;
67
- let dataset = DatasetManager_1.default.create(planStep.producer);
67
+ let dataset = DatasetManager_1.default.create(planStep.producer, this._executionId);
68
68
  dataset = yield dataset.loadFromMemory(queryData, planStep.producer);
69
69
  this._storeIntermidiate(planStep, dataset);
70
70
  break;
@@ -74,7 +74,7 @@ class ExecutionEnvironment {
74
74
  const { producer } = planStep;
75
75
  const source = Environment_1.default.getSource(producer.source);
76
76
  (0, Affirm_1.default)(source, `Source "${producer.source}" of producer "${producer.name}" not found.`);
77
- let dataset = DatasetManager_1.default.create(producer);
77
+ let dataset = DatasetManager_1.default.create(producer, this._executionId);
78
78
  dataset = yield dataset.load(source);
79
79
  this._storeIntermidiate(planStep, dataset);
80
80
  break;
@@ -102,7 +102,7 @@ class ExecutionEnvironment {
102
102
  case 'export-file': {
103
103
  (0, Affirm_1.default)(planStep.output, `Invalid output in export-file step`);
104
104
  (0, Affirm_1.default)(this._resultingDataset, 'Invalid resulting dataset in export-file step');
105
- const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset);
105
+ const res = yield FileExporter_1.default.export(this._consumer, planStep.output, this._resultingDataset, this._executionId);
106
106
  result.fileUri = res;
107
107
  break;
108
108
  }
@@ -156,6 +156,13 @@ class ExecutionEnvironment {
156
156
  if (ds)
157
157
  Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getCount()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
158
158
  Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
159
+ // IMPORTANT: cleanup all the datasets to not leave any data around and to avoid memory leaks
160
+ const datasets = [
161
+ ...this._producedData.map(x => x.dataset),
162
+ this._resultingDataset
163
+ ].filter(Algo_1.default.hasVal);
164
+ const promises = datasets.map(x => x.destroy());
165
+ yield Promise.all(promises);
159
166
  throw error;
160
167
  }
161
168
  Logger_1.default.log(`Completed execution of consumer:\n\tSize: ${result._stats.size}\n\tCycles: ${result._stats.cycles}\n\tTime: ${result._stats.elapsedMS}\n\tOperations: ${Logger_1.default.formatList(result._stats.operations)}`);
@@ -184,6 +191,8 @@ class ExecutionEnvironment {
184
191
  this._envData = { consumerSQL: null, executionRequestSQL: null, finalSQL: null };
185
192
  this._producedData = [];
186
193
  this._resultingDataset = null;
194
+ // A short unique id to isolate temp dataset files & output names
195
+ this._executionId = executionId;
187
196
  }
188
197
  }
189
198
  exports.default = ExecutionEnvironment;
@@ -108,7 +108,8 @@ class ExecutionPlannerClas {
108
108
  break;
109
109
  }
110
110
  case 'local':
111
- case 'aws-s3': {
111
+ case 'aws-s3':
112
+ case 'delta-share': {
112
113
  plan.push({ type: 'load-dataset', producer });
113
114
  plan.push({ type: 'prepare-dataset', producer });
114
115
  if (producer.dimensions.some(x => { var _a, _b; return ((_a = x.alias) === null || _a === void 0 ? void 0 : _a.includes('{')) || ((_b = x.alias) === null || _b === void 0 ? void 0 : _b.includes('[')); }))
@@ -7,9 +7,10 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
7
7
  class FileCompilerClass {
8
8
  constructor() {
9
9
  this.compileProducer = (producer, source) => {
10
+ var _a;
10
11
  (0, Affirm_1.default)(producer, `Invalid producer`);
11
12
  (0, Affirm_1.default)(source, `Invalid source`);
12
- (0, Affirm_1.default)(producer.settings.fileKey, `Missing required file key in producer settings`);
13
+ (0, Affirm_1.default)((_a = producer.settings.fileKey) !== null && _a !== void 0 ? _a : producer.settings.sqlTable, `Missing required file key in producer settings`);
13
14
  (0, Affirm_1.default)(producer.settings.fileType, `Missing required file type in producer settings`);
14
15
  (0, Affirm_1.default)(!producer.measures || producer.measures.length === 0, `Cannot use "measure" with a producer linked to a file (only dimensions are allowed).`);
15
16
  const columns = producer.dimensions.map(x => ({
@@ -20,7 +20,7 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
20
20
  const Environment_1 = __importDefault(require("../Environment"));
21
21
  class FileExporterClass {
22
22
  constructor() {
23
- this.export = (consumer, output, dataset) => __awaiter(this, void 0, void 0, function* () {
23
+ this.export = (consumer, output, dataset, executionId) => __awaiter(this, void 0, void 0, function* () {
24
24
  (0, Affirm_1.default)(consumer, `Invalid consumer`);
25
25
  (0, Affirm_1.default)(output, `Invalid output`);
26
26
  (0, Affirm_1.default)(dataset, `Invalid export dataset`);
@@ -32,7 +32,7 @@ class FileExporterClass {
32
32
  : output.format === 'JSON'
33
33
  ? 'jsonl'
34
34
  : 'txt';
35
- const name = this._composeFileName(consumer, extension);
35
+ const name = this._composeFileName(consumer, output, extension, executionId);
36
36
  const uploadRes = yield driver.uploadStream({
37
37
  dataset,
38
38
  name,
@@ -67,7 +67,16 @@ class FileExporterClass {
67
67
  }
68
68
  return chunks;
69
69
  };
70
- this._composeFileName = (consumer, extension) => `${consumer.name}_${Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-')}.${extension}`;
70
+ this._composeFileName = (consumer, output, extension, executionId) => {
71
+ if (output.exportName && output.exportName.trim().length > 0) {
72
+ // Ensure no extension duplication
73
+ const sanitized = output.exportName.replace(/\.[^.]+$/, '');
74
+ return `${sanitized}.${extension}`;
75
+ }
76
+ const baseTs = Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-');
77
+ const suffix = executionId ? `_${executionId}` : '';
78
+ return `${consumer.name}_${baseTs}${suffix}.${extension}`;
79
+ };
71
80
  }
72
81
  }
73
82
  const FileExporter = new FileExporterClass();
@@ -9,7 +9,7 @@ const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
9
9
  class ParseManagerClass {
10
10
  constructor() {
11
11
  this._extractHeader = (headerLine, delimiter, producer, discover) => {
12
- var _a;
12
+ var _a, _b, _c;
13
13
  (0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
14
14
  (0, Affirm_1.default)(delimiter, 'Invalid CSV delimiter');
15
15
  (0, Affirm_1.default)(producer, 'Invalid producer');
@@ -24,7 +24,12 @@ class ParseManagerClass {
24
24
  const columnKey = (_a = pColumn.aliasInProducer) !== null && _a !== void 0 ? _a : pColumn.nameInProducer;
25
25
  const csvColumnIndex = headerColumns.findIndex(x => x === columnKey);
26
26
  (0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
27
- csvColumns.push({ index: csvColumnIndex, name: columnKey, saveAs: pColumn.nameInProducer });
27
+ csvColumns.push({
28
+ index: csvColumnIndex,
29
+ name: columnKey,
30
+ saveAs: pColumn.nameInProducer,
31
+ type: (_c = (_b = pColumn.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
32
+ });
28
33
  }
29
34
  return csvColumns;
30
35
  };
@@ -34,7 +34,8 @@ class ProducerEngineClass {
34
34
  (0, Affirm_1.default)(sql, `Invalid SQL from compilation for producer "${producer.name}"`);
35
35
  return sql;
36
36
  }
37
- case 'aws-s3': {
37
+ case 'aws-s3':
38
+ case 'delta-share': {
38
39
  const columns = FileCompiler_1.default.compileProducer(producer, source);
39
40
  (0, Affirm_1.default)(columns, `Invalid columns from compilation for producer "${producer.name}"`);
40
41
  break;
@@ -141,7 +142,8 @@ class ProducerEngineClass {
141
142
  break;
142
143
  }
143
144
  case 'local':
144
- case 'aws-s3': {
145
+ case 'aws-s3':
146
+ case 'delta-share': {
145
147
  const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } });
146
148
  dataset = yield dataset.loadFromMemory(fileData.data, producer, discover);
147
149
  break;
@@ -102,12 +102,16 @@ class JoinEngineClass {
102
102
  fileType: 'CSV'
103
103
  });
104
104
  // Get dimensions for the result dataset based on consumer columns
105
- const resultDimensions = consumerColumns.map((col, index) => ({
106
- name: col.consumerAlias || col.consumerKey,
107
- key: col.consumerAlias || col.consumerKey,
108
- index,
109
- hidden: null
110
- }));
105
+ const resultDimensions = consumerColumns.map((col, index) => {
106
+ var _a, _b;
107
+ return ({
108
+ name: col.consumerAlias || col.consumerKey,
109
+ key: col.consumerAlias || col.consumerKey,
110
+ index,
111
+ type: (_b = (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string',
112
+ hidden: null
113
+ });
114
+ });
111
115
  // Initialize the result dataset with proper dimensions
112
116
  resultDataset.getDimensions().length = 0;
113
117
  resultDataset.getDimensions().push(...resultDimensions);
@@ -22,7 +22,7 @@ class TransformationEngineClass {
22
22
  (0, Affirm_1.default)(dataset, 'Invalid data');
23
23
  const fieldsToTransform = consumer.fields.filter(field => Algo_1.default.hasVal(field.transform));
24
24
  Affirm_1.default.hasItems(fieldsToTransform, 'No fields with transformations');
25
- return yield dataset.map(record => {
25
+ yield dataset.map(record => {
26
26
  var _a;
27
27
  for (const field of fieldsToTransform) {
28
28
  if (!field.transform)
@@ -54,6 +54,17 @@ class TransformationEngineClass {
54
54
  }
55
55
  return record;
56
56
  }, options);
57
+ /**
58
+ * Some transformations (for now only "cast") change the underlying type of the dataset dimension
59
+ * Here I update the dimension type of the dataset.
60
+ * TODO: I think that we may have a bug if you cast AND then do an operation on the number, since it reverts back to being a string in the same trnasformation chain, since the dimension type update is applied only at the end of all the transformations
61
+ */
62
+ for (const field of fieldsToTransform) {
63
+ if (!field.transform)
64
+ continue;
65
+ this.applyDimensionsChanges(field.transform, field, dataset);
66
+ }
67
+ return dataset;
57
68
  });
58
69
  this.isFieldCombinationTransformation = (transformation) => {
59
70
  if (Array.isArray(transformation)) {
@@ -77,7 +88,7 @@ class TransformationEngineClass {
77
88
  const casted = TypeCaster_1.default.cast(value, cast, format);
78
89
  if (cast === 'number' && isNaN(casted))
79
90
  throw new Error(`Cannot cast non-numeric value in field '${field.key}'`);
80
- if (cast === 'date' && casted instanceof Date && isNaN(casted.getTime()))
91
+ if (cast === 'datetime' && casted instanceof Date && isNaN(casted.getTime()))
81
92
  throw new Error(`Cannot cast value to date in field '${field.key}'`);
82
93
  return casted;
83
94
  }
@@ -263,6 +274,24 @@ class TransformationEngineClass {
263
274
  }
264
275
  return false;
265
276
  };
277
+ this.applyDimensionsChanges = (transformations, field, dataset) => {
278
+ if (Array.isArray(transformations)) {
279
+ for (const transform of transformations) {
280
+ this.applyDimensionsChanges(transform, field, dataset);
281
+ }
282
+ return dataset;
283
+ }
284
+ // Single transformation
285
+ if ('cast' in transformations) {
286
+ const { cast } = transformations;
287
+ let oldDimension = dataset.getDimensions().find(x => x.name === field.key);
288
+ if (!oldDimension)
289
+ oldDimension = dataset.getDimensions().find(x => x.key === field.key);
290
+ const newDimension = Object.assign(Object.assign({}, structuredClone(oldDimension)), { type: cast });
291
+ dataset.setSingleDimension(newDimension, oldDimension);
292
+ }
293
+ return dataset;
294
+ };
266
295
  }
267
296
  }
268
297
  const TransformationEngine = new TransformationEngineClass();
@@ -0,0 +1,110 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const DSTE_1 = __importDefault(require("../../core/dste/DSTE"));
16
+ const DatabaseEngine_1 = __importDefault(require("../../database/DatabaseEngine"));
17
+ const DataframeManager_1 = __importDefault(require("./DataframeManager"));
18
+ class UsageDataManager {
19
+ getUsageDetails() {
20
+ return __awaiter(this, void 0, void 0, function* () {
21
+ const now = DSTE_1.default.now();
22
+ const from = new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000);
23
+ const prevMonthFrom = new Date(now.getTime() - 60 * 24 * 60 * 60 * 1000);
24
+ const yearAgo = new Date(now.getFullYear(), now.getMonth() - 11, 1);
25
+ const collection = 'usage';
26
+ // Aggregate status counts for current and previous month
27
+ const getStatusCounts = (start, end) => __awaiter(this, void 0, void 0, function* () {
28
+ const results = yield DatabaseEngine_1.default.aggregate(collection, [
29
+ { $match: { startedAt: { $gte: start, $lte: end } } },
30
+ { $group: { _id: '$status', count: { $sum: 1 } } }
31
+ ]);
32
+ let success = 0, failed = 0, total = 0;
33
+ results.forEach(r => {
34
+ total += r.count;
35
+ if (r._id === 'success')
36
+ success = r.count;
37
+ if (r._id === 'failed')
38
+ failed = r.count;
39
+ });
40
+ return { total, success, failed };
41
+ });
42
+ const statusesRequests = yield getStatusCounts(from, now);
43
+ const prevStatusesRequests = yield getStatusCounts(prevMonthFrom, from);
44
+ // Monthly success and fails for last 12 months
45
+ const monthlySuccessPipeline = [
46
+ { $match: { status: 'success', startedAt: { $gte: yearAgo, $lte: now } } },
47
+ { $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
48
+ { $group: { _id: { year: '$year', month: '$month' }, count: { $sum: 1 } } },
49
+ { $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
50
+ { $sort: { x: 1 } }
51
+ ];
52
+ const monthlyFailsPipeline = [
53
+ { $match: { status: 'failed', startedAt: { $gte: yearAgo, $lte: now } } },
54
+ { $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
55
+ { $group: { _id: { year: '$year', month: '$month' }, count: { $sum: 1 } } },
56
+ { $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
57
+ { $sort: { x: 1 } }
58
+ ];
59
+ const rawMonthlySuccess = yield DatabaseEngine_1.default.aggregate(collection, monthlySuccessPipeline);
60
+ const rawMonthlyFails = yield DatabaseEngine_1.default.aggregate(collection, monthlyFailsPipeline);
61
+ // Top lines per month for last 12 months
62
+ const topLinesPipeline = [
63
+ { $match: { startedAt: { $gte: yearAgo, $lte: now } } },
64
+ { $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
65
+ { $group: { _id: { year: '$year', month: '$month' }, itemsCount: { $max: '$itemsCount' } } },
66
+ { $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$itemsCount' } },
67
+ { $sort: { x: 1 } }
68
+ ];
69
+ const topLines = yield DatabaseEngine_1.default.aggregate(collection, topLinesPipeline);
70
+ // Top times per month for last 12 months
71
+ const topTimePipeline = [
72
+ { $match: { startedAt: { $gte: yearAgo, $lte: now } } },
73
+ { $addFields: { durationMs: { $subtract: ['$finishedAt', '$startedAt'] }, year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
74
+ { $group: { _id: { year: '$year', month: '$month' }, maxDuration: { $max: '$durationMs' } } },
75
+ { $project: { _id: 0, x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$maxDuration' } },
76
+ { $sort: { x: 1 } }
77
+ ];
78
+ const topTime = yield DatabaseEngine_1.default.aggregate(collection, topTimePipeline);
79
+ // Monthly consumers: for each consumer, per month count
80
+ const consumerPipeline = [
81
+ { $match: { startedAt: { $gte: yearAgo, $lte: now } } },
82
+ { $addFields: { year: { $year: '$startedAt' }, month: { $month: '$startedAt' } } },
83
+ { $group: { _id: { consumer: '$consumer', year: '$year', month: '$month' }, count: { $sum: 1 } } },
84
+ { $project: { _id: 0, consumer: '$_id.consumer', x: { $concat: [{ $toString: '$_id.year' }, '-', { $toString: '$_id.month' }] }, y: '$count' } },
85
+ { $sort: { consumer: 1, x: 1 } }
86
+ ];
87
+ const consumersData = yield DatabaseEngine_1.default.aggregate(collection, consumerPipeline);
88
+ // transform to consumer array
89
+ const consumerMap = {};
90
+ consumersData.forEach(r => {
91
+ consumerMap[r.consumer] = consumerMap[r.consumer] || [];
92
+ consumerMap[r.consumer].push({ x: r.x, y: r.y });
93
+ });
94
+ const consumers = Object.entries(consumerMap).map(([name, data]) => ({ name, data: DataframeManager_1.default.fill(data !== null && data !== void 0 ? data : [], yearAgo, now) }));
95
+ // Recent executions
96
+ const recentExecution = yield DatabaseEngine_1.default.query(collection, { startedAt: { $gte: from, $lte: now } }, { sort: { startedAt: -1 }, limit: 10 });
97
+ return {
98
+ statusesRequests,
99
+ prevStatusesRequests,
100
+ monthlySuccess: DataframeManager_1.default.fill(rawMonthlySuccess !== null && rawMonthlySuccess !== void 0 ? rawMonthlySuccess : [], yearAgo, now),
101
+ monthlyFails: DataframeManager_1.default.fill(rawMonthlyFails !== null && rawMonthlyFails !== void 0 ? rawMonthlyFails : [], yearAgo, now),
102
+ consumers: consumers,
103
+ topLine: DataframeManager_1.default.fill(topLines !== null && topLines !== void 0 ? topLines : [], yearAgo, now),
104
+ topTime: DataframeManager_1.default.fill(topTime !== null && topTime !== void 0 ? topTime : [], yearAgo, now),
105
+ recentExecution
106
+ };
107
+ });
108
+ }
109
+ }
110
+ exports.default = new UsageDataManager();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.1.4-nasco.3",
3
+ "version": "0.1.5-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,
@@ -47,6 +47,7 @@
47
47
  "dotenv": "^16.0.3",
48
48
  "fast-xml-parser": "^5.2.3",
49
49
  "fs-extra": "^11.1.0",
50
+ "hyparquet": "^1.17.4",
50
51
  "inquirer": "^8.2.5",
51
52
  "json-schema": "^0.4.0",
52
53
  "jsonwebtoken": "^9.0.2",