@forzalabs/remora 0.0.49-nasco.3 → 0.0.51-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.49-nasco',
4
+ cliVersion: '0.0.51-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
package/core/Algo.js CHANGED
@@ -150,6 +150,7 @@ const algo = {
150
150
  (0, Affirm_1.default)(arr.length > 0, 'Array must be non-empty');
151
151
  return Math.max(...arr);
152
152
  },
153
- replaceAll: (text, search, replace) => text.replace(new RegExp(search, 'g'), replace)
153
+ replaceAll: (text, search, replace) => text.replace(new RegExp(search, 'g'), replace),
154
+ deepClone: (data) => JSON.parse(JSON.stringify(data))
154
155
  };
155
156
  exports.default = algo;
@@ -79,6 +79,10 @@
79
79
  "code"
80
80
  ],
81
81
  "additionalProperties": false
82
+ },
83
+ "union": {
84
+ "type": "boolean",
85
+ "description": "Merges the data from the various producers in a single dataset. They must have the same output dimensions. If true, then you can't set any joins on any producer, since all producers are merged in a single dataset."
82
86
  }
83
87
  },
84
88
  "required": [
@@ -33,9 +33,7 @@ class PostProcessorClass {
33
33
  (0, Affirm_1.default)(consumer, 'Invalid consumer');
34
34
  (0, Affirm_1.default)(dataset, 'Invalid dataset');
35
35
  const fields = ConsumerManager_1.default.getExpandedFields(consumer);
36
- let newDataset = yield this.dropDimensions(dataset, consumer);
37
- newDataset = this.updateDimensions(newDataset, consumer);
38
- newDataset = yield this.reorderDimensions(newDataset, consumer);
36
+ let newDataset = yield dataset.wholeUpdateDimensions(fields);
39
37
  newDataset = yield newDataset.map(record => {
40
38
  var _a, _b;
41
39
  for (const field of fields) {
@@ -53,38 +51,6 @@ class PostProcessorClass {
53
51
  });
54
52
  return newDataset;
55
53
  });
56
- this.updateDimensions = (dataset, consumer) => {
57
- const fields = ConsumerManager_1.default.getExpandedFields(consumer);
58
- dataset.updateDimensions(fields);
59
- return dataset;
60
- };
61
- this.dropDimensions = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
62
- const initialDimensions = dataset.getDimensions();
63
- const fields = ConsumerManager_1.default.getExpandedFields(consumer);
64
- const dimensionsToKeep = new Set();
65
- // First, identify which dimensions the consumer actually wants
66
- for (const field of fields) {
67
- const { key } = field.cField;
68
- dimensionsToKeep.add(key);
69
- }
70
- // Create a copy to iterate over since we'll be modifying the original
71
- const dimensionsCopy = [...initialDimensions];
72
- const dimensionsToDrop = [];
73
- for (const dim of dimensionsCopy) {
74
- if (!dimensionsToKeep.has(dim.name)) {
75
- // This dimension is not wanted by the consumer, collect it for dropping
76
- dimensionsToDrop.push(dim.name);
77
- }
78
- }
79
- // Drop all unwanted dimensions in a single optimized operation
80
- if (dimensionsToDrop.length > 0)
81
- yield dataset.dropDimensions(dimensionsToDrop);
82
- return dataset;
83
- });
84
- this.reorderDimensions = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
85
- const fields = ConsumerManager_1.default.getExpandedFields(consumer);
86
- return yield dataset.reorderDimensions(fields.map(x => { var _a; return ((_a = x.cField.alias) !== null && _a !== void 0 ? _a : x.cField.key); }));
87
- });
88
54
  /**
89
55
  * Gets an array of objects (with potentially nested fields) and unpacks them to an array of objects with no nested fields
90
56
  * If some nested keys are lists, then a logic similar to a SQL JOIN is used and rows are duplicated
@@ -34,6 +34,7 @@ const Helper_1 = __importDefault(require("../../helper/Helper"));
34
34
  const Algo_1 = __importDefault(require("../../core/Algo"));
35
35
  class Dataset {
36
36
  constructor(name, file, batchSize = Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY) {
37
+ this._pipeline = [];
37
38
  this.getPath = () => this._path;
38
39
  this.setPath = (path) => {
39
40
  this._path = path;
@@ -122,13 +123,14 @@ class Dataset {
122
123
  const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
123
124
  let batch = [];
124
125
  let lineCount = 0;
126
+ const dimensions = Algo_1.default.deepClone(this._dimensions);
125
127
  try {
126
128
  for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
127
129
  _c = rl_1_1.value;
128
130
  _d = false;
129
131
  const line = _c;
130
132
  try {
131
- const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
133
+ const record = new DatasetRecord_1.default(line, dimensions, this._delimiter);
132
134
  batch.push(record);
133
135
  lineCount++;
134
136
  if (batch.length >= this._batchSize) {
@@ -520,6 +522,7 @@ class Dataset {
520
522
  if (batch.length > 0) {
521
523
  yield processor(batch, batchIndex);
522
524
  }
525
+ this._iterations++;
523
526
  this._finishOperation('stream-batches');
524
527
  });
525
528
  /**
@@ -679,68 +682,52 @@ class Dataset {
679
682
  return this;
680
683
  });
681
684
  this.getDimensions = () => this._dimensions;
682
- this.updateDimensions = (fields) => {
683
- this._startOperation('update-dimensions');
684
- for (const field of fields) {
685
- const { cField: { key, alias, hidden, fixed, default: defaultValue } } = field;
686
- const currentDim = this._dimensions.find(x => x.name === key);
687
- if (currentDim) {
688
- currentDim.name = alias !== null && alias !== void 0 ? alias : key;
689
- currentDim.hidden = hidden;
690
- }
691
- else if (fixed && Algo_1.default.hasVal(defaultValue)) {
692
- this._dimensions.push({
693
- hidden: hidden,
694
- index: this._dimensions.length,
695
- key: key,
696
- name: alias !== null && alias !== void 0 ? alias : key
697
- });
698
- }
699
- else {
700
- throw new Error(`Trying to update the dataset dimension "${(alias !== null && alias !== void 0 ? alias : key)}", but none was found (${this._dimensions.map(x => x.name).join(', ')})`);
701
- }
685
+ /**
686
+ * - remove dimension
687
+ * - rename a dimension
688
+ * - change hidden flag
689
+ * - move a dimension
690
+ */
691
+ this.wholeUpdateDimensions = (fields) => __awaiter(this, void 0, void 0, function* () {
692
+ var _a;
693
+ let updates = [];
694
+ // Add all the updates
695
+ for (let i = 0; i < fields.length; i++) {
696
+ const { cField } = fields[i];
697
+ const currentMatch = structuredClone(this._dimensions.find(x => x.name === cField.key));
698
+ updates.push({
699
+ currentDimension: currentMatch,
700
+ newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
701
+ newHidden: cField.hidden,
702
+ newPosition: i,
703
+ toDelete: false
704
+ });
702
705
  }
703
- this._finishOperation('update-dimensions');
704
- return this;
705
- };
706
- this.dropDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () {
707
- if (dimensionNames.length === 0)
708
- return this;
709
- this._startOperation('drop-dimensions');
710
- const toRemove = this._dimensions
711
- .filter(x => dimensionNames.includes(x.name))
712
- .sort((a, b) => b.index - a.index);
713
- const toRemoveNames = toRemove.map(x => x.name);
714
- yield this.map(record => record.dropDimensions(toRemoveNames));
715
- this._dimensions = this._dimensions
716
- .filter(x => !dimensionNames.includes(x.name))
717
- .map((x, i) => (Object.assign(Object.assign({}, x), { index: i })));
718
- this._finishOperation('drop-dimensions');
719
- return this;
720
- });
721
- this.reorderDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () {
722
- if (dimensionNames.length === 0)
706
+ // Add all the updates to remove dimensions
707
+ for (const dim of this._dimensions) {
708
+ if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
709
+ updates.push({ currentDimension: dim, toDelete: true });
710
+ }
711
+ // Now keep only the updates that actually change something
712
+ updates = updates.filter(x => x.toDelete
713
+ || !x.currentDimension
714
+ || (x.currentDimension && (x.currentDimension.name !== x.newName
715
+ || (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
716
+ || x.newPosition !== x.currentDimension.index)));
717
+ if (updates.length === 0)
723
718
  return this;
724
- this._startOperation('reorder-dimensions');
725
- // Validate that all provided dimension names exist
726
- const existingNames = this._dimensions.map(d => d.name);
727
- const missingDimensions = dimensionNames.filter(name => !existingNames.includes(name));
728
- (0, Affirm_1.default)(missingDimensions.length === 0, `Cannot reorder dimensions. Missing dimensions: ${missingDimensions.join(', ')}`);
729
- // Validate that all existing dimensions are included
730
- const extraDimensions = existingNames.filter(name => !dimensionNames.includes(name));
731
- (0, Affirm_1.default)(extraDimensions.length === 0, `Cannot reorder dimensions. All existing dimensions must be included. Missing: ${extraDimensions.join(', ')}`);
732
- const movements = dimensionNames.map((name, index) => {
733
- const dim = this._dimensions.find(x => x.name === name);
734
- const newDim = structuredClone(dim);
735
- newDim.index = index;
736
- return { newDimension: newDim, oldDimension: dim };
719
+ let updatedDimensions = null;
720
+ const newDataset = yield this.map(record => {
721
+ for (const update of updates) {
722
+ record.wholeUpdateDimension(update);
723
+ }
724
+ record._dimensions.sort((a, b) => a.index - b.index);
725
+ if (!updatedDimensions)
726
+ updatedDimensions = record._dimensions;
727
+ return record;
737
728
  });
738
- // Update metadata
739
- this._dimensions = movements.map(x => x.newDimension);
740
- // Reorder the data in the file using streaming approach
741
- yield this.map(record => record.reorderDimensions(movements));
742
- this._finishOperation('reorder-dimensions');
743
- return this;
729
+ this._dimensions = updatedDimensions;
730
+ return newDataset;
744
731
  });
745
732
  this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
746
733
  console.log(`DS ${this._name} (${this._size} | ${this._iterations})`);
@@ -862,6 +849,7 @@ class Dataset {
862
849
  this._size = 0;
863
850
  this._iterations = 0;
864
851
  this._operations = [];
852
+ this._pipeline = [];
865
853
  const datasetName = this._name
866
854
  .replace(/[^a-zA-Z0-9_-]/g, '_')
867
855
  .replace(/_{2,}/g, '_')
@@ -6,44 +6,45 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  const Algo_1 = __importDefault(require("../../core/Algo"));
7
7
  class DatasetRecord {
8
8
  constructor(row, dimensions, delimiter) {
9
+ this.parse = (row, delimiter, dimensions) => {
10
+ if (!this.isEmpty() && dimensions.length > 0) {
11
+ const parts = row.split(delimiter);
12
+ for (let i = 0; i < dimensions.length; i++) {
13
+ const dim = dimensions[i];
14
+ this._value[dim.name] = parts[i];
15
+ }
16
+ }
17
+ };
9
18
  this.stringify = () => this._dimensions.map(x => this._value[x.name]).join(this._delimiter);
10
19
  this.isEmpty = () => { var _a; return ((_a = this._row) === null || _a === void 0 ? void 0 : _a.trim().length) === 0; };
11
20
  this.getRaw = () => this._row;
12
21
  this.getValue = (dimension) => this._value[dimension];
13
22
  this.setValue = (dimension, value) => this._value[dimension] = value;
14
- this.dropDimensions = (dimensionNames) => {
15
- if (dimensionNames.length === 0)
16
- return this;
17
- // Remove the dimensions from the internal value storage
18
- for (const dimensionName of dimensionNames)
19
- delete this._value[dimensionName];
20
- // Filter out the dropped dimensions from the dimensions array
21
- this._dimensions = this._dimensions.filter(dim => !dimensionNames.includes(dim.name));
22
- // Update the indices of remaining dimensions
23
- this._dimensions.forEach((dim, index) => {
24
- dim.index = index;
25
- });
26
- return this;
27
- };
28
- this.reorderDimensions = (movements) => {
29
- if (movements.length === 0)
30
- return this;
31
- // Parse the raw row again with the new dimension order
32
- const parts = this._row.split(this._delimiter);
33
- const newValue = {};
34
- // Create the new dimensions array and remap values
35
- this._dimensions = [];
36
- for (const move of movements) {
37
- const { newDimension, oldDimension } = move;
38
- // Add the dimension with its new index
39
- this._dimensions.push(newDimension);
40
- // Remap the value from the old position to the new dimension name
41
- if (parts.length > oldDimension.index) {
42
- newValue[newDimension.name] = parts[oldDimension.index];
43
- }
23
+ this.wholeUpdateDimension = (update) => {
24
+ var _a;
25
+ if (update.toDelete) {
26
+ // To remove
27
+ delete this._value[update.currentDimension.name];
28
+ this._dimensions = this._dimensions.filter(x => x.key !== update.currentDimension.name);
29
+ }
30
+ else if (!update.currentDimension) {
31
+ // To create (at the right position)
32
+ const newDimension = { index: update.newPosition, key: update.newName, name: update.newName, hidden: update.newHidden };
33
+ this._value[newDimension.name] = null;
34
+ this._dimensions = [...this._dimensions, newDimension];
35
+ }
36
+ else {
37
+ // Change: name, hidden, position
38
+ const index = this._dimensions.findIndex(x => x.key === update.currentDimension.name);
39
+ const currentDim = this._dimensions[index];
40
+ const updatedDim = { name: update.newName, key: (_a = currentDim.key) !== null && _a !== void 0 ? _a : update.newName, hidden: update.newHidden, index: update.newPosition };
41
+ this._value[updatedDim.name] = this._value[currentDim.name];
42
+ if (updatedDim.name !== currentDim.name)
43
+ delete this._value[currentDim.name];
44
+ const newDimensions = [...this._dimensions];
45
+ newDimensions.splice(index, 1, updatedDim);
46
+ this._dimensions = newDimensions;
44
47
  }
45
- // Update the value mapping
46
- this._value = newValue;
47
48
  return this;
48
49
  };
49
50
  this.toJSON = () => {
@@ -70,16 +71,10 @@ class DatasetRecord {
70
71
  return line;
71
72
  };
72
73
  this._row = row;
73
- this._dimensions = dimensions.sort((a, b) => a.index - b.index);
74
+ this._dimensions = dimensions;
74
75
  this._delimiter = delimiter;
75
76
  this._value = {};
76
- if (!this.isEmpty() && this._dimensions.length > 0) {
77
- const parts = row.split(delimiter);
78
- for (let i = 0; i < dimensions.length; i++) {
79
- const dim = dimensions[i];
80
- this._value[dim.name] = parts[i];
81
- }
82
- }
77
+ this.parse(row, delimiter, this._dimensions);
83
78
  }
84
79
  }
85
80
  exports.default = DatasetRecord;
@@ -17,7 +17,6 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
17
17
  const ConsumerEngine_1 = __importDefault(require("../consumer/ConsumerEngine"));
18
18
  const PostProcessor_1 = __importDefault(require("../consumer/PostProcessor"));
19
19
  const FileExporter_1 = __importDefault(require("../file/FileExporter"));
20
- const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine"));
21
20
  const SQLBuilder_1 = __importDefault(require("../sql/SQLBuilder"));
22
21
  const SQLCompiler_1 = __importDefault(require("../sql/SQLCompiler"));
23
22
  const ExecutionPlanner_1 = __importDefault(require("./ExecutionPlanner"));
@@ -70,27 +69,6 @@ class ExecutionEnvironment {
70
69
  this._storeIntermidiate(planStep, dataset);
71
70
  break;
72
71
  }
73
- case 'read-file-whole': {
74
- (0, Affirm_1.default)(planStep.producer, `Invalid producer in read-file-whole step`);
75
- const fileData = yield ProducerEngine_1.default.readFile(planStep.producer, { readmode: 'all' });
76
- this._storeIntermidiate(planStep, fileData.dataset);
77
- break;
78
- }
79
- case 'read-file-lines': {
80
- (0, Affirm_1.default)(planStep.lines, `Invalid lines in read-file-lines step`);
81
- (0, Affirm_1.default)(planStep.producer, `Invalid producer in read-file-lines step`);
82
- const { producer, lines: { from, to } } = planStep;
83
- const fileData = yield ProducerEngine_1.default.readFile(producer, { readmode: 'lines', lines: { from, to } });
84
- this._storeIntermidiate(planStep, fileData.dataset);
85
- break;
86
- }
87
- case 'download-file-locally': {
88
- (0, Affirm_1.default)(planStep.producer, `Invalid producer in download-file-locally step`);
89
- const { producer } = planStep;
90
- const readRes = yield ProducerEngine_1.default.readFile(producer, { readmode: 'download' });
91
- this._storeIntermidiate(planStep, readRes.dataset);
92
- break;
93
- }
94
72
  case 'load-dataset': {
95
73
  (0, Affirm_1.default)(planStep.producer, `Invalid producer in read-file-lines step`);
96
74
  const { producer } = planStep;
@@ -171,14 +149,14 @@ class ExecutionEnvironment {
171
149
  }
172
150
  default: throw new Error(`Invalid execution plan step type "${planStep.type}"`);
173
151
  }
174
- Logger_1.default.log(`Completed step: ${planStep.type}`);
152
+ Logger_1.default.log(`\tCompleted step: ${planStep.type}`);
175
153
  }
176
154
  }
177
155
  catch (error) {
178
156
  const ds = (_a = this._resultingDataset) !== null && _a !== void 0 ? _a : this._getIntermidiate(currentStep);
179
157
  if (ds)
180
158
  Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getSize()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
181
- Logger_1.default.log(`Error at step ${currentStep.type}:\n\t${error}`);
159
+ Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
182
160
  throw error;
183
161
  }
184
162
  Logger_1.default.log(`Completed execution of consumer:\n\tSize: ${result._stats.size}\n\tCycles: ${result._stats.cycles}\n\tTime: ${result._stats.elapsedMS}\n\tOperations: ${Logger_1.default.formatList(result._stats.operations)}`);
@@ -92,6 +92,8 @@ class JoinEngineClass {
92
92
  (0, Affirm_1.default)(producedData, 'Invalid produced data');
93
93
  if (consumer.producers.length <= 1)
94
94
  return this.findProducerData(consumer.producers[0].name, producedData);
95
+ if (consumer.producers.some(x => x.union))
96
+ return yield this.union(consumer, producedData);
95
97
  const consumerShape = ConsumerEngine_1.default.getOutputShape(consumer);
96
98
  const consumerColumns = ConsumerEngine_1.default.compile(consumer);
97
99
  // Create a new dataset for the joined result
@@ -132,6 +134,21 @@ class JoinEngineClass {
132
134
  }
133
135
  return resultDataset;
134
136
  });
137
+ this.union = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () {
138
+ const getDimensionsKey = (ds) => ds.getDimensions().map(x => x.name.trim()).join(';').trim();
139
+ const mainDataset = producedData[0].dataset;
140
+ const mainDimKey = getDimensionsKey(mainDataset);
141
+ const otherProducedData = producedData.slice(1);
142
+ for (const prodData of otherProducedData) {
143
+ const prodDimKey = getDimensionsKey(prodData.dataset);
144
+ if (mainDimKey !== prodDimKey)
145
+ throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset['_name']}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset['_name']}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`);
146
+ yield prodData.dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
147
+ yield mainDataset.append(batch);
148
+ }));
149
+ }
150
+ return mainDataset;
151
+ });
135
152
  this.performStreamingJoin = (leftDataset, rightLookup, condition, relationship, consumerColumns, resultDataset) => __awaiter(this, void 0, void 0, function* () {
136
153
  const joinedRecords = [];
137
154
  const batchSize = leftDataset.getBatchSize();
@@ -90,6 +90,12 @@ class ValidatorClass {
90
90
  errors.push('No producers found');
91
91
  if (producers.some(x => !x))
92
92
  errors.push(`Invalid producer found in consumer "${consumer.name}"`);
93
+ if (consumer.producers.length > 0) {
94
+ const withJoins = consumer.producers.filter(x => (Algo_1.default.hasVal(x.joins) && x.joins.length > 0) || !x.union);
95
+ const withUnions = consumer.producers.filter(x => x.union === true);
96
+ if (withJoins.length > 0 && withUnions.length)
97
+ errors.push(`Multiple producers in consumer have mixed "joins" and "union": you can either have multiple producers with "joins" or multiple producers with "union", but not both (joins: ${withJoins.map(x => x.name).join(', ')}; unions: ${withUnions.map(x => x.name).join(', ')})`);
98
+ }
93
99
  // Validation on sources
94
100
  const sources = producers.map(x => Environment_1.default.getSource(x.source));
95
101
  if (sources.length === 0)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.49-nasco.3",
3
+ "version": "0.0.51-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,
@@ -9,6 +9,7 @@
9
9
  },
10
10
  "scripts": {
11
11
  "sync": "cd ../dev_ops && npm run sync",
12
+ "dev": "clear && npx tsx scripts/dev.ts",
12
13
  "tsc-check": "npx tsc --noemit",
13
14
  "init": "npx tsx ./src/index.ts init",
14
15
  "version": "npx tsx ./src/index.ts -v",