@forzalabs/remora 0.0.63-nasco.3 → 0.1.2-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.63-nasco',
4
+ cliVersion: '0.1.2-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
@@ -10,7 +10,9 @@ const CONSTANTS = {
10
10
  STRING_MAX_CHARACTERS_LENGTH: 10000000,
11
11
  MAX_ITEMS_IN_MEMORY: 200000,
12
12
  MIN_RUNTIME_HEAP_MB: 4000,
13
- RECOMMENDED_RUNTIME_HEAP_MB: 8000
13
+ RECOMMENDED_RUNTIME_HEAP_MB: 8000,
14
+ INDICATIVE_THREAD_LINE_COUNT: 750000,
15
+ MAX_THREAD_COUNT: 8
14
16
  }
15
17
  };
16
18
  exports.default = CONSTANTS;
@@ -38,6 +38,7 @@ const DriverHelper = {
38
38
  let hasValidatedHeader = shouldValidateHeader ? false : true;
39
39
  let leftoverData = '';
40
40
  let globalIndex = 0;
41
+ let lineCount = 0;
41
42
  const headerValidationTransform = new stream_1.Transform({
42
43
  transform(chunk, encoding, callback) {
43
44
  const chunkStr = leftoverData + chunk.toString();
@@ -99,6 +100,7 @@ const DriverHelper = {
99
100
  return true;
100
101
  };
101
102
  const processLine = (line) => {
103
+ lineCount++;
102
104
  switch (fileType) {
103
105
  case 'JSON':
104
106
  case 'JSONL': {
@@ -118,6 +120,7 @@ const DriverHelper = {
118
120
  const writeOptions = append ? { flags: 'a' } : {};
119
121
  const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
120
122
  yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
123
+ return lineCount;
121
124
  }),
122
125
  quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
123
126
  var _a, e_1, _b, _c;
@@ -146,17 +146,20 @@ class LocalSourceDriver {
146
146
  // Get header line from the first file
147
147
  const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
148
148
  dataset.setFirstLine(headerLine);
149
+ let totalLineCount = 0;
149
150
  // Copy files sequentially to avoid file conflicts
150
151
  for (let i = 0; i < allFileKeys.length; i++) {
151
- yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
152
+ totalLineCount += yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
152
153
  }
154
+ dataset.setCount(totalLineCount);
153
155
  return dataset;
154
156
  }
155
157
  else {
156
158
  // For single file, no header validation needed
157
159
  const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
158
160
  dataset.setFirstLine(headerLine);
159
- yield copyLocally(fileKey, headerLine, false);
161
+ const totalLineCount = yield copyLocally(fileKey, headerLine, false);
162
+ dataset.setCount(totalLineCount);
160
163
  return dataset;
161
164
  }
162
165
  });
@@ -226,10 +226,12 @@ class S3SourceDriver {
226
226
  const firstFileStream = firstFileResponse.Body;
227
227
  const headerLine = yield this.getFirstLineFromStream(firstFileStream);
228
228
  dataset.setFirstLine(headerLine);
229
+ let totalLineCount = 0;
229
230
  // Download files sequentially to avoid file conflicts
230
231
  for (let i = 0; i < allFileKeys.length; i++) {
231
- yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
232
+ totalLineCount += yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
232
233
  }
234
+ dataset.setCount(totalLineCount);
233
235
  return dataset;
234
236
  }
235
237
  else {
@@ -244,7 +246,8 @@ class S3SourceDriver {
244
246
  const headerLine = yield this.getFirstLineFromStream(firstFileStream);
245
247
  dataset.setFirstLine(headerLine);
246
248
  // For single file, no header validation needed
247
- yield downloadLocally(fileKey, headerLine);
249
+ const totalLineCount = yield downloadLocally(fileKey, headerLine);
250
+ dataset.setCount(totalLineCount);
248
251
  return dataset;
249
252
  }
250
253
  });
@@ -31,7 +31,7 @@ class PostProcessorClass {
31
31
  * - default field values
32
32
  * - masking/hashing of data
33
33
  */
34
- this.doProjection = (consumer, dataset) => __awaiter(this, void 0, void 0, function* () {
34
+ this.doProjection = (consumer, dataset, options) => __awaiter(this, void 0, void 0, function* () {
35
35
  (0, Affirm_1.default)(consumer, 'Invalid consumer');
36
36
  (0, Affirm_1.default)(dataset, 'Invalid dataset');
37
37
  const fields = ConsumerManager_1.default.getExpandedFields(consumer);
@@ -61,7 +61,7 @@ class PostProcessorClass {
61
61
  record.setValue(fieldKey, TypeCaster_1.default.cast(fieldValue, fieldType));
62
62
  }
63
63
  return record;
64
- });
64
+ }, options);
65
65
  newDataset.setDimensinons(updatedDimensions);
66
66
  return newDataset;
67
67
  });
@@ -49,15 +49,24 @@ class Dataset {
49
49
  return this;
50
50
  };
51
51
  this.getFirstLine = () => this._firstLine;
52
- this.getSize = () => this._size;
52
+ this.getCount = () => this._count;
53
+ this.setCount = (count) => {
54
+ this._count = count;
55
+ return this;
56
+ };
53
57
  this.getCycles = () => this._iterations;
54
58
  this.getDelimiter = () => this._delimiter;
59
+ this.setDelimiter = (delimiter) => {
60
+ this._delimiter = delimiter;
61
+ return this;
62
+ };
55
63
  this.getOperations = () => this._operations;
56
64
  this.load = (source) => __awaiter(this, void 0, void 0, function* () {
57
65
  (0, Affirm_1.default)(source, 'Invalid source');
58
66
  this._startOperation('load', { source: source.engine });
59
67
  const driver = yield DriverFactory_1.default.instantiateSource(source);
60
68
  yield driver.download(this);
69
+ this._size = this._computeSize();
61
70
  this._finishOperation('load');
62
71
  return this;
63
72
  });
@@ -101,7 +110,8 @@ class Dataset {
101
110
  }
102
111
  // Write all records to the dataset file
103
112
  yield this.append(records);
104
- this._size = data.length;
113
+ this._size = this._computeSize();
114
+ this._count = data.length;
105
115
  this._finishOperation('load-from-memory');
106
116
  return this;
107
117
  }
@@ -115,12 +125,14 @@ class Dataset {
115
125
  */
116
126
  this.transformStream = (transformer_1, ...args_1) => __awaiter(this, [transformer_1, ...args_1], void 0, function* (transformer, options = {}) {
117
127
  var _a, e_1, _b, _c;
128
+ var _d, _e, _f, _g;
118
129
  const inputPath = options.inputPath || this._path;
119
130
  const outputPath = options.outputPath || this._tempPath;
131
+ const fromLine = (_e = (_d = options.range) === null || _d === void 0 ? void 0 : _d.fromLine) !== null && _e !== void 0 ? _e : -1;
132
+ const toLine = (_g = (_f = options.range) === null || _f === void 0 ? void 0 : _f.toLine) !== null && _g !== void 0 ? _g : Infinity;
120
133
  this.ensureFile(outputPath);
121
- if (!fs_1.default.existsSync(inputPath)) {
134
+ if (!fs_1.default.existsSync(inputPath))
122
135
  throw new Error(`Input file does not exist: ${inputPath}`);
123
- }
124
136
  this._startOperation('transform-stream');
125
137
  const readStream = (0, fs_2.createReadStream)(inputPath);
126
138
  const writeStream = (0, fs_2.createWriteStream)(outputPath);
@@ -128,12 +140,22 @@ class Dataset {
128
140
  const dimensions = Algo_1.default.deepClone(this._dimensions);
129
141
  let batch = [];
130
142
  let lineCount = 0;
143
+ let index = 0;
131
144
  try {
132
- for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
145
+ for (var _h = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _h = true) {
133
146
  _c = rl_1_1.value;
134
- _d = false;
147
+ _h = false;
135
148
  const line = _c;
136
149
  try {
150
+ if (index < fromLine) {
151
+ index++;
152
+ continue;
153
+ }
154
+ else if (index >= toLine) {
155
+ index++;
156
+ break;
157
+ }
158
+ index++;
137
159
  // Reuse record from pool and reinitialize it with new line data
138
160
  const record = this._recordPool.getNext(line, dimensions, this._delimiter);
139
161
  batch.push(record);
@@ -144,7 +166,7 @@ class Dataset {
144
166
  writeStream.write(transformedRecord.stringify() + '\n');
145
167
  }
146
168
  batch = [];
147
- this._recordPool.reset(); // Reset pool index for next batch
169
+ this._recordPool.reset();
148
170
  }
149
171
  }
150
172
  catch (error) {
@@ -155,7 +177,7 @@ class Dataset {
155
177
  catch (e_1_1) { e_1 = { error: e_1_1 }; }
156
178
  finally {
157
179
  try {
158
- if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
180
+ if (!_h && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
159
181
  }
160
182
  finally { if (e_1) throw e_1.error; }
161
183
  }
@@ -176,14 +198,15 @@ class Dataset {
176
198
  if (outputPath === this._tempPath) {
177
199
  fs_1.default.renameSync(this._tempPath, this._path);
178
200
  }
179
- this._size = lineCount;
201
+ this._count = lineCount;
202
+ this._size = this._computeSize();
180
203
  this._iterations++;
181
204
  this._finishOperation('transform-stream');
182
205
  });
183
206
  /**
184
207
  * Filter items in the file using batched streaming
185
208
  */
186
- this.filter = (predicate) => __awaiter(this, void 0, void 0, function* () {
209
+ this.filter = (predicate_1, ...args_1) => __awaiter(this, [predicate_1, ...args_1], void 0, function* (predicate, options = {}) {
187
210
  this._startOperation('filter');
188
211
  let globalIndex = 0;
189
212
  yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
@@ -195,7 +218,7 @@ class Dataset {
195
218
  globalIndex++;
196
219
  }
197
220
  return filteredBatch;
198
- }));
221
+ }), options);
199
222
  this._finishOperation('filter');
200
223
  return this;
201
224
  });
@@ -617,7 +640,6 @@ class Dataset {
617
640
  }
618
641
  rl.close();
619
642
  readStream.close();
620
- this._size = lineCount;
621
643
  this._finishOperation('read-lines');
622
644
  return results;
623
645
  });
@@ -628,7 +650,14 @@ class Dataset {
628
650
  */
629
651
  this.prepare = (producer) => __awaiter(this, void 0, void 0, function* () {
630
652
  this._startOperation('prepare');
631
- const { delimiter, dimensions } = yield DatasetManager_1.default.buildDimensions(this, producer);
653
+ const dimsRes = yield DatasetManager_1.default.buildDimensions(this, producer);
654
+ yield this.prepareWithDimensions(dimsRes);
655
+ this._finishOperation('prepare');
656
+ return this;
657
+ });
658
+ this.prepareWithDimensions = (dimResult) => __awaiter(this, void 0, void 0, function* () {
659
+ this._startOperation('prepare-with-dimensions');
660
+ const { delimiter, dimensions } = dimResult;
632
661
  this._delimiter = delimiter;
633
662
  this._dimensions = dimensions;
634
663
  switch (this._file.fileType) {
@@ -670,7 +699,7 @@ class Dataset {
670
699
  break;
671
700
  }
672
701
  }
673
- this._finishOperation('prepare');
702
+ this._finishOperation('prepare-with-dimensions');
674
703
  return this;
675
704
  });
676
705
  this.getDimensions = () => this._dimensions;
@@ -686,7 +715,7 @@ class Dataset {
686
715
  this._recordPool.updateDimensions(this._dimensions, this._delimiter);
687
716
  };
688
717
  this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
689
- console.log(`DS ${this._name} (${this._size} | ${this._iterations})`);
718
+ console.log(`DS ${this.name} (${this._count} | ${this._iterations})`);
690
719
  console.log(this._dimensions.map(x => x.name).join(this._delimiter));
691
720
  const records = yield this.readLines(count);
692
721
  records.forEach(x => console.log(full ? x : x.stringify()));
@@ -695,7 +724,7 @@ class Dataset {
695
724
  this.printStats = () => {
696
725
  var _a, _b;
697
726
  const total = ((_b = (_a = this._operations) === null || _a === void 0 ? void 0 : _a.map(x => x.elapsedMs)) !== null && _b !== void 0 ? _b : []).reduce((sum, ms) => sum + ms, 0);
698
- console.log(`DS[stats] ${this._name} (size: ${this._size} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`);
727
+ console.log(`DS[stats] ${this.name} (size: ${this._count} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`);
699
728
  console.log(`Operations: ${this._operations.length}`);
700
729
  console.log(JSON.stringify(this._operations, null, 4));
701
730
  };
@@ -760,7 +789,7 @@ class Dataset {
760
789
  const finishedOperation = this._findRunningOperation(name);
761
790
  if (finishedOperation) {
762
791
  finishedOperation.status = 'completed';
763
- finishedOperation.count = this._size;
792
+ finishedOperation.count = this._count;
764
793
  finishedOperation.elapsedMs = performance.now() - finishedOperation.elapsedMs;
765
794
  }
766
795
  else {
@@ -797,18 +826,19 @@ class Dataset {
797
826
  };
798
827
  return searchInOperations(this._operations);
799
828
  };
800
- this._name = name;
829
+ this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
830
+ this.name = name;
801
831
  this._file = file;
802
832
  this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
803
833
  this._dimensions = [];
804
834
  this._firstLine = '';
805
835
  this._delimiter = ',';
806
- this._size = 0;
836
+ this._count = 0;
807
837
  this._iterations = 0;
808
838
  this._operations = [];
809
839
  // Initialize record pool for optimization
810
840
  this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
811
- const datasetName = this._name
841
+ const datasetName = this.name
812
842
  .replace(/[^a-zA-Z0-9_-]/g, '_')
813
843
  .replace(/_{2,}/g, '_')
814
844
  .replace(/^_+|_+$/g, '')
@@ -19,6 +19,8 @@ const Environment_1 = __importDefault(require("../Environment"));
19
19
  const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
20
20
  const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
21
21
  const Dataset_1 = __importDefault(require("./Dataset"));
22
+ const promises_1 = require("stream/promises");
23
+ const fs_1 = require("fs");
22
24
  class DatasetManagerClass {
23
25
  constructor() {
24
26
  this.create = (producer) => {
@@ -29,7 +31,7 @@ class DatasetManagerClass {
29
31
  fileType,
30
32
  hasHeaderRow,
31
33
  sheetName,
32
- delimiter: delimiter
34
+ delimiter
33
35
  });
34
36
  return dataset;
35
37
  };
@@ -125,7 +127,7 @@ class DatasetManagerClass {
125
127
  const { cField } = fields[i];
126
128
  const currentMatch = structuredClone(dimensions.find(x => x.name === cField.key));
127
129
  if (!currentMatch && !cField.fixed)
128
- throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${dataset['_name']}" (${dimensions.map(x => x.name).join(', ')})`);
130
+ throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${dataset.name}" (${dimensions.map(x => x.name).join(', ')})`);
129
131
  updates.push({
130
132
  currentDimension: currentMatch,
131
133
  newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
@@ -147,6 +149,24 @@ class DatasetManagerClass {
147
149
  || x.newPosition !== x.currentDimension.index)));
148
150
  return updates;
149
151
  };
152
+ /**
153
+ * Each worker threads writes to his own dataset file to avoid concurrency and data loss,
154
+ * at the end of their work, I merge their results to a single file
155
+ */
156
+ this.mergeWorkersPaths = (threadPaths, dataset) => __awaiter(this, void 0, void 0, function* () {
157
+ dataset.clear();
158
+ const datasetPath = dataset.getPath();
159
+ for (let i = 0; i < threadPaths.length; i++) {
160
+ const path = threadPaths[i];
161
+ const readStream = (0, fs_1.createReadStream)(path);
162
+ // For the first file, create a new write stream
163
+ // For subsequent files, append to the existing file
164
+ const writeStream = (0, fs_1.createWriteStream)(datasetPath, { flags: i === 0 ? 'w' : 'a' });
165
+ yield (0, promises_1.pipeline)(readStream, writeStream);
166
+ (0, fs_1.unlinkSync)(path);
167
+ }
168
+ return dataset;
169
+ });
150
170
  }
151
171
  }
152
172
  const DatasetManager = new DatasetManagerClass();
@@ -0,0 +1,158 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const Constants_1 = __importDefault(require("../../Constants"));
16
+ const Affirm_1 = __importDefault(require("../../core/Affirm"));
17
+ const Environment_1 = __importDefault(require("../Environment"));
18
+ const workerpool_1 = __importDefault(require("workerpool"));
19
+ const DatasetManager_1 = __importDefault(require("./DatasetManager"));
20
+ const path_1 = __importDefault(require("path"));
21
+ class ParallelDatasetClass {
22
+ constructor() {
23
+ this._getWorkerPath = () => {
24
+ // Get the current file's directory
25
+ const currentDir = __dirname;
26
+ // Check if we're already in the .build directory (production)
27
+ if (currentDir.includes('.build')) {
28
+ // We're in production (.build/engines/dataset), go to .build/workers
29
+ const buildDir = currentDir.split('.build')[0] + '.build';
30
+ return path_1.default.join(buildDir, 'workers');
31
+ }
32
+ else {
33
+ // We're in development, workers are in ./.build/workers
34
+ return path_1.default.resolve('./.build/workers');
35
+ }
36
+ };
37
+ this._scopeWork = (dataset) => {
38
+ var _a;
39
+ const datasetCount = dataset.getCount();
40
+ const batchSize = (_a = parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
41
+ const workerChunkSize = batchSize * Math.round(Constants_1.default.defaults.INDICATIVE_THREAD_LINE_COUNT / batchSize);
42
+ const workerCount = Math.min(Math.ceil(datasetCount / workerChunkSize), Constants_1.default.defaults.MAX_THREAD_COUNT);
43
+ const adjustedWorkerCount = Math.ceil(datasetCount / workerCount);
44
+ return { workerCount, adjustedWorkerCount };
45
+ };
46
+ this.filter = (dataset, filters) => __awaiter(this, void 0, void 0, function* () {
47
+ (0, Affirm_1.default)(dataset, `Invalid dataset`);
48
+ (0, Affirm_1.default)(filters, `Invalid filters`);
49
+ // Distribute the work of the filter among the various workers, trying to have them match the batch size
50
+ const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
51
+ dataset._startOperation('filter-parallel', { workerCount });
52
+ const threads = [];
53
+ for (let i = 0; i < workerCount; i++) {
54
+ const workerId = `worker_filter_${i}`;
55
+ const fromLine = adjustedWorkerCount * i;
56
+ const toLine = (i === workerCount - 1)
57
+ ? Infinity
58
+ : (adjustedWorkerCount * i) + adjustedWorkerCount;
59
+ const workerData = {
60
+ datasetDimensions: dataset.getDimensions(),
61
+ datasetFile: dataset.getFile(),
62
+ datasetName: dataset.name,
63
+ datasetDelimiter: dataset.getDelimiter(),
64
+ fromLine: fromLine,
65
+ toLine: toLine,
66
+ workerId: workerId,
67
+ filterData: {
68
+ rules: filters
69
+ }
70
+ };
71
+ threads.push(this._filterPool.exec('filter', [workerData]));
72
+ }
73
+ const results = yield Promise.all(threads);
74
+ yield this._filterPool.terminate();
75
+ (0, Affirm_1.default)(results.every(x => x.success), `Error in processing the dataset on multiple threads: filter ${dataset.name}`);
76
+ yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
77
+ dataset
78
+ .setDelimiter(results[0].datasetDelimiter)
79
+ .setDimensinons(results[0].datasetDimensions);
80
+ dataset._finishOperation('filter-parallel');
81
+ return dataset;
82
+ });
83
+ this.projection = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
84
+ (0, Affirm_1.default)(dataset, `Invalid dataset`);
85
+ (0, Affirm_1.default)(consumer, `Invalid consumer`);
86
+ const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
87
+ dataset._startOperation('projection-parallel', { workerCount });
88
+ const threads = [];
89
+ for (let i = 0; i < workerCount; i++) {
90
+ const workerId = `worker_projection_${i}`;
91
+ const fromLine = adjustedWorkerCount * i;
92
+ const toLine = (i === workerCount - 1)
93
+ ? Infinity
94
+ : (adjustedWorkerCount * i) + adjustedWorkerCount;
95
+ const workerData = {
96
+ datasetDimensions: dataset.getDimensions(),
97
+ datasetFile: dataset.getFile(),
98
+ datasetName: dataset.name,
99
+ datasetDelimiter: dataset.getDelimiter(),
100
+ fromLine: fromLine,
101
+ toLine: toLine,
102
+ workerId: workerId,
103
+ projectionData: { consumerName: consumer.name }
104
+ };
105
+ threads.push(this._projectionPool.exec('projection', [workerData]));
106
+ }
107
+ const results = yield Promise.all(threads);
108
+ yield this._projectionPool.terminate();
109
+ (0, Affirm_1.default)(results.every(x => x.success), `Error in processing the dataset on multiple threads: projection ${dataset.name}`);
110
+ yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
111
+ dataset
112
+ .setDelimiter(results[0].datasetDelimiter)
113
+ .setDimensinons(results[0].datasetDimensions);
114
+ dataset._finishOperation('projection-parallel');
115
+ return dataset;
116
+ });
117
+ this.transform = (dataset, consumer) => __awaiter(this, void 0, void 0, function* () {
118
+ (0, Affirm_1.default)(dataset, `Invalid dataset`);
119
+ (0, Affirm_1.default)(consumer, `Invalid consumer`);
120
+ const { adjustedWorkerCount, workerCount } = this._scopeWork(dataset);
121
+ dataset._startOperation('transform-parallel', { workerCount });
122
+ const threads = [];
123
+ for (let i = 0; i < workerCount; i++) {
124
+ const workerId = `worker_transform_${i}`;
125
+ const fromLine = adjustedWorkerCount * i;
126
+ const toLine = (i === workerCount - 1)
127
+ ? Infinity
128
+ : (adjustedWorkerCount * i) + adjustedWorkerCount;
129
+ const workerData = {
130
+ datasetDimensions: dataset.getDimensions(),
131
+ datasetFile: dataset.getFile(),
132
+ datasetName: dataset.name,
133
+ datasetDelimiter: dataset.getDelimiter(),
134
+ fromLine: fromLine,
135
+ toLine: toLine,
136
+ workerId: workerId,
137
+ transformData: { consumerName: consumer.name }
138
+ };
139
+ threads.push(this._transformPool.exec('transform', [workerData]));
140
+ }
141
+ const results = yield Promise.all(threads);
142
+ yield this._transformPool.terminate();
143
+ (0, Affirm_1.default)(results.every(x => x.success), `Error in processing the dataset on multiple threads: projection ${dataset.name}`);
144
+ yield DatasetManager_1.default.mergeWorkersPaths(results.map(x => x.datasetPath), dataset);
145
+ dataset
146
+ .setDelimiter(results[0].datasetDelimiter)
147
+ .setDimensinons(results[0].datasetDimensions);
148
+ dataset._finishOperation('transform-parallel');
149
+ return dataset;
150
+ });
151
+ const workerPath = this._getWorkerPath();
152
+ this._filterPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'FilterWorker.js'));
153
+ this._projectionPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ProjectionWorker.js'));
154
+ this._transformPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'TransformWorker.js'));
155
+ }
156
+ }
157
+ const ParallelDataset = new ParallelDatasetClass();
158
+ exports.default = ParallelDataset;
@@ -21,12 +21,12 @@ const SQLBuilder_1 = __importDefault(require("../sql/SQLBuilder"));
21
21
  const SQLCompiler_1 = __importDefault(require("../sql/SQLCompiler"));
22
22
  const ExecutionPlanner_1 = __importDefault(require("./ExecutionPlanner"));
23
23
  const RequestExecutor_1 = __importDefault(require("./RequestExecutor"));
24
- const TransformationEngine_1 = __importDefault(require("../transform/TransformationEngine"));
25
24
  const JoinEngine_1 = __importDefault(require("../transform/JoinEngine"));
26
25
  const DatasetManager_1 = __importDefault(require("../dataset/DatasetManager"));
27
26
  const Environment_1 = __importDefault(require("../Environment"));
28
27
  const Algo_1 = __importDefault(require("../../core/Algo"));
29
28
  const Logger_1 = __importDefault(require("../../helper/Logger"));
29
+ const ParallelDataset_1 = __importDefault(require("../dataset/ParallelDataset"));
30
30
  class ExecutionEnvironment {
31
31
  constructor(consumer) {
32
32
  this.run = (options) => __awaiter(this, void 0, void 0, function* () {
@@ -95,7 +95,7 @@ class ExecutionEnvironment {
95
95
  }
96
96
  case 'post-process-json': {
97
97
  const dataset = this._getIntermidiate(planStep);
98
- const newDataset = yield PostProcessor_1.default.doProjection(this._consumer, dataset);
98
+ const newDataset = yield ParallelDataset_1.default.projection(dataset, this._consumer);
99
99
  this._storeIntermidiate(planStep, newDataset);
100
100
  break;
101
101
  }
@@ -111,12 +111,11 @@ class ExecutionEnvironment {
111
111
  break;
112
112
  }
113
113
  case 'apply-consumer-filters-on-JSON': {
114
- const rules = this._consumer.filters.map(x => x.rule);
115
- this._resultingDataset = yield RequestExecutor_1.default.applyFilters(this._resultingDataset, rules);
114
+ this._resultingDataset = yield ParallelDataset_1.default.filter(this._resultingDataset, this._consumer.filters);
116
115
  break;
117
116
  }
118
117
  case 'apply-transformations': {
119
- this._resultingDataset = yield TransformationEngine_1.default.apply(this._consumer, this._resultingDataset);
118
+ this._resultingDataset = yield ParallelDataset_1.default.transform(this._resultingDataset, this._consumer);
120
119
  break;
121
120
  }
122
121
  case 'join-producers-data': {
@@ -134,7 +133,7 @@ class ExecutionEnvironment {
134
133
  cycles: this._resultingDataset.getCycles(),
135
134
  elapsedMS: performance.now() - start,
136
135
  operations: structuredClone(this._resultingDataset.getOperations()),
137
- size: this._resultingDataset.getSize()
136
+ size: this._resultingDataset.getCount()
138
137
  };
139
138
  break;
140
139
  }
@@ -155,7 +154,7 @@ class ExecutionEnvironment {
155
154
  catch (error) {
156
155
  const ds = (_c = this._resultingDataset) !== null && _c !== void 0 ? _c : (_d = this._producedData.at(-1)) === null || _d === void 0 ? void 0 : _d.dataset;
157
156
  if (ds)
158
- Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getSize()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
157
+ Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getCount()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
159
158
  Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
160
159
  throw error;
161
160
  }
@@ -27,10 +27,10 @@ class RequestExecutorClass {
27
27
  dataset = yield this._applyOrdering(dataset, request.order);
28
28
  return dataset;
29
29
  });
30
- this.applyFilters = (dataset, filters) => __awaiter(this, void 0, void 0, function* () {
30
+ this.applyFilters = (dataset_1, filters_1, ...args_1) => __awaiter(this, [dataset_1, filters_1, ...args_1], void 0, function* (dataset, filters, options = {}) {
31
31
  return yield dataset.filter(record => {
32
32
  return filters.every(filter => this._evaluateFilter(record, filter));
33
- });
33
+ }, options);
34
34
  });
35
35
  this._evaluateFilter = (record, filter) => {
36
36
  const evaluate = (baseRecord, baseFilter) => {
@@ -142,7 +142,7 @@ class JoinEngineClass {
142
142
  for (const prodData of otherProducedData) {
143
143
  const prodDimKey = getDimensionsKey(prodData.dataset);
144
144
  if (mainDimKey !== prodDimKey)
145
- throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset['_name']}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset['_name']}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`);
145
+ throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset.name}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset.name}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`);
146
146
  yield prodData.dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
147
147
  yield mainDataset.append(batch);
148
148
  }));
@@ -17,7 +17,7 @@ const Algo_1 = __importDefault(require("../../core/Algo"));
17
17
  const TypeCaster_1 = __importDefault(require("./TypeCaster"));
18
18
  class TransformationEngineClass {
19
19
  constructor() {
20
- this.apply = (consumer, dataset) => __awaiter(this, void 0, void 0, function* () {
20
+ this.apply = (consumer, dataset, options) => __awaiter(this, void 0, void 0, function* () {
21
21
  (0, Affirm_1.default)(consumer, 'Invalid consumer');
22
22
  (0, Affirm_1.default)(dataset, 'Invalid data');
23
23
  const fieldsToTransform = consumer.fields.filter(field => Algo_1.default.hasVal(field.transform));
@@ -53,7 +53,7 @@ class TransformationEngineClass {
53
53
  }
54
54
  }
55
55
  return record;
56
- });
56
+ }, options);
57
57
  });
58
58
  this.isFieldCombinationTransformation = (transformation) => {
59
59
  if (Array.isArray(transformation)) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.63-nasco.3",
3
+ "version": "0.1.2-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,
@@ -9,7 +9,7 @@
9
9
  },
10
10
  "scripts": {
11
11
  "sync": "cd ../dev_ops && npm run sync",
12
- "dev": "clear && npx tsx scripts/threaded.ts",
12
+ "dev": "clear && npm run fast-build && clear && npx tsx scripts/dev.ts",
13
13
  "tsc-check": "npx tsc --noemit",
14
14
  "init": "npx tsx ./src/index.ts init",
15
15
  "version": "npx tsx ./src/index.ts -v",
@@ -21,6 +21,7 @@
21
21
  "create-producer": "npx tsx ./src/index.ts create-producer",
22
22
  "copy-static-file": "npx tsx ./scripts/CopyStaticFile.js",
23
23
  "build": "npm i && npm run sync && tsc --outDir .build && npm run copy-static-file",
24
+ "fast-build": "tsc --outDir .build",
24
25
  "upload": "npm run build && cd .build && npm publish --tag nasco --access=public"
25
26
  },
26
27
  "keywords": [
@@ -55,6 +56,7 @@
55
56
  "react": "^18.2.0",
56
57
  "react-dom": "^18.2.0",
57
58
  "seedrandom": "^3.0.5",
59
+ "workerpool": "^9.3.3",
58
60
  "xlsx": "^0.18.5",
59
61
  "zod": "^3.24.2"
60
62
  },
@@ -0,0 +1,62 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const workerpool_1 = __importDefault(require("workerpool"));
16
+ const dotenv_1 = __importDefault(require("dotenv"));
17
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
18
+ const Dataset_1 = __importDefault(require("../engines/dataset/Dataset"));
19
+ const Environment_1 = __importDefault(require("../engines/Environment"));
20
+ const RequestExecutor_1 = __importDefault(require("../engines/execution/RequestExecutor"));
21
+ dotenv_1.default.configDotenv();
22
+ const run = (workerData) => __awaiter(void 0, void 0, void 0, function* () {
23
+ Environment_1.default.load('./');
24
+ try {
25
+ const { datasetName, fromLine, toLine, workerId, datasetFile, datasetDimensions, datasetDelimiter, filterData: filter } = workerData;
26
+ Affirm_1.default.hasValue(fromLine, `Invalid from line`);
27
+ Affirm_1.default.hasValue(toLine, `Invalid to line`);
28
+ (0, Affirm_1.default)(datasetName, `Invalid dataset name`);
29
+ (0, Affirm_1.default)(workerId, `Invalid worker id`);
30
+ (0, Affirm_1.default)(datasetFile, `Invalid dataset file`);
31
+ (0, Affirm_1.default)(datasetDimensions, `Invalid dataset dimensions`);
32
+ (0, Affirm_1.default)(filter, `Invalid filter data`);
33
+ (0, Affirm_1.default)(datasetDelimiter, `Invalid dataset delimter`);
34
+ const dataset = new Dataset_1.default(datasetName, datasetFile);
35
+ dataset
36
+ .setDimensinons(datasetDimensions)
37
+ .setDelimiter(datasetDelimiter);
38
+ const outputPath = dataset['_tempPath'] + workerId;
39
+ const rules = filter.rules.map(x => x.rule);
40
+ yield RequestExecutor_1.default.applyFilters(dataset, rules, { outputPath, range: { fromLine, toLine } });
41
+ const result = {
42
+ success: true,
43
+ datasetDelimiter: dataset.getDelimiter(),
44
+ datasetDimensions: dataset.getDimensions(),
45
+ datasetFile: dataset.getFile(),
46
+ datasetName: dataset.name,
47
+ datasetPath: outputPath
48
+ };
49
+ return result;
50
+ }
51
+ catch (error) {
52
+ console.error(error);
53
+ const result = {
54
+ success: false,
55
+ error
56
+ };
57
+ return result;
58
+ }
59
+ });
60
+ workerpool_1.default.worker({
61
+ filter: run
62
+ });
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const workerpool_1 = __importDefault(require("workerpool"));
16
+ const dotenv_1 = __importDefault(require("dotenv"));
17
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
18
+ const Dataset_1 = __importDefault(require("../engines/dataset/Dataset"));
19
+ const Environment_1 = __importDefault(require("../engines/Environment"));
20
+ const PostProcessor_1 = __importDefault(require("../engines/consumer/PostProcessor"));
21
+ dotenv_1.default.configDotenv();
22
+ const run = (workerData) => __awaiter(void 0, void 0, void 0, function* () {
23
+ Environment_1.default.load('./');
24
+ try {
25
+ const { datasetName, fromLine, toLine, workerId, datasetFile, datasetDimensions, datasetDelimiter, projectionData } = workerData;
26
+ Affirm_1.default.hasValue(fromLine, `Invalid from line`);
27
+ Affirm_1.default.hasValue(toLine, `Invalid to line`);
28
+ (0, Affirm_1.default)(datasetName, `Invalid dataset name`);
29
+ (0, Affirm_1.default)(workerId, `Invalid worker id`);
30
+ (0, Affirm_1.default)(datasetFile, `Invalid dataset file`);
31
+ (0, Affirm_1.default)(datasetDimensions, `Invalid dataset dimensions`);
32
+ (0, Affirm_1.default)(projectionData, `Invalid projection data`);
33
+ (0, Affirm_1.default)(datasetDelimiter, `Invalid dataset delimter`);
34
+ const consumer = Environment_1.default.getConsumer(projectionData.consumerName);
35
+ (0, Affirm_1.default)(consumer, `Wrong consumer name sent to projection worker: "${projectionData.consumerName}" not found.`);
36
+ const dataset = new Dataset_1.default(datasetName, datasetFile);
37
+ dataset
38
+ .setDimensinons(datasetDimensions)
39
+ .setDelimiter(datasetDelimiter);
40
+ const outputPath = dataset['_tempPath'] + workerId;
41
+ yield PostProcessor_1.default.doProjection(consumer, dataset, { outputPath, range: { fromLine, toLine } });
42
+ const result = {
43
+ success: true,
44
+ datasetDelimiter: dataset.getDelimiter(),
45
+ datasetDimensions: dataset.getDimensions(),
46
+ datasetFile: dataset.getFile(),
47
+ datasetName: dataset.name,
48
+ datasetPath: outputPath
49
+ };
50
+ return result;
51
+ }
52
+ catch (error) {
53
+ console.error(error);
54
+ const result = {
55
+ success: false,
56
+ error
57
+ };
58
+ return result;
59
+ }
60
+ });
61
+ workerpool_1.default.worker({
62
+ projection: run
63
+ });
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const workerpool_1 = __importDefault(require("workerpool"));
16
+ const dotenv_1 = __importDefault(require("dotenv"));
17
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
18
+ const Dataset_1 = __importDefault(require("../engines/dataset/Dataset"));
19
+ const Environment_1 = __importDefault(require("../engines/Environment"));
20
+ const TransformationEngine_1 = __importDefault(require("../engines/transform/TransformationEngine"));
21
+ dotenv_1.default.configDotenv();
22
+ const run = (workerData) => __awaiter(void 0, void 0, void 0, function* () {
23
+ Environment_1.default.load('./');
24
+ try {
25
+ const { datasetName, fromLine, toLine, workerId, datasetFile, datasetDimensions, datasetDelimiter, transformData } = workerData;
26
+ Affirm_1.default.hasValue(fromLine, `Invalid from line`);
27
+ Affirm_1.default.hasValue(toLine, `Invalid to line`);
28
+ (0, Affirm_1.default)(datasetName, `Invalid dataset name`);
29
+ (0, Affirm_1.default)(workerId, `Invalid worker id`);
30
+ (0, Affirm_1.default)(datasetFile, `Invalid dataset file`);
31
+ (0, Affirm_1.default)(datasetDimensions, `Invalid dataset dimensions`);
32
+ (0, Affirm_1.default)(transformData, `Invalid transform data`);
33
+ (0, Affirm_1.default)(datasetDelimiter, `Invalid dataset delimter`);
34
+ const consumer = Environment_1.default.getConsumer(transformData.consumerName);
35
+ (0, Affirm_1.default)(consumer, `Wrong consumer name sent to projection worker: "${transformData.consumerName}" not found.`);
36
+ const dataset = new Dataset_1.default(datasetName, datasetFile);
37
+ dataset
38
+ .setDimensinons(datasetDimensions)
39
+ .setDelimiter(datasetDelimiter);
40
+ const outputPath = dataset['_tempPath'] + workerId;
41
+ yield TransformationEngine_1.default.apply(consumer, dataset, { outputPath, range: { fromLine, toLine } });
42
+ const result = {
43
+ success: true,
44
+ datasetDelimiter: dataset.getDelimiter(),
45
+ datasetDimensions: dataset.getDimensions(),
46
+ datasetFile: dataset.getFile(),
47
+ datasetName: dataset.name,
48
+ datasetPath: outputPath
49
+ };
50
+ return result;
51
+ }
52
+ catch (error) {
53
+ console.error(error);
54
+ const result = {
55
+ success: false,
56
+ error
57
+ };
58
+ return result;
59
+ }
60
+ });
61
+ workerpool_1.default.worker({
62
+ transform: run
63
+ });
@@ -0,0 +1,14 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const workerpool_1 = __importDefault(require("workerpool"));
7
+ const DatasetRecord_1 = __importDefault(require("../engines/dataset/DatasetRecord"));
8
+ workerpool_1.default.worker({
9
+ ts: () => {
10
+ const tt = new DatasetRecord_1.default('bububub,bububbu', [{ hidden: false, index: 0, key: '11', name: '11' }], ',');
11
+ console.log(tt);
12
+ console.log('hello form typescript', tt.stringify());
13
+ }
14
+ });
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });