@forzalabs/remora 0.2.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/Constants.js +10 -2
  2. package/actions/debug.js +1 -0
  3. package/actions/deploy.js +1 -0
  4. package/actions/run.js +17 -13
  5. package/actions/sample.js +1 -1
  6. package/core/Algo.js +8 -4
  7. package/definitions/ExecutorDefinitions.js +2 -0
  8. package/definitions/json_schemas/consumer-schema.json +1 -1
  9. package/definitions/json_schemas/producer-schema.json +1 -1
  10. package/definitions/temp.js +2 -0
  11. package/drivers/DeltaShareDriver.js +4 -0
  12. package/drivers/DriverFactory.js +10 -10
  13. package/drivers/DriverHelper.js +33 -10
  14. package/drivers/HttpApiDriver.js +4 -0
  15. package/drivers/LocalDriver.js +73 -6
  16. package/drivers/RedshiftDriver.js +4 -0
  17. package/drivers/S3Driver.js +36 -52
  18. package/drivers/files/LocalDestinationDriver.js +200 -0
  19. package/drivers/files/LocalSourceDriver.js +394 -0
  20. package/drivers/s3/S3DestinationDriver.js +159 -0
  21. package/drivers/s3/S3SourceDriver.js +455 -0
  22. package/engines/ai/LLM.js +0 -11
  23. package/engines/consumer/ConsumerEngine.js +0 -77
  24. package/engines/consumer/ConsumerManager.js +61 -36
  25. package/engines/consumer/ConsumerOnFinishManager.js +14 -0
  26. package/engines/consumer/PostProcessor.js +1 -7
  27. package/engines/dataset/Dataset.js +0 -61
  28. package/engines/dataset/DatasetManager.js +16 -76
  29. package/engines/dataset/DatasetRecord.js +4 -3
  30. package/engines/deployment/DeploymentPlanner.js +0 -7
  31. package/engines/execution/ExecutionPlanner.js +2 -2
  32. package/engines/execution/RequestExecutor.js +4 -45
  33. package/engines/file/FileExporter.js +7 -32
  34. package/engines/parsing/CSVParser.js +27 -26
  35. package/engines/parsing/LineParser.js +52 -0
  36. package/engines/parsing/XMLParser.js +1 -1
  37. package/engines/producer/ProducerEngine.js +0 -45
  38. package/engines/scheduler/CronScheduler.js +12 -4
  39. package/engines/scheduler/QueueManager.js +11 -4
  40. package/engines/sql/SQLCompiler.js +4 -4
  41. package/engines/transform/JoinEngine.js +3 -3
  42. package/engines/transform/TransformationEngine.js +3 -89
  43. package/engines/usage/UsageManager.js +8 -6
  44. package/engines/validation/Validator.js +12 -18
  45. package/executors/ConsumerExecutor.js +152 -0
  46. package/executors/Executor.js +168 -0
  47. package/executors/ExecutorOrchestrator.js +315 -0
  48. package/executors/ExecutorPerformance.js +17 -0
  49. package/executors/ExecutorProgress.js +52 -0
  50. package/executors/OutputExecutor.js +118 -0
  51. package/executors/ProducerExecutor.js +108 -0
  52. package/package.json +3 -3
  53. package/workers/ExecutorWorker.js +48 -0
@@ -0,0 +1,152 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __asyncValues = (this && this.__asyncValues) || function (o) {
12
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
+ var m = o[Symbol.asyncIterator], i;
14
+ return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
+ function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
+ function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
+ };
18
+ var __importDefault = (this && this.__importDefault) || function (mod) {
19
+ return (mod && mod.__esModule) ? mod : { "default": mod };
20
+ };
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ const path_1 = __importDefault(require("path"));
23
+ const fs_1 = __importDefault(require("fs"));
24
+ const readline_1 = __importDefault(require("readline"));
25
+ const promises_1 = __importDefault(require("fs/promises"));
26
+ const TransformationEngine_1 = __importDefault(require("../engines/transform/TransformationEngine"));
27
+ const RequestExecutor_1 = __importDefault(require("../engines/execution/RequestExecutor"));
28
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
29
+ const Constants_1 = __importDefault(require("../Constants"));
30
+ class ConsumerExecutorClass {
31
+ constructor() {
32
+ this._getWorkPath = (consumer, executionId) => {
33
+ const execFolder = path_1.default.join(consumer.name, executionId);
34
+ const workPath = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
35
+ return workPath;
36
+ };
37
+ this._clearWorkPath = (workPath) => __awaiter(this, void 0, void 0, function* () {
38
+ try {
39
+ if (fs_1.default.existsSync(workPath)) {
40
+ yield promises_1.default.unlink(workPath);
41
+ }
42
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
43
+ }
44
+ catch (error) {
45
+ // Ignore file deletion errors
46
+ }
47
+ try {
48
+ const dir = path_1.default.dirname(workPath);
49
+ if (fs_1.default.existsSync(dir)) {
50
+ yield promises_1.default.rmdir(dir);
51
+ }
52
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
53
+ }
54
+ catch (error) {
55
+ // Ignore directory deletion errors
56
+ }
57
+ });
58
+ this._ensurePath = (pathUri) => {
59
+ // make sure that the workpath exists
60
+ const dir = path_1.default.dirname(pathUri);
61
+ if (!fs_1.default.existsSync(dir))
62
+ fs_1.default.mkdirSync(dir, { recursive: true });
63
+ if (!fs_1.default.existsSync(pathUri))
64
+ fs_1.default.writeFileSync(pathUri, '');
65
+ };
66
+ this.ready = (consumer, executionId) => {
67
+ (0, Affirm_1.default)(consumer, 'Invalid consumer');
68
+ const workPath = this._getWorkPath(consumer, executionId);
69
+ this._ensurePath(workPath);
70
+ return fs_1.default.createWriteStream(workPath);
71
+ };
72
+ this.processRecord = (options) => {
73
+ var _a;
74
+ const { consumer, fields, dimensions, producer, record, requestOptions } = options;
75
+ // map to consumer fields and apply consumer field logic
76
+ for (const field of fields) {
77
+ const { cField } = field;
78
+ const fieldKey = (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key;
79
+ // set the fixed default value for the field, or throw error if not present in the producer
80
+ const dimension = dimensions.find(x => x.name === cField.key);
81
+ if (!dimension) {
82
+ if (cField.fixed && cField.default)
83
+ record[fieldKey] = cField.default;
84
+ else
85
+ throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map(x => x.name).join(', ')})`);
86
+ }
87
+ // change the name of the dimension
88
+ if (cField.alias && cField.alias !== dimension.name) {
89
+ record[cField.alias] = record[dimension.name];
90
+ delete record[dimension.name];
91
+ }
92
+ // apply transformations
93
+ if (cField.transform)
94
+ record[fieldKey] = TransformationEngine_1.default.applyTransformations(record[fieldKey], cField.transform, cField, record);
95
+ }
96
+ // remove un-wanted producer dimensions
97
+ for (const dimension of dimensions) {
98
+ const field = fields.find(x => x.cField.key === dimension.name);
99
+ if (!field)
100
+ delete record[dimension.name];
101
+ }
102
+ // apply consumer filters
103
+ if (consumer.filters && consumer.filters.length > 0) {
104
+ const isKept = consumer.filters.every(x => RequestExecutor_1.default.evaluateFilter(record, x.rule));
105
+ if (!isKept)
106
+ return null;
107
+ }
108
+ // apply request custom filters
109
+ if (requestOptions && requestOptions.filters) {
110
+ const isKept = requestOptions.filters.every(x => RequestExecutor_1.default.evaluateFilter(record, x));
111
+ if (!isKept)
112
+ return null;
113
+ }
114
+ return record;
115
+ };
116
+ this.processDistinct = (consumer, executionId) => __awaiter(this, void 0, void 0, function* () {
117
+ var _a, e_1, _b, _c;
118
+ const workPath = this._getWorkPath(consumer, executionId);
119
+ const reader = fs_1.default.createReadStream(workPath);
120
+ const lineReader = readline_1.default.createInterface({ input: reader, crlfDelay: Infinity });
121
+ const tempWorkPath = workPath + '_tmp';
122
+ const writer = fs_1.default.createWriteStream(tempWorkPath);
123
+ let newLineCount = 0;
124
+ const seen = new Set();
125
+ try {
126
+ for (var _d = true, lineReader_1 = __asyncValues(lineReader), lineReader_1_1; lineReader_1_1 = yield lineReader_1.next(), _a = lineReader_1_1.done, !_a; _d = true) {
127
+ _c = lineReader_1_1.value;
128
+ _d = false;
129
+ const line = _c;
130
+ if (!seen.has(line)) {
131
+ seen.add(line);
132
+ writer.write(line + '\n');
133
+ newLineCount++;
134
+ }
135
+ }
136
+ }
137
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
138
+ finally {
139
+ try {
140
+ if (!_d && !_a && (_b = lineReader_1.return)) yield _b.call(lineReader_1);
141
+ }
142
+ finally { if (e_1) throw e_1.error; }
143
+ }
144
+ writer.close();
145
+ reader.close();
146
+ fs_1.default.renameSync(tempWorkPath, workPath);
147
+ return newLineCount;
148
+ });
149
+ }
150
+ }
151
+ const ConsumerExecutor = new ConsumerExecutorClass();
152
+ exports.default = ConsumerExecutor;
@@ -0,0 +1,168 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __asyncValues = (this && this.__asyncValues) || function (o) {
12
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
+ var m = o[Symbol.asyncIterator], i;
14
+ return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
+ function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
+ function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
+ };
18
+ var __importDefault = (this && this.__importDefault) || function (mod) {
19
+ return (mod && mod.__esModule) ? mod : { "default": mod };
20
+ };
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ const fs_1 = __importDefault(require("fs"));
23
+ const readline_1 = __importDefault(require("readline"));
24
+ const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
25
+ const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
26
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
27
+ const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
28
+ const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
29
+ const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
30
+ class Executor {
31
+ constructor() {
32
+ this._REPORT_WORK_AFTER_LINES = 1000;
33
+ /**
34
+ * 1. check and ready the local file for processing
35
+ * 2. open read stream and write stream
36
+ * 3. process the file
37
+ * 4. cleanup and after execution actions
38
+ */
39
+ this.run = (request) => __awaiter(this, void 0, void 0, function* () {
40
+ var _a, e_1, _b, _c;
41
+ var _d;
42
+ (0, Affirm_1.default)(request, 'Invalid request');
43
+ const { consumer, producer, prodDimensions, workerId, chunk, options, reportWork } = request;
44
+ const counter = performance.now();
45
+ const result = {
46
+ executionId: workerId,
47
+ cycles: 1,
48
+ elapsedMS: -1,
49
+ inputCount: -1,
50
+ outputCount: -1,
51
+ resultUri: ConsumerExecutor_1.default._getWorkPath(consumer, workerId),
52
+ operations: {}
53
+ };
54
+ let totalOutputCount = 0, totalCycles = 1, perf = 0, lineIndex = 0;
55
+ const readStream = this.openReadStream(chunk);
56
+ const writeStream = this.openWriteStream(consumer, workerId);
57
+ const fields = ConsumerManager_1.default.getExpandedFields(consumer);
58
+ const { isFirstChunk, start, end } = chunk;
59
+ const totalBytes = end - start;
60
+ let processedBytes = 0;
61
+ // Process all the line-independent operations of the consumer in a single pass
62
+ const lineStream = readline_1.default.createInterface({ input: readStream, crlfDelay: Infinity });
63
+ try {
64
+ for (var _e = true, lineStream_1 = __asyncValues(lineStream), lineStream_1_1; lineStream_1_1 = yield lineStream_1.next(), _a = lineStream_1_1.done, !_a; _e = true) {
65
+ _c = lineStream_1_1.value;
66
+ _e = false;
67
+ const line = _c;
68
+ if (lineIndex === 0 && isFirstChunk) {
69
+ if (!this.shouldProcessFirstLine(producer)) {
70
+ lineIndex++;
71
+ continue;
72
+ }
73
+ }
74
+ perf = performance.now();
75
+ let record = ProducerExecutor_1.default.processLine({
76
+ dimensions: prodDimensions,
77
+ index: lineIndex,
78
+ line,
79
+ producer,
80
+ tracker: this._performance
81
+ });
82
+ this._performance.measure('process-line', performance.now() - perf);
83
+ if (!record) {
84
+ lineIndex++;
85
+ continue;
86
+ }
87
+ perf = performance.now();
88
+ record = ConsumerExecutor_1.default.processRecord({
89
+ record,
90
+ index: lineIndex,
91
+ consumer: consumer,
92
+ fields,
93
+ producer,
94
+ dimensions: prodDimensions,
95
+ requestOptions: options
96
+ });
97
+ this._performance.measure('process-record', performance.now() - perf);
98
+ if (!record) {
99
+ lineIndex++;
100
+ continue;
101
+ }
102
+ perf = performance.now();
103
+ const outputLine = OutputExecutor_1.default.outputRecord(record, consumer, fields);
104
+ this._performance.measure('output-record', performance.now() - perf);
105
+ writeStream.write(outputLine + '\n');
106
+ totalOutputCount++;
107
+ lineIndex++;
108
+ // Report progress to the main thread
109
+ if (reportWork && lineIndex % this._REPORT_WORK_AFTER_LINES === 0) {
110
+ processedBytes = Math.min(readStream.bytesRead, totalBytes);
111
+ reportWork({ processed: processedBytes, total: totalBytes, workerId: workerId });
112
+ }
113
+ }
114
+ }
115
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
116
+ finally {
117
+ try {
118
+ if (!_e && !_a && (_b = lineStream_1.return)) yield _b.call(lineStream_1);
119
+ }
120
+ finally { if (e_1) throw e_1.error; }
121
+ }
122
+ // Process the operations that work on multiple lines
123
+ if (((_d = consumer.options) === null || _d === void 0 ? void 0 : _d.distinct) === true) {
124
+ perf = performance.now();
125
+ totalOutputCount = yield ConsumerExecutor_1.default.processDistinct(consumer, workerId);
126
+ this._performance.measure('process-distinct', performance.now() - perf);
127
+ totalCycles++;
128
+ }
129
+ result.elapsedMS = performance.now() - counter;
130
+ result.cycles = totalCycles;
131
+ result.inputCount = lineIndex;
132
+ result.outputCount = totalOutputCount;
133
+ result.resultUri = ConsumerExecutor_1.default._getWorkPath(consumer, workerId);
134
+ result.operations = this._performance.getOperations();
135
+ return result;
136
+ });
137
+ this.openReadStream = (chunk) => {
138
+ const { end, fileUri, start } = chunk;
139
+ return fs_1.default.createReadStream(fileUri, { start, end: end });
140
+ };
141
+ this.openWriteStream = (consumer, executionId) => {
142
+ return ConsumerExecutor_1.default.ready(consumer, executionId);
143
+ };
144
+ this.shouldProcessFirstLine = (producer) => {
145
+ (0, Affirm_1.default)(producer, 'Invalid producer');
146
+ const { settings: { fileType, hasHeaderRow } } = producer;
147
+ switch (fileType) {
148
+ case 'PARQUET':
149
+ case 'XML':
150
+ case 'XLS':
151
+ case 'XLSX':
152
+ case 'CSV':
153
+ return false;
154
+ case 'TXT': {
155
+ if (hasHeaderRow)
156
+ return false;
157
+ else
158
+ return true;
159
+ }
160
+ case 'JSON':
161
+ case 'JSONL':
162
+ return true;
163
+ }
164
+ };
165
+ this._performance = new ExecutorPerformance_1.default();
166
+ }
167
+ }
168
+ exports.default = Executor;
@@ -0,0 +1,315 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const os_1 = __importDefault(require("os"));
16
+ const fs_1 = __importDefault(require("fs"));
17
+ const promises_1 = __importDefault(require("fs/promises"));
18
+ const path_1 = __importDefault(require("path"));
19
+ const promises_2 = require("stream/promises");
20
+ const workerpool_1 = __importDefault(require("workerpool"));
21
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
22
+ const UsageManager_1 = __importDefault(require("../engines/usage/UsageManager"));
23
+ const Helper_1 = __importDefault(require("../helper/Helper"));
24
+ const Environment_1 = __importDefault(require("../engines/Environment"));
25
+ const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
26
+ const Constants_1 = __importDefault(require("../Constants"));
27
+ const DriverHelper_1 = __importDefault(require("../drivers/DriverHelper"));
28
+ const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
29
+ const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
30
+ const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
31
+ const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
32
+ const ExecutorProgress_1 = __importDefault(require("./ExecutorProgress"));
33
+ const Algo_1 = __importDefault(require("../core/Algo"));
34
+ const ConsumerOnFinishManager_1 = __importDefault(require("../engines/consumer/ConsumerOnFinishManager"));
35
+ class ExecutorOrchestratorClass {
36
+ constructor() {
37
+ this.init = () => {
38
+ if (!this._executorPool) {
39
+ const options = {
40
+ workerThreadOpts: {
41
+ resourceLimits: {
42
+ maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
43
+ }
44
+ }
45
+ };
46
+ const workerPath = this._getWorkerPath();
47
+ this._executorPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ExecutorWorker.js'), options);
48
+ }
49
+ };
50
+ this.launch = (request) => __awaiter(this, void 0, void 0, function* () {
51
+ var _a;
52
+ (0, Affirm_1.default)(request, 'Invalid options');
53
+ const { consumer, details, logProgress, options } = request;
54
+ (0, Affirm_1.default)(consumer, 'Invalid consumer');
55
+ (0, Affirm_1.default)(details, 'Invalid execution details');
56
+ const tracker = new ExecutorPerformance_1.default();
57
+ const _progress = new ExecutorProgress_1.default(logProgress);
58
+ const { usageId } = UsageManager_1.default.startUsage(consumer, details);
59
+ const workersId = [];
60
+ try {
61
+ const start = performance.now();
62
+ this.init();
63
+ const executorResults = [];
64
+ const sourceFilesByProducer = yield this.readySourceFiles(consumer);
65
+ for (const pair of sourceFilesByProducer) {
66
+ const { prod, cProd, response } = pair;
67
+ // Make sure that the data files are there, if missing and isOptional = true, then skip
68
+ if (!fs_1.default.existsSync(response.files[0].fullUri)) {
69
+ if (!cProd.isOptional)
70
+ throw new Error(`Expected data file ${response.files[0].fullUri} of producer ${prod.name} in consumer ${consumer.name} is missing.`);
71
+ else if (cProd.isOptional === true)
72
+ continue;
73
+ }
74
+ // Extract the dimensions for this producer just once
75
+ const firstLine = (yield DriverHelper_1.default.quickReadFile(response.files[0].fullUri, 1))[0];
76
+ const header = ProducerExecutor_1.default.processHeader(firstLine, prod);
77
+ const prodDimensions = ProducerExecutor_1.default.reconcileHeader(header, prod);
78
+ for (const file of response.files) {
79
+ const chunks = ExecutorOrchestrator.scopeWork(file.fullUri);
80
+ const workerThreads = [];
81
+ for (const [index, chunk] of chunks.entries()) {
82
+ // Spawn off thread
83
+ const workerId = `${usageId}_${index}`;
84
+ const workerData = {
85
+ chunk,
86
+ consumer,
87
+ prodDimensions,
88
+ producer: prod,
89
+ workerId: workerId,
90
+ options: options
91
+ };
92
+ _progress.register((index + 1).toString());
93
+ workersId.push(workerId);
94
+ workerThreads.push(this._executorPool.exec('executor', [workerData], {
95
+ on: payload => this.onWorkAdvanced(payload, index, _progress)
96
+ }));
97
+ }
98
+ executorResults.push(...yield Promise.all(workerThreads));
99
+ // WARNING: will this not create problems when multiple are executed together at the same time since this is a singleton?!?
100
+ yield this._executorPool.terminate();
101
+ }
102
+ }
103
+ _progress.complete();
104
+ yield this.reconcileExecutorThreadsResults(consumer, usageId, executorResults, tracker);
105
+ // If there is more than one worker, then I need to redo the operations that are done on multiple lines (cause now the worker files have been merged together)
106
+ const postOperation = { totalOutputCount: null };
107
+ if (executorResults.length > 1) {
108
+ if (((_a = consumer.options) === null || _a === void 0 ? void 0 : _a.distinct) === true) {
109
+ const perf = performance.now();
110
+ const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinct(consumer, usageId);
111
+ tracker.measure('process-distinct:main', performance.now() - perf);
112
+ postOperation.totalOutputCount = unifiedOutputCount;
113
+ }
114
+ }
115
+ // Export to the destination
116
+ let perf = performance.now();
117
+ const exportRes = yield OutputExecutor_1.default.exportResult(consumer, usageId, ConsumerManager_1.default.getExpandedFields(consumer));
118
+ tracker.measure('export-result', performance.now() - perf);
119
+ // Perform on-success actions if any
120
+ if (consumer.outputs.some(x => x.onSuccess)) {
121
+ perf = performance.now();
122
+ yield ConsumerOnFinishManager_1.default.onConsumerSuccess(consumer, usageId);
123
+ tracker.measure('on-success-actions', performance.now() - perf);
124
+ }
125
+ yield this.performCleanupOperations(consumer, usageId, executorResults.map(x => x.resultUri), tracker);
126
+ const finalResult = this.computeFinalResult(tracker, executorResults, usageId, exportRes.key);
127
+ finalResult.elapsedMS = performance.now() - start;
128
+ if (Algo_1.default.hasVal(postOperation.totalOutputCount))
129
+ finalResult.outputCount = postOperation.totalOutputCount;
130
+ UsageManager_1.default.endUsage(usageId, finalResult.outputCount, finalResult);
131
+ return finalResult;
132
+ }
133
+ catch (error) {
134
+ yield ConsumerOnFinishManager_1.default.onConsumerError(consumer, usageId);
135
+ yield this.performCleanupOperations(consumer, usageId, workersId.map(x => ConsumerExecutor_1.default._getWorkPath(consumer, x)), tracker);
136
+ UsageManager_1.default.failUsage(usageId, Helper_1.default.asError(error).message);
137
+ throw error;
138
+ }
139
+ });
140
+ /**
141
+ * Calculates line-aligned chunk offsets for parallel file processing.
142
+ * Each chunk boundary is adjusted to the next newline to avoid breaking lines.
143
+ * Returns a single chunk for small files where parallelism overhead isn't worth it.
144
+ */
145
+ this.scopeWork = (fileUri, numChunks) => {
146
+ const fileSize = fs_1.default.statSync(fileUri).size;
147
+ if (fileSize === 0)
148
+ return [];
149
+ // Small files: single chunk, parallelism overhead not worth it
150
+ if (fileSize < Constants_1.default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) {
151
+ return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
152
+ }
153
+ // Calculate optimal chunk count based on file size and CPU cores
154
+ const cpus = numChunks !== null && numChunks !== void 0 ? numChunks : (os_1.default.cpus().length - 1);
155
+ const maxChunksBySize = Math.floor(fileSize / Constants_1.default.defaults.MIN_CHUNK_SIZE);
156
+ const effectiveChunks = Math.min(cpus, maxChunksBySize);
157
+ if (effectiveChunks <= 1)
158
+ return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
159
+ const targetChunkSize = Math.floor(fileSize / effectiveChunks);
160
+ const fd = fs_1.default.openSync(fileUri, 'r');
161
+ try {
162
+ const offsets = [];
163
+ let currentStart = 0;
164
+ for (let i = 0; i < cpus - 1; i++) {
165
+ const targetEnd = currentStart + targetChunkSize;
166
+ // Don't overshoot file size
167
+ if (targetEnd >= fileSize) {
168
+ break;
169
+ }
170
+ // Find next newline after target boundary
171
+ const alignedEnd = this.findNextNewline(fd, targetEnd, fileSize);
172
+ offsets.push({ start: currentStart, end: alignedEnd, isFirstChunk: i === 0, fileUri });
173
+ currentStart = alignedEnd;
174
+ }
175
+ // Final chunk goes to end of file
176
+ if (currentStart < fileSize) {
177
+ offsets.push({ start: currentStart, end: fileSize, isFirstChunk: offsets.length === 0, fileUri });
178
+ }
179
+ return offsets;
180
+ }
181
+ finally {
182
+ fs_1.default.closeSync(fd);
183
+ }
184
+ };
185
+ /**
186
+ * Efficiently finds the next newline character starting from a position.
187
+ * Uses small buffer reads for speed.
188
+ */
189
+ this.findNextNewline = (fd, position, fileSize) => {
190
+ const BUFFER_SIZE = 8192; // 8KB buffer for scanning
191
+ const buffer = Buffer.allocUnsafe(BUFFER_SIZE);
192
+ let currentPos = position;
193
+ while (currentPos < fileSize) {
194
+ const bytesToRead = Math.min(BUFFER_SIZE, fileSize - currentPos);
195
+ const bytesRead = fs_1.default.readSync(fd, buffer, 0, bytesToRead, currentPos);
196
+ if (bytesRead === 0)
197
+ break;
198
+ // Scan buffer for newline
199
+ for (let i = 0; i < bytesRead; i++) {
200
+ if (buffer[i] === 0x0A) { // \n
201
+ return currentPos + i + 1; // Position after the newline
202
+ }
203
+ }
204
+ currentPos += bytesRead;
205
+ }
206
+ // No newline found, return file end
207
+ return fileSize;
208
+ };
209
+ this.readySourceFiles = (consumer) => __awaiter(this, void 0, void 0, function* () {
210
+ const results = [];
211
+ for (let i = 0; i < consumer.producers.length; i++) {
212
+ const cProd = consumer.producers[i];
213
+ const prod = Environment_1.default.getProducer(cProd.name);
214
+ results.push({ prod, cProd, response: yield ProducerExecutor_1.default.ready(prod) });
215
+ }
216
+ return results;
217
+ });
218
+ this._getWorkerPath = () => {
219
+ // Get the current file's directory
220
+ const currentDir = __dirname;
221
+ if (process.env.NODE_ENV === 'dev' || process.env.NODE_ENV === 'development')
222
+ return path_1.default.resolve('./.build/workers');
223
+ const forcedPath = process.env.REMORA_WORKERS_PATH;
224
+ if (forcedPath && forcedPath.length > 0)
225
+ return path_1.default.join(__dirname, forcedPath);
226
+ // Check if we're in a published npm package (no .build in path)
227
+ if (!currentDir.includes('.build')) {
228
+ // We're in the published package, workers are relative to package root
229
+ // __dirname is something like: /path/to/package/engines/dataset
230
+ // We need to go up to package root and then to workers
231
+ return path_1.default.join(__dirname, '../../workers');
232
+ }
233
+ else {
234
+ // We're in development, workers are in ./.build/workers
235
+ return path_1.default.resolve('./.build/workers');
236
+ }
237
+ };
238
+ this.reconcileExecutorThreadsResults = (consumer, executionId, executorResults, tracker) => __awaiter(this, void 0, void 0, function* () {
239
+ const workPath = ConsumerExecutor_1.default._getWorkPath(consumer, executionId);
240
+ ConsumerExecutor_1.default._ensurePath(workPath);
241
+ // Merge all the various files into a single one
242
+ if (executorResults.length > 1) {
243
+ const perf = performance.now();
244
+ const output = fs_1.default.createWriteStream(workPath);
245
+ output.setMaxListeners(executorResults.length + 1);
246
+ for (const workerResult of executorResults) {
247
+ yield (0, promises_2.pipeline)(fs_1.default.createReadStream(workerResult.resultUri), output, { end: false });
248
+ }
249
+ output.end();
250
+ output.close();
251
+ tracker.measure('merge-workers', performance.now() - perf);
252
+ }
253
+ else if (executorResults.length === 1) {
254
+ // If there is only one worker, then just rename the worker .dataset to the general consumer one
255
+ yield promises_1.default.rename(executorResults[0].resultUri, workPath);
256
+ }
257
+ });
258
+ this.performCleanupOperations = (consumer, executionId, workersPath, tracker) => __awaiter(this, void 0, void 0, function* () {
259
+ const workPath = ConsumerExecutor_1.default._getWorkPath(consumer, executionId);
260
+ const start = performance.now();
261
+ yield Promise.all(workersPath.map(x => ConsumerExecutor_1.default._clearWorkPath(x)));
262
+ yield promises_1.default.rmdir(path_1.default.dirname(workPath));
263
+ tracker.measure('cleanup-operations', performance.now() - start);
264
+ });
265
+ this.computeFinalResult = (tracker, executorResults, executionId, resultUri) => {
266
+ const result = {
267
+ cycles: Algo_1.default.max(executorResults.map(x => x.cycles)),
268
+ elapsedMS: Algo_1.default.sum(executorResults.map(x => x.elapsedMS)),
269
+ inputCount: Algo_1.default.sum(executorResults.map(x => x.inputCount)),
270
+ outputCount: Algo_1.default.sum(executorResults.map(x => x.outputCount)),
271
+ workerCount: executorResults.length,
272
+ executionId,
273
+ resultUri,
274
+ operations: {}
275
+ };
276
+ for (const res of executorResults) {
277
+ for (const opKey of Object.keys(res.operations)) {
278
+ const op = res.operations[opKey];
279
+ let label = result.operations[opKey];
280
+ if (!label) {
281
+ result.operations[opKey] = { avg: -1, max: -1, min: -1, elapsedMS: [] };
282
+ label = result.operations[opKey];
283
+ }
284
+ label.elapsedMS.push(op.elapsedMS);
285
+ }
286
+ // Calculate min, max, avg for all operations after collecting all data
287
+ for (const opKey of Object.keys(result.operations)) {
288
+ const operation = result.operations[opKey];
289
+ if (operation.elapsedMS.length > 0) {
290
+ operation.min = Math.min(...operation.elapsedMS);
291
+ operation.max = Math.max(...operation.elapsedMS);
292
+ operation.avg = Algo_1.default.mean(operation.elapsedMS);
293
+ }
294
+ }
295
+ }
296
+ // Add tracker operations to result
297
+ const trackerOperations = tracker.getOperations();
298
+ for (const opKey of Object.keys(trackerOperations)) {
299
+ const trackerOp = trackerOperations[opKey];
300
+ const value = trackerOp.elapsedMS;
301
+ if (!result.operations[opKey]) {
302
+ result.operations[opKey] = { avg: value, max: value, min: value, elapsedMS: [] };
303
+ }
304
+ result.operations[opKey].elapsedMS.push(value);
305
+ }
306
+ return result;
307
+ };
308
+ this.onWorkAdvanced = (packet, index, progress) => {
309
+ const { processed, total } = packet;
310
+ progress.update((index + 1).toString(), processed / total);
311
+ };
312
+ }
313
+ }
314
+ const ExecutorOrchestrator = new ExecutorOrchestratorClass();
315
+ exports.default = ExecutorOrchestrator;
@@ -0,0 +1,17 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ class ExecutorPerformance {
4
+ constructor() {
5
+ this.measure = (name, elapsedMS) => {
6
+ let tracker = this._operations[name];
7
+ if (!tracker) {
8
+ this._operations[name] = { elapsedMS: 0 };
9
+ tracker = this._operations[name];
10
+ }
11
+ tracker.elapsedMS += elapsedMS;
12
+ };
13
+ this.getOperations = () => this._operations;
14
+ this._operations = {};
15
+ }
16
+ }
17
+ exports.default = ExecutorPerformance;