@forzalabs/remora 0.2.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +10 -2
- package/actions/debug.js +1 -0
- package/actions/deploy.js +1 -0
- package/actions/run.js +17 -13
- package/actions/sample.js +1 -1
- package/core/Algo.js +8 -4
- package/definitions/ExecutorDefinitions.js +2 -0
- package/definitions/json_schemas/consumer-schema.json +1 -1
- package/definitions/json_schemas/producer-schema.json +1 -1
- package/definitions/temp.js +2 -0
- package/drivers/DeltaShareDriver.js +4 -0
- package/drivers/DriverFactory.js +10 -10
- package/drivers/DriverHelper.js +33 -10
- package/drivers/HttpApiDriver.js +4 -0
- package/drivers/LocalDriver.js +73 -6
- package/drivers/RedshiftDriver.js +4 -0
- package/drivers/S3Driver.js +36 -52
- package/drivers/files/LocalDestinationDriver.js +200 -0
- package/drivers/files/LocalSourceDriver.js +394 -0
- package/drivers/s3/S3DestinationDriver.js +159 -0
- package/drivers/s3/S3SourceDriver.js +455 -0
- package/engines/ai/LLM.js +0 -11
- package/engines/consumer/ConsumerEngine.js +0 -77
- package/engines/consumer/ConsumerManager.js +61 -36
- package/engines/consumer/ConsumerOnFinishManager.js +14 -0
- package/engines/consumer/PostProcessor.js +1 -7
- package/engines/dataset/Dataset.js +0 -61
- package/engines/dataset/DatasetManager.js +16 -76
- package/engines/dataset/DatasetRecord.js +4 -3
- package/engines/deployment/DeploymentPlanner.js +0 -7
- package/engines/execution/ExecutionPlanner.js +2 -2
- package/engines/execution/RequestExecutor.js +4 -45
- package/engines/file/FileExporter.js +7 -32
- package/engines/parsing/CSVParser.js +27 -26
- package/engines/parsing/LineParser.js +52 -0
- package/engines/parsing/XMLParser.js +1 -1
- package/engines/producer/ProducerEngine.js +0 -45
- package/engines/scheduler/CronScheduler.js +12 -4
- package/engines/scheduler/QueueManager.js +11 -4
- package/engines/sql/SQLCompiler.js +4 -4
- package/engines/transform/JoinEngine.js +3 -3
- package/engines/transform/TransformationEngine.js +3 -89
- package/engines/usage/UsageManager.js +8 -6
- package/engines/validation/Validator.js +12 -18
- package/executors/ConsumerExecutor.js +152 -0
- package/executors/Executor.js +168 -0
- package/executors/ExecutorOrchestrator.js +315 -0
- package/executors/ExecutorPerformance.js +17 -0
- package/executors/ExecutorProgress.js +52 -0
- package/executors/OutputExecutor.js +118 -0
- package/executors/ProducerExecutor.js +108 -0
- package/package.json +3 -3
- package/workers/ExecutorWorker.js +48 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
+
var m = o[Symbol.asyncIterator], i;
|
|
14
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
+
};
|
|
18
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
19
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
20
|
+
};
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
const path_1 = __importDefault(require("path"));
|
|
23
|
+
const fs_1 = __importDefault(require("fs"));
|
|
24
|
+
const readline_1 = __importDefault(require("readline"));
|
|
25
|
+
const promises_1 = __importDefault(require("fs/promises"));
|
|
26
|
+
const TransformationEngine_1 = __importDefault(require("../engines/transform/TransformationEngine"));
|
|
27
|
+
const RequestExecutor_1 = __importDefault(require("../engines/execution/RequestExecutor"));
|
|
28
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
29
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
30
|
+
class ConsumerExecutorClass {
|
|
31
|
+
constructor() {
|
|
32
|
+
this._getWorkPath = (consumer, executionId) => {
|
|
33
|
+
const execFolder = path_1.default.join(consumer.name, executionId);
|
|
34
|
+
const workPath = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
|
|
35
|
+
return workPath;
|
|
36
|
+
};
|
|
37
|
+
this._clearWorkPath = (workPath) => __awaiter(this, void 0, void 0, function* () {
|
|
38
|
+
try {
|
|
39
|
+
if (fs_1.default.existsSync(workPath)) {
|
|
40
|
+
yield promises_1.default.unlink(workPath);
|
|
41
|
+
}
|
|
42
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
43
|
+
}
|
|
44
|
+
catch (error) {
|
|
45
|
+
// Ignore file deletion errors
|
|
46
|
+
}
|
|
47
|
+
try {
|
|
48
|
+
const dir = path_1.default.dirname(workPath);
|
|
49
|
+
if (fs_1.default.existsSync(dir)) {
|
|
50
|
+
yield promises_1.default.rmdir(dir);
|
|
51
|
+
}
|
|
52
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
53
|
+
}
|
|
54
|
+
catch (error) {
|
|
55
|
+
// Ignore directory deletion errors
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
this._ensurePath = (pathUri) => {
|
|
59
|
+
// make sure that the workpath exists
|
|
60
|
+
const dir = path_1.default.dirname(pathUri);
|
|
61
|
+
if (!fs_1.default.existsSync(dir))
|
|
62
|
+
fs_1.default.mkdirSync(dir, { recursive: true });
|
|
63
|
+
if (!fs_1.default.existsSync(pathUri))
|
|
64
|
+
fs_1.default.writeFileSync(pathUri, '');
|
|
65
|
+
};
|
|
66
|
+
this.ready = (consumer, executionId) => {
|
|
67
|
+
(0, Affirm_1.default)(consumer, 'Invalid consumer');
|
|
68
|
+
const workPath = this._getWorkPath(consumer, executionId);
|
|
69
|
+
this._ensurePath(workPath);
|
|
70
|
+
return fs_1.default.createWriteStream(workPath);
|
|
71
|
+
};
|
|
72
|
+
this.processRecord = (options) => {
|
|
73
|
+
var _a;
|
|
74
|
+
const { consumer, fields, dimensions, producer, record, requestOptions } = options;
|
|
75
|
+
// map to consumer fields and apply consumer field logic
|
|
76
|
+
for (const field of fields) {
|
|
77
|
+
const { cField } = field;
|
|
78
|
+
const fieldKey = (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key;
|
|
79
|
+
// set the fixed default value for the field, or throw error if not present in the producer
|
|
80
|
+
const dimension = dimensions.find(x => x.name === cField.key);
|
|
81
|
+
if (!dimension) {
|
|
82
|
+
if (cField.fixed && cField.default)
|
|
83
|
+
record[fieldKey] = cField.default;
|
|
84
|
+
else
|
|
85
|
+
throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map(x => x.name).join(', ')})`);
|
|
86
|
+
}
|
|
87
|
+
// change the name of the dimension
|
|
88
|
+
if (cField.alias && cField.alias !== dimension.name) {
|
|
89
|
+
record[cField.alias] = record[dimension.name];
|
|
90
|
+
delete record[dimension.name];
|
|
91
|
+
}
|
|
92
|
+
// apply transformations
|
|
93
|
+
if (cField.transform)
|
|
94
|
+
record[fieldKey] = TransformationEngine_1.default.applyTransformations(record[fieldKey], cField.transform, cField, record);
|
|
95
|
+
}
|
|
96
|
+
// remove un-wanted producer dimensions
|
|
97
|
+
for (const dimension of dimensions) {
|
|
98
|
+
const field = fields.find(x => x.cField.key === dimension.name);
|
|
99
|
+
if (!field)
|
|
100
|
+
delete record[dimension.name];
|
|
101
|
+
}
|
|
102
|
+
// apply consumer filters
|
|
103
|
+
if (consumer.filters && consumer.filters.length > 0) {
|
|
104
|
+
const isKept = consumer.filters.every(x => RequestExecutor_1.default.evaluateFilter(record, x.rule));
|
|
105
|
+
if (!isKept)
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
// apply request custom filters
|
|
109
|
+
if (requestOptions && requestOptions.filters) {
|
|
110
|
+
const isKept = requestOptions.filters.every(x => RequestExecutor_1.default.evaluateFilter(record, x));
|
|
111
|
+
if (!isKept)
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
return record;
|
|
115
|
+
};
|
|
116
|
+
this.processDistinct = (consumer, executionId) => __awaiter(this, void 0, void 0, function* () {
|
|
117
|
+
var _a, e_1, _b, _c;
|
|
118
|
+
const workPath = this._getWorkPath(consumer, executionId);
|
|
119
|
+
const reader = fs_1.default.createReadStream(workPath);
|
|
120
|
+
const lineReader = readline_1.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
121
|
+
const tempWorkPath = workPath + '_tmp';
|
|
122
|
+
const writer = fs_1.default.createWriteStream(tempWorkPath);
|
|
123
|
+
let newLineCount = 0;
|
|
124
|
+
const seen = new Set();
|
|
125
|
+
try {
|
|
126
|
+
for (var _d = true, lineReader_1 = __asyncValues(lineReader), lineReader_1_1; lineReader_1_1 = yield lineReader_1.next(), _a = lineReader_1_1.done, !_a; _d = true) {
|
|
127
|
+
_c = lineReader_1_1.value;
|
|
128
|
+
_d = false;
|
|
129
|
+
const line = _c;
|
|
130
|
+
if (!seen.has(line)) {
|
|
131
|
+
seen.add(line);
|
|
132
|
+
writer.write(line + '\n');
|
|
133
|
+
newLineCount++;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
138
|
+
finally {
|
|
139
|
+
try {
|
|
140
|
+
if (!_d && !_a && (_b = lineReader_1.return)) yield _b.call(lineReader_1);
|
|
141
|
+
}
|
|
142
|
+
finally { if (e_1) throw e_1.error; }
|
|
143
|
+
}
|
|
144
|
+
writer.close();
|
|
145
|
+
reader.close();
|
|
146
|
+
fs_1.default.renameSync(tempWorkPath, workPath);
|
|
147
|
+
return newLineCount;
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
const ConsumerExecutor = new ConsumerExecutorClass();
|
|
152
|
+
exports.default = ConsumerExecutor;
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
+
var m = o[Symbol.asyncIterator], i;
|
|
14
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
+
};
|
|
18
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
19
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
20
|
+
};
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
const fs_1 = __importDefault(require("fs"));
|
|
23
|
+
const readline_1 = __importDefault(require("readline"));
|
|
24
|
+
const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
|
|
25
|
+
const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
|
|
26
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
27
|
+
const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
|
|
28
|
+
const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
|
|
29
|
+
const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
|
|
30
|
+
class Executor {
|
|
31
|
+
constructor() {
|
|
32
|
+
this._REPORT_WORK_AFTER_LINES = 1000;
|
|
33
|
+
/**
|
|
34
|
+
* 1. check and ready the local file for processing
|
|
35
|
+
* 2. open read stream and write stream
|
|
36
|
+
* 3. process the file
|
|
37
|
+
* 4. cleanup and after execution actions
|
|
38
|
+
*/
|
|
39
|
+
this.run = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
40
|
+
var _a, e_1, _b, _c;
|
|
41
|
+
var _d;
|
|
42
|
+
(0, Affirm_1.default)(request, 'Invalid request');
|
|
43
|
+
const { consumer, producer, prodDimensions, workerId, chunk, options, reportWork } = request;
|
|
44
|
+
const counter = performance.now();
|
|
45
|
+
const result = {
|
|
46
|
+
executionId: workerId,
|
|
47
|
+
cycles: 1,
|
|
48
|
+
elapsedMS: -1,
|
|
49
|
+
inputCount: -1,
|
|
50
|
+
outputCount: -1,
|
|
51
|
+
resultUri: ConsumerExecutor_1.default._getWorkPath(consumer, workerId),
|
|
52
|
+
operations: {}
|
|
53
|
+
};
|
|
54
|
+
let totalOutputCount = 0, totalCycles = 1, perf = 0, lineIndex = 0;
|
|
55
|
+
const readStream = this.openReadStream(chunk);
|
|
56
|
+
const writeStream = this.openWriteStream(consumer, workerId);
|
|
57
|
+
const fields = ConsumerManager_1.default.getExpandedFields(consumer);
|
|
58
|
+
const { isFirstChunk, start, end } = chunk;
|
|
59
|
+
const totalBytes = end - start;
|
|
60
|
+
let processedBytes = 0;
|
|
61
|
+
// Process all the line-independent operations of the consumer in a single pass
|
|
62
|
+
const lineStream = readline_1.default.createInterface({ input: readStream, crlfDelay: Infinity });
|
|
63
|
+
try {
|
|
64
|
+
for (var _e = true, lineStream_1 = __asyncValues(lineStream), lineStream_1_1; lineStream_1_1 = yield lineStream_1.next(), _a = lineStream_1_1.done, !_a; _e = true) {
|
|
65
|
+
_c = lineStream_1_1.value;
|
|
66
|
+
_e = false;
|
|
67
|
+
const line = _c;
|
|
68
|
+
if (lineIndex === 0 && isFirstChunk) {
|
|
69
|
+
if (!this.shouldProcessFirstLine(producer)) {
|
|
70
|
+
lineIndex++;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
perf = performance.now();
|
|
75
|
+
let record = ProducerExecutor_1.default.processLine({
|
|
76
|
+
dimensions: prodDimensions,
|
|
77
|
+
index: lineIndex,
|
|
78
|
+
line,
|
|
79
|
+
producer,
|
|
80
|
+
tracker: this._performance
|
|
81
|
+
});
|
|
82
|
+
this._performance.measure('process-line', performance.now() - perf);
|
|
83
|
+
if (!record) {
|
|
84
|
+
lineIndex++;
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
perf = performance.now();
|
|
88
|
+
record = ConsumerExecutor_1.default.processRecord({
|
|
89
|
+
record,
|
|
90
|
+
index: lineIndex,
|
|
91
|
+
consumer: consumer,
|
|
92
|
+
fields,
|
|
93
|
+
producer,
|
|
94
|
+
dimensions: prodDimensions,
|
|
95
|
+
requestOptions: options
|
|
96
|
+
});
|
|
97
|
+
this._performance.measure('process-record', performance.now() - perf);
|
|
98
|
+
if (!record) {
|
|
99
|
+
lineIndex++;
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
perf = performance.now();
|
|
103
|
+
const outputLine = OutputExecutor_1.default.outputRecord(record, consumer, fields);
|
|
104
|
+
this._performance.measure('output-record', performance.now() - perf);
|
|
105
|
+
writeStream.write(outputLine + '\n');
|
|
106
|
+
totalOutputCount++;
|
|
107
|
+
lineIndex++;
|
|
108
|
+
// Report progress to the main thread
|
|
109
|
+
if (reportWork && lineIndex % this._REPORT_WORK_AFTER_LINES === 0) {
|
|
110
|
+
processedBytes = Math.min(readStream.bytesRead, totalBytes);
|
|
111
|
+
reportWork({ processed: processedBytes, total: totalBytes, workerId: workerId });
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
116
|
+
finally {
|
|
117
|
+
try {
|
|
118
|
+
if (!_e && !_a && (_b = lineStream_1.return)) yield _b.call(lineStream_1);
|
|
119
|
+
}
|
|
120
|
+
finally { if (e_1) throw e_1.error; }
|
|
121
|
+
}
|
|
122
|
+
// Process the operations that work on multiple lines
|
|
123
|
+
if (((_d = consumer.options) === null || _d === void 0 ? void 0 : _d.distinct) === true) {
|
|
124
|
+
perf = performance.now();
|
|
125
|
+
totalOutputCount = yield ConsumerExecutor_1.default.processDistinct(consumer, workerId);
|
|
126
|
+
this._performance.measure('process-distinct', performance.now() - perf);
|
|
127
|
+
totalCycles++;
|
|
128
|
+
}
|
|
129
|
+
result.elapsedMS = performance.now() - counter;
|
|
130
|
+
result.cycles = totalCycles;
|
|
131
|
+
result.inputCount = lineIndex;
|
|
132
|
+
result.outputCount = totalOutputCount;
|
|
133
|
+
result.resultUri = ConsumerExecutor_1.default._getWorkPath(consumer, workerId);
|
|
134
|
+
result.operations = this._performance.getOperations();
|
|
135
|
+
return result;
|
|
136
|
+
});
|
|
137
|
+
this.openReadStream = (chunk) => {
|
|
138
|
+
const { end, fileUri, start } = chunk;
|
|
139
|
+
return fs_1.default.createReadStream(fileUri, { start, end: end });
|
|
140
|
+
};
|
|
141
|
+
this.openWriteStream = (consumer, executionId) => {
|
|
142
|
+
return ConsumerExecutor_1.default.ready(consumer, executionId);
|
|
143
|
+
};
|
|
144
|
+
this.shouldProcessFirstLine = (producer) => {
|
|
145
|
+
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
146
|
+
const { settings: { fileType, hasHeaderRow } } = producer;
|
|
147
|
+
switch (fileType) {
|
|
148
|
+
case 'PARQUET':
|
|
149
|
+
case 'XML':
|
|
150
|
+
case 'XLS':
|
|
151
|
+
case 'XLSX':
|
|
152
|
+
case 'CSV':
|
|
153
|
+
return false;
|
|
154
|
+
case 'TXT': {
|
|
155
|
+
if (hasHeaderRow)
|
|
156
|
+
return false;
|
|
157
|
+
else
|
|
158
|
+
return true;
|
|
159
|
+
}
|
|
160
|
+
case 'JSON':
|
|
161
|
+
case 'JSONL':
|
|
162
|
+
return true;
|
|
163
|
+
}
|
|
164
|
+
};
|
|
165
|
+
this._performance = new ExecutorPerformance_1.default();
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
exports.default = Executor;
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const os_1 = __importDefault(require("os"));
|
|
16
|
+
const fs_1 = __importDefault(require("fs"));
|
|
17
|
+
const promises_1 = __importDefault(require("fs/promises"));
|
|
18
|
+
const path_1 = __importDefault(require("path"));
|
|
19
|
+
const promises_2 = require("stream/promises");
|
|
20
|
+
const workerpool_1 = __importDefault(require("workerpool"));
|
|
21
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
22
|
+
const UsageManager_1 = __importDefault(require("../engines/usage/UsageManager"));
|
|
23
|
+
const Helper_1 = __importDefault(require("../helper/Helper"));
|
|
24
|
+
const Environment_1 = __importDefault(require("../engines/Environment"));
|
|
25
|
+
const ProducerExecutor_1 = __importDefault(require("./ProducerExecutor"));
|
|
26
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
27
|
+
const DriverHelper_1 = __importDefault(require("../drivers/DriverHelper"));
|
|
28
|
+
const ConsumerExecutor_1 = __importDefault(require("./ConsumerExecutor"));
|
|
29
|
+
const OutputExecutor_1 = __importDefault(require("./OutputExecutor"));
|
|
30
|
+
const ConsumerManager_1 = __importDefault(require("../engines/consumer/ConsumerManager"));
|
|
31
|
+
const ExecutorPerformance_1 = __importDefault(require("./ExecutorPerformance"));
|
|
32
|
+
const ExecutorProgress_1 = __importDefault(require("./ExecutorProgress"));
|
|
33
|
+
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
34
|
+
const ConsumerOnFinishManager_1 = __importDefault(require("../engines/consumer/ConsumerOnFinishManager"));
|
|
35
|
+
class ExecutorOrchestratorClass {
|
|
36
|
+
constructor() {
|
|
37
|
+
this.init = () => {
|
|
38
|
+
if (!this._executorPool) {
|
|
39
|
+
const options = {
|
|
40
|
+
workerThreadOpts: {
|
|
41
|
+
resourceLimits: {
|
|
42
|
+
maxOldGenerationSizeMb: Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
const workerPath = this._getWorkerPath();
|
|
47
|
+
this._executorPool = workerpool_1.default.pool(path_1.default.join(workerPath, 'ExecutorWorker.js'), options);
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
this.launch = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
51
|
+
var _a;
|
|
52
|
+
(0, Affirm_1.default)(request, 'Invalid options');
|
|
53
|
+
const { consumer, details, logProgress, options } = request;
|
|
54
|
+
(0, Affirm_1.default)(consumer, 'Invalid consumer');
|
|
55
|
+
(0, Affirm_1.default)(details, 'Invalid execution details');
|
|
56
|
+
const tracker = new ExecutorPerformance_1.default();
|
|
57
|
+
const _progress = new ExecutorProgress_1.default(logProgress);
|
|
58
|
+
const { usageId } = UsageManager_1.default.startUsage(consumer, details);
|
|
59
|
+
const workersId = [];
|
|
60
|
+
try {
|
|
61
|
+
const start = performance.now();
|
|
62
|
+
this.init();
|
|
63
|
+
const executorResults = [];
|
|
64
|
+
const sourceFilesByProducer = yield this.readySourceFiles(consumer);
|
|
65
|
+
for (const pair of sourceFilesByProducer) {
|
|
66
|
+
const { prod, cProd, response } = pair;
|
|
67
|
+
// Make sure that the data files are there, if missing and isOptional = true, then skip
|
|
68
|
+
if (!fs_1.default.existsSync(response.files[0].fullUri)) {
|
|
69
|
+
if (!cProd.isOptional)
|
|
70
|
+
throw new Error(`Expected data file ${response.files[0].fullUri} of producer ${prod.name} in consumer ${consumer.name} is missing.`);
|
|
71
|
+
else if (cProd.isOptional === true)
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
// Extract the dimensions for this producer just once
|
|
75
|
+
const firstLine = (yield DriverHelper_1.default.quickReadFile(response.files[0].fullUri, 1))[0];
|
|
76
|
+
const header = ProducerExecutor_1.default.processHeader(firstLine, prod);
|
|
77
|
+
const prodDimensions = ProducerExecutor_1.default.reconcileHeader(header, prod);
|
|
78
|
+
for (const file of response.files) {
|
|
79
|
+
const chunks = ExecutorOrchestrator.scopeWork(file.fullUri);
|
|
80
|
+
const workerThreads = [];
|
|
81
|
+
for (const [index, chunk] of chunks.entries()) {
|
|
82
|
+
// Spawn off thread
|
|
83
|
+
const workerId = `${usageId}_${index}`;
|
|
84
|
+
const workerData = {
|
|
85
|
+
chunk,
|
|
86
|
+
consumer,
|
|
87
|
+
prodDimensions,
|
|
88
|
+
producer: prod,
|
|
89
|
+
workerId: workerId,
|
|
90
|
+
options: options
|
|
91
|
+
};
|
|
92
|
+
_progress.register((index + 1).toString());
|
|
93
|
+
workersId.push(workerId);
|
|
94
|
+
workerThreads.push(this._executorPool.exec('executor', [workerData], {
|
|
95
|
+
on: payload => this.onWorkAdvanced(payload, index, _progress)
|
|
96
|
+
}));
|
|
97
|
+
}
|
|
98
|
+
executorResults.push(...yield Promise.all(workerThreads));
|
|
99
|
+
// WARNING: will this not create problems when multiple are executed together at the same time since this is a singleton?!?
|
|
100
|
+
yield this._executorPool.terminate();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
_progress.complete();
|
|
104
|
+
yield this.reconcileExecutorThreadsResults(consumer, usageId, executorResults, tracker);
|
|
105
|
+
// If there is more than one worker, then I need to redo the operations that are done on multiple lines (cause now the worker files have been merged together)
|
|
106
|
+
const postOperation = { totalOutputCount: null };
|
|
107
|
+
if (executorResults.length > 1) {
|
|
108
|
+
if (((_a = consumer.options) === null || _a === void 0 ? void 0 : _a.distinct) === true) {
|
|
109
|
+
const perf = performance.now();
|
|
110
|
+
const unifiedOutputCount = yield ConsumerExecutor_1.default.processDistinct(consumer, usageId);
|
|
111
|
+
tracker.measure('process-distinct:main', performance.now() - perf);
|
|
112
|
+
postOperation.totalOutputCount = unifiedOutputCount;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// Export to the destination
|
|
116
|
+
let perf = performance.now();
|
|
117
|
+
const exportRes = yield OutputExecutor_1.default.exportResult(consumer, usageId, ConsumerManager_1.default.getExpandedFields(consumer));
|
|
118
|
+
tracker.measure('export-result', performance.now() - perf);
|
|
119
|
+
// Perform on-success actions if any
|
|
120
|
+
if (consumer.outputs.some(x => x.onSuccess)) {
|
|
121
|
+
perf = performance.now();
|
|
122
|
+
yield ConsumerOnFinishManager_1.default.onConsumerSuccess(consumer, usageId);
|
|
123
|
+
tracker.measure('on-success-actions', performance.now() - perf);
|
|
124
|
+
}
|
|
125
|
+
yield this.performCleanupOperations(consumer, usageId, executorResults.map(x => x.resultUri), tracker);
|
|
126
|
+
const finalResult = this.computeFinalResult(tracker, executorResults, usageId, exportRes.key);
|
|
127
|
+
finalResult.elapsedMS = performance.now() - start;
|
|
128
|
+
if (Algo_1.default.hasVal(postOperation.totalOutputCount))
|
|
129
|
+
finalResult.outputCount = postOperation.totalOutputCount;
|
|
130
|
+
UsageManager_1.default.endUsage(usageId, finalResult.outputCount, finalResult);
|
|
131
|
+
return finalResult;
|
|
132
|
+
}
|
|
133
|
+
catch (error) {
|
|
134
|
+
yield ConsumerOnFinishManager_1.default.onConsumerError(consumer, usageId);
|
|
135
|
+
yield this.performCleanupOperations(consumer, usageId, workersId.map(x => ConsumerExecutor_1.default._getWorkPath(consumer, x)), tracker);
|
|
136
|
+
UsageManager_1.default.failUsage(usageId, Helper_1.default.asError(error).message);
|
|
137
|
+
throw error;
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
/**
|
|
141
|
+
* Calculates line-aligned chunk offsets for parallel file processing.
|
|
142
|
+
* Each chunk boundary is adjusted to the next newline to avoid breaking lines.
|
|
143
|
+
* Returns a single chunk for small files where parallelism overhead isn't worth it.
|
|
144
|
+
*/
|
|
145
|
+
this.scopeWork = (fileUri, numChunks) => {
|
|
146
|
+
const fileSize = fs_1.default.statSync(fileUri).size;
|
|
147
|
+
if (fileSize === 0)
|
|
148
|
+
return [];
|
|
149
|
+
// Small files: single chunk, parallelism overhead not worth it
|
|
150
|
+
if (fileSize < Constants_1.default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) {
|
|
151
|
+
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
152
|
+
}
|
|
153
|
+
// Calculate optimal chunk count based on file size and CPU cores
|
|
154
|
+
const cpus = numChunks !== null && numChunks !== void 0 ? numChunks : (os_1.default.cpus().length - 1);
|
|
155
|
+
const maxChunksBySize = Math.floor(fileSize / Constants_1.default.defaults.MIN_CHUNK_SIZE);
|
|
156
|
+
const effectiveChunks = Math.min(cpus, maxChunksBySize);
|
|
157
|
+
if (effectiveChunks <= 1)
|
|
158
|
+
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
159
|
+
const targetChunkSize = Math.floor(fileSize / effectiveChunks);
|
|
160
|
+
const fd = fs_1.default.openSync(fileUri, 'r');
|
|
161
|
+
try {
|
|
162
|
+
const offsets = [];
|
|
163
|
+
let currentStart = 0;
|
|
164
|
+
for (let i = 0; i < cpus - 1; i++) {
|
|
165
|
+
const targetEnd = currentStart + targetChunkSize;
|
|
166
|
+
// Don't overshoot file size
|
|
167
|
+
if (targetEnd >= fileSize) {
|
|
168
|
+
break;
|
|
169
|
+
}
|
|
170
|
+
// Find next newline after target boundary
|
|
171
|
+
const alignedEnd = this.findNextNewline(fd, targetEnd, fileSize);
|
|
172
|
+
offsets.push({ start: currentStart, end: alignedEnd, isFirstChunk: i === 0, fileUri });
|
|
173
|
+
currentStart = alignedEnd;
|
|
174
|
+
}
|
|
175
|
+
// Final chunk goes to end of file
|
|
176
|
+
if (currentStart < fileSize) {
|
|
177
|
+
offsets.push({ start: currentStart, end: fileSize, isFirstChunk: offsets.length === 0, fileUri });
|
|
178
|
+
}
|
|
179
|
+
return offsets;
|
|
180
|
+
}
|
|
181
|
+
finally {
|
|
182
|
+
fs_1.default.closeSync(fd);
|
|
183
|
+
}
|
|
184
|
+
};
|
|
185
|
+
/**
|
|
186
|
+
* Efficiently finds the next newline character starting from a position.
|
|
187
|
+
* Uses small buffer reads for speed.
|
|
188
|
+
*/
|
|
189
|
+
this.findNextNewline = (fd, position, fileSize) => {
|
|
190
|
+
const BUFFER_SIZE = 8192; // 8KB buffer for scanning
|
|
191
|
+
const buffer = Buffer.allocUnsafe(BUFFER_SIZE);
|
|
192
|
+
let currentPos = position;
|
|
193
|
+
while (currentPos < fileSize) {
|
|
194
|
+
const bytesToRead = Math.min(BUFFER_SIZE, fileSize - currentPos);
|
|
195
|
+
const bytesRead = fs_1.default.readSync(fd, buffer, 0, bytesToRead, currentPos);
|
|
196
|
+
if (bytesRead === 0)
|
|
197
|
+
break;
|
|
198
|
+
// Scan buffer for newline
|
|
199
|
+
for (let i = 0; i < bytesRead; i++) {
|
|
200
|
+
if (buffer[i] === 0x0A) { // \n
|
|
201
|
+
return currentPos + i + 1; // Position after the newline
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
currentPos += bytesRead;
|
|
205
|
+
}
|
|
206
|
+
// No newline found, return file end
|
|
207
|
+
return fileSize;
|
|
208
|
+
};
|
|
209
|
+
this.readySourceFiles = (consumer) => __awaiter(this, void 0, void 0, function* () {
|
|
210
|
+
const results = [];
|
|
211
|
+
for (let i = 0; i < consumer.producers.length; i++) {
|
|
212
|
+
const cProd = consumer.producers[i];
|
|
213
|
+
const prod = Environment_1.default.getProducer(cProd.name);
|
|
214
|
+
results.push({ prod, cProd, response: yield ProducerExecutor_1.default.ready(prod) });
|
|
215
|
+
}
|
|
216
|
+
return results;
|
|
217
|
+
});
|
|
218
|
+
this._getWorkerPath = () => {
|
|
219
|
+
// Get the current file's directory
|
|
220
|
+
const currentDir = __dirname;
|
|
221
|
+
if (process.env.NODE_ENV === 'dev' || process.env.NODE_ENV === 'development')
|
|
222
|
+
return path_1.default.resolve('./.build/workers');
|
|
223
|
+
const forcedPath = process.env.REMORA_WORKERS_PATH;
|
|
224
|
+
if (forcedPath && forcedPath.length > 0)
|
|
225
|
+
return path_1.default.join(__dirname, forcedPath);
|
|
226
|
+
// Check if we're in a published npm package (no .build in path)
|
|
227
|
+
if (!currentDir.includes('.build')) {
|
|
228
|
+
// We're in the published package, workers are relative to package root
|
|
229
|
+
// __dirname is something like: /path/to/package/engines/dataset
|
|
230
|
+
// We need to go up to package root and then to workers
|
|
231
|
+
return path_1.default.join(__dirname, '../../workers');
|
|
232
|
+
}
|
|
233
|
+
else {
|
|
234
|
+
// We're in development, workers are in ./.build/workers
|
|
235
|
+
return path_1.default.resolve('./.build/workers');
|
|
236
|
+
}
|
|
237
|
+
};
|
|
238
|
+
this.reconcileExecutorThreadsResults = (consumer, executionId, executorResults, tracker) => __awaiter(this, void 0, void 0, function* () {
|
|
239
|
+
const workPath = ConsumerExecutor_1.default._getWorkPath(consumer, executionId);
|
|
240
|
+
ConsumerExecutor_1.default._ensurePath(workPath);
|
|
241
|
+
// Merge all the various files into a single one
|
|
242
|
+
if (executorResults.length > 1) {
|
|
243
|
+
const perf = performance.now();
|
|
244
|
+
const output = fs_1.default.createWriteStream(workPath);
|
|
245
|
+
output.setMaxListeners(executorResults.length + 1);
|
|
246
|
+
for (const workerResult of executorResults) {
|
|
247
|
+
yield (0, promises_2.pipeline)(fs_1.default.createReadStream(workerResult.resultUri), output, { end: false });
|
|
248
|
+
}
|
|
249
|
+
output.end();
|
|
250
|
+
output.close();
|
|
251
|
+
tracker.measure('merge-workers', performance.now() - perf);
|
|
252
|
+
}
|
|
253
|
+
else if (executorResults.length === 1) {
|
|
254
|
+
// If there is only one worker, then just rename the worker .dataset to the general consumer one
|
|
255
|
+
yield promises_1.default.rename(executorResults[0].resultUri, workPath);
|
|
256
|
+
}
|
|
257
|
+
});
|
|
258
|
+
this.performCleanupOperations = (consumer, executionId, workersPath, tracker) => __awaiter(this, void 0, void 0, function* () {
|
|
259
|
+
const workPath = ConsumerExecutor_1.default._getWorkPath(consumer, executionId);
|
|
260
|
+
const start = performance.now();
|
|
261
|
+
yield Promise.all(workersPath.map(x => ConsumerExecutor_1.default._clearWorkPath(x)));
|
|
262
|
+
yield promises_1.default.rmdir(path_1.default.dirname(workPath));
|
|
263
|
+
tracker.measure('cleanup-operations', performance.now() - start);
|
|
264
|
+
});
|
|
265
|
+
this.computeFinalResult = (tracker, executorResults, executionId, resultUri) => {
|
|
266
|
+
const result = {
|
|
267
|
+
cycles: Algo_1.default.max(executorResults.map(x => x.cycles)),
|
|
268
|
+
elapsedMS: Algo_1.default.sum(executorResults.map(x => x.elapsedMS)),
|
|
269
|
+
inputCount: Algo_1.default.sum(executorResults.map(x => x.inputCount)),
|
|
270
|
+
outputCount: Algo_1.default.sum(executorResults.map(x => x.outputCount)),
|
|
271
|
+
workerCount: executorResults.length,
|
|
272
|
+
executionId,
|
|
273
|
+
resultUri,
|
|
274
|
+
operations: {}
|
|
275
|
+
};
|
|
276
|
+
for (const res of executorResults) {
|
|
277
|
+
for (const opKey of Object.keys(res.operations)) {
|
|
278
|
+
const op = res.operations[opKey];
|
|
279
|
+
let label = result.operations[opKey];
|
|
280
|
+
if (!label) {
|
|
281
|
+
result.operations[opKey] = { avg: -1, max: -1, min: -1, elapsedMS: [] };
|
|
282
|
+
label = result.operations[opKey];
|
|
283
|
+
}
|
|
284
|
+
label.elapsedMS.push(op.elapsedMS);
|
|
285
|
+
}
|
|
286
|
+
// Calculate min, max, avg for all operations after collecting all data
|
|
287
|
+
for (const opKey of Object.keys(result.operations)) {
|
|
288
|
+
const operation = result.operations[opKey];
|
|
289
|
+
if (operation.elapsedMS.length > 0) {
|
|
290
|
+
operation.min = Math.min(...operation.elapsedMS);
|
|
291
|
+
operation.max = Math.max(...operation.elapsedMS);
|
|
292
|
+
operation.avg = Algo_1.default.mean(operation.elapsedMS);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
// Add tracker operations to result
|
|
297
|
+
const trackerOperations = tracker.getOperations();
|
|
298
|
+
for (const opKey of Object.keys(trackerOperations)) {
|
|
299
|
+
const trackerOp = trackerOperations[opKey];
|
|
300
|
+
const value = trackerOp.elapsedMS;
|
|
301
|
+
if (!result.operations[opKey]) {
|
|
302
|
+
result.operations[opKey] = { avg: value, max: value, min: value, elapsedMS: [] };
|
|
303
|
+
}
|
|
304
|
+
result.operations[opKey].elapsedMS.push(value);
|
|
305
|
+
}
|
|
306
|
+
return result;
|
|
307
|
+
};
|
|
308
|
+
this.onWorkAdvanced = (packet, index, progress) => {
|
|
309
|
+
const { processed, total } = packet;
|
|
310
|
+
progress.update((index + 1).toString(), processed / total);
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
const ExecutorOrchestrator = new ExecutorOrchestratorClass();
|
|
315
|
+
exports.default = ExecutorOrchestrator;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
class ExecutorPerformance {
|
|
4
|
+
constructor() {
|
|
5
|
+
this.measure = (name, elapsedMS) => {
|
|
6
|
+
let tracker = this._operations[name];
|
|
7
|
+
if (!tracker) {
|
|
8
|
+
this._operations[name] = { elapsedMS: 0 };
|
|
9
|
+
tracker = this._operations[name];
|
|
10
|
+
}
|
|
11
|
+
tracker.elapsedMS += elapsedMS;
|
|
12
|
+
};
|
|
13
|
+
this.getOperations = () => this._operations;
|
|
14
|
+
this._operations = {};
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
exports.default = ExecutorPerformance;
|