@forzalabs/remora 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +10 -2
- package/actions/debug.js +1 -0
- package/actions/deploy.js +1 -0
- package/actions/run.js +17 -13
- package/actions/sample.js +1 -1
- package/core/Algo.js +8 -4
- package/definitions/ExecutorDefinitions.js +2 -0
- package/definitions/json_schemas/consumer-schema.json +1 -1
- package/definitions/json_schemas/producer-schema.json +1 -1
- package/definitions/temp.js +2 -0
- package/drivers/DeltaShareDriver.js +4 -0
- package/drivers/DriverFactory.js +10 -10
- package/drivers/DriverHelper.js +33 -10
- package/drivers/HttpApiDriver.js +4 -0
- package/drivers/LocalDriver.js +72 -5
- package/drivers/RedshiftDriver.js +4 -0
- package/drivers/S3Driver.js +36 -52
- package/drivers/files/LocalDestinationDriver.js +200 -0
- package/drivers/files/LocalSourceDriver.js +394 -0
- package/drivers/s3/S3DestinationDriver.js +159 -0
- package/drivers/s3/S3SourceDriver.js +455 -0
- package/engines/ai/LLM.js +0 -11
- package/engines/consumer/ConsumerEngine.js +0 -77
- package/engines/consumer/ConsumerManager.js +61 -36
- package/engines/consumer/ConsumerOnFinishManager.js +14 -0
- package/engines/consumer/PostProcessor.js +1 -7
- package/engines/dataset/Dataset.js +0 -61
- package/engines/dataset/DatasetManager.js +16 -76
- package/engines/dataset/DatasetRecord.js +4 -3
- package/engines/deployment/DeploymentPlanner.js +0 -7
- package/engines/execution/ExecutionPlanner.js +2 -2
- package/engines/execution/RequestExecutor.js +4 -45
- package/engines/file/FileExporter.js +7 -32
- package/engines/parsing/CSVParser.js +27 -26
- package/engines/parsing/LineParser.js +52 -0
- package/engines/parsing/XMLParser.js +1 -1
- package/engines/producer/ProducerEngine.js +0 -45
- package/engines/scheduler/CronScheduler.js +12 -4
- package/engines/scheduler/QueueManager.js +11 -4
- package/engines/sql/SQLCompiler.js +4 -4
- package/engines/transform/JoinEngine.js +3 -3
- package/engines/transform/TransformationEngine.js +3 -86
- package/engines/usage/UsageManager.js +8 -6
- package/engines/validation/Validator.js +12 -18
- package/executors/ConsumerExecutor.js +152 -0
- package/executors/Executor.js +168 -0
- package/executors/ExecutorOrchestrator.js +315 -0
- package/executors/ExecutorPerformance.js +17 -0
- package/executors/ExecutorProgress.js +52 -0
- package/executors/OutputExecutor.js +118 -0
- package/executors/ProducerExecutor.js +108 -0
- package/package.json +3 -3
- package/workers/ExecutorWorker.js +48 -0
package/Constants.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
const CONSTANTS = {
|
|
4
|
-
cliVersion: '0.
|
|
4
|
+
cliVersion: '1.0.0',
|
|
5
5
|
backendVersion: 1,
|
|
6
6
|
backendPort: 5088,
|
|
7
7
|
workerVersion: 2,
|
|
@@ -19,7 +19,15 @@ const CONSTANTS = {
|
|
|
19
19
|
MIN_RUNTIME_HEAP_MB: 4000,
|
|
20
20
|
RECOMMENDED_RUNTIME_HEAP_MB: 8000,
|
|
21
21
|
INDICATIVE_THREAD_LINE_COUNT: 750000,
|
|
22
|
-
MAX_THREAD_COUNT: 8
|
|
22
|
+
MAX_THREAD_COUNT: 8,
|
|
23
|
+
/**
|
|
24
|
+
* Minimum file size to consider parallel processing (10 MB)
|
|
25
|
+
*/
|
|
26
|
+
MIN_FILE_SIZE_FOR_PARALLEL: 10 * 1024 * 1024,
|
|
27
|
+
/**
|
|
28
|
+
* Minimum chunk size per worker to justify overhead (2 MB)
|
|
29
|
+
*/
|
|
30
|
+
MIN_CHUNK_SIZE: 2 * 1024 * 1024
|
|
23
31
|
}
|
|
24
32
|
};
|
|
25
33
|
exports.default = CONSTANTS;
|
package/actions/debug.js
CHANGED
|
@@ -19,6 +19,7 @@ const DriverFactory_1 = __importDefault(require("../drivers/DriverFactory"));
|
|
|
19
19
|
const Environment_1 = __importDefault(require("../engines/Environment"));
|
|
20
20
|
const compile_1 = require("./compile");
|
|
21
21
|
const debug = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
22
|
+
void options;
|
|
22
23
|
try {
|
|
23
24
|
(0, compile_1.compile)();
|
|
24
25
|
console.log('\n');
|
package/actions/deploy.js
CHANGED
|
@@ -54,6 +54,7 @@ const deploy = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
54
54
|
const version = Constants_1.default.workerVersion;
|
|
55
55
|
const workerAPI = `${host}/cli/v${version}/uploaddeployment`;
|
|
56
56
|
const formData = new FormData();
|
|
57
|
+
// @ts-ignore
|
|
57
58
|
const blob = new Blob([zipBuffer], { type: 'application/zip' });
|
|
58
59
|
formData.append('remora_config', blob, 'temp_deployment.zip'); // Updated to match the actual file name
|
|
59
60
|
const apiKey = process.env.REMORA_LICENCE_KEY;
|
package/actions/run.js
CHANGED
|
@@ -14,17 +14,15 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
15
|
exports.run = void 0;
|
|
16
16
|
const chalk_1 = __importDefault(require("chalk"));
|
|
17
|
-
const ora_1 = __importDefault(require("ora"));
|
|
18
17
|
const Environment_1 = __importDefault(require("../engines/Environment"));
|
|
19
|
-
const ConsumerEngine_1 = __importDefault(require("../engines/consumer/ConsumerEngine"));
|
|
20
18
|
const compile_1 = require("./compile");
|
|
21
19
|
const Helper_1 = __importDefault(require("../helper/Helper"));
|
|
22
20
|
const LicenceManager_1 = __importDefault(require("../licencing/LicenceManager"));
|
|
21
|
+
const ExecutorOrchestrator_1 = __importDefault(require("../executors/ExecutorOrchestrator"));
|
|
23
22
|
const run = (consumerName, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
24
23
|
try {
|
|
25
24
|
(0, compile_1.compile)();
|
|
26
|
-
console.log(); // needed for newline
|
|
27
|
-
const spinner = (0, ora_1.default)(chalk_1.default.blue('Running consumer(s)...\n')).start();
|
|
25
|
+
console.log(chalk_1.default.blue('Running consumer(s)...\n')); // needed for newline
|
|
28
26
|
const consumersToExecute = [];
|
|
29
27
|
if (consumerName && consumerName.length > 0) {
|
|
30
28
|
const cons = Environment_1.default.getConsumer(consumerName);
|
|
@@ -52,7 +50,14 @@ const run = (consumerName, options) => __awaiter(void 0, void 0, void 0, functio
|
|
|
52
50
|
console.error(`Invalid Remora licence key, the product is not active: remember to set "REMORA_LICENCE_KEY" environment variable.`);
|
|
53
51
|
process.exit(1);
|
|
54
52
|
}
|
|
55
|
-
const response = yield
|
|
53
|
+
const response = yield ExecutorOrchestrator_1.default.launch({
|
|
54
|
+
consumer,
|
|
55
|
+
details: {
|
|
56
|
+
invokedBy: 'CLI',
|
|
57
|
+
user: { _id: check.customer, name: check.customer, type: 'licence' }
|
|
58
|
+
},
|
|
59
|
+
logProgress: true
|
|
60
|
+
});
|
|
56
61
|
results.push({ success: true, consumer, response });
|
|
57
62
|
}
|
|
58
63
|
catch (error) {
|
|
@@ -62,15 +67,14 @@ const run = (consumerName, options) => __awaiter(void 0, void 0, void 0, functio
|
|
|
62
67
|
console.log(myErr.stack);
|
|
63
68
|
}
|
|
64
69
|
}
|
|
65
|
-
spinner.succeed('All consumers have been executed:');
|
|
66
70
|
results.forEach(({ response, consumer, success, error }) => {
|
|
67
71
|
if (success) {
|
|
68
|
-
const {
|
|
69
|
-
const rowCount =
|
|
72
|
+
const { elapsedMS, outputCount, resultUri } = response;
|
|
73
|
+
const rowCount = outputCount;
|
|
70
74
|
const duration = Helper_1.default.formatDuration(elapsedMS);
|
|
71
75
|
const performanceInfo = chalk_1.default.gray(` (${rowCount} rows, ${duration})`);
|
|
72
|
-
if (
|
|
73
|
-
console.log(chalk_1.default.green(`• Consumer ${consumer.name} -> ${
|
|
76
|
+
if (resultUri)
|
|
77
|
+
console.log(chalk_1.default.green(`• Consumer ${consumer.name} -> ${resultUri}`) + performanceInfo);
|
|
74
78
|
else
|
|
75
79
|
console.log(chalk_1.default.green(`• Consumer ${consumer.name} `) + performanceInfo);
|
|
76
80
|
}
|
|
@@ -82,11 +86,11 @@ const run = (consumerName, options) => __awaiter(void 0, void 0, void 0, functio
|
|
|
82
86
|
const successfulResults = results.filter(x => x.success);
|
|
83
87
|
const totalRows = successfulResults.reduce((sum, result) => {
|
|
84
88
|
var _a, _b;
|
|
85
|
-
return sum + ((_b = (_a = result.response) === null || _a === void 0 ? void 0 : _a.
|
|
89
|
+
return sum + ((_b = (_a = result.response) === null || _a === void 0 ? void 0 : _a.outputCount) !== null && _b !== void 0 ? _b : 0);
|
|
86
90
|
}, 0);
|
|
87
91
|
const totalDuration = successfulResults.reduce((sum, result) => {
|
|
88
|
-
var _a
|
|
89
|
-
return sum + (((
|
|
92
|
+
var _a;
|
|
93
|
+
return sum + (((_a = result.response) === null || _a === void 0 ? void 0 : _a.elapsedMS) || 0);
|
|
90
94
|
}, 0);
|
|
91
95
|
const totalsInfo = chalk_1.default.gray(` (${totalRows} rows, ${Helper_1.default.formatDuration(totalDuration)})`);
|
|
92
96
|
if (results.some(x => !x.success))
|
package/actions/sample.js
CHANGED
|
@@ -93,7 +93,7 @@ const sampleFromConsumer = (consumer, sampleSize) => __awaiter(void 0, void 0, v
|
|
|
93
93
|
const mappedData = rawSampleData.map(record => {
|
|
94
94
|
const mappedRecord = new DatasetRecord_1.default('', [], record._delimiter);
|
|
95
95
|
consumer.fields.forEach(field => {
|
|
96
|
-
if (field.key !== '*'
|
|
96
|
+
if (field.key !== '*') {
|
|
97
97
|
const sourceValue = record.getValue(field.key);
|
|
98
98
|
const outputKey = field.alias || field.key;
|
|
99
99
|
mappedRecord.setValue(outputKey, sourceValue);
|
package/core/Algo.js
CHANGED
|
@@ -122,13 +122,15 @@ const algo = {
|
|
|
122
122
|
},
|
|
123
123
|
mean: (numbers) => {
|
|
124
124
|
(0, Affirm_1.default)(algo.hasVal(numbers), 'Array must not be null or undefined');
|
|
125
|
-
|
|
125
|
+
if (numbers.length === 0)
|
|
126
|
+
return 0;
|
|
126
127
|
const total = algo.sum(numbers);
|
|
127
128
|
return total / numbers.length;
|
|
128
129
|
},
|
|
129
130
|
sum: (numbers) => {
|
|
130
131
|
(0, Affirm_1.default)(algo.hasVal(numbers), 'Array must not be null or undefined');
|
|
131
|
-
|
|
132
|
+
if (numbers.length === 0)
|
|
133
|
+
return 0;
|
|
132
134
|
let total = 0;
|
|
133
135
|
for (let i = 0; i < numbers.length; i++) {
|
|
134
136
|
total += numbers[i];
|
|
@@ -142,12 +144,14 @@ const algo = {
|
|
|
142
144
|
},
|
|
143
145
|
min: (arr) => {
|
|
144
146
|
(0, Affirm_1.default)(algo.hasVal(arr), 'Array must not be null or undefined');
|
|
145
|
-
|
|
147
|
+
if (arr.length === 0)
|
|
148
|
+
return 0;
|
|
146
149
|
return Math.min(...arr);
|
|
147
150
|
},
|
|
148
151
|
max: (arr) => {
|
|
149
152
|
(0, Affirm_1.default)(algo.hasVal(arr), 'Array must not be null or undefined');
|
|
150
|
-
|
|
153
|
+
if (arr.length === 0)
|
|
154
|
+
return 0;
|
|
151
155
|
return Math.max(...arr);
|
|
152
156
|
},
|
|
153
157
|
replaceAll: (text, search, replace) => text.replace(new RegExp(search, 'g'), replace),
|
|
@@ -845,7 +845,7 @@
|
|
|
845
845
|
},
|
|
846
846
|
{
|
|
847
847
|
"type": "object",
|
|
848
|
-
"description": "Apply conditional logic to transform values based on comparison conditions",
|
|
848
|
+
"description": "Apply conditional logic to transform values based on comparison conditions.",
|
|
849
849
|
"properties": {
|
|
850
850
|
"conditional": {
|
|
851
851
|
"type": "object",
|
|
@@ -177,6 +177,10 @@ class DeltaShareSourceDriver {
|
|
|
177
177
|
.map(x => JSON.parse(x));
|
|
178
178
|
return deltaLines;
|
|
179
179
|
});
|
|
180
|
+
this.ready = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
181
|
+
void producer;
|
|
182
|
+
throw new Error('DeltaShareSourceDriver.ready is not supported: Delta Sharing does not support readiness checks');
|
|
183
|
+
});
|
|
180
184
|
}
|
|
181
185
|
}
|
|
182
186
|
exports.default = DeltaShareSourceDriver;
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -12,23 +12,24 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
const LocalDriver_1 = require("./LocalDriver");
|
|
16
15
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
17
|
-
const S3Driver_1 = require("./S3Driver");
|
|
18
16
|
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
19
|
-
const HttpApiDriver_1 = require("./HttpApiDriver");
|
|
17
|
+
const HttpApiDriver_1 = __importDefault(require("./HttpApiDriver"));
|
|
18
|
+
const LocalSourceDriver_1 = __importDefault(require("./files/LocalSourceDriver"));
|
|
19
|
+
const LocalDestinationDriver_1 = __importDefault(require("./files/LocalDestinationDriver"));
|
|
20
|
+
const S3SourceDriver_1 = __importDefault(require("./s3/S3SourceDriver"));
|
|
21
|
+
const S3DestinationDriver_1 = __importDefault(require("./s3/S3DestinationDriver"));
|
|
20
22
|
class DriverFactoryClass {
|
|
21
23
|
constructor() {
|
|
22
24
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
23
25
|
switch (source.engine) {
|
|
24
|
-
// TODO: implement all the other engines
|
|
25
26
|
case 'aws-redshift': {
|
|
26
27
|
const driver = new RedshiftDriver_1.default();
|
|
27
28
|
yield driver.init(source);
|
|
28
29
|
return driver;
|
|
29
30
|
}
|
|
30
31
|
case 'aws-s3': {
|
|
31
|
-
const driver = new
|
|
32
|
+
const driver = new S3SourceDriver_1.default();
|
|
32
33
|
yield driver.init(source);
|
|
33
34
|
return driver;
|
|
34
35
|
}
|
|
@@ -38,12 +39,12 @@ class DriverFactoryClass {
|
|
|
38
39
|
return driver;
|
|
39
40
|
}
|
|
40
41
|
case 'local': {
|
|
41
|
-
const driver = new
|
|
42
|
+
const driver = new LocalSourceDriver_1.default();
|
|
42
43
|
yield driver.init(source);
|
|
43
44
|
return driver;
|
|
44
45
|
}
|
|
45
46
|
case 'http-api': {
|
|
46
|
-
const driver = new HttpApiDriver_1.
|
|
47
|
+
const driver = new HttpApiDriver_1.default();
|
|
47
48
|
yield driver.init(source);
|
|
48
49
|
return driver;
|
|
49
50
|
}
|
|
@@ -52,14 +53,13 @@ class DriverFactoryClass {
|
|
|
52
53
|
});
|
|
53
54
|
this.instantiateDestination = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
54
55
|
switch (source.engine) {
|
|
55
|
-
// TODO: implement all the other engines
|
|
56
56
|
case 'aws-s3': {
|
|
57
|
-
const driver = new
|
|
57
|
+
const driver = new S3DestinationDriver_1.default();
|
|
58
58
|
yield driver.init(source);
|
|
59
59
|
return driver;
|
|
60
60
|
}
|
|
61
61
|
case 'local': {
|
|
62
|
-
const driver = new
|
|
62
|
+
const driver = new LocalDestinationDriver_1.default();
|
|
63
63
|
yield driver.init(source);
|
|
64
64
|
return driver;
|
|
65
65
|
}
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -23,12 +23,12 @@ const stream_1 = require("stream");
|
|
|
23
23
|
const readline_1 = require("readline");
|
|
24
24
|
const promises_1 = require("stream/promises");
|
|
25
25
|
const fs_1 = require("fs");
|
|
26
|
+
const path_1 = __importDefault(require("path"));
|
|
26
27
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
27
28
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
28
29
|
const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
|
|
29
|
-
const path_1 = __importDefault(require("path"));
|
|
30
|
-
const Constants_1 = __importDefault(require("../Constants"));
|
|
31
30
|
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
|
|
31
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
32
32
|
const DriverHelper = {
|
|
33
33
|
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
34
34
|
(0, Affirm_1.default)(options, 'Invalid options');
|
|
@@ -37,7 +37,7 @@ const DriverHelper = {
|
|
|
37
37
|
const keys = (fileType === 'JSON' || fileType === 'JSONL')
|
|
38
38
|
? Object.keys(JSON.parse(headerLine))
|
|
39
39
|
: [];
|
|
40
|
-
const shouldValidateHeader = fileType === 'CSV' ||
|
|
40
|
+
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
|
|
41
41
|
// When sourceFilename is set, the headerLine includes $source_filename at the end.
|
|
42
42
|
// For validation, we need to compare against the original header without this suffix.
|
|
43
43
|
const originalHeaderLine = sourceFilename
|
|
@@ -154,14 +154,9 @@ const DriverHelper = {
|
|
|
154
154
|
return lineCount;
|
|
155
155
|
}),
|
|
156
156
|
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
157
|
-
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
158
|
-
const lines = yield DriverHelper.quickReadStream(fileStream, lineCount);
|
|
159
|
-
fileStream.close();
|
|
160
|
-
return lines;
|
|
161
|
-
}),
|
|
162
|
-
quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
163
157
|
var _a, e_1, _b, _c;
|
|
164
|
-
const
|
|
158
|
+
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
159
|
+
const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
|
|
165
160
|
const lines = [];
|
|
166
161
|
let counter = 0;
|
|
167
162
|
try {
|
|
@@ -184,6 +179,7 @@ const DriverHelper = {
|
|
|
184
179
|
finally { if (e_1) throw e_1.error; }
|
|
185
180
|
}
|
|
186
181
|
rl.close();
|
|
182
|
+
fileStream.close();
|
|
187
183
|
return lines;
|
|
188
184
|
}),
|
|
189
185
|
setHeaderFromFile: (fileKey, file, filePath, dataset) => __awaiter(void 0, void 0, void 0, function* () {
|
|
@@ -220,6 +216,33 @@ const DriverHelper = {
|
|
|
220
216
|
default:
|
|
221
217
|
throw new Error(`the fileType "${file.fileType}" is not implemented yet`);
|
|
222
218
|
}
|
|
219
|
+
}),
|
|
220
|
+
quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
221
|
+
var _a, e_2, _b, _c;
|
|
222
|
+
const rl = (0, readline_1.createInterface)({ input: stream, crlfDelay: Infinity });
|
|
223
|
+
const lines = [];
|
|
224
|
+
let counter = 0;
|
|
225
|
+
try {
|
|
226
|
+
for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
|
|
227
|
+
_c = rl_2_1.value;
|
|
228
|
+
_d = false;
|
|
229
|
+
const line = _c;
|
|
230
|
+
lines.push(line);
|
|
231
|
+
counter++;
|
|
232
|
+
if (counter >= lineCount) {
|
|
233
|
+
break;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
238
|
+
finally {
|
|
239
|
+
try {
|
|
240
|
+
if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
|
|
241
|
+
}
|
|
242
|
+
finally { if (e_2) throw e_2.error; }
|
|
243
|
+
}
|
|
244
|
+
rl.close();
|
|
245
|
+
return lines;
|
|
223
246
|
})
|
|
224
247
|
};
|
|
225
248
|
exports.default = DriverHelper;
|
package/drivers/HttpApiDriver.js
CHANGED
package/drivers/LocalDriver.js
CHANGED
|
@@ -59,13 +59,14 @@ const readline_1 = __importDefault(require("readline"));
|
|
|
59
59
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
60
60
|
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
61
61
|
const xlsx_1 = __importDefault(require("xlsx"));
|
|
62
|
-
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
|
|
62
|
+
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
|
|
63
63
|
const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
|
|
64
64
|
const Helper_1 = __importDefault(require("../helper/Helper"));
|
|
65
65
|
const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
|
|
66
66
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
67
67
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
68
68
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
69
|
+
const stream_1 = require("stream");
|
|
69
70
|
class LocalSourceDriver {
|
|
70
71
|
constructor() {
|
|
71
72
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -147,8 +148,7 @@ class LocalSourceDriver {
|
|
|
147
148
|
sourceFilename
|
|
148
149
|
});
|
|
149
150
|
});
|
|
150
|
-
|
|
151
|
-
const handleFileAndGetLineCount = (fileKey, appendMode, fileType, sourceFilename) => __awaiter(this, void 0, void 0, function* () {
|
|
151
|
+
const getTotalLineCount = (fileKey, appendMode, fileType, sourceFilename) => __awaiter(this, void 0, void 0, function* () {
|
|
152
152
|
let totalLineCount;
|
|
153
153
|
let streamXLS;
|
|
154
154
|
switch (fileType) {
|
|
@@ -177,7 +177,7 @@ class LocalSourceDriver {
|
|
|
177
177
|
const currentFileKey = allFileKeys[i];
|
|
178
178
|
// Pass the filename (just the basename) if includeSourceFilename is enabled
|
|
179
179
|
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
|
|
180
|
-
totalLineCount += yield
|
|
180
|
+
totalLineCount += yield getTotalLineCount(currentFileKey, true, file.fileType, sourceFilename); // Append mode for subsequent files
|
|
181
181
|
}
|
|
182
182
|
dataset.setCount(totalLineCount);
|
|
183
183
|
return dataset;
|
|
@@ -185,7 +185,7 @@ class LocalSourceDriver {
|
|
|
185
185
|
else {
|
|
186
186
|
sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
187
187
|
yield DriverHelper_1.default.setHeaderFromFile(fileKey, file, this._path, dataset);
|
|
188
|
-
totalLineCount = (yield
|
|
188
|
+
totalLineCount = (yield getTotalLineCount(fileKey, false, file.fileType, sourceFilename));
|
|
189
189
|
dataset.setCount(totalLineCount);
|
|
190
190
|
return dataset;
|
|
191
191
|
}
|
|
@@ -376,6 +376,24 @@ class LocalSourceDriver {
|
|
|
376
376
|
}
|
|
377
377
|
fs.renameSync(sourceFilePath, destinationFilePath);
|
|
378
378
|
};
|
|
379
|
+
this.ready = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
380
|
+
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
381
|
+
const { fileKey } = producer.settings;
|
|
382
|
+
if (fileKey.includes('%')) {
|
|
383
|
+
const allFileKeys = this.listFiles(fileKey);
|
|
384
|
+
const allFilePaths = allFileKeys.map(x => path_1.default.join(this._path, x));
|
|
385
|
+
const readStreams = allFilePaths.map(x => fs.createReadStream(x));
|
|
386
|
+
let pass = new stream_1.PassThrough();
|
|
387
|
+
for (const [index, stream] of readStreams.entries())
|
|
388
|
+
pass = stream.pipe(pass, { end: index === readStreams.length - 1 });
|
|
389
|
+
return pass;
|
|
390
|
+
}
|
|
391
|
+
else {
|
|
392
|
+
const sourceFilePath = path_1.default.join(this._path, fileKey);
|
|
393
|
+
const readStream = fs.createReadStream(sourceFilePath);
|
|
394
|
+
return readStream;
|
|
395
|
+
}
|
|
396
|
+
});
|
|
379
397
|
}
|
|
380
398
|
}
|
|
381
399
|
exports.LocalSourceDriver = LocalSourceDriver;
|
|
@@ -470,6 +488,55 @@ class LocalDestinationDriver {
|
|
|
470
488
|
const fileContent = yield s3Driver.downloadFile(sourceFileKey);
|
|
471
489
|
yield this.saveFile(destinationFileKey, fileContent);
|
|
472
490
|
});
|
|
491
|
+
this.ready = (destinationPath) => __awaiter(this, void 0, void 0, function* () {
|
|
492
|
+
return fs.createWriteStream(destinationPath);
|
|
493
|
+
});
|
|
494
|
+
this.move = (fromPath, toName) => __awaiter(this, void 0, void 0, function* () {
|
|
495
|
+
try {
|
|
496
|
+
const toFilePath = path_1.default.join(this._path, toName);
|
|
497
|
+
fs.renameSync(fromPath, toFilePath);
|
|
498
|
+
return { bucket: '', key: toFilePath, res: true };
|
|
499
|
+
}
|
|
500
|
+
catch (error) {
|
|
501
|
+
Logger_1.default.error(error);
|
|
502
|
+
return { bucket: '', key: '', res: false };
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
this.transformAndMove = (fromPath, transform, toName) => __awaiter(this, void 0, void 0, function* () {
|
|
506
|
+
var _a, e_2, _b, _c;
|
|
507
|
+
try {
|
|
508
|
+
const toFilePath = path_1.default.join(this._path, toName);
|
|
509
|
+
const decoder = new TextDecoder();
|
|
510
|
+
const reader = fs.createReadStream(fromPath);
|
|
511
|
+
const writer = fs.createWriteStream(toFilePath);
|
|
512
|
+
try {
|
|
513
|
+
for (var _d = true, reader_2 = __asyncValues(reader), reader_2_1; reader_2_1 = yield reader_2.next(), _a = reader_2_1.done, !_a; _d = true) {
|
|
514
|
+
_c = reader_2_1.value;
|
|
515
|
+
_d = false;
|
|
516
|
+
const chunk = _c;
|
|
517
|
+
const decoded = decoder.decode(chunk);
|
|
518
|
+
const lines = decoded.split('\n');
|
|
519
|
+
for (const line of lines) {
|
|
520
|
+
writer.write(transform(line) + '\n');
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
525
|
+
finally {
|
|
526
|
+
try {
|
|
527
|
+
if (!_d && !_a && (_b = reader_2.return)) yield _b.call(reader_2);
|
|
528
|
+
}
|
|
529
|
+
finally { if (e_2) throw e_2.error; }
|
|
530
|
+
}
|
|
531
|
+
writer.close();
|
|
532
|
+
reader.close();
|
|
533
|
+
return { bucket: '', key: toFilePath, res: true };
|
|
534
|
+
}
|
|
535
|
+
catch (error) {
|
|
536
|
+
Logger_1.default.error(error);
|
|
537
|
+
return { bucket: '', key: '', res: false };
|
|
538
|
+
}
|
|
539
|
+
});
|
|
473
540
|
}
|
|
474
541
|
}
|
|
475
542
|
exports.LocalDestinationDriver = LocalDestinationDriver;
|
|
@@ -175,6 +175,10 @@ class RedshiftDriver {
|
|
|
175
175
|
}
|
|
176
176
|
return records;
|
|
177
177
|
};
|
|
178
|
+
this.ready = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
179
|
+
void producer;
|
|
180
|
+
throw new Error('Not implemented yet');
|
|
181
|
+
});
|
|
178
182
|
}
|
|
179
183
|
}
|
|
180
184
|
exports.default = RedshiftDriver;
|
package/drivers/S3Driver.js
CHANGED
|
@@ -34,7 +34,6 @@ const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
|
34
34
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
35
35
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
36
36
|
const Constants_1 = __importDefault(require("../Constants"));
|
|
37
|
-
const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
|
|
38
37
|
class S3DestinationDriver {
|
|
39
38
|
constructor() {
|
|
40
39
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -225,16 +224,7 @@ class S3SourceDriver {
|
|
|
225
224
|
});
|
|
226
225
|
const response = yield this._client.send(command);
|
|
227
226
|
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
|
|
228
|
-
|
|
229
|
-
switch (file.fileType) {
|
|
230
|
-
case 'XLS':
|
|
231
|
-
case 'XLSX':
|
|
232
|
-
stream = yield XLSParser_1.default.parseXLSStream(response.Body, file.sheetName);
|
|
233
|
-
break;
|
|
234
|
-
default:
|
|
235
|
-
stream = response.Body;
|
|
236
|
-
break;
|
|
237
|
-
}
|
|
227
|
+
const stream = response.Body;
|
|
238
228
|
return DriverHelper_1.default.appendToUnifiedFile({
|
|
239
229
|
stream,
|
|
240
230
|
fileKey: fileUrl,
|
|
@@ -248,45 +238,6 @@ class S3SourceDriver {
|
|
|
248
238
|
});
|
|
249
239
|
});
|
|
250
240
|
const { fileKey } = file;
|
|
251
|
-
const setFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
|
|
252
|
-
var _a, e_1, _b, _c;
|
|
253
|
-
const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
254
|
-
let firstLine = '';
|
|
255
|
-
switch (file.fileType) {
|
|
256
|
-
case 'XLSX':
|
|
257
|
-
case 'XLS':
|
|
258
|
-
firstLine = yield XLSParser_1.default.getHeaderXlsFromStream(stream, file.sheetName);
|
|
259
|
-
break;
|
|
260
|
-
case 'CSV':
|
|
261
|
-
case 'JSON':
|
|
262
|
-
case 'JSONL':
|
|
263
|
-
case 'TXT':
|
|
264
|
-
try {
|
|
265
|
-
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
266
|
-
_c = rl_1_1.value;
|
|
267
|
-
_d = false;
|
|
268
|
-
const line = _c;
|
|
269
|
-
firstLine = line;
|
|
270
|
-
break;
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
274
|
-
finally {
|
|
275
|
-
try {
|
|
276
|
-
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
277
|
-
}
|
|
278
|
-
finally { if (e_1) throw e_1.error; }
|
|
279
|
-
}
|
|
280
|
-
rl.close();
|
|
281
|
-
break;
|
|
282
|
-
}
|
|
283
|
-
// If including source filename, append a placeholder column name to the header
|
|
284
|
-
if (file.includeSourceFilename) {
|
|
285
|
-
firstLine = firstLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
286
|
-
}
|
|
287
|
-
dataset.setFirstLine(firstLine);
|
|
288
|
-
return firstLine;
|
|
289
|
-
});
|
|
290
241
|
if (fileKey.includes('%')) {
|
|
291
242
|
const allFileKeys = yield this.listFiles(fileKey);
|
|
292
243
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
@@ -299,7 +250,12 @@ class S3SourceDriver {
|
|
|
299
250
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
300
251
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
301
252
|
const firstFileStream = firstFileResponse.Body;
|
|
302
|
-
|
|
253
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
254
|
+
// If including source filename, append a placeholder column name to the header
|
|
255
|
+
if (includeSourceFilename) {
|
|
256
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
257
|
+
}
|
|
258
|
+
dataset.setFirstLine(headerLine);
|
|
303
259
|
let totalLineCount = 0;
|
|
304
260
|
// Download files sequentially to avoid file conflicts
|
|
305
261
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
@@ -320,7 +276,12 @@ class S3SourceDriver {
|
|
|
320
276
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
321
277
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
322
278
|
const firstFileStream = firstFileResponse.Body;
|
|
323
|
-
|
|
279
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
280
|
+
// If including source filename, append a placeholder column name to the header
|
|
281
|
+
if (includeSourceFilename) {
|
|
282
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
283
|
+
}
|
|
284
|
+
dataset.setFirstLine(headerLine);
|
|
324
285
|
// Pass the filename if includeSourceFilename is enabled
|
|
325
286
|
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
326
287
|
const totalLineCount = yield downloadLocally(fileKey, headerLine, false, sourceFilename);
|
|
@@ -328,6 +289,29 @@ class S3SourceDriver {
|
|
|
328
289
|
return dataset;
|
|
329
290
|
}
|
|
330
291
|
});
|
|
292
|
+
this.getFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
|
|
293
|
+
var _a, e_1, _b, _c;
|
|
294
|
+
const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
295
|
+
let firstLine = '';
|
|
296
|
+
try {
|
|
297
|
+
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
298
|
+
_c = rl_1_1.value;
|
|
299
|
+
_d = false;
|
|
300
|
+
const line = _c;
|
|
301
|
+
firstLine = line;
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
306
|
+
finally {
|
|
307
|
+
try {
|
|
308
|
+
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
309
|
+
}
|
|
310
|
+
finally { if (e_1) throw e_1.error; }
|
|
311
|
+
}
|
|
312
|
+
rl.close();
|
|
313
|
+
return firstLine;
|
|
314
|
+
});
|
|
331
315
|
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
332
316
|
var _a;
|
|
333
317
|
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
|