@forzalabs/remora 0.0.47-nasco.3 → 0.0.48-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/drivers/DriverHelper.js +86 -0
- package/drivers/LocalDriver.js +9 -11
- package/drivers/S3Driver.js +48 -16
- package/engines/execution/ExecutionPlanner.js +2 -0
- package/package.json +1 -1
package/Constants.js
CHANGED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
+
var m = o[Symbol.asyncIterator], i;
|
|
14
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
+
};
|
|
18
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
19
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
20
|
+
};
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
const stream_1 = require("stream");
|
|
23
|
+
const readline_1 = require("readline");
|
|
24
|
+
const promises_1 = require("stream/promises");
|
|
25
|
+
const fs_1 = require("fs");
|
|
26
|
+
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
27
|
+
const DriverHelper = {
|
|
28
|
+
appendToUnifiedFile: (stream, fileKey, destinationPath, append, headerLine, fileType) => __awaiter(void 0, void 0, void 0, function* () {
|
|
29
|
+
const shouldValidateHeader = fileType === 'CSV' || fileType === 'TXT';
|
|
30
|
+
let isFirstLine = true;
|
|
31
|
+
let hasValidatedHeader = shouldValidateHeader ? false : true;
|
|
32
|
+
const headerValidationTransform = new stream_1.Transform({
|
|
33
|
+
transform(chunk, encoding, callback) {
|
|
34
|
+
if (!hasValidatedHeader) {
|
|
35
|
+
const chunkStr = chunk.toString();
|
|
36
|
+
const lines = chunkStr.split('\n');
|
|
37
|
+
if (isFirstLine && lines.length > 0) {
|
|
38
|
+
const firstLine = lines[0];
|
|
39
|
+
// Validate header only for CSV and TXT files
|
|
40
|
+
if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
|
|
41
|
+
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
|
|
42
|
+
Logger_1.default.log(msg);
|
|
43
|
+
return callback(new Error(msg));
|
|
44
|
+
}
|
|
45
|
+
hasValidatedHeader = true;
|
|
46
|
+
isFirstLine = false;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
callback(null, chunk);
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
const writeOptions = append ? { flags: 'a' } : {};
|
|
53
|
+
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
54
|
+
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
|
55
|
+
}),
|
|
56
|
+
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
57
|
+
var _a, e_1, _b, _c;
|
|
58
|
+
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
59
|
+
const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
|
|
60
|
+
const lines = [];
|
|
61
|
+
let counter = 0;
|
|
62
|
+
try {
|
|
63
|
+
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
64
|
+
_c = rl_1_1.value;
|
|
65
|
+
_d = false;
|
|
66
|
+
const line = _c;
|
|
67
|
+
lines.push(line);
|
|
68
|
+
counter++;
|
|
69
|
+
if (counter >= lineCount) {
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
75
|
+
finally {
|
|
76
|
+
try {
|
|
77
|
+
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
78
|
+
}
|
|
79
|
+
finally { if (e_1) throw e_1.error; }
|
|
80
|
+
}
|
|
81
|
+
rl.close();
|
|
82
|
+
fileStream.close();
|
|
83
|
+
return lines;
|
|
84
|
+
})
|
|
85
|
+
};
|
|
86
|
+
exports.default = DriverHelper;
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -64,6 +64,7 @@ const Helper_1 = __importDefault(require("../helper/Helper"));
|
|
|
64
64
|
const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
|
|
65
65
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
66
66
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
67
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
67
68
|
class LocalSourceDriver {
|
|
68
69
|
constructor() {
|
|
69
70
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -116,31 +117,28 @@ class LocalSourceDriver {
|
|
|
116
117
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
117
118
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
118
119
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
119
|
-
const copyLocally = (fileKey_1, ...args_1) => __awaiter(this, [fileKey_1, ...args_1], void 0, function* (fileKey, appendMode = false) {
|
|
120
|
+
const copyLocally = (fileKey_1, headerLine_1, ...args_1) => __awaiter(this, [fileKey_1, headerLine_1, ...args_1], void 0, function* (fileKey, headerLine, appendMode = false) {
|
|
120
121
|
const sourceFilePath = path_1.default.join(this._path, fileKey);
|
|
121
122
|
(0, Affirm_1.default)(fs.existsSync(sourceFilePath), `Source file does not exist: ${sourceFilePath}`);
|
|
122
|
-
|
|
123
|
+
// Copy and validate header in a single stream pass
|
|
123
124
|
const readStream = fs.createReadStream(sourceFilePath);
|
|
124
|
-
|
|
125
|
-
return new Promise((resolve, reject) => {
|
|
126
|
-
readStream.pipe(writeStream);
|
|
127
|
-
writeStream.on('finish', resolve);
|
|
128
|
-
writeStream.on('error', reject);
|
|
129
|
-
readStream.on('error', reject);
|
|
130
|
-
});
|
|
125
|
+
return DriverHelper_1.default.appendToUnifiedFile(readStream, fileKey, dataset.getPath(), appendMode, headerLine, file.fileType);
|
|
131
126
|
});
|
|
132
127
|
const { fileKey } = file;
|
|
133
128
|
if (fileKey.includes('%')) {
|
|
134
129
|
const allFileKeys = this.listFiles(fileKey);
|
|
135
130
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying to locally and creating unified dataset.`);
|
|
131
|
+
// Get header line from the first file
|
|
132
|
+
const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
136
133
|
// Copy files sequentially to avoid file conflicts
|
|
137
134
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
138
|
-
yield copyLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
|
|
135
|
+
yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
|
|
139
136
|
}
|
|
140
137
|
return dataset;
|
|
141
138
|
}
|
|
142
139
|
else {
|
|
143
|
-
|
|
140
|
+
// For single file, no header validation needed
|
|
141
|
+
yield copyLocally(fileKey, '', false);
|
|
144
142
|
return dataset;
|
|
145
143
|
}
|
|
146
144
|
});
|
package/drivers/S3Driver.js
CHANGED
|
@@ -29,9 +29,8 @@ const xlsx_1 = __importDefault(require("xlsx"));
|
|
|
29
29
|
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); // Added XMLParser import
|
|
30
30
|
const Helper_1 = __importDefault(require("../helper/Helper"));
|
|
31
31
|
const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
|
|
32
|
-
const promises_1 = require("stream/promises");
|
|
33
|
-
const fs_1 = require("fs");
|
|
34
32
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
33
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
35
34
|
class S3DestinationDriver {
|
|
36
35
|
constructor() {
|
|
37
36
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -227,7 +226,8 @@ class S3SourceDriver {
|
|
|
227
226
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
228
227
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
229
228
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
230
|
-
const downloadLocally = (fileUrl_1, ...args_1) => __awaiter(this, [fileUrl_1, ...args_1], void 0, function* (fileUrl, appendMode = false) {
|
|
229
|
+
const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false) {
|
|
230
|
+
// Download and validate header in a single stream pass
|
|
231
231
|
const command = new client_s3_1.GetObjectCommand({
|
|
232
232
|
Bucket: this._bucketName,
|
|
233
233
|
Key: fileUrl
|
|
@@ -235,23 +235,55 @@ class S3SourceDriver {
|
|
|
235
235
|
const response = yield this._client.send(command);
|
|
236
236
|
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
|
|
237
237
|
const stream = response.Body;
|
|
238
|
-
|
|
239
|
-
yield (0, promises_1.pipeline)(stream, (0, fs_1.createWriteStream)(dataset.getPath(), writeOptions));
|
|
238
|
+
return DriverHelper_1.default.appendToUnifiedFile(stream, fileUrl, dataset.getPath(), appendMode, headerLine, file.fileType);
|
|
240
239
|
});
|
|
241
240
|
const { fileKey } = file;
|
|
242
241
|
if (fileKey.includes('%')) {
|
|
243
242
|
const allFileKeys = yield this.listFiles(fileKey);
|
|
243
|
+
// Get header line from the first file
|
|
244
|
+
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
245
|
+
Bucket: this._bucketName,
|
|
246
|
+
Key: allFileKeys[0]
|
|
247
|
+
});
|
|
248
|
+
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
249
|
+
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
250
|
+
const firstFileStream = firstFileResponse.Body;
|
|
251
|
+
const headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
244
252
|
// Download files sequentially to avoid file conflicts
|
|
245
253
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
246
|
-
yield downloadLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
|
|
254
|
+
yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
|
|
247
255
|
}
|
|
248
256
|
return dataset;
|
|
249
257
|
}
|
|
250
258
|
else {
|
|
251
|
-
|
|
259
|
+
// For single file, no header validation needed
|
|
260
|
+
yield downloadLocally(fileKey, '');
|
|
252
261
|
return dataset;
|
|
253
262
|
}
|
|
254
263
|
});
|
|
264
|
+
this.getFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
|
|
265
|
+
var _a, e_1, _b, _c;
|
|
266
|
+
const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
267
|
+
let firstLine = '';
|
|
268
|
+
try {
|
|
269
|
+
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
270
|
+
_c = rl_1_1.value;
|
|
271
|
+
_d = false;
|
|
272
|
+
const line = _c;
|
|
273
|
+
firstLine = line;
|
|
274
|
+
break;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
278
|
+
finally {
|
|
279
|
+
try {
|
|
280
|
+
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
281
|
+
}
|
|
282
|
+
finally { if (e_1) throw e_1.error; }
|
|
283
|
+
}
|
|
284
|
+
rl.close();
|
|
285
|
+
return firstLine;
|
|
286
|
+
});
|
|
255
287
|
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
256
288
|
var _a;
|
|
257
289
|
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
|
|
@@ -276,7 +308,7 @@ class S3SourceDriver {
|
|
|
276
308
|
}
|
|
277
309
|
});
|
|
278
310
|
this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
279
|
-
var _a,
|
|
311
|
+
var _a, e_2, _b, _c;
|
|
280
312
|
const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
281
313
|
const lines = [];
|
|
282
314
|
let lineCounter = 0;
|
|
@@ -300,19 +332,19 @@ class S3SourceDriver {
|
|
|
300
332
|
}
|
|
301
333
|
}
|
|
302
334
|
}
|
|
303
|
-
catch (
|
|
335
|
+
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
304
336
|
finally {
|
|
305
337
|
try {
|
|
306
338
|
if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
|
|
307
339
|
}
|
|
308
|
-
finally { if (
|
|
340
|
+
finally { if (e_2) throw e_2.error; }
|
|
309
341
|
}
|
|
310
342
|
reader.close();
|
|
311
343
|
return lines;
|
|
312
344
|
});
|
|
313
345
|
this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
314
346
|
var _a, stream_1, stream_1_1;
|
|
315
|
-
var _b,
|
|
347
|
+
var _b, e_3, _c, _d;
|
|
316
348
|
(0, Affirm_1.default)(sheetName, `Invalid sheetname`);
|
|
317
349
|
const chunks = [];
|
|
318
350
|
try {
|
|
@@ -323,12 +355,12 @@ class S3SourceDriver {
|
|
|
323
355
|
chunks.push(chunk);
|
|
324
356
|
}
|
|
325
357
|
}
|
|
326
|
-
catch (
|
|
358
|
+
catch (e_3_1) { e_3 = { error: e_3_1 }; }
|
|
327
359
|
finally {
|
|
328
360
|
try {
|
|
329
361
|
if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1);
|
|
330
362
|
}
|
|
331
|
-
finally { if (
|
|
363
|
+
finally { if (e_3) throw e_3.error; }
|
|
332
364
|
}
|
|
333
365
|
const buffer = Buffer.concat(chunks);
|
|
334
366
|
const excel = xlsx_1.default.read(buffer, { type: 'buffer' });
|
|
@@ -343,7 +375,7 @@ class S3SourceDriver {
|
|
|
343
375
|
});
|
|
344
376
|
this._readXmlLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
345
377
|
var _a, stream_2, stream_2_1;
|
|
346
|
-
var _b,
|
|
378
|
+
var _b, e_4, _c, _d;
|
|
347
379
|
const chunks = [];
|
|
348
380
|
try {
|
|
349
381
|
for (_a = true, stream_2 = __asyncValues(stream); stream_2_1 = yield stream_2.next(), _b = stream_2_1.done, !_b; _a = true) {
|
|
@@ -353,12 +385,12 @@ class S3SourceDriver {
|
|
|
353
385
|
chunks.push(chunk);
|
|
354
386
|
}
|
|
355
387
|
}
|
|
356
|
-
catch (
|
|
388
|
+
catch (e_4_1) { e_4 = { error: e_4_1 }; }
|
|
357
389
|
finally {
|
|
358
390
|
try {
|
|
359
391
|
if (!_a && !_b && (_c = stream_2.return)) yield _c.call(stream_2);
|
|
360
392
|
}
|
|
361
|
-
finally { if (
|
|
393
|
+
finally { if (e_4) throw e_4.error; }
|
|
362
394
|
}
|
|
363
395
|
const buffer = Buffer.concat(chunks);
|
|
364
396
|
const jsonData = XMLParser_1.default.xmlToJson(buffer);
|
|
@@ -85,6 +85,8 @@ class ExecutionPlannerClas {
|
|
|
85
85
|
plan.push(...this._planProducer(producers[0], options));
|
|
86
86
|
else
|
|
87
87
|
plan.push(...(producers.flatMap(x => this._planProducer(x, options))));
|
|
88
|
+
// I technically don't need this, but I keep it to merge all the datasets to a single one
|
|
89
|
+
// so the other steps of the plan can work with a single dataset variable
|
|
88
90
|
plan.push({ type: 'join-producers-data' });
|
|
89
91
|
if (consumer.filters && consumer.filters.length > 0)
|
|
90
92
|
plan.push({ type: 'apply-consumer-filters-on-JSON' });
|