@forzalabs/remora 0.0.47-nasco.3 → 0.0.48-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.47-nasco',
4
+ cliVersion: '0.0.48-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __asyncValues = (this && this.__asyncValues) || function (o) {
12
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
+ var m = o[Symbol.asyncIterator], i;
14
+ return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
+ function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
+ function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
+ };
18
+ var __importDefault = (this && this.__importDefault) || function (mod) {
19
+ return (mod && mod.__esModule) ? mod : { "default": mod };
20
+ };
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ const stream_1 = require("stream");
23
+ const readline_1 = require("readline");
24
+ const promises_1 = require("stream/promises");
25
+ const fs_1 = require("fs");
26
+ const Logger_1 = __importDefault(require("../helper/Logger"));
27
+ const DriverHelper = {
28
+ appendToUnifiedFile: (stream, fileKey, destinationPath, append, headerLine, fileType) => __awaiter(void 0, void 0, void 0, function* () {
29
+ const shouldValidateHeader = fileType === 'CSV' || fileType === 'TXT';
30
+ let isFirstLine = true;
31
+ let hasValidatedHeader = shouldValidateHeader ? false : true;
32
+ const headerValidationTransform = new stream_1.Transform({
33
+ transform(chunk, encoding, callback) {
34
+ if (!hasValidatedHeader) {
35
+ const chunkStr = chunk.toString();
36
+ const lines = chunkStr.split('\n');
37
+ if (isFirstLine && lines.length > 0) {
38
+ const firstLine = lines[0];
39
+ // Validate header only for CSV and TXT files
40
+ if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
41
+ const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
42
+ Logger_1.default.log(msg);
43
+ return callback(new Error(msg));
44
+ }
45
+ hasValidatedHeader = true;
46
+ isFirstLine = false;
47
+ }
48
+ }
49
+ callback(null, chunk);
50
+ }
51
+ });
52
+ const writeOptions = append ? { flags: 'a' } : {};
53
+ const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
54
+ yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
55
+ }),
56
+ quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
57
+ var _a, e_1, _b, _c;
58
+ const fileStream = (0, fs_1.createReadStream)(filePath);
59
+ const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
60
+ const lines = [];
61
+ let counter = 0;
62
+ try {
63
+ for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
64
+ _c = rl_1_1.value;
65
+ _d = false;
66
+ const line = _c;
67
+ lines.push(line);
68
+ counter++;
69
+ if (counter >= lineCount) {
70
+ break;
71
+ }
72
+ }
73
+ }
74
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
75
+ finally {
76
+ try {
77
+ if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
78
+ }
79
+ finally { if (e_1) throw e_1.error; }
80
+ }
81
+ rl.close();
82
+ fileStream.close();
83
+ return lines;
84
+ })
85
+ };
86
+ exports.default = DriverHelper;
@@ -64,6 +64,7 @@ const Helper_1 = __importDefault(require("../helper/Helper"));
64
64
  const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
65
65
  const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
66
66
  const Logger_1 = __importDefault(require("../helper/Logger"));
67
+ const DriverHelper_1 = __importDefault(require("./DriverHelper"));
67
68
  class LocalSourceDriver {
68
69
  constructor() {
69
70
  this.init = (source) => __awaiter(this, void 0, void 0, function* () {
@@ -116,31 +117,28 @@ class LocalSourceDriver {
116
117
  (0, Affirm_1.default)(file, 'Invalid dataset file');
117
118
  (0, Affirm_1.default)(file.fileKey, 'Invalid file key');
118
119
  (0, Affirm_1.default)(file.fileType, `Invalid file type`);
119
- const copyLocally = (fileKey_1, ...args_1) => __awaiter(this, [fileKey_1, ...args_1], void 0, function* (fileKey, appendMode = false) {
120
+ const copyLocally = (fileKey_1, headerLine_1, ...args_1) => __awaiter(this, [fileKey_1, headerLine_1, ...args_1], void 0, function* (fileKey, headerLine, appendMode = false) {
120
121
  const sourceFilePath = path_1.default.join(this._path, fileKey);
121
122
  (0, Affirm_1.default)(fs.existsSync(sourceFilePath), `Source file does not exist: ${sourceFilePath}`);
122
- const writeOptions = appendMode ? { flags: 'a' } : {};
123
+ // Copy and validate header in a single stream pass
123
124
  const readStream = fs.createReadStream(sourceFilePath);
124
- const writeStream = fs.createWriteStream(dataset.getPath(), writeOptions);
125
- return new Promise((resolve, reject) => {
126
- readStream.pipe(writeStream);
127
- writeStream.on('finish', resolve);
128
- writeStream.on('error', reject);
129
- readStream.on('error', reject);
130
- });
125
+ return DriverHelper_1.default.appendToUnifiedFile(readStream, fileKey, dataset.getPath(), appendMode, headerLine, file.fileType);
131
126
  });
132
127
  const { fileKey } = file;
133
128
  if (fileKey.includes('%')) {
134
129
  const allFileKeys = this.listFiles(fileKey);
135
130
  Logger_1.default.log(`Matched ${allFileKeys.length} files, copying to locally and creating unified dataset.`);
131
+ // Get header line from the first file
132
+ const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
136
133
  // Copy files sequentially to avoid file conflicts
137
134
  for (let i = 0; i < allFileKeys.length; i++) {
138
- yield copyLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
135
+ yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
139
136
  }
140
137
  return dataset;
141
138
  }
142
139
  else {
143
- yield copyLocally(fileKey);
140
+ // For single file, no header validation needed
141
+ yield copyLocally(fileKey, '', false);
144
142
  return dataset;
145
143
  }
146
144
  });
@@ -29,9 +29,8 @@ const xlsx_1 = __importDefault(require("xlsx"));
29
29
  const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); // Added XMLParser import
30
30
  const Helper_1 = __importDefault(require("../helper/Helper"));
31
31
  const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
32
- const promises_1 = require("stream/promises");
33
- const fs_1 = require("fs");
34
32
  const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
33
+ const DriverHelper_1 = __importDefault(require("./DriverHelper"));
35
34
  class S3DestinationDriver {
36
35
  constructor() {
37
36
  this.init = (source) => __awaiter(this, void 0, void 0, function* () {
@@ -227,7 +226,8 @@ class S3SourceDriver {
227
226
  (0, Affirm_1.default)(file, 'Invalid dataset file');
228
227
  (0, Affirm_1.default)(file.fileKey, 'Invalid file key');
229
228
  (0, Affirm_1.default)(file.fileType, `Invalid file type`);
230
- const downloadLocally = (fileUrl_1, ...args_1) => __awaiter(this, [fileUrl_1, ...args_1], void 0, function* (fileUrl, appendMode = false) {
229
+ const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false) {
230
+ // Download and validate header in a single stream pass
231
231
  const command = new client_s3_1.GetObjectCommand({
232
232
  Bucket: this._bucketName,
233
233
  Key: fileUrl
@@ -235,23 +235,55 @@ class S3SourceDriver {
235
235
  const response = yield this._client.send(command);
236
236
  (0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
237
237
  const stream = response.Body;
238
- const writeOptions = appendMode ? { flags: 'a' } : {};
239
- yield (0, promises_1.pipeline)(stream, (0, fs_1.createWriteStream)(dataset.getPath(), writeOptions));
238
+ return DriverHelper_1.default.appendToUnifiedFile(stream, fileUrl, dataset.getPath(), appendMode, headerLine, file.fileType);
240
239
  });
241
240
  const { fileKey } = file;
242
241
  if (fileKey.includes('%')) {
243
242
  const allFileKeys = yield this.listFiles(fileKey);
243
+ // Get header line from the first file
244
+ const firstFileCommand = new client_s3_1.GetObjectCommand({
245
+ Bucket: this._bucketName,
246
+ Key: allFileKeys[0]
247
+ });
248
+ const firstFileResponse = yield this._client.send(firstFileCommand);
249
+ (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
250
+ const firstFileStream = firstFileResponse.Body;
251
+ const headerLine = yield this.getFirstLineFromStream(firstFileStream);
244
252
  // Download files sequentially to avoid file conflicts
245
253
  for (let i = 0; i < allFileKeys.length; i++) {
246
- yield downloadLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
254
+ yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
247
255
  }
248
256
  return dataset;
249
257
  }
250
258
  else {
251
- yield downloadLocally(fileKey);
259
+ // For single file, no header validation needed
260
+ yield downloadLocally(fileKey, '');
252
261
  return dataset;
253
262
  }
254
263
  });
264
+ this.getFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
265
+ var _a, e_1, _b, _c;
266
+ const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
267
+ let firstLine = '';
268
+ try {
269
+ for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
270
+ _c = rl_1_1.value;
271
+ _d = false;
272
+ const line = _c;
273
+ firstLine = line;
274
+ break;
275
+ }
276
+ }
277
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
278
+ finally {
279
+ try {
280
+ if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
281
+ }
282
+ finally { if (e_1) throw e_1.error; }
283
+ }
284
+ rl.close();
285
+ return firstLine;
286
+ });
255
287
  this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
256
288
  var _a;
257
289
  (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
@@ -276,7 +308,7 @@ class S3SourceDriver {
276
308
  }
277
309
  });
278
310
  this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
279
- var _a, e_1, _b, _c;
311
+ var _a, e_2, _b, _c;
280
312
  const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
281
313
  const lines = [];
282
314
  let lineCounter = 0;
@@ -300,19 +332,19 @@ class S3SourceDriver {
300
332
  }
301
333
  }
302
334
  }
303
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
335
+ catch (e_2_1) { e_2 = { error: e_2_1 }; }
304
336
  finally {
305
337
  try {
306
338
  if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
307
339
  }
308
- finally { if (e_1) throw e_1.error; }
340
+ finally { if (e_2) throw e_2.error; }
309
341
  }
310
342
  reader.close();
311
343
  return lines;
312
344
  });
313
345
  this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
314
346
  var _a, stream_1, stream_1_1;
315
- var _b, e_2, _c, _d;
347
+ var _b, e_3, _c, _d;
316
348
  (0, Affirm_1.default)(sheetName, `Invalid sheetname`);
317
349
  const chunks = [];
318
350
  try {
@@ -323,12 +355,12 @@ class S3SourceDriver {
323
355
  chunks.push(chunk);
324
356
  }
325
357
  }
326
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
358
+ catch (e_3_1) { e_3 = { error: e_3_1 }; }
327
359
  finally {
328
360
  try {
329
361
  if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1);
330
362
  }
331
- finally { if (e_2) throw e_2.error; }
363
+ finally { if (e_3) throw e_3.error; }
332
364
  }
333
365
  const buffer = Buffer.concat(chunks);
334
366
  const excel = xlsx_1.default.read(buffer, { type: 'buffer' });
@@ -343,7 +375,7 @@ class S3SourceDriver {
343
375
  });
344
376
  this._readXmlLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
345
377
  var _a, stream_2, stream_2_1;
346
- var _b, e_3, _c, _d;
378
+ var _b, e_4, _c, _d;
347
379
  const chunks = [];
348
380
  try {
349
381
  for (_a = true, stream_2 = __asyncValues(stream); stream_2_1 = yield stream_2.next(), _b = stream_2_1.done, !_b; _a = true) {
@@ -353,12 +385,12 @@ class S3SourceDriver {
353
385
  chunks.push(chunk);
354
386
  }
355
387
  }
356
- catch (e_3_1) { e_3 = { error: e_3_1 }; }
388
+ catch (e_4_1) { e_4 = { error: e_4_1 }; }
357
389
  finally {
358
390
  try {
359
391
  if (!_a && !_b && (_c = stream_2.return)) yield _c.call(stream_2);
360
392
  }
361
- finally { if (e_3) throw e_3.error; }
393
+ finally { if (e_4) throw e_4.error; }
362
394
  }
363
395
  const buffer = Buffer.concat(chunks);
364
396
  const jsonData = XMLParser_1.default.xmlToJson(buffer);
@@ -85,6 +85,8 @@ class ExecutionPlannerClas {
85
85
  plan.push(...this._planProducer(producers[0], options));
86
86
  else
87
87
  plan.push(...(producers.flatMap(x => this._planProducer(x, options))));
88
+ // I technically don't need this, but I keep it to merge all the datasets to a single one
89
+ // so the other steps of the plan can work with a single dataset variable
88
90
  plan.push({ type: 'join-producers-data' });
89
91
  if (consumer.filters && consumer.filters.length > 0)
90
92
  plan.push({ type: 'apply-consumer-filters-on-JSON' });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.47-nasco.3",
3
+ "version": "0.0.48-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,