@forzalabs/remora 0.0.47-nasco.3 → 0.0.49-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.47-nasco',
4
+ cliVersion: '0.0.49-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
@@ -0,0 +1,88 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __asyncValues = (this && this.__asyncValues) || function (o) {
12
+ if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
+ var m = o[Symbol.asyncIterator], i;
14
+ return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
+ function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
+ function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
+ };
18
+ var __importDefault = (this && this.__importDefault) || function (mod) {
19
+ return (mod && mod.__esModule) ? mod : { "default": mod };
20
+ };
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ const stream_1 = require("stream");
23
+ const readline_1 = require("readline");
24
+ const promises_1 = require("stream/promises");
25
+ const fs_1 = require("fs");
26
+ const Logger_1 = __importDefault(require("../helper/Logger"));
27
+ const Affirm_1 = __importDefault(require("../core/Affirm"));
28
+ const DriverHelper = {
29
+ appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
30
+ (0, Affirm_1.default)(options, 'Invalid options');
31
+ const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow } = options;
32
+ const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
33
+ let isFirstLine = true;
34
+ let hasValidatedHeader = shouldValidateHeader ? false : true;
35
+ const headerValidationTransform = new stream_1.Transform({
36
+ transform(chunk, encoding, callback) {
37
+ if (!hasValidatedHeader) {
38
+ const chunkStr = chunk.toString();
39
+ const lines = chunkStr.split('\n');
40
+ if (isFirstLine && lines.length > 0) {
41
+ const firstLine = lines[0];
42
+ if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
43
+ const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
44
+ Logger_1.default.log(msg);
45
+ return callback(new Error(msg));
46
+ }
47
+ hasValidatedHeader = true;
48
+ isFirstLine = false;
49
+ }
50
+ }
51
+ callback(null, chunk);
52
+ }
53
+ });
54
+ const writeOptions = append ? { flags: 'a' } : {};
55
+ const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
56
+ yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
57
+ }),
58
+ quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
59
+ var _a, e_1, _b, _c;
60
+ const fileStream = (0, fs_1.createReadStream)(filePath);
61
+ const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
62
+ const lines = [];
63
+ let counter = 0;
64
+ try {
65
+ for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
66
+ _c = rl_1_1.value;
67
+ _d = false;
68
+ const line = _c;
69
+ lines.push(line);
70
+ counter++;
71
+ if (counter >= lineCount) {
72
+ break;
73
+ }
74
+ }
75
+ }
76
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
77
+ finally {
78
+ try {
79
+ if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
80
+ }
81
+ finally { if (e_1) throw e_1.error; }
82
+ }
83
+ rl.close();
84
+ fileStream.close();
85
+ return lines;
86
+ })
87
+ };
88
+ exports.default = DriverHelper;
@@ -64,6 +64,7 @@ const Helper_1 = __importDefault(require("../helper/Helper"));
64
64
  const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
65
65
  const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
66
66
  const Logger_1 = __importDefault(require("../helper/Logger"));
67
+ const DriverHelper_1 = __importDefault(require("./DriverHelper"));
67
68
  class LocalSourceDriver {
68
69
  constructor() {
69
70
  this.init = (source) => __awaiter(this, void 0, void 0, function* () {
@@ -82,12 +83,15 @@ class LocalSourceDriver {
82
83
  const { fileKey } = request;
83
84
  if (fileKey.includes('%')) {
84
85
  const allFileKeys = this.listFiles(fileKey);
85
- const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i));
86
+ Logger_1.default.log(`Matched ${allFileKeys.length} files, copying to locally and creating unified dataset.`);
87
+ const firstPath = path_1.default.join(this._path, allFileKeys[0]);
88
+ const headerLine = (yield DriverHelper_1.default.quickReadFile(firstPath, 1))[0];
89
+ const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), headerLine, i));
86
90
  const results = yield Promise.all(promises);
87
91
  return results.flat();
88
92
  }
89
93
  else {
90
- return yield this._get(request);
94
+ return yield this._get(request, '');
91
95
  }
92
96
  });
93
97
  this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
@@ -101,12 +105,15 @@ class LocalSourceDriver {
101
105
  const { fileKey } = request;
102
106
  if (fileKey.includes('%')) {
103
107
  const allFileKeys = this.listFiles(fileKey);
104
- const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), i));
108
+ Logger_1.default.log(`Matched ${allFileKeys.length} files, copying to locally and creating unified dataset.`);
109
+ const firstPath = path_1.default.join(this._path, allFileKeys[0]);
110
+ const headerLine = (yield DriverHelper_1.default.quickReadFile(firstPath, 1))[0];
111
+ const promises = allFileKeys.map((x, i) => this._get(Object.assign(Object.assign({}, request), { fileKey: x }), headerLine, i));
105
112
  const results = yield Promise.all(promises);
106
113
  return results.flat();
107
114
  }
108
115
  else {
109
- return yield this._get(request);
116
+ return yield this._get(request, '');
110
117
  }
111
118
  });
112
119
  this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
@@ -116,31 +123,36 @@ class LocalSourceDriver {
116
123
  (0, Affirm_1.default)(file, 'Invalid dataset file');
117
124
  (0, Affirm_1.default)(file.fileKey, 'Invalid file key');
118
125
  (0, Affirm_1.default)(file.fileType, `Invalid file type`);
119
- const copyLocally = (fileKey_1, ...args_1) => __awaiter(this, [fileKey_1, ...args_1], void 0, function* (fileKey, appendMode = false) {
126
+ const copyLocally = (fileKey_1, headerLine_1, ...args_1) => __awaiter(this, [fileKey_1, headerLine_1, ...args_1], void 0, function* (fileKey, headerLine, appendMode = false) {
120
127
  const sourceFilePath = path_1.default.join(this._path, fileKey);
121
128
  (0, Affirm_1.default)(fs.existsSync(sourceFilePath), `Source file does not exist: ${sourceFilePath}`);
122
- const writeOptions = appendMode ? { flags: 'a' } : {};
129
+ // Copy and validate header in a single stream pass
123
130
  const readStream = fs.createReadStream(sourceFilePath);
124
- const writeStream = fs.createWriteStream(dataset.getPath(), writeOptions);
125
- return new Promise((resolve, reject) => {
126
- readStream.pipe(writeStream);
127
- writeStream.on('finish', resolve);
128
- writeStream.on('error', reject);
129
- readStream.on('error', reject);
131
+ return DriverHelper_1.default.appendToUnifiedFile({
132
+ stream: readStream,
133
+ fileKey,
134
+ destinationPath: dataset.getPath(),
135
+ append: appendMode,
136
+ headerLine,
137
+ fileType: file.fileType,
138
+ hasHeaderRow: file.hasHeaderRow
130
139
  });
131
140
  });
132
141
  const { fileKey } = file;
133
142
  if (fileKey.includes('%')) {
134
143
  const allFileKeys = this.listFiles(fileKey);
135
- Logger_1.default.log(`Matched ${allFileKeys.length} files, copying to locally and creating unified dataset.`);
144
+ Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
145
+ // Get header line from the first file
146
+ const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
136
147
  // Copy files sequentially to avoid file conflicts
137
148
  for (let i = 0; i < allFileKeys.length; i++) {
138
- yield copyLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
149
+ yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
139
150
  }
140
151
  return dataset;
141
152
  }
142
153
  else {
143
- yield copyLocally(fileKey);
154
+ // For single file, no header validation needed
155
+ yield copyLocally(fileKey, '', false);
144
156
  return dataset;
145
157
  }
146
158
  });
@@ -224,7 +236,7 @@ class LocalSourceDriver {
224
236
  }
225
237
  return lines;
226
238
  });
227
- this._get = (request, index) => __awaiter(this, void 0, void 0, function* () {
239
+ this._get = (request, headerLine, index) => __awaiter(this, void 0, void 0, function* () {
228
240
  const { fileKey, fileType, options } = request;
229
241
  let lineFrom, lineTo, sheetName, hasHeaderRow;
230
242
  if (options) {
@@ -259,6 +271,12 @@ class LocalSourceDriver {
259
271
  lines = yield this._readXmlLines(fileUrl);
260
272
  break;
261
273
  }
274
+ const firstLine = lines[0];
275
+ if (headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
276
+ const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
277
+ Logger_1.default.log(msg);
278
+ throw new Error(msg);
279
+ }
262
280
  // If this is not the first file read in a pattern match AND the file type has an header,
263
281
  // then I need to remove the header from the resulting lines or the header will be duplicated
264
282
  if (index > 0 && ParseHelper_1.default.shouldHaveHeader(fileType, hasHeaderRow)) {
@@ -29,9 +29,9 @@ const xlsx_1 = __importDefault(require("xlsx"));
29
29
  const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); // Added XMLParser import
30
30
  const Helper_1 = __importDefault(require("../helper/Helper"));
31
31
  const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"));
32
- const promises_1 = require("stream/promises");
33
- const fs_1 = require("fs");
34
32
  const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
33
+ const DriverHelper_1 = __importDefault(require("./DriverHelper"));
34
+ const Logger_1 = __importDefault(require("../helper/Logger"));
35
35
  class S3DestinationDriver {
36
36
  constructor() {
37
37
  this.init = (source) => __awaiter(this, void 0, void 0, function* () {
@@ -227,7 +227,8 @@ class S3SourceDriver {
227
227
  (0, Affirm_1.default)(file, 'Invalid dataset file');
228
228
  (0, Affirm_1.default)(file.fileKey, 'Invalid file key');
229
229
  (0, Affirm_1.default)(file.fileType, `Invalid file type`);
230
- const downloadLocally = (fileUrl_1, ...args_1) => __awaiter(this, [fileUrl_1, ...args_1], void 0, function* (fileUrl, appendMode = false) {
230
+ const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false) {
231
+ // Download and validate header in a single stream pass
231
232
  const command = new client_s3_1.GetObjectCommand({
232
233
  Bucket: this._bucketName,
233
234
  Key: fileUrl
@@ -235,23 +236,64 @@ class S3SourceDriver {
235
236
  const response = yield this._client.send(command);
236
237
  (0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
237
238
  const stream = response.Body;
238
- const writeOptions = appendMode ? { flags: 'a' } : {};
239
- yield (0, promises_1.pipeline)(stream, (0, fs_1.createWriteStream)(dataset.getPath(), writeOptions));
239
+ return DriverHelper_1.default.appendToUnifiedFile({
240
+ stream,
241
+ fileKey: fileUrl,
242
+ destinationPath: dataset.getPath(),
243
+ append: appendMode,
244
+ headerLine,
245
+ fileType: file.fileType,
246
+ hasHeaderRow: file.hasHeaderRow
247
+ });
240
248
  });
241
249
  const { fileKey } = file;
242
250
  if (fileKey.includes('%')) {
243
251
  const allFileKeys = yield this.listFiles(fileKey);
252
+ Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
253
+ // Get header line from the first file
254
+ const firstFileCommand = new client_s3_1.GetObjectCommand({
255
+ Bucket: this._bucketName,
256
+ Key: allFileKeys[0]
257
+ });
258
+ const firstFileResponse = yield this._client.send(firstFileCommand);
259
+ (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
260
+ const firstFileStream = firstFileResponse.Body;
261
+ const headerLine = yield this.getFirstLineFromStream(firstFileStream);
244
262
  // Download files sequentially to avoid file conflicts
245
263
  for (let i = 0; i < allFileKeys.length; i++) {
246
- yield downloadLocally(allFileKeys[i], i > 0); // Append mode for subsequent files
264
+ yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
247
265
  }
248
266
  return dataset;
249
267
  }
250
268
  else {
251
- yield downloadLocally(fileKey);
269
+ // For single file, no header validation needed
270
+ yield downloadLocally(fileKey, '');
252
271
  return dataset;
253
272
  }
254
273
  });
274
+ this.getFirstLineFromStream = (stream) => __awaiter(this, void 0, void 0, function* () {
275
+ var _a, e_1, _b, _c;
276
+ const rl = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
277
+ let firstLine = '';
278
+ try {
279
+ for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
280
+ _c = rl_1_1.value;
281
+ _d = false;
282
+ const line = _c;
283
+ firstLine = line;
284
+ break;
285
+ }
286
+ }
287
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
288
+ finally {
289
+ try {
290
+ if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
291
+ }
292
+ finally { if (e_1) throw e_1.error; }
293
+ }
294
+ rl.close();
295
+ return firstLine;
296
+ });
255
297
  this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
256
298
  var _a;
257
299
  (0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
@@ -276,7 +318,7 @@ class S3SourceDriver {
276
318
  }
277
319
  });
278
320
  this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
279
- var _a, e_1, _b, _c;
321
+ var _a, e_2, _b, _c;
280
322
  const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
281
323
  const lines = [];
282
324
  let lineCounter = 0;
@@ -300,19 +342,19 @@ class S3SourceDriver {
300
342
  }
301
343
  }
302
344
  }
303
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
345
+ catch (e_2_1) { e_2 = { error: e_2_1 }; }
304
346
  finally {
305
347
  try {
306
348
  if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
307
349
  }
308
- finally { if (e_1) throw e_1.error; }
350
+ finally { if (e_2) throw e_2.error; }
309
351
  }
310
352
  reader.close();
311
353
  return lines;
312
354
  });
313
355
  this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
314
356
  var _a, stream_1, stream_1_1;
315
- var _b, e_2, _c, _d;
357
+ var _b, e_3, _c, _d;
316
358
  (0, Affirm_1.default)(sheetName, `Invalid sheetname`);
317
359
  const chunks = [];
318
360
  try {
@@ -323,12 +365,12 @@ class S3SourceDriver {
323
365
  chunks.push(chunk);
324
366
  }
325
367
  }
326
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
368
+ catch (e_3_1) { e_3 = { error: e_3_1 }; }
327
369
  finally {
328
370
  try {
329
371
  if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1);
330
372
  }
331
- finally { if (e_2) throw e_2.error; }
373
+ finally { if (e_3) throw e_3.error; }
332
374
  }
333
375
  const buffer = Buffer.concat(chunks);
334
376
  const excel = xlsx_1.default.read(buffer, { type: 'buffer' });
@@ -343,7 +385,7 @@ class S3SourceDriver {
343
385
  });
344
386
  this._readXmlLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
345
387
  var _a, stream_2, stream_2_1;
346
- var _b, e_3, _c, _d;
388
+ var _b, e_4, _c, _d;
347
389
  const chunks = [];
348
390
  try {
349
391
  for (_a = true, stream_2 = __asyncValues(stream); stream_2_1 = yield stream_2.next(), _b = stream_2_1.done, !_b; _a = true) {
@@ -353,12 +395,12 @@ class S3SourceDriver {
353
395
  chunks.push(chunk);
354
396
  }
355
397
  }
356
- catch (e_3_1) { e_3 = { error: e_3_1 }; }
398
+ catch (e_4_1) { e_4 = { error: e_4_1 }; }
357
399
  finally {
358
400
  try {
359
401
  if (!_a && !_b && (_c = stream_2.return)) yield _c.call(stream_2);
360
402
  }
361
- finally { if (e_3) throw e_3.error; }
403
+ finally { if (e_4) throw e_4.error; }
362
404
  }
363
405
  const buffer = Buffer.concat(chunks);
364
406
  const jsonData = XMLParser_1.default.xmlToJson(buffer);
@@ -85,6 +85,8 @@ class ExecutionPlannerClas {
85
85
  plan.push(...this._planProducer(producers[0], options));
86
86
  else
87
87
  plan.push(...(producers.flatMap(x => this._planProducer(x, options))));
88
+ // I technically don't need this, but I keep it to merge all the datasets to a single one
89
+ // so the other steps of the plan can work with a single dataset variable
88
90
  plan.push({ type: 'join-producers-data' });
89
91
  if (consumer.filters && consumer.filters.length > 0)
90
92
  plan.push({ type: 'apply-consumer-filters-on-JSON' });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.47-nasco.3",
3
+ "version": "0.0.49-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,