@forzalabs/remora 0.0.57-nasco.3 → 0.0.58-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.57-nasco',
4
+ cliVersion: '0.0.58-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
@@ -28,29 +28,84 @@ const Affirm_1 = __importDefault(require("../core/Affirm"));
28
28
  const DriverHelper = {
29
29
  appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
30
30
  (0, Affirm_1.default)(options, 'Invalid options');
31
- const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow } = options;
31
+ const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter } = options;
32
+ const keys = (fileType === 'JSON' || fileType === 'JSONL') ? Object.keys(headerLine) : [];
32
33
  const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
33
34
  let isFirstLine = true;
34
35
  let hasValidatedHeader = shouldValidateHeader ? false : true;
36
+ let leftoverData = '';
37
+ let globalIndex = 0;
35
38
  const headerValidationTransform = new stream_1.Transform({
36
39
  transform(chunk, encoding, callback) {
37
- if (!hasValidatedHeader) {
38
- const chunkStr = chunk.toString();
39
- const lines = chunkStr.split('\n');
40
- if (isFirstLine && lines.length > 0) {
41
- const firstLine = lines[0];
42
- if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
43
- const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
40
+ const chunkStr = leftoverData + chunk.toString();
41
+ const lines = chunkStr.split('\n');
42
+ // Keep the last line as leftover if it doesn't end with newline
43
+ leftoverData = lines.pop() || '';
44
+ const filteredLines = [];
45
+ for (let i = 0; i < lines.length; i++) {
46
+ const line = lines[i];
47
+ // Header validation for first line
48
+ if (!hasValidatedHeader && isFirstLine && i === 0) {
49
+ if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && line.trim() !== headerLine.trim()) {
50
+ const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${headerLine}`;
44
51
  Logger_1.default.log(msg);
45
52
  return callback(new Error(msg));
46
53
  }
47
54
  hasValidatedHeader = true;
48
55
  isFirstLine = false;
49
56
  }
57
+ // Apply your filtering logic here
58
+ if (shouldIncludeLine(line, globalIndex)) {
59
+ filteredLines.push(processLine(line));
60
+ }
61
+ globalIndex++;
62
+ }
63
+ // Output filtered lines
64
+ if (filteredLines.length > 0) {
65
+ const output = filteredLines.join('\n') + '\n';
66
+ callback(null, Buffer.from(output));
67
+ }
68
+ else {
69
+ callback(null, null); // No data to output
50
70
  }
51
- callback(null, chunk);
71
+ },
72
+ flush(callback) {
73
+ // Process any remaining data
74
+ if (leftoverData.trim()) {
75
+ if (shouldIncludeLine(leftoverData, -1)) {
76
+ callback(null, Buffer.from(processLine(leftoverData)));
77
+ }
78
+ else {
79
+ callback(null, null);
80
+ }
81
+ }
82
+ else {
83
+ callback(null, null);
84
+ }
85
+ globalIndex++;
52
86
  }
53
87
  });
88
+ // Helper function to determine if a line should be included
89
+ const shouldIncludeLine = (line, lineIndex) => {
90
+ // For flat files (csv, txt) ignore the first line of the header (I already saved that line)
91
+ if (lineIndex === 0 && shouldValidateHeader)
92
+ return false;
93
+ // Skip empty lines
94
+ if (line.trim() === '')
95
+ return false;
96
+ return true;
97
+ };
98
+ const processLine = (line) => {
99
+ switch (fileType) {
100
+ case 'JSON':
101
+ case 'JSONL': {
102
+ const parsed = JSON.parse(line);
103
+ return keys.map(k => parsed[k]).join(delimiter);
104
+ }
105
+ default:
106
+ return line;
107
+ }
108
+ };
54
109
  const writeOptions = append ? { flags: 'a' } : {};
55
110
  const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
56
111
  yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
@@ -135,7 +135,8 @@ class LocalSourceDriver {
135
135
  append: appendMode,
136
136
  headerLine,
137
137
  fileType: file.fileType,
138
- hasHeaderRow: file.hasHeaderRow
138
+ hasHeaderRow: file.hasHeaderRow,
139
+ delimiter: dataset.getDelimiter()
139
140
  });
140
141
  });
141
142
  const { fileKey } = file;
@@ -144,6 +145,7 @@ class LocalSourceDriver {
144
145
  Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
145
146
  // Get header line from the first file
146
147
  const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
148
+ dataset.setFirstLine(headerLine);
147
149
  // Copy files sequentially to avoid file conflicts
148
150
  for (let i = 0; i < allFileKeys.length; i++) {
149
151
  yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
@@ -152,6 +154,8 @@ class LocalSourceDriver {
152
154
  }
153
155
  else {
154
156
  // For single file, no header validation needed
157
+ const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
158
+ dataset.setFirstLine(headerLine);
155
159
  yield copyLocally(fileKey, '', false);
156
160
  return dataset;
157
161
  }
@@ -243,7 +243,8 @@ class S3SourceDriver {
243
243
  append: appendMode,
244
244
  headerLine,
245
245
  fileType: file.fileType,
246
- hasHeaderRow: file.hasHeaderRow
246
+ hasHeaderRow: file.hasHeaderRow,
247
+ delimiter: dataset.getDelimiter()
247
248
  });
248
249
  });
249
250
  const { fileKey } = file;
@@ -259,6 +260,7 @@ class S3SourceDriver {
259
260
  (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
260
261
  const firstFileStream = firstFileResponse.Body;
261
262
  const headerLine = yield this.getFirstLineFromStream(firstFileStream);
263
+ dataset.setFirstLine(headerLine);
262
264
  // Download files sequentially to avoid file conflicts
263
265
  for (let i = 0; i < allFileKeys.length; i++) {
264
266
  yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
@@ -266,6 +268,16 @@ class S3SourceDriver {
266
268
  return dataset;
267
269
  }
268
270
  else {
271
+ // Get header line from the first file
272
+ const firstFileCommand = new client_s3_1.GetObjectCommand({
273
+ Bucket: this._bucketName,
274
+ Key: fileKey
275
+ });
276
+ const firstFileResponse = yield this._client.send(firstFileCommand);
277
+ (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
278
+ const firstFileStream = firstFileResponse.Body;
279
+ const headerLine = yield this.getFirstLineFromStream(firstFileStream);
280
+ dataset.setFirstLine(headerLine);
269
281
  // For single file, no header validation needed
270
282
  yield downloadLocally(fileKey, '');
271
283
  return dataset;
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  const Affirm_1 = __importDefault(require("../../core/Affirm"));
16
16
  const Algo_1 = __importDefault(require("../../core/Algo"));
17
17
  const CryptoEngine_1 = __importDefault(require("../CryptoEngine"));
18
+ const DatasetManager_1 = __importDefault(require("../dataset/DatasetManager"));
18
19
  const DatasetRecord_1 = __importDefault(require("../dataset/DatasetRecord"));
19
20
  const Environment_1 = __importDefault(require("../Environment"));
20
21
  const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
@@ -25,6 +26,7 @@ class PostProcessorClass {
25
26
  constructor() {
26
27
  /**
27
28
  * Maps an array of objects and projects it to another array of objects but with different shape:
29
+ * - updates the dimensions of the dataset (drop, rename, reorder, hide)
28
30
  * - type casting
29
31
  * - default field values
30
32
  * - masking/hashing of data
@@ -33,9 +35,20 @@ class PostProcessorClass {
33
35
  (0, Affirm_1.default)(consumer, 'Invalid consumer');
34
36
  (0, Affirm_1.default)(dataset, 'Invalid dataset');
35
37
  const fields = ConsumerManager_1.default.getExpandedFields(consumer);
36
- let newDataset = yield dataset.wholeUpdateDimensions(fields);
37
- newDataset = yield newDataset.map(record => {
38
+ const dimensionsUpdates = DatasetManager_1.default.computeDimensionsUpdates(dataset, consumer);
39
+ let updatedDimensions = null;
40
+ const newDataset = yield dataset.map(record => {
38
41
  var _a, _b;
42
+ // First apply the updates to the dimensions of this record
43
+ if (dimensionsUpdates.length > 0) {
44
+ for (const update of dimensionsUpdates) {
45
+ record.wholeUpdateDimension(update);
46
+ }
47
+ record.sortDimensions();
48
+ }
49
+ if (!updatedDimensions)
50
+ updatedDimensions = record._dimensions;
51
+ // Finally apply the rules and changes of the consumer fields to the record
39
52
  for (const field of fields) {
40
53
  const { key, alias } = field.cField;
41
54
  const fieldKey = alias !== null && alias !== void 0 ? alias : key;
@@ -49,6 +62,7 @@ class PostProcessorClass {
49
62
  }
50
63
  return record;
51
64
  });
65
+ newDataset.setDimensinons(updatedDimensions);
52
66
  return newDataset;
53
67
  });
54
68
  /**
@@ -37,7 +37,6 @@ const Environment_1 = __importDefault(require("../Environment"));
37
37
  class Dataset {
38
38
  constructor(name, file, batchSize) {
39
39
  var _a;
40
- this._pipeline = [];
41
40
  this.getPath = () => this._path;
42
41
  this.setPath = (path) => {
43
42
  this._path = path;
@@ -45,11 +44,11 @@ class Dataset {
45
44
  };
46
45
  this.getFile = () => this._file;
47
46
  this.getBatchSize = () => this._batchSize;
48
- this.setBatchSize = (size) => {
49
- this._batchSize = size;
50
- this._recordPool.resize(size);
47
+ this.setFirstLine = (firstLine) => {
48
+ this._firstLine = firstLine;
51
49
  return this;
52
50
  };
51
+ this.getFirstLine = () => this._firstLine;
53
52
  this.getSize = () => this._size;
54
53
  this.getCycles = () => this._iterations;
55
54
  this.getDelimiter = () => this._delimiter;
@@ -633,25 +632,11 @@ class Dataset {
633
632
  this._delimiter = delimiter;
634
633
  this._dimensions = dimensions;
635
634
  switch (this._file.fileType) {
636
- case 'TXT': {
637
- if (this._file.hasHeaderRow)
638
- yield this.filter((x, i) => i > 0 && !x.isEmpty());
639
- break;
640
- }
641
- case 'CSV': {
642
- yield this.filter((x, i) => i > 0 && !x.isEmpty());
643
- break;
644
- }
635
+ case 'TXT':
636
+ case 'CSV':
645
637
  case 'JSON':
646
- case 'JSONL': {
647
- // Convert the JSON to the internal CSV format
648
- yield this.map(record => {
649
- const parsed = JSON.parse(record.getRaw());
650
- const preparedRow = this._dimensions.map(d => parsed[d.key]).join(this._delimiter);
651
- return new DatasetRecord_1.default(preparedRow, this._dimensions, this._delimiter);
652
- });
638
+ case 'JSONL':
653
639
  break;
654
- }
655
640
  case 'XLS':
656
641
  case 'XLSX': {
657
642
  const excel = xlsx_1.default.readFile(this._path);
@@ -689,6 +674,10 @@ class Dataset {
689
674
  return this;
690
675
  });
691
676
  this.getDimensions = () => this._dimensions;
677
+ this.setDimensinons = (dimensions) => {
678
+ this._dimensions = dimensions;
679
+ return this;
680
+ };
692
681
  /**
693
682
  * Update the record pool when dimensions change
694
683
  */
@@ -696,55 +685,6 @@ class Dataset {
696
685
  // Update all pooled records with current dimensions
697
686
  this._recordPool.updateDimensions(this._dimensions, this._delimiter);
698
687
  };
699
- /**
700
- * - remove dimension
701
- * - rename a dimension
702
- * - change hidden flag
703
- * - move a dimension
704
- */
705
- this.wholeUpdateDimensions = (fields) => __awaiter(this, void 0, void 0, function* () {
706
- var _a;
707
- let updates = [];
708
- // Add all the updates
709
- for (let i = 0; i < fields.length; i++) {
710
- const { cField } = fields[i];
711
- const currentMatch = structuredClone(this._dimensions.find(x => x.name === cField.key));
712
- if (!currentMatch && !cField.fixed)
713
- throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${this._name}" (${this._dimensions.map(x => x.name).join(', ')})`);
714
- updates.push({
715
- currentDimension: currentMatch,
716
- newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
717
- newHidden: cField.hidden,
718
- newPosition: i,
719
- toDelete: false
720
- });
721
- }
722
- // Add all the updates to remove dimensions
723
- for (const dim of this._dimensions) {
724
- if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
725
- updates.push({ currentDimension: dim, toDelete: true });
726
- }
727
- // Now keep only the updates that actually change something
728
- updates = updates.filter(x => x.toDelete
729
- || !x.currentDimension
730
- || (x.currentDimension && (x.currentDimension.name !== x.newName
731
- || (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
732
- || x.newPosition !== x.currentDimension.index)));
733
- if (updates.length === 0)
734
- return this;
735
- let updatedDimensions = null;
736
- const newDataset = yield this.map(record => {
737
- for (const update of updates) {
738
- record.wholeUpdateDimension(update);
739
- }
740
- record._dimensions.sort((a, b) => a.index - b.index);
741
- if (!updatedDimensions)
742
- updatedDimensions = record._dimensions;
743
- return record;
744
- });
745
- this._dimensions = updatedDimensions;
746
- return newDataset;
747
- });
748
688
  this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
749
689
  console.log(`DS ${this._name} (${this._size} | ${this._iterations})`);
750
690
  console.log(this._dimensions.map(x => x.name).join(this._delimiter));
@@ -861,11 +801,11 @@ class Dataset {
861
801
  this._file = file;
862
802
  this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
863
803
  this._dimensions = [];
804
+ this._firstLine = '';
864
805
  this._delimiter = ',';
865
806
  this._size = 0;
866
807
  this._iterations = 0;
867
808
  this._operations = [];
868
- this._pipeline = [];
869
809
  // Initialize record pool for optimization
870
810
  this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
871
811
  const datasetName = this._name
@@ -13,6 +13,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
13
13
  };
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
15
  const Affirm_1 = __importDefault(require("../../core/Affirm"));
16
+ const Algo_1 = __importDefault(require("../../core/Algo"));
17
+ const ConsumerManager_1 = __importDefault(require("../consumer/ConsumerManager"));
16
18
  const Environment_1 = __importDefault(require("../Environment"));
17
19
  const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
18
20
  const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
@@ -34,7 +36,8 @@ class DatasetManagerClass {
34
36
  this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
35
37
  (0, Affirm_1.default)(dataset, `Invalid dataset`);
36
38
  (0, Affirm_1.default)(producer, `Invalid producer`);
37
- const firstLine = (yield dataset.readLines(1))[0].getRaw();
39
+ const firstLine = dataset.getFirstLine();
40
+ (0, Affirm_1.default)(firstLine, `The first line of the dataset was not set.`);
38
41
  return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
39
42
  });
40
43
  this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
@@ -97,6 +100,40 @@ class DatasetManagerClass {
97
100
  break;
98
101
  }
99
102
  });
103
+ this.computeDimensionsUpdates = (dataset, consumer) => {
104
+ var _a;
105
+ (0, Affirm_1.default)(dataset, 'Invalid dataset');
106
+ (0, Affirm_1.default)(consumer, 'Invalid consumer');
107
+ const fields = ConsumerManager_1.default.getExpandedFields(consumer);
108
+ const dimensions = dataset.getDimensions();
109
+ let updates = [];
110
+ // Add all the updates
111
+ for (let i = 0; i < fields.length; i++) {
112
+ const { cField } = fields[i];
113
+ const currentMatch = structuredClone(dimensions.find(x => x.name === cField.key));
114
+ if (!currentMatch && !cField.fixed)
115
+ throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${dataset['_name']}" (${dimensions.map(x => x.name).join(', ')})`);
116
+ updates.push({
117
+ currentDimension: currentMatch,
118
+ newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
119
+ newHidden: cField.hidden,
120
+ newPosition: i,
121
+ toDelete: false
122
+ });
123
+ }
124
+ // Add all the updates to remove dimensions
125
+ for (const dim of dimensions) {
126
+ if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
127
+ updates.push({ currentDimension: dim, toDelete: true });
128
+ }
129
+ // Now keep only the updates that actually change something
130
+ updates = updates.filter(x => x.toDelete
131
+ || !x.currentDimension
132
+ || (x.currentDimension && (x.currentDimension.name !== x.newName
133
+ || (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
134
+ || x.newPosition !== x.currentDimension.index)));
135
+ return updates;
136
+ };
100
137
  }
101
138
  }
102
139
  const DatasetManager = new DatasetManagerClass();
@@ -63,6 +63,12 @@ class DatasetRecord {
63
63
  }
64
64
  return this;
65
65
  };
66
+ this.sortDimensions = () => {
67
+ const isOutOfOrder = this._dimensions.some((dim, index) => dim.index !== index);
68
+ if (isOutOfOrder) {
69
+ this._dimensions.sort((a, b) => a.index - b.index);
70
+ }
71
+ };
66
72
  this.toJSON = () => {
67
73
  if (this._dimensions.some(x => x.hidden)) {
68
74
  // remove the not wanted dimension
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.57-nasco.3",
3
+ "version": "0.0.58-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,