npm - @forzalabs/remora - Versions diffs - 0.0.57-nasco.3 → 0.0.58-nasco.3 - Mend

@forzalabs/remora 0.0.57-nasco.3 → 0.0.58-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/Constants.js +1 -1
package/drivers/DriverHelper.js +64 -9
package/drivers/LocalDriver.js +5 -1
package/drivers/S3Driver.js +13 -1
package/engines/consumer/PostProcessor.js +16 -2
package/engines/dataset/Dataset.js +11 -71
package/engines/dataset/DatasetManager.js +38 -1
package/engines/dataset/DatasetRecord.js +6 -0
package/engines/dataset/TransformWorker.js +2 -0
package/engines/dataset/example-parallel-transform.js +2 -0
package/engines/dataset/test-parallel.js +2 -0
package/package.json +1 -1

package/Constants.js CHANGED Viewed

@@ -1,7 +1,7 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 const CONSTANTS = {
-    cliVersion: '0.0.57-nasco',
+    cliVersion: '0.0.58-nasco',
     lambdaVersion: 1,
     port: 5069,
     defaults: {

package/drivers/DriverHelper.js CHANGED Viewed

@@ -28,29 +28,84 @@ const Affirm_1 = __importDefault(require("../core/Affirm"));
 const DriverHelper = {
     appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
         (0, Affirm_1.default)(options, 'Invalid options');
-        const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow } = options;
+        const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter } = options;
+        const keys = (fileType === 'JSON' || fileType === 'JSONL') ? Object.keys(headerLine) : [];
         const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
         let isFirstLine = true;
         let hasValidatedHeader = shouldValidateHeader ? false : true;
+        let leftoverData = '';
+        let globalIndex = 0;
         const headerValidationTransform = new stream_1.Transform({
             transform(chunk, encoding, callback) {
-                if (!hasValidatedHeader) {
-                    const chunkStr = chunk.toString();
-                    const lines = chunkStr.split('\n');
-                    if (isFirstLine && lines.length > 0) {
-                        const firstLine = lines[0];
-                        if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && firstLine.trim() !== headerLine.trim()) {
-                            const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${firstLine}\n\t-main: ${headerLine}`;
+                const chunkStr = leftoverData + chunk.toString();
+                const lines = chunkStr.split('\n');
+                // Keep the last line as leftover if it doesn't end with newline
+                leftoverData = lines.pop() || '';
+                const filteredLines = [];
+                for (let i = 0; i < lines.length; i++) {
+                    const line = lines[i];
+                    // Header validation for first line
+                    if (!hasValidatedHeader && isFirstLine && i === 0) {
+                        if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && line.trim() !== headerLine.trim()) {
+                            const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${headerLine}`;
                             Logger_1.default.log(msg);
                             return callback(new Error(msg));
                         }
                         hasValidatedHeader = true;
                         isFirstLine = false;
                     }
+                    // Apply your filtering logic here
+                    if (shouldIncludeLine(line, globalIndex)) {
+                        filteredLines.push(processLine(line));
+                    }
+                    globalIndex++;
+                }
+                // Output filtered lines
+                if (filteredLines.length > 0) {
+                    const output = filteredLines.join('\n') + '\n';
+                    callback(null, Buffer.from(output));
+                }
+                else {
+                    callback(null, null); // No data to output
                 }
-                callback(null, chunk);
+            },
+            flush(callback) {
+                // Process any remaining data
+                if (leftoverData.trim()) {
+                    if (shouldIncludeLine(leftoverData, -1)) {
+                        callback(null, Buffer.from(processLine(leftoverData)));
+                    }
+                    else {
+                        callback(null, null);
+                    }
+                }
+                else {
+                    callback(null, null);
+                }
+                globalIndex++;
             }
         });
+        // Helper function to determine if a line should be included
+        const shouldIncludeLine = (line, lineIndex) => {
+            // For flat files (csv, txt) ignore the first line of the header (I already saved that line)
+            if (lineIndex === 0 && shouldValidateHeader)
+                return false;
+            // Skip empty lines
+            if (line.trim() === '')
+                return false;
+            return true;
+        };
+        const processLine = (line) => {
+            switch (fileType) {
+                case 'JSON':
+                case 'JSONL': {
+                    const parsed = JSON.parse(line);
+                    return keys.map(k => parsed[k]).join(delimiter);
+                }
+                default:
+                    return line;
+            }
+        };
         const writeOptions = append ? { flags: 'a' } : {};
         const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
         yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);

package/drivers/LocalDriver.js CHANGED Viewed

@@ -135,7 +135,8 @@ class LocalSourceDriver {
                     append: appendMode,
                     headerLine,
                     fileType: file.fileType,
-                    hasHeaderRow: file.hasHeaderRow
+                    hasHeaderRow: file.hasHeaderRow,
+                    delimiter: dataset.getDelimiter()
                 });
             });
             const { fileKey } = file;
@@ -144,6 +145,7 @@ class LocalSourceDriver {
                 Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
                 // Get header line from the first file
                 const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
+                dataset.setFirstLine(headerLine);
                 // Copy files sequentially to avoid file conflicts
                 for (let i = 0; i < allFileKeys.length; i++) {
                     yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
@@ -152,6 +154,8 @@ class LocalSourceDriver {
             }
             else {
                 // For single file, no header validation needed
+                const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
+                dataset.setFirstLine(headerLine);
                 yield copyLocally(fileKey, '', false);
                 return dataset;
             }

package/drivers/S3Driver.js CHANGED Viewed

@@ -243,7 +243,8 @@ class S3SourceDriver {
                     append: appendMode,
                     headerLine,
                     fileType: file.fileType,
-                    hasHeaderRow: file.hasHeaderRow
+                    hasHeaderRow: file.hasHeaderRow,
+                    delimiter: dataset.getDelimiter()
                 });
             });
             const { fileKey } = file;
@@ -259,6 +260,7 @@ class S3SourceDriver {
                 (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
                 const firstFileStream = firstFileResponse.Body;
                 const headerLine = yield this.getFirstLineFromStream(firstFileStream);
+                dataset.setFirstLine(headerLine);
                 // Download files sequentially to avoid file conflicts
                 for (let i = 0; i < allFileKeys.length; i++) {
                     yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
@@ -266,6 +268,16 @@ class S3SourceDriver {
                 return dataset;
             }
             else {
+                // Get header line from the first file
+                const firstFileCommand = new client_s3_1.GetObjectCommand({
+                    Bucket: this._bucketName,
+                    Key: fileKey
+                });
+                const firstFileResponse = yield this._client.send(firstFileCommand);
+                (0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
+                const firstFileStream = firstFileResponse.Body;
+                const headerLine = yield this.getFirstLineFromStream(firstFileStream);
+                dataset.setFirstLine(headerLine);
                 // For single file, no header validation needed
                 yield downloadLocally(fileKey, '');
                 return dataset;

package/engines/consumer/PostProcessor.js CHANGED Viewed

@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
 const Affirm_1 = __importDefault(require("../../core/Affirm"));
 const Algo_1 = __importDefault(require("../../core/Algo"));
 const CryptoEngine_1 = __importDefault(require("../CryptoEngine"));
+const DatasetManager_1 = __importDefault(require("../dataset/DatasetManager"));
 const DatasetRecord_1 = __importDefault(require("../dataset/DatasetRecord"));
 const Environment_1 = __importDefault(require("../Environment"));
 const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
@@ -25,6 +26,7 @@ class PostProcessorClass {
     constructor() {
         /**
          * Maps an array of objects and projects it to another array of objects but with different shape:
+         * - updates the dimensions of the dataset (drop, rename, reorder, hide)
          * - type casting
          * - default field values
          * - masking/hashing of data
@@ -33,9 +35,20 @@ class PostProcessorClass {
             (0, Affirm_1.default)(consumer, 'Invalid consumer');
             (0, Affirm_1.default)(dataset, 'Invalid dataset');
             const fields = ConsumerManager_1.default.getExpandedFields(consumer);
-            let newDataset = yield dataset.wholeUpdateDimensions(fields);
-            newDataset = yield newDataset.map(record => {
+            const dimensionsUpdates = DatasetManager_1.default.computeDimensionsUpdates(dataset, consumer);
+            let updatedDimensions = null;
+            const newDataset = yield dataset.map(record => {
                 var _a, _b;
+                // First apply the updates to the dimensions of this record
+                if (dimensionsUpdates.length > 0) {
+                    for (const update of dimensionsUpdates) {
+                        record.wholeUpdateDimension(update);
+                    }
+                    record.sortDimensions();
+                }
+                if (!updatedDimensions)
+                    updatedDimensions = record._dimensions;
+                // Finally apply the rules and changes of the consumer fields to the record
                 for (const field of fields) {
                     const { key, alias } = field.cField;
                     const fieldKey = alias !== null && alias !== void 0 ? alias : key;
@@ -49,6 +62,7 @@ class PostProcessorClass {
                 }
                 return record;
             });
+            newDataset.setDimensinons(updatedDimensions);
             return newDataset;
         });
         /**

package/engines/dataset/Dataset.js CHANGED Viewed

@@ -37,7 +37,6 @@ const Environment_1 = __importDefault(require("../Environment"));
 class Dataset {
     constructor(name, file, batchSize) {
         var _a;
-        this._pipeline = [];
         this.getPath = () => this._path;
         this.setPath = (path) => {
             this._path = path;
@@ -45,11 +44,11 @@ class Dataset {
         };
         this.getFile = () => this._file;
         this.getBatchSize = () => this._batchSize;
-        this.setBatchSize = (size) => {
-            this._batchSize = size;
-            this._recordPool.resize(size);
+        this.setFirstLine = (firstLine) => {
+            this._firstLine = firstLine;
             return this;
         };
+        this.getFirstLine = () => this._firstLine;
         this.getSize = () => this._size;
         this.getCycles = () => this._iterations;
         this.getDelimiter = () => this._delimiter;
@@ -633,25 +632,11 @@ class Dataset {
             this._delimiter = delimiter;
             this._dimensions = dimensions;
             switch (this._file.fileType) {
-                case 'TXT': {
-                    if (this._file.hasHeaderRow)
-                        yield this.filter((x, i) => i > 0 && !x.isEmpty());
-                    break;
-                }
-                case 'CSV': {
-                    yield this.filter((x, i) => i > 0 && !x.isEmpty());
-                    break;
-                }
+                case 'TXT':
+                case 'CSV':
                 case 'JSON':
-                case 'JSONL': {
-                    // Convert the JSON to the internal CSV format
-                    yield this.map(record => {
-                        const parsed = JSON.parse(record.getRaw());
-                        const preparedRow = this._dimensions.map(d => parsed[d.key]).join(this._delimiter);
-                        return new DatasetRecord_1.default(preparedRow, this._dimensions, this._delimiter);
-                    });
+                case 'JSONL':
                     break;
-                }
                 case 'XLS':
                 case 'XLSX': {
                     const excel = xlsx_1.default.readFile(this._path);
@@ -689,6 +674,10 @@ class Dataset {
             return this;
         });
         this.getDimensions = () => this._dimensions;
+        this.setDimensinons = (dimensions) => {
+            this._dimensions = dimensions;
+            return this;
+        };
         /**
          * Update the record pool when dimensions change
          */
@@ -696,55 +685,6 @@ class Dataset {
             // Update all pooled records with current dimensions
             this._recordPool.updateDimensions(this._dimensions, this._delimiter);
         };
-        /**
-         * - remove dimension
-         * - rename a dimension
-         * - change hidden flag
-         * - move a dimension
-         */
-        this.wholeUpdateDimensions = (fields) => __awaiter(this, void 0, void 0, function* () {
-            var _a;
-            let updates = [];
-            // Add all the updates
-            for (let i = 0; i < fields.length; i++) {
-                const { cField } = fields[i];
-                const currentMatch = structuredClone(this._dimensions.find(x => x.name === cField.key));
-                if (!currentMatch && !cField.fixed)
-                    throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${this._name}" (${this._dimensions.map(x => x.name).join(', ')})`);
-                updates.push({
-                    currentDimension: currentMatch,
-                    newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
-                    newHidden: cField.hidden,
-                    newPosition: i,
-                    toDelete: false
-                });
-            }
-            // Add all the updates to remove dimensions
-            for (const dim of this._dimensions) {
-                if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
-                    updates.push({ currentDimension: dim, toDelete: true });
-            }
-            // Now keep only the updates that actually change something
-            updates = updates.filter(x => x.toDelete
-                || !x.currentDimension
-                || (x.currentDimension && (x.currentDimension.name !== x.newName
-                    || (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
-                    || x.newPosition !== x.currentDimension.index)));
-            if (updates.length === 0)
-                return this;
-            let updatedDimensions = null;
-            const newDataset = yield this.map(record => {
-                for (const update of updates) {
-                    record.wholeUpdateDimension(update);
-                }
-                record._dimensions.sort((a, b) => a.index - b.index);
-                if (!updatedDimensions)
-                    updatedDimensions = record._dimensions;
-                return record;
-            });
-            this._dimensions = updatedDimensions;
-            return newDataset;
-        });
         this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
             console.log(`DS ${this._name} (${this._size} | ${this._iterations})`);
             console.log(this._dimensions.map(x => x.name).join(this._delimiter));
@@ -861,11 +801,11 @@ class Dataset {
         this._file = file;
         this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
         this._dimensions = [];
+        this._firstLine = '';
         this._delimiter = ',';
         this._size = 0;
         this._iterations = 0;
         this._operations = [];
-        this._pipeline = [];
         // Initialize record pool for optimization
         this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
         const datasetName = this._name

package/engines/dataset/DatasetManager.js CHANGED Viewed

@@ -13,6 +13,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 const Affirm_1 = __importDefault(require("../../core/Affirm"));
+const Algo_1 = __importDefault(require("../../core/Algo"));
+const ConsumerManager_1 = __importDefault(require("../consumer/ConsumerManager"));
 const Environment_1 = __importDefault(require("../Environment"));
 const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
 const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
@@ -34,7 +36,8 @@ class DatasetManagerClass {
         this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
             (0, Affirm_1.default)(dataset, `Invalid dataset`);
             (0, Affirm_1.default)(producer, `Invalid producer`);
-            const firstLine = (yield dataset.readLines(1))[0].getRaw();
+            const firstLine = dataset.getFirstLine();
+            (0, Affirm_1.default)(firstLine, `The first line of the dataset was not set.`);
             return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
         });
         this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
@@ -97,6 +100,40 @@ class DatasetManagerClass {
                     break;
             }
         });
+        this.computeDimensionsUpdates = (dataset, consumer) => {
+            var _a;
+            (0, Affirm_1.default)(dataset, 'Invalid dataset');
+            (0, Affirm_1.default)(consumer, 'Invalid consumer');
+            const fields = ConsumerManager_1.default.getExpandedFields(consumer);
+            const dimensions = dataset.getDimensions();
+            let updates = [];
+            // Add all the updates
+            for (let i = 0; i < fields.length; i++) {
+                const { cField } = fields[i];
+                const currentMatch = structuredClone(dimensions.find(x => x.name === cField.key));
+                if (!currentMatch && !cField.fixed)
+                    throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${dataset['_name']}" (${dimensions.map(x => x.name).join(', ')})`);
+                updates.push({
+                    currentDimension: currentMatch,
+                    newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
+                    newHidden: cField.hidden,
+                    newPosition: i,
+                    toDelete: false
+                });
+            }
+            // Add all the updates to remove dimensions
+            for (const dim of dimensions) {
+                if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
+                    updates.push({ currentDimension: dim, toDelete: true });
+            }
+            // Now keep only the updates that actually change something
+            updates = updates.filter(x => x.toDelete
+                || !x.currentDimension
+                || (x.currentDimension && (x.currentDimension.name !== x.newName
+                    || (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
+                    || x.newPosition !== x.currentDimension.index)));
+            return updates;
+        };
     }
 }
 const DatasetManager = new DatasetManagerClass();

package/engines/dataset/DatasetRecord.js CHANGED Viewed

@@ -63,6 +63,12 @@ class DatasetRecord {
             }
             return this;
         };
+        this.sortDimensions = () => {
+            const isOutOfOrder = this._dimensions.some((dim, index) => dim.index !== index);
+            if (isOutOfOrder) {
+                this._dimensions.sort((a, b) => a.index - b.index);
+            }
+        };
         this.toJSON = () => {
             if (this._dimensions.some(x => x.hidden)) {
                 // remove the not wanted dimension

package/engines/dataset/TransformWorker.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/engines/dataset/example-parallel-transform.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/engines/dataset/test-parallel.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@forzalabs/remora",
-    "version": "0.0.57-nasco.3",
+    "version": "0.0.58-nasco.3",
     "description": "A powerful CLI tool for seamless data translation.",
     "main": "index.js",
     "private": false,