@forzalabs/remora 0.0.57-nasco.3 → 0.0.59-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +4 -2
- package/drivers/DriverHelper.js +64 -9
- package/drivers/LocalDriver.js +8 -3
- package/drivers/S3Driver.js +31 -16
- package/engines/consumer/PostProcessor.js +16 -2
- package/engines/dataset/Dataset.js +11 -71
- package/engines/dataset/DatasetManager.js +38 -1
- package/engines/dataset/DatasetRecord.js +6 -0
- package/engines/dataset/TransformWorker.js +2 -0
- package/engines/dataset/example-parallel-transform.js +2 -0
- package/engines/dataset/test-parallel.js +2 -0
- package/engines/file/FileExporter.js +20 -4
- package/helper/Runtime.js +20 -0
- package/index.js +9 -0
- package/package.json +1 -1
package/Constants.js
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
const CONSTANTS = {
|
|
4
|
-
cliVersion: '0.0.
|
|
4
|
+
cliVersion: '0.0.59-nasco',
|
|
5
5
|
lambdaVersion: 1,
|
|
6
6
|
port: 5069,
|
|
7
7
|
defaults: {
|
|
8
8
|
PRODUCER_TEMP_FOLDER: '.temp',
|
|
9
9
|
SQL_MAX_QUERY_ROWS: 10000,
|
|
10
10
|
STRING_MAX_CHARACTERS_LENGTH: 10000000,
|
|
11
|
-
MAX_ITEMS_IN_MEMORY: 200000
|
|
11
|
+
MAX_ITEMS_IN_MEMORY: 200000,
|
|
12
|
+
MIN_RUNTIME_HEAP_MB: 4000,
|
|
13
|
+
RECOMMENDED_RUNTIME_HEAP_MB: 8000
|
|
12
14
|
}
|
|
13
15
|
};
|
|
14
16
|
exports.default = CONSTANTS;
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -28,29 +28,84 @@ const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
|
28
28
|
const DriverHelper = {
|
|
29
29
|
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
30
30
|
(0, Affirm_1.default)(options, 'Invalid options');
|
|
31
|
-
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow } = options;
|
|
31
|
+
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter } = options;
|
|
32
|
+
const keys = (fileType === 'JSON' || fileType === 'JSONL') ? Object.keys(headerLine) : [];
|
|
32
33
|
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
|
|
33
34
|
let isFirstLine = true;
|
|
34
35
|
let hasValidatedHeader = shouldValidateHeader ? false : true;
|
|
36
|
+
let leftoverData = '';
|
|
37
|
+
let globalIndex = 0;
|
|
35
38
|
const headerValidationTransform = new stream_1.Transform({
|
|
36
39
|
transform(chunk, encoding, callback) {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
const chunkStr = leftoverData + chunk.toString();
|
|
41
|
+
const lines = chunkStr.split('\n');
|
|
42
|
+
// Keep the last line as leftover if it doesn't end with newline
|
|
43
|
+
leftoverData = lines.pop() || '';
|
|
44
|
+
const filteredLines = [];
|
|
45
|
+
for (let i = 0; i < lines.length; i++) {
|
|
46
|
+
const line = lines[i];
|
|
47
|
+
// Header validation for first line
|
|
48
|
+
if (!hasValidatedHeader && isFirstLine && i === 0) {
|
|
49
|
+
if (shouldValidateHeader && headerLine && headerLine.trim() !== '' && line.trim() !== headerLine.trim()) {
|
|
50
|
+
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${headerLine}`;
|
|
44
51
|
Logger_1.default.log(msg);
|
|
45
52
|
return callback(new Error(msg));
|
|
46
53
|
}
|
|
47
54
|
hasValidatedHeader = true;
|
|
48
55
|
isFirstLine = false;
|
|
49
56
|
}
|
|
57
|
+
// Apply your filtering logic here
|
|
58
|
+
if (shouldIncludeLine(line, globalIndex)) {
|
|
59
|
+
filteredLines.push(processLine(line));
|
|
60
|
+
}
|
|
61
|
+
globalIndex++;
|
|
62
|
+
}
|
|
63
|
+
// Output filtered lines
|
|
64
|
+
if (filteredLines.length > 0) {
|
|
65
|
+
const output = filteredLines.join('\n') + '\n';
|
|
66
|
+
callback(null, Buffer.from(output));
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
callback(null, null); // No data to output
|
|
50
70
|
}
|
|
51
|
-
|
|
71
|
+
},
|
|
72
|
+
flush(callback) {
|
|
73
|
+
// Process any remaining data
|
|
74
|
+
if (leftoverData.trim()) {
|
|
75
|
+
if (shouldIncludeLine(leftoverData, -1)) {
|
|
76
|
+
callback(null, Buffer.from(processLine(leftoverData)));
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
callback(null, null);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
callback(null, null);
|
|
84
|
+
}
|
|
85
|
+
globalIndex++;
|
|
52
86
|
}
|
|
53
87
|
});
|
|
88
|
+
// Helper function to determine if a line should be included
|
|
89
|
+
const shouldIncludeLine = (line, lineIndex) => {
|
|
90
|
+
// For flat files (csv, txt) ignore the first line of the header (I already saved that line)
|
|
91
|
+
if (lineIndex === 0 && shouldValidateHeader)
|
|
92
|
+
return false;
|
|
93
|
+
// Skip empty lines
|
|
94
|
+
if (line.trim() === '')
|
|
95
|
+
return false;
|
|
96
|
+
return true;
|
|
97
|
+
};
|
|
98
|
+
const processLine = (line) => {
|
|
99
|
+
switch (fileType) {
|
|
100
|
+
case 'JSON':
|
|
101
|
+
case 'JSONL': {
|
|
102
|
+
const parsed = JSON.parse(line);
|
|
103
|
+
return keys.map(k => parsed[k]).join(delimiter);
|
|
104
|
+
}
|
|
105
|
+
default:
|
|
106
|
+
return line;
|
|
107
|
+
}
|
|
108
|
+
};
|
|
54
109
|
const writeOptions = append ? { flags: 'a' } : {};
|
|
55
110
|
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
56
111
|
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -135,7 +135,8 @@ class LocalSourceDriver {
|
|
|
135
135
|
append: appendMode,
|
|
136
136
|
headerLine,
|
|
137
137
|
fileType: file.fileType,
|
|
138
|
-
hasHeaderRow: file.hasHeaderRow
|
|
138
|
+
hasHeaderRow: file.hasHeaderRow,
|
|
139
|
+
delimiter: dataset.getDelimiter()
|
|
139
140
|
});
|
|
140
141
|
});
|
|
141
142
|
const { fileKey } = file;
|
|
@@ -144,6 +145,7 @@ class LocalSourceDriver {
|
|
|
144
145
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
145
146
|
// Get header line from the first file
|
|
146
147
|
const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
148
|
+
dataset.setFirstLine(headerLine);
|
|
147
149
|
// Copy files sequentially to avoid file conflicts
|
|
148
150
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
149
151
|
yield copyLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
|
|
@@ -152,6 +154,8 @@ class LocalSourceDriver {
|
|
|
152
154
|
}
|
|
153
155
|
else {
|
|
154
156
|
// For single file, no header validation needed
|
|
157
|
+
const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
|
|
158
|
+
dataset.setFirstLine(headerLine);
|
|
155
159
|
yield copyLocally(fileKey, '', false);
|
|
156
160
|
return dataset;
|
|
157
161
|
}
|
|
@@ -392,8 +396,9 @@ class LocalDestinationDriver {
|
|
|
392
396
|
const filePath = path_1.default.join(folder, options.name);
|
|
393
397
|
fs.writeFileSync(filePath, '');
|
|
394
398
|
yield dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
395
|
-
const
|
|
396
|
-
|
|
399
|
+
const chunks = FileExporter_1.default.prepareBatch(batch, options);
|
|
400
|
+
for (const chunk of chunks)
|
|
401
|
+
fs.appendFileSync(filePath, chunk);
|
|
397
402
|
}));
|
|
398
403
|
return { bucket: folder, key: filePath, res: true };
|
|
399
404
|
}
|
package/drivers/S3Driver.js
CHANGED
|
@@ -129,21 +129,24 @@ class S3DestinationDriver {
|
|
|
129
129
|
const uploadId = createMultipartUploadRes.UploadId;
|
|
130
130
|
(0, Affirm_1.default)(uploadId, 'Failed to initiate multipart upload');
|
|
131
131
|
const uploadedParts = [];
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
const
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
132
|
+
let partNumber = 1;
|
|
133
|
+
yield dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
134
|
+
const chunks = FileExporter_1.default.prepareBatch(batch, options);
|
|
135
|
+
for (const chunk of chunks) {
|
|
136
|
+
const body = Buffer.from(chunk);
|
|
137
|
+
const uploadPartRes = yield this._client.send(new client_s3_1.UploadPartCommand({
|
|
138
|
+
Bucket: this._bucketName,
|
|
139
|
+
Key: name,
|
|
140
|
+
UploadId: uploadId,
|
|
141
|
+
PartNumber: partNumber,
|
|
142
|
+
Body: body
|
|
143
|
+
}));
|
|
144
|
+
uploadedParts.push({
|
|
145
|
+
PartNumber: partNumber,
|
|
146
|
+
ETag: uploadPartRes.ETag
|
|
147
|
+
});
|
|
148
|
+
partNumber++;
|
|
149
|
+
}
|
|
147
150
|
}));
|
|
148
151
|
// Complete the multipart upload
|
|
149
152
|
const completeRes = yield this._client.send(new client_s3_1.CompleteMultipartUploadCommand({
|
|
@@ -243,7 +246,8 @@ class S3SourceDriver {
|
|
|
243
246
|
append: appendMode,
|
|
244
247
|
headerLine,
|
|
245
248
|
fileType: file.fileType,
|
|
246
|
-
hasHeaderRow: file.hasHeaderRow
|
|
249
|
+
hasHeaderRow: file.hasHeaderRow,
|
|
250
|
+
delimiter: dataset.getDelimiter()
|
|
247
251
|
});
|
|
248
252
|
});
|
|
249
253
|
const { fileKey } = file;
|
|
@@ -259,6 +263,7 @@ class S3SourceDriver {
|
|
|
259
263
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
260
264
|
const firstFileStream = firstFileResponse.Body;
|
|
261
265
|
const headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
266
|
+
dataset.setFirstLine(headerLine);
|
|
262
267
|
// Download files sequentially to avoid file conflicts
|
|
263
268
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
264
269
|
yield downloadLocally(allFileKeys[i], headerLine, i > 0); // Append mode for subsequent files
|
|
@@ -266,6 +271,16 @@ class S3SourceDriver {
|
|
|
266
271
|
return dataset;
|
|
267
272
|
}
|
|
268
273
|
else {
|
|
274
|
+
// Get header line from the first file
|
|
275
|
+
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
276
|
+
Bucket: this._bucketName,
|
|
277
|
+
Key: fileKey
|
|
278
|
+
});
|
|
279
|
+
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
280
|
+
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
281
|
+
const firstFileStream = firstFileResponse.Body;
|
|
282
|
+
const headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
283
|
+
dataset.setFirstLine(headerLine);
|
|
269
284
|
// For single file, no header validation needed
|
|
270
285
|
yield downloadLocally(fileKey, '');
|
|
271
286
|
return dataset;
|
|
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
16
16
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
17
17
|
const CryptoEngine_1 = __importDefault(require("../CryptoEngine"));
|
|
18
|
+
const DatasetManager_1 = __importDefault(require("../dataset/DatasetManager"));
|
|
18
19
|
const DatasetRecord_1 = __importDefault(require("../dataset/DatasetRecord"));
|
|
19
20
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
20
21
|
const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
|
|
@@ -25,6 +26,7 @@ class PostProcessorClass {
|
|
|
25
26
|
constructor() {
|
|
26
27
|
/**
|
|
27
28
|
* Maps an array of objects and projects it to another array of objects but with different shape:
|
|
29
|
+
* - updates the dimensions of the dataset (drop, rename, reorder, hide)
|
|
28
30
|
* - type casting
|
|
29
31
|
* - default field values
|
|
30
32
|
* - masking/hashing of data
|
|
@@ -33,9 +35,20 @@ class PostProcessorClass {
|
|
|
33
35
|
(0, Affirm_1.default)(consumer, 'Invalid consumer');
|
|
34
36
|
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
35
37
|
const fields = ConsumerManager_1.default.getExpandedFields(consumer);
|
|
36
|
-
|
|
37
|
-
|
|
38
|
+
const dimensionsUpdates = DatasetManager_1.default.computeDimensionsUpdates(dataset, consumer);
|
|
39
|
+
let updatedDimensions = null;
|
|
40
|
+
const newDataset = yield dataset.map(record => {
|
|
38
41
|
var _a, _b;
|
|
42
|
+
// First apply the updates to the dimensions of this record
|
|
43
|
+
if (dimensionsUpdates.length > 0) {
|
|
44
|
+
for (const update of dimensionsUpdates) {
|
|
45
|
+
record.wholeUpdateDimension(update);
|
|
46
|
+
}
|
|
47
|
+
record.sortDimensions();
|
|
48
|
+
}
|
|
49
|
+
if (!updatedDimensions)
|
|
50
|
+
updatedDimensions = record._dimensions;
|
|
51
|
+
// Finally apply the rules and changes of the consumer fields to the record
|
|
39
52
|
for (const field of fields) {
|
|
40
53
|
const { key, alias } = field.cField;
|
|
41
54
|
const fieldKey = alias !== null && alias !== void 0 ? alias : key;
|
|
@@ -49,6 +62,7 @@ class PostProcessorClass {
|
|
|
49
62
|
}
|
|
50
63
|
return record;
|
|
51
64
|
});
|
|
65
|
+
newDataset.setDimensinons(updatedDimensions);
|
|
52
66
|
return newDataset;
|
|
53
67
|
});
|
|
54
68
|
/**
|
|
@@ -37,7 +37,6 @@ const Environment_1 = __importDefault(require("../Environment"));
|
|
|
37
37
|
class Dataset {
|
|
38
38
|
constructor(name, file, batchSize) {
|
|
39
39
|
var _a;
|
|
40
|
-
this._pipeline = [];
|
|
41
40
|
this.getPath = () => this._path;
|
|
42
41
|
this.setPath = (path) => {
|
|
43
42
|
this._path = path;
|
|
@@ -45,11 +44,11 @@ class Dataset {
|
|
|
45
44
|
};
|
|
46
45
|
this.getFile = () => this._file;
|
|
47
46
|
this.getBatchSize = () => this._batchSize;
|
|
48
|
-
this.
|
|
49
|
-
this.
|
|
50
|
-
this._recordPool.resize(size);
|
|
47
|
+
this.setFirstLine = (firstLine) => {
|
|
48
|
+
this._firstLine = firstLine;
|
|
51
49
|
return this;
|
|
52
50
|
};
|
|
51
|
+
this.getFirstLine = () => this._firstLine;
|
|
53
52
|
this.getSize = () => this._size;
|
|
54
53
|
this.getCycles = () => this._iterations;
|
|
55
54
|
this.getDelimiter = () => this._delimiter;
|
|
@@ -633,25 +632,11 @@ class Dataset {
|
|
|
633
632
|
this._delimiter = delimiter;
|
|
634
633
|
this._dimensions = dimensions;
|
|
635
634
|
switch (this._file.fileType) {
|
|
636
|
-
case 'TXT':
|
|
637
|
-
|
|
638
|
-
yield this.filter((x, i) => i > 0 && !x.isEmpty());
|
|
639
|
-
break;
|
|
640
|
-
}
|
|
641
|
-
case 'CSV': {
|
|
642
|
-
yield this.filter((x, i) => i > 0 && !x.isEmpty());
|
|
643
|
-
break;
|
|
644
|
-
}
|
|
635
|
+
case 'TXT':
|
|
636
|
+
case 'CSV':
|
|
645
637
|
case 'JSON':
|
|
646
|
-
case 'JSONL':
|
|
647
|
-
// Convert the JSON to the internal CSV format
|
|
648
|
-
yield this.map(record => {
|
|
649
|
-
const parsed = JSON.parse(record.getRaw());
|
|
650
|
-
const preparedRow = this._dimensions.map(d => parsed[d.key]).join(this._delimiter);
|
|
651
|
-
return new DatasetRecord_1.default(preparedRow, this._dimensions, this._delimiter);
|
|
652
|
-
});
|
|
638
|
+
case 'JSONL':
|
|
653
639
|
break;
|
|
654
|
-
}
|
|
655
640
|
case 'XLS':
|
|
656
641
|
case 'XLSX': {
|
|
657
642
|
const excel = xlsx_1.default.readFile(this._path);
|
|
@@ -689,6 +674,10 @@ class Dataset {
|
|
|
689
674
|
return this;
|
|
690
675
|
});
|
|
691
676
|
this.getDimensions = () => this._dimensions;
|
|
677
|
+
this.setDimensinons = (dimensions) => {
|
|
678
|
+
this._dimensions = dimensions;
|
|
679
|
+
return this;
|
|
680
|
+
};
|
|
692
681
|
/**
|
|
693
682
|
* Update the record pool when dimensions change
|
|
694
683
|
*/
|
|
@@ -696,55 +685,6 @@ class Dataset {
|
|
|
696
685
|
// Update all pooled records with current dimensions
|
|
697
686
|
this._recordPool.updateDimensions(this._dimensions, this._delimiter);
|
|
698
687
|
};
|
|
699
|
-
/**
|
|
700
|
-
* - remove dimension
|
|
701
|
-
* - rename a dimension
|
|
702
|
-
* - change hidden flag
|
|
703
|
-
* - move a dimension
|
|
704
|
-
*/
|
|
705
|
-
this.wholeUpdateDimensions = (fields) => __awaiter(this, void 0, void 0, function* () {
|
|
706
|
-
var _a;
|
|
707
|
-
let updates = [];
|
|
708
|
-
// Add all the updates
|
|
709
|
-
for (let i = 0; i < fields.length; i++) {
|
|
710
|
-
const { cField } = fields[i];
|
|
711
|
-
const currentMatch = structuredClone(this._dimensions.find(x => x.name === cField.key));
|
|
712
|
-
if (!currentMatch && !cField.fixed)
|
|
713
|
-
throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${this._name}" (${this._dimensions.map(x => x.name).join(', ')})`);
|
|
714
|
-
updates.push({
|
|
715
|
-
currentDimension: currentMatch,
|
|
716
|
-
newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
|
|
717
|
-
newHidden: cField.hidden,
|
|
718
|
-
newPosition: i,
|
|
719
|
-
toDelete: false
|
|
720
|
-
});
|
|
721
|
-
}
|
|
722
|
-
// Add all the updates to remove dimensions
|
|
723
|
-
for (const dim of this._dimensions) {
|
|
724
|
-
if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
|
|
725
|
-
updates.push({ currentDimension: dim, toDelete: true });
|
|
726
|
-
}
|
|
727
|
-
// Now keep only the updates that actually change something
|
|
728
|
-
updates = updates.filter(x => x.toDelete
|
|
729
|
-
|| !x.currentDimension
|
|
730
|
-
|| (x.currentDimension && (x.currentDimension.name !== x.newName
|
|
731
|
-
|| (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
|
|
732
|
-
|| x.newPosition !== x.currentDimension.index)));
|
|
733
|
-
if (updates.length === 0)
|
|
734
|
-
return this;
|
|
735
|
-
let updatedDimensions = null;
|
|
736
|
-
const newDataset = yield this.map(record => {
|
|
737
|
-
for (const update of updates) {
|
|
738
|
-
record.wholeUpdateDimension(update);
|
|
739
|
-
}
|
|
740
|
-
record._dimensions.sort((a, b) => a.index - b.index);
|
|
741
|
-
if (!updatedDimensions)
|
|
742
|
-
updatedDimensions = record._dimensions;
|
|
743
|
-
return record;
|
|
744
|
-
});
|
|
745
|
-
this._dimensions = updatedDimensions;
|
|
746
|
-
return newDataset;
|
|
747
|
-
});
|
|
748
688
|
this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
|
|
749
689
|
console.log(`DS ${this._name} (${this._size} | ${this._iterations})`);
|
|
750
690
|
console.log(this._dimensions.map(x => x.name).join(this._delimiter));
|
|
@@ -861,11 +801,11 @@ class Dataset {
|
|
|
861
801
|
this._file = file;
|
|
862
802
|
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
863
803
|
this._dimensions = [];
|
|
804
|
+
this._firstLine = '';
|
|
864
805
|
this._delimiter = ',';
|
|
865
806
|
this._size = 0;
|
|
866
807
|
this._iterations = 0;
|
|
867
808
|
this._operations = [];
|
|
868
|
-
this._pipeline = [];
|
|
869
809
|
// Initialize record pool for optimization
|
|
870
810
|
this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
|
|
871
811
|
const datasetName = this._name
|
|
@@ -13,6 +13,8 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
15
|
const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
16
|
+
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
17
|
+
const ConsumerManager_1 = __importDefault(require("../consumer/ConsumerManager"));
|
|
16
18
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
17
19
|
const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
|
|
18
20
|
const ParseManager_1 = __importDefault(require("../parsing/ParseManager"));
|
|
@@ -34,7 +36,8 @@ class DatasetManagerClass {
|
|
|
34
36
|
this.buildDimensions = (dataset_1, producer_1, ...args_1) => __awaiter(this, [dataset_1, producer_1, ...args_1], void 0, function* (dataset, producer, discover = false) {
|
|
35
37
|
(0, Affirm_1.default)(dataset, `Invalid dataset`);
|
|
36
38
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
37
|
-
const firstLine =
|
|
39
|
+
const firstLine = dataset.getFirstLine();
|
|
40
|
+
(0, Affirm_1.default)(firstLine, `The first line of the dataset was not set.`);
|
|
38
41
|
return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
|
|
39
42
|
});
|
|
40
43
|
this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
|
|
@@ -97,6 +100,40 @@ class DatasetManagerClass {
|
|
|
97
100
|
break;
|
|
98
101
|
}
|
|
99
102
|
});
|
|
103
|
+
this.computeDimensionsUpdates = (dataset, consumer) => {
|
|
104
|
+
var _a;
|
|
105
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
106
|
+
(0, Affirm_1.default)(consumer, 'Invalid consumer');
|
|
107
|
+
const fields = ConsumerManager_1.default.getExpandedFields(consumer);
|
|
108
|
+
const dimensions = dataset.getDimensions();
|
|
109
|
+
let updates = [];
|
|
110
|
+
// Add all the updates
|
|
111
|
+
for (let i = 0; i < fields.length; i++) {
|
|
112
|
+
const { cField } = fields[i];
|
|
113
|
+
const currentMatch = structuredClone(dimensions.find(x => x.name === cField.key));
|
|
114
|
+
if (!currentMatch && !cField.fixed)
|
|
115
|
+
throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying dataset "${dataset['_name']}" (${dimensions.map(x => x.name).join(', ')})`);
|
|
116
|
+
updates.push({
|
|
117
|
+
currentDimension: currentMatch,
|
|
118
|
+
newName: (_a = cField.alias) !== null && _a !== void 0 ? _a : cField.key,
|
|
119
|
+
newHidden: cField.hidden,
|
|
120
|
+
newPosition: i,
|
|
121
|
+
toDelete: false
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
// Add all the updates to remove dimensions
|
|
125
|
+
for (const dim of dimensions) {
|
|
126
|
+
if (!updates.find(x => { var _a; return ((_a = x.currentDimension) === null || _a === void 0 ? void 0 : _a.name) === dim.name; }))
|
|
127
|
+
updates.push({ currentDimension: dim, toDelete: true });
|
|
128
|
+
}
|
|
129
|
+
// Now keep only the updates that actually change something
|
|
130
|
+
updates = updates.filter(x => x.toDelete
|
|
131
|
+
|| !x.currentDimension
|
|
132
|
+
|| (x.currentDimension && (x.currentDimension.name !== x.newName
|
|
133
|
+
|| (Algo_1.default.hasVal(x.newHidden) && x.newHidden !== x.currentDimension.hidden)
|
|
134
|
+
|| x.newPosition !== x.currentDimension.index)));
|
|
135
|
+
return updates;
|
|
136
|
+
};
|
|
100
137
|
}
|
|
101
138
|
}
|
|
102
139
|
const DatasetManager = new DatasetManagerClass();
|
|
@@ -63,6 +63,12 @@ class DatasetRecord {
|
|
|
63
63
|
}
|
|
64
64
|
return this;
|
|
65
65
|
};
|
|
66
|
+
this.sortDimensions = () => {
|
|
67
|
+
const isOutOfOrder = this._dimensions.some((dim, index) => dim.index !== index);
|
|
68
|
+
if (isOutOfOrder) {
|
|
69
|
+
this._dimensions.sort((a, b) => a.index - b.index);
|
|
70
|
+
}
|
|
71
|
+
};
|
|
66
72
|
this.toJSON = () => {
|
|
67
73
|
if (this._dimensions.some(x => x.hidden)) {
|
|
68
74
|
// remove the not wanted dimension
|
|
@@ -12,6 +12,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Constants_1 = __importDefault(require("../../Constants"));
|
|
15
16
|
const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
16
17
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
17
18
|
const DSTE_1 = __importDefault(require("../../core/dste/DSTE"));
|
|
@@ -42,15 +43,30 @@ class FileExporterClass {
|
|
|
42
43
|
this.prepareBatch = (batch, options) => {
|
|
43
44
|
switch (options.recordProjection.format) {
|
|
44
45
|
case 'JSON': {
|
|
45
|
-
const
|
|
46
|
-
return
|
|
46
|
+
const jsonRecords = batch.map(x => x.toJSON());
|
|
47
|
+
return this._splitIntoChunks(jsonRecords, '\n');
|
|
47
48
|
}
|
|
48
49
|
case 'CSV': {
|
|
49
|
-
const
|
|
50
|
-
return
|
|
50
|
+
const csvRecords = batch.map(x => x.toCSV(options.recordProjection.delimiter));
|
|
51
|
+
return this._splitIntoChunks(csvRecords, '\n');
|
|
51
52
|
}
|
|
52
53
|
}
|
|
53
54
|
};
|
|
55
|
+
this._splitIntoChunks = (records, separator) => {
|
|
56
|
+
if (records.length === 0)
|
|
57
|
+
return [''];
|
|
58
|
+
const sampleRecord = records[0];
|
|
59
|
+
const sampleLength = sampleRecord.length + separator.length; // Include separator in calculation
|
|
60
|
+
const recordsPerChunk = Math.floor(Constants_1.default.defaults.STRING_MAX_CHARACTERS_LENGTH / sampleLength);
|
|
61
|
+
// Ensure at least 1 record per chunk
|
|
62
|
+
const chunkSize = Math.max(1, recordsPerChunk);
|
|
63
|
+
const chunks = [];
|
|
64
|
+
for (let i = 0; i < records.length; i += chunkSize) {
|
|
65
|
+
const chunk = records.slice(i, i + chunkSize);
|
|
66
|
+
chunks.push(chunk.join(separator));
|
|
67
|
+
}
|
|
68
|
+
return chunks;
|
|
69
|
+
};
|
|
54
70
|
this._composeFileName = (consumer, extension) => `${consumer.name}_${Algo_1.default.replaceAll(DSTE_1.default.now().toISOString().split('.')[0], ':', '-')}.${extension}`;
|
|
55
71
|
}
|
|
56
72
|
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
const node_v8_1 = __importDefault(require("node:v8"));
|
|
7
|
+
class RuntimeClass {
|
|
8
|
+
constructor() {
|
|
9
|
+
this.getHeap = () => {
|
|
10
|
+
const { heap_size_limit, used_heap_size } = node_v8_1.default.getHeapStatistics();
|
|
11
|
+
return {
|
|
12
|
+
heapSizeMB: this._toMB(heap_size_limit),
|
|
13
|
+
usedHeapMB: this._toMB(used_heap_size)
|
|
14
|
+
};
|
|
15
|
+
};
|
|
16
|
+
this._toMB = (bytes) => Math.round(bytes / (1024 * 1024) * 100) / 100;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
const Runtime = new RuntimeClass();
|
|
20
|
+
exports.default = Runtime;
|
package/index.js
CHANGED
|
@@ -16,14 +16,23 @@ const create_producer_1 = require("./actions/create_producer");
|
|
|
16
16
|
const create_consumer_1 = require("./actions/create_consumer");
|
|
17
17
|
const Constants_1 = __importDefault(require("./Constants"));
|
|
18
18
|
const LicenceManager_1 = __importDefault(require("./licencing/LicenceManager"));
|
|
19
|
+
const Runtime_1 = __importDefault(require("./helper/Runtime"));
|
|
19
20
|
dotenv_1.default.configDotenv();
|
|
20
21
|
const program = new commander_1.Command();
|
|
22
|
+
// Validate the remora licence
|
|
21
23
|
const remoraLicenceKey = process.env.REMORA_LICENCE_KEY;
|
|
22
24
|
const check = LicenceManager_1.default.validate(remoraLicenceKey);
|
|
23
25
|
if (!check.valid) {
|
|
24
26
|
console.error(`Invalid Remora licence key, the product is not active: remember to set "REMORA_LICENCE_KEY" environment variable.`);
|
|
25
27
|
process.exit(1);
|
|
26
28
|
}
|
|
29
|
+
// Runtime check on heap size to warn user of insufficent runtime resources
|
|
30
|
+
const { heapSizeMB } = Runtime_1.default.getHeap();
|
|
31
|
+
if (heapSizeMB < Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB)
|
|
32
|
+
console.warn(`Remora is running with ${heapSizeMB}MB of runtime heap, which is below the bare minimum of ${Constants_1.default.defaults.MIN_RUNTIME_HEAP_MB}MB (Recommended: ${Constants_1.default.defaults.RECOMMENDED_RUNTIME_HEAP_MB}MB).`);
|
|
33
|
+
else if (heapSizeMB < Constants_1.default.defaults.RECOMMENDED_RUNTIME_HEAP_MB)
|
|
34
|
+
console.warn(`Remora is running with ${heapSizeMB} MB of runtime heap, which is below the recommended of ${Constants_1.default.defaults.RECOMMENDED_RUNTIME_HEAP_MB} MB.`);
|
|
35
|
+
// Initialize all commands
|
|
27
36
|
program
|
|
28
37
|
.version(Constants_1.default.cliVersion + '', '-v, --version', 'Display the version of the CLI')
|
|
29
38
|
.description('CLI tool for setting up and managing Data-Remora');
|