@forzalabs/remora 0.0.54-nasco.3 → 0.0.56-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js
CHANGED
|
@@ -26,14 +26,17 @@ const readline_1 = require("readline");
|
|
|
26
26
|
const Constants_1 = __importDefault(require("../../Constants"));
|
|
27
27
|
const DatasetManager_1 = __importDefault(require("./DatasetManager"));
|
|
28
28
|
const DatasetRecord_1 = __importDefault(require("./DatasetRecord"));
|
|
29
|
+
const DatasetRecordPool_1 = __importDefault(require("./DatasetRecordPool"));
|
|
29
30
|
const xlsx_1 = __importDefault(require("xlsx"));
|
|
30
31
|
const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
31
32
|
const XMLParser_1 = __importDefault(require("../parsing/XMLParser"));
|
|
32
33
|
const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
|
|
33
34
|
const Helper_1 = __importDefault(require("../../helper/Helper"));
|
|
34
35
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
36
|
+
const Environment_1 = __importDefault(require("../Environment"));
|
|
35
37
|
class Dataset {
|
|
36
|
-
constructor(name, file, batchSize
|
|
38
|
+
constructor(name, file, batchSize) {
|
|
39
|
+
var _a;
|
|
37
40
|
this._pipeline = [];
|
|
38
41
|
this.getPath = () => this._path;
|
|
39
42
|
this.setPath = (path) => {
|
|
@@ -44,6 +47,7 @@ class Dataset {
|
|
|
44
47
|
this.getBatchSize = () => this._batchSize;
|
|
45
48
|
this.setBatchSize = (size) => {
|
|
46
49
|
this._batchSize = size;
|
|
50
|
+
this._recordPool.resize(size);
|
|
47
51
|
return this;
|
|
48
52
|
};
|
|
49
53
|
this.getSize = () => this._size;
|
|
@@ -80,6 +84,7 @@ class Dataset {
|
|
|
80
84
|
const firstLine = typeof firstItem === 'object' ? JSON.stringify(firstItem) : String(firstItem);
|
|
81
85
|
const buildRes = yield DatasetManager_1.default.buildDimensionsFromFirstLine(firstLine, this._file, producer, discover);
|
|
82
86
|
this._dimensions = buildRes.dimensions;
|
|
87
|
+
this._updateRecordPoolDimensions();
|
|
83
88
|
// Clear existing file content
|
|
84
89
|
this.clear();
|
|
85
90
|
// Convert objects to DatasetRecord format and write to file
|
|
@@ -121,16 +126,17 @@ class Dataset {
|
|
|
121
126
|
const readStream = (0, fs_2.createReadStream)(inputPath);
|
|
122
127
|
const writeStream = (0, fs_2.createWriteStream)(outputPath);
|
|
123
128
|
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
|
|
129
|
+
const dimensions = Algo_1.default.deepClone(this._dimensions);
|
|
124
130
|
let batch = [];
|
|
125
131
|
let lineCount = 0;
|
|
126
|
-
const dimensions = Algo_1.default.deepClone(this._dimensions);
|
|
127
132
|
try {
|
|
128
133
|
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
129
134
|
_c = rl_1_1.value;
|
|
130
135
|
_d = false;
|
|
131
136
|
const line = _c;
|
|
132
137
|
try {
|
|
133
|
-
|
|
138
|
+
// Reuse record from pool and reinitialize it with new line data
|
|
139
|
+
const record = this._recordPool.getNext(line, dimensions, this._delimiter);
|
|
134
140
|
batch.push(record);
|
|
135
141
|
lineCount++;
|
|
136
142
|
if (batch.length >= this._batchSize) {
|
|
@@ -139,6 +145,7 @@ class Dataset {
|
|
|
139
145
|
writeStream.write(transformedRecord.stringify() + '\n');
|
|
140
146
|
}
|
|
141
147
|
batch = [];
|
|
148
|
+
this._recordPool.reset(); // Reset pool index for next batch
|
|
142
149
|
}
|
|
143
150
|
}
|
|
144
151
|
catch (error) {
|
|
@@ -682,6 +689,13 @@ class Dataset {
|
|
|
682
689
|
return this;
|
|
683
690
|
});
|
|
684
691
|
this.getDimensions = () => this._dimensions;
|
|
692
|
+
/**
|
|
693
|
+
* Update the record pool when dimensions change
|
|
694
|
+
*/
|
|
695
|
+
this._updateRecordPoolDimensions = () => {
|
|
696
|
+
// Update all pooled records with current dimensions
|
|
697
|
+
this._recordPool.updateDimensions(this._dimensions, this._delimiter);
|
|
698
|
+
};
|
|
685
699
|
/**
|
|
686
700
|
* - remove dimension
|
|
687
701
|
* - rename a dimension
|
|
@@ -845,13 +859,15 @@ class Dataset {
|
|
|
845
859
|
};
|
|
846
860
|
this._name = name;
|
|
847
861
|
this._file = file;
|
|
848
|
-
this._batchSize = batchSize;
|
|
862
|
+
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
849
863
|
this._dimensions = [];
|
|
850
864
|
this._delimiter = ',';
|
|
851
865
|
this._size = 0;
|
|
852
866
|
this._iterations = 0;
|
|
853
867
|
this._operations = [];
|
|
854
868
|
this._pipeline = [];
|
|
869
|
+
// Initialize record pool for optimization
|
|
870
|
+
this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
|
|
855
871
|
const datasetName = this._name
|
|
856
872
|
.replace(/[^a-zA-Z0-9_-]/g, '_')
|
|
857
873
|
.replace(/_{2,}/g, '_')
|
|
@@ -20,6 +20,17 @@ class DatasetRecord {
|
|
|
20
20
|
this.getRaw = () => this._row;
|
|
21
21
|
this.getValue = (dimension) => this._value[dimension];
|
|
22
22
|
this.setValue = (dimension, value) => this._value[dimension] = value;
|
|
23
|
+
/**
|
|
24
|
+
* Reinitialize the record with new data instead of creating a new instance
|
|
25
|
+
* This is used for object pooling optimization
|
|
26
|
+
*/
|
|
27
|
+
this.reinitialize = (row, dimensions, delimiter) => {
|
|
28
|
+
this._row = row;
|
|
29
|
+
this._dimensions = dimensions;
|
|
30
|
+
this._delimiter = delimiter;
|
|
31
|
+
this._value = {};
|
|
32
|
+
this.parse(row, delimiter, this._dimensions);
|
|
33
|
+
};
|
|
23
34
|
this.wholeUpdateDimension = (update) => {
|
|
24
35
|
var _a;
|
|
25
36
|
if (update.toDelete) {
|
|
@@ -35,7 +46,9 @@ class DatasetRecord {
|
|
|
35
46
|
}
|
|
36
47
|
else {
|
|
37
48
|
// Change: name, hidden, position
|
|
38
|
-
|
|
49
|
+
let index = this._dimensions.findIndex(x => x.key === update.currentDimension.name);
|
|
50
|
+
if (index < 0)
|
|
51
|
+
index = this._dimensions.findIndex(x => x.key === update.currentDimension.key);
|
|
39
52
|
const currentDim = this._dimensions[index];
|
|
40
53
|
const updatedDim = { name: update.newName, key: (_a = currentDim.key) !== null && _a !== void 0 ? _a : update.newName, hidden: update.newHidden, index: update.newPosition };
|
|
41
54
|
this._value[updatedDim.name] = this._value[currentDim.name];
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
const DatasetRecord_1 = __importDefault(require("./DatasetRecord"));
|
|
7
|
+
/**
|
|
8
|
+
* A pool of DatasetRecord objects to optimize memory allocation during batch processing
|
|
9
|
+
*/
|
|
10
|
+
class DatasetRecordPool {
|
|
11
|
+
constructor(poolSize) {
|
|
12
|
+
/**
|
|
13
|
+
* Initialize the pool with empty DatasetRecord objects
|
|
14
|
+
*/
|
|
15
|
+
this._initializePool = () => {
|
|
16
|
+
this._pool = [];
|
|
17
|
+
for (let i = 0; i < this._poolSize; i++) {
|
|
18
|
+
this._pool.push(new DatasetRecord_1.default('', [], ','));
|
|
19
|
+
}
|
|
20
|
+
this._poolIndex = 0;
|
|
21
|
+
};
|
|
22
|
+
/**
|
|
23
|
+
* Get the next available record from the pool and reinitialize it with new data
|
|
24
|
+
* @param line The raw line data
|
|
25
|
+
* @param dimensions The dataset dimensions
|
|
26
|
+
* @param delimiter The delimiter to use
|
|
27
|
+
* @returns A reinitialized DatasetRecord from the pool
|
|
28
|
+
*/
|
|
29
|
+
this.getNext = (line, dimensions, delimiter) => {
|
|
30
|
+
const record = this._pool[this._poolIndex];
|
|
31
|
+
record.reinitialize(line, dimensions, delimiter);
|
|
32
|
+
this._poolIndex = (this._poolIndex + 1) % this._poolSize;
|
|
33
|
+
return record;
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* Reset the pool index to start from the beginning
|
|
37
|
+
* This should be called when starting a new batch
|
|
38
|
+
*/
|
|
39
|
+
this.reset = () => {
|
|
40
|
+
this._poolIndex = 0;
|
|
41
|
+
};
|
|
42
|
+
/**
|
|
43
|
+
* Update the pool size and reinitialize if necessary
|
|
44
|
+
* @param newSize The new pool size
|
|
45
|
+
*/
|
|
46
|
+
this.resize = (newSize) => {
|
|
47
|
+
if (newSize !== this._poolSize) {
|
|
48
|
+
this._poolSize = newSize;
|
|
49
|
+
this._initializePool();
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
/**
|
|
53
|
+
* Update all pooled records with new dimensions and delimiter
|
|
54
|
+
* This should be called when dataset dimensions change
|
|
55
|
+
* @param dimensions The new dimensions
|
|
56
|
+
* @param delimiter The new delimiter
|
|
57
|
+
*/
|
|
58
|
+
this.updateDimensions = (dimensions, delimiter) => {
|
|
59
|
+
for (const record of this._pool) {
|
|
60
|
+
record.reinitialize('', dimensions, delimiter);
|
|
61
|
+
}
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Get the current pool size
|
|
65
|
+
*/
|
|
66
|
+
this.getSize = () => this._poolSize;
|
|
67
|
+
/**
|
|
68
|
+
* Get the current pool index
|
|
69
|
+
*/
|
|
70
|
+
this.getCurrentIndex = () => this._poolIndex;
|
|
71
|
+
this._poolSize = poolSize;
|
|
72
|
+
this._poolIndex = 0;
|
|
73
|
+
this._pool = [];
|
|
74
|
+
this._initializePool();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
exports.default = DatasetRecordPool;
|