@forzalabs/remora 1.0.21 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/actions/automap.js +26 -42
- package/actions/compile.js +27 -43
- package/actions/create_consumer.js +24 -40
- package/actions/create_producer.js +16 -32
- package/actions/debug.js +18 -34
- package/actions/deploy.js +30 -46
- package/actions/discover.js +13 -29
- package/actions/init.js +29 -45
- package/actions/mock.js +16 -32
- package/actions/run.js +34 -52
- package/actions/sample.js +42 -58
- package/index.js +38 -43
- package/package.json +4 -4
- package/workers/ExecutorWorker.js +18 -32
- package/Constants.js +0 -34
- package/core/Affirm.js +0 -42
- package/core/Algo.js +0 -160
- package/core/dste/DSTE.js +0 -113
- package/core/logger/DebugLogService.js +0 -48
- package/core/logger/DevelopmentLogService.js +0 -70
- package/core/logger/LocalLogService.js +0 -70
- package/core/logger/Logger.js +0 -54
- package/database/DatabaseEngine.js +0 -149
- package/database/DatabaseStructure.js +0 -27
- package/definitions/DatasetDefinitions.js +0 -2
- package/definitions/ExecutorDefinitions.js +0 -2
- package/definitions/ProcessENV.js +0 -2
- package/definitions/agents/DestinationDriver.js +0 -2
- package/definitions/agents/SourceDriver.js +0 -2
- package/definitions/cli.js +0 -2
- package/definitions/database/ApiKeys.js +0 -2
- package/definitions/database/Stored.js +0 -7
- package/definitions/database/UsageStat.js +0 -2
- package/definitions/database/User.js +0 -2
- package/definitions/json_schemas/consumer-schema.json +0 -1226
- package/definitions/json_schemas/producer-schema.json +0 -308
- package/definitions/json_schemas/project-schema.json +0 -100
- package/definitions/json_schemas/source-schema.json +0 -249
- package/definitions/requests/ConsumerRequest.js +0 -2
- package/definitions/requests/Developer.js +0 -2
- package/definitions/requests/Mapping.js +0 -2
- package/definitions/requests/ProducerRequest.js +0 -2
- package/definitions/requests/Request.js +0 -2
- package/definitions/resources/Compiled.js +0 -2
- package/definitions/resources/Consumer.js +0 -2
- package/definitions/resources/Environment.js +0 -2
- package/definitions/resources/Library.js +0 -2
- package/definitions/resources/Producer.js +0 -2
- package/definitions/resources/Project.js +0 -2
- package/definitions/resources/Schema.js +0 -2
- package/definitions/resources/Source.js +0 -2
- package/definitions/temp.js +0 -2
- package/definitions/transform/Transformations.js +0 -2
- package/drivers/DeltaShareDriver.js +0 -186
- package/drivers/DriverFactory.js +0 -72
- package/drivers/DriverHelper.js +0 -248
- package/drivers/HttpApiDriver.js +0 -208
- package/drivers/RedshiftDriver.js +0 -184
- package/drivers/files/LocalDestinationDriver.js +0 -146
- package/drivers/files/LocalSourceDriver.js +0 -405
- package/drivers/s3/S3DestinationDriver.js +0 -197
- package/drivers/s3/S3SourceDriver.js +0 -495
- package/engines/CryptoEngine.js +0 -75
- package/engines/Environment.js +0 -170
- package/engines/ProcessENVManager.js +0 -83
- package/engines/RandomEngine.js +0 -47
- package/engines/SecretManager.js +0 -23
- package/engines/UserManager.js +0 -66
- package/engines/ai/AutoMapperEngine.js +0 -37
- package/engines/ai/DeveloperEngine.js +0 -497
- package/engines/ai/LLM.js +0 -255
- package/engines/consumer/ConsumerManager.js +0 -218
- package/engines/consumer/ConsumerOnFinishManager.js +0 -202
- package/engines/dataset/Dataset.js +0 -824
- package/engines/dataset/DatasetManager.js +0 -211
- package/engines/dataset/DatasetRecord.js +0 -120
- package/engines/dataset/DatasetRecordPool.js +0 -77
- package/engines/execution/RequestExecutor.js +0 -67
- package/engines/parsing/CSVParser.js +0 -60
- package/engines/parsing/LineParser.js +0 -71
- package/engines/parsing/ParseCompression.js +0 -101
- package/engines/parsing/ParseHelper.js +0 -18
- package/engines/parsing/ParseManager.js +0 -54
- package/engines/parsing/XLSParser.js +0 -87
- package/engines/parsing/XMLParser.js +0 -115
- package/engines/producer/ProducerEngine.js +0 -127
- package/engines/producer/ProducerManager.js +0 -43
- package/engines/scheduler/CronScheduler.js +0 -222
- package/engines/scheduler/QueueManager.js +0 -314
- package/engines/schema/SchemaValidator.js +0 -67
- package/engines/transform/JoinEngine.js +0 -232
- package/engines/transform/TransformationEngine.js +0 -277
- package/engines/transform/TypeCaster.js +0 -59
- package/engines/usage/DataframeManager.js +0 -55
- package/engines/usage/UsageDataManager.js +0 -151
- package/engines/usage/UsageManager.js +0 -65
- package/engines/validation/Validator.js +0 -216
- package/executors/ConsumerExecutor.js +0 -280
- package/executors/Executor.js +0 -177
- package/executors/ExecutorOrchestrator.js +0 -331
- package/executors/ExecutorPerformance.js +0 -17
- package/executors/ExecutorProgress.js +0 -54
- package/executors/ExecutorScope.js +0 -52
- package/executors/OutputExecutor.js +0 -118
- package/executors/ProducerExecutor.js +0 -108
- package/helper/Helper.js +0 -149
- package/helper/Logger.js +0 -84
- package/helper/Runtime.js +0 -20
- package/helper/Settings.js +0 -13
- package/licencing/LicenceManager.js +0 -64
- package/settings.js +0 -12
|
@@ -1,824 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
|
-
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
-
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
-
var m = o[Symbol.asyncIterator], i;
|
|
14
|
-
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
-
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
-
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
-
};
|
|
18
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
19
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
20
|
-
};
|
|
21
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
-
const path_1 = __importDefault(require("path"));
|
|
23
|
-
const fs_1 = __importDefault(require("fs"));
|
|
24
|
-
const fs_2 = require("fs");
|
|
25
|
-
const readline_1 = require("readline");
|
|
26
|
-
const Constants_1 = __importDefault(require("../../Constants"));
|
|
27
|
-
const DatasetManager_1 = __importDefault(require("./DatasetManager"));
|
|
28
|
-
const DatasetRecord_1 = __importDefault(require("./DatasetRecord"));
|
|
29
|
-
const DatasetRecordPool_1 = __importDefault(require("./DatasetRecordPool"));
|
|
30
|
-
const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
31
|
-
const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
|
|
32
|
-
const Helper_1 = __importDefault(require("../../helper/Helper"));
|
|
33
|
-
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
34
|
-
const Environment_1 = __importDefault(require("../Environment"));
|
|
35
|
-
const Logger_1 = __importDefault(require("../../helper/Logger"));
|
|
36
|
-
const ProducerManager_1 = __importDefault(require("../producer/ProducerManager"));
|
|
37
|
-
class Dataset {
|
|
38
|
-
constructor(options) {
|
|
39
|
-
var _a, _b;
|
|
40
|
-
this.getPath = () => this._path;
|
|
41
|
-
this.setPath = (path) => {
|
|
42
|
-
this._path = path;
|
|
43
|
-
return this;
|
|
44
|
-
};
|
|
45
|
-
this.getFile = () => this._file;
|
|
46
|
-
this.getExecutionId = () => this._executionId;
|
|
47
|
-
this.getBatchSize = () => this._batchSize;
|
|
48
|
-
this.setFirstLine = (firstLine) => {
|
|
49
|
-
this._firstLine = firstLine;
|
|
50
|
-
return this;
|
|
51
|
-
};
|
|
52
|
-
this.getFirstLine = () => this._firstLine;
|
|
53
|
-
this.getCount = () => this._count;
|
|
54
|
-
this.setCount = (count) => {
|
|
55
|
-
this._count = count;
|
|
56
|
-
return this;
|
|
57
|
-
};
|
|
58
|
-
this.getCycles = () => this._iterations;
|
|
59
|
-
this.getDelimiter = () => this._delimiter;
|
|
60
|
-
this.setDelimiter = (delimiter) => {
|
|
61
|
-
this._delimiter = delimiter;
|
|
62
|
-
return this;
|
|
63
|
-
};
|
|
64
|
-
this.getOperations = () => this._operations;
|
|
65
|
-
this.load = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
66
|
-
(0, Affirm_1.default)(source, 'Invalid source');
|
|
67
|
-
this._startOperation('load', { source: source.engine });
|
|
68
|
-
try {
|
|
69
|
-
const driver = yield DriverFactory_1.default.instantiateSource(source);
|
|
70
|
-
yield driver.download(this);
|
|
71
|
-
}
|
|
72
|
-
catch (error) {
|
|
73
|
-
if (this._file.isOptional) {
|
|
74
|
-
Logger_1.default.log(`Error loading dataset "${this.name}", creating default configuration and mock data because "isOptional" is true.`);
|
|
75
|
-
if (!this.getDimensions() || this.getDimensions().length === 0)
|
|
76
|
-
this.setDimensions(ProducerManager_1.default.getColumns(this._baseProducer).map((x, i) => { var _a; return ({ index: i, key: x.nameInProducer, name: x.aliasInProducer, hidden: false, type: (_a = x.dimension) === null || _a === void 0 ? void 0 : _a.type }); }));
|
|
77
|
-
if (!this.getFirstLine() || this.getFirstLine().length === 0) {
|
|
78
|
-
if (this._file.hasHeaderRow)
|
|
79
|
-
this.setFirstLine(this.getDimensions().map(x => x.key).join(this.getDelimiter()));
|
|
80
|
-
else
|
|
81
|
-
this.setFirstLine('');
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
else
|
|
85
|
-
throw error;
|
|
86
|
-
}
|
|
87
|
-
this._size = this._computeSize();
|
|
88
|
-
this._finishOperation('load');
|
|
89
|
-
return this;
|
|
90
|
-
});
|
|
91
|
-
/**
|
|
92
|
-
* Load data from an in-memory array of objects and create a local dataset file
|
|
93
|
-
* @param data Array of objects to load into the dataset
|
|
94
|
-
* @param dimensions Optional dimensions array. If not provided, will be inferred from the first object
|
|
95
|
-
* @param delimiter Optional delimiter. Defaults to comma
|
|
96
|
-
*/
|
|
97
|
-
this.loadFromMemory = (data_1, producer_1, ...args_1) => __awaiter(this, [data_1, producer_1, ...args_1], void 0, function* (data, producer, discover = false) {
|
|
98
|
-
var _a, _b;
|
|
99
|
-
(0, Affirm_1.default)(data, 'Invalid data array');
|
|
100
|
-
(0, Affirm_1.default)(Array.isArray(data), 'Data must be an array');
|
|
101
|
-
if (data.length === 0) {
|
|
102
|
-
console.warn('Loading empty array into dataset');
|
|
103
|
-
return this;
|
|
104
|
-
}
|
|
105
|
-
this._startOperation('load-from-memory', { recordCount: data.length });
|
|
106
|
-
try {
|
|
107
|
-
this._delimiter = (_b = (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.delimiter) !== null && _b !== void 0 ? _b : this._delimiter;
|
|
108
|
-
// Discover the dimensions on your own
|
|
109
|
-
const firstItem = data[0];
|
|
110
|
-
const firstLine = typeof firstItem === 'object' ? JSON.stringify(firstItem) : String(firstItem);
|
|
111
|
-
const buildRes = yield DatasetManager_1.default.buildDimensionsFromFirstLine(firstLine, this._file, producer, discover);
|
|
112
|
-
this._dimensions = buildRes.dimensions;
|
|
113
|
-
this._updateRecordPoolDimensions();
|
|
114
|
-
// Clear existing file content
|
|
115
|
-
this.clear();
|
|
116
|
-
// Convert objects to DatasetRecord format and write to file
|
|
117
|
-
const records = [];
|
|
118
|
-
for (const item of data) {
|
|
119
|
-
// Create a row string by extracting values in dimension order
|
|
120
|
-
const values = this._dimensions.map(dim => {
|
|
121
|
-
const value = item[dim.key];
|
|
122
|
-
// Handle null/undefined values
|
|
123
|
-
return value !== null && value !== undefined ? String(value) : '';
|
|
124
|
-
});
|
|
125
|
-
const rowString = values.join(this._delimiter);
|
|
126
|
-
const record = new DatasetRecord_1.default(rowString, this._dimensions, this._delimiter);
|
|
127
|
-
records.push(record);
|
|
128
|
-
}
|
|
129
|
-
// Write all records to the dataset file
|
|
130
|
-
yield this.append(records);
|
|
131
|
-
this._size = this._computeSize();
|
|
132
|
-
this._count = data.length;
|
|
133
|
-
this._finishOperation('load-from-memory');
|
|
134
|
-
return this;
|
|
135
|
-
}
|
|
136
|
-
catch (error) {
|
|
137
|
-
this._finishOperation('load-from-memory');
|
|
138
|
-
throw new Error(`Failed to load data from memory: ${error instanceof Error ? error.message : String(error)}`);
|
|
139
|
-
}
|
|
140
|
-
});
|
|
141
|
-
/**
|
|
142
|
-
* Stream through the file in batches and apply a transformation
|
|
143
|
-
*/
|
|
144
|
-
this.transformStream = (transformer_1, ...args_1) => __awaiter(this, [transformer_1, ...args_1], void 0, function* (transformer, options = {}) {
|
|
145
|
-
var _a, e_1, _b, _c;
|
|
146
|
-
var _d, _e, _f, _g;
|
|
147
|
-
const inputPath = options.inputPath || this._path;
|
|
148
|
-
const outputPath = options.outputPath || this._tempPath;
|
|
149
|
-
const fromLine = (_e = (_d = options.range) === null || _d === void 0 ? void 0 : _d.fromLine) !== null && _e !== void 0 ? _e : -1;
|
|
150
|
-
const toLine = (_g = (_f = options.range) === null || _f === void 0 ? void 0 : _f.toLine) !== null && _g !== void 0 ? _g : Infinity;
|
|
151
|
-
this.ensureFile(outputPath);
|
|
152
|
-
if (!fs_1.default.existsSync(inputPath))
|
|
153
|
-
throw new Error(`Input file does not exist: ${inputPath}`);
|
|
154
|
-
this._startOperation('transform-stream');
|
|
155
|
-
const readStream = (0, fs_2.createReadStream)(inputPath);
|
|
156
|
-
const writeStream = (0, fs_2.createWriteStream)(outputPath);
|
|
157
|
-
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
|
|
158
|
-
const dimensions = Algo_1.default.deepClone(this._dimensions);
|
|
159
|
-
let batch = [];
|
|
160
|
-
let lineCount = 0;
|
|
161
|
-
let index = 0;
|
|
162
|
-
try {
|
|
163
|
-
for (var _h = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _h = true) {
|
|
164
|
-
_c = rl_1_1.value;
|
|
165
|
-
_h = false;
|
|
166
|
-
const line = _c;
|
|
167
|
-
try {
|
|
168
|
-
if (index < fromLine) {
|
|
169
|
-
index++;
|
|
170
|
-
continue;
|
|
171
|
-
}
|
|
172
|
-
else if (index >= toLine) {
|
|
173
|
-
index++;
|
|
174
|
-
break;
|
|
175
|
-
}
|
|
176
|
-
index++;
|
|
177
|
-
// Reuse record from pool and reinitialize it with new line data
|
|
178
|
-
const record = this._recordPool.getNext(line, dimensions, this._delimiter);
|
|
179
|
-
batch.push(record);
|
|
180
|
-
lineCount++;
|
|
181
|
-
if (batch.length >= this._batchSize) {
|
|
182
|
-
const transformedBatch = yield transformer(batch);
|
|
183
|
-
for (const transformedRecord of transformedBatch) {
|
|
184
|
-
writeStream.write(transformedRecord.stringify() + '\n');
|
|
185
|
-
}
|
|
186
|
-
batch = [];
|
|
187
|
-
this._recordPool.reset();
|
|
188
|
-
}
|
|
189
|
-
}
|
|
190
|
-
catch (error) {
|
|
191
|
-
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
192
|
-
lineCount++;
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
197
|
-
finally {
|
|
198
|
-
try {
|
|
199
|
-
if (!_h && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
200
|
-
}
|
|
201
|
-
finally { if (e_1) throw e_1.error; }
|
|
202
|
-
}
|
|
203
|
-
// Process remaining items in the last batch
|
|
204
|
-
if (batch.length > 0) {
|
|
205
|
-
const transformedBatch = yield transformer(batch);
|
|
206
|
-
for (const transformedRecord of transformedBatch) {
|
|
207
|
-
writeStream.write(transformedRecord.stringify() + '\n');
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
writeStream.end();
|
|
211
|
-
// Wait for write stream to finish
|
|
212
|
-
yield new Promise((resolve, reject) => {
|
|
213
|
-
writeStream.on('finish', resolve);
|
|
214
|
-
writeStream.on('error', reject);
|
|
215
|
-
});
|
|
216
|
-
// Replace original file with transformed file
|
|
217
|
-
if (outputPath === this._tempPath) {
|
|
218
|
-
fs_1.default.renameSync(this._tempPath, this._path);
|
|
219
|
-
}
|
|
220
|
-
this._count = lineCount;
|
|
221
|
-
this._size = this._computeSize();
|
|
222
|
-
this._iterations++;
|
|
223
|
-
this._finishOperation('transform-stream');
|
|
224
|
-
});
|
|
225
|
-
/**
|
|
226
|
-
* Filter items in the file using batched streaming
|
|
227
|
-
*/
|
|
228
|
-
this.filter = (predicate_1, ...args_1) => __awaiter(this, [predicate_1, ...args_1], void 0, function* (predicate, options = {}) {
|
|
229
|
-
this._startOperation('filter');
|
|
230
|
-
let globalIndex = 0;
|
|
231
|
-
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
232
|
-
const filteredBatch = [];
|
|
233
|
-
for (const item of batch) {
|
|
234
|
-
if (predicate(item, globalIndex)) {
|
|
235
|
-
filteredBatch.push(item);
|
|
236
|
-
}
|
|
237
|
-
globalIndex++;
|
|
238
|
-
}
|
|
239
|
-
return filteredBatch;
|
|
240
|
-
}), options);
|
|
241
|
-
this._finishOperation('filter');
|
|
242
|
-
return this;
|
|
243
|
-
});
|
|
244
|
-
/**
|
|
245
|
-
* Map items in the file using batched streaming
|
|
246
|
-
*/
|
|
247
|
-
this.map = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
|
|
248
|
-
this._startOperation('map');
|
|
249
|
-
let globalIndex = 0;
|
|
250
|
-
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
251
|
-
const mappedBatch = [];
|
|
252
|
-
for (const item of batch) {
|
|
253
|
-
const mappedItem = yield mapper(item, globalIndex);
|
|
254
|
-
mappedBatch.push(mappedItem);
|
|
255
|
-
globalIndex++;
|
|
256
|
-
}
|
|
257
|
-
return mappedBatch;
|
|
258
|
-
}), options);
|
|
259
|
-
this._finishOperation('map');
|
|
260
|
-
return this;
|
|
261
|
-
});
|
|
262
|
-
/**
|
|
263
|
-
* FlatMap items in the file using batched streaming - maps each item to an array of items and flattens the result
|
|
264
|
-
*/
|
|
265
|
-
this.flatMap = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
|
|
266
|
-
this._startOperation('flat-map');
|
|
267
|
-
let globalIndex = 0;
|
|
268
|
-
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
269
|
-
const flatMappedBatch = [];
|
|
270
|
-
for (const item of batch) {
|
|
271
|
-
const mappedItems = yield mapper(item, globalIndex);
|
|
272
|
-
flatMappedBatch.push(...mappedItems);
|
|
273
|
-
globalIndex++;
|
|
274
|
-
}
|
|
275
|
-
return flatMappedBatch;
|
|
276
|
-
}), options);
|
|
277
|
-
this._finishOperation('flat-map');
|
|
278
|
-
return this;
|
|
279
|
-
});
|
|
280
|
-
/**
|
|
281
|
-
* Sort the dataset by one or more dimensions using batched streaming with external merge sort
|
|
282
|
-
* @param compareFn Comparison function that takes two DatasetRecord objects and returns a number
|
|
283
|
-
* @param options Optional parameters for sorting
|
|
284
|
-
*/
|
|
285
|
-
this.sort = (compareFn_1, ...args_1) => __awaiter(this, [compareFn_1, ...args_1], void 0, function* (compareFn, options = {}) {
|
|
286
|
-
var _a, e_2, _b, _c;
|
|
287
|
-
this._startOperation('sort');
|
|
288
|
-
const { batchSize = this._batchSize } = options;
|
|
289
|
-
if (!fs_1.default.existsSync(this._path)) {
|
|
290
|
-
throw new Error(`File does not exist: ${this._path}`);
|
|
291
|
-
}
|
|
292
|
-
// Phase 1: Sort individual batches and write them to temporary files
|
|
293
|
-
const tempFiles = [];
|
|
294
|
-
const readStream = (0, fs_2.createReadStream)(this._path);
|
|
295
|
-
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
|
|
296
|
-
let batch = [];
|
|
297
|
-
let batchIndex = 0;
|
|
298
|
-
try {
|
|
299
|
-
for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
|
|
300
|
-
_c = rl_2_1.value;
|
|
301
|
-
_d = false;
|
|
302
|
-
const line = _c;
|
|
303
|
-
try {
|
|
304
|
-
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
|
|
305
|
-
batch.push(record);
|
|
306
|
-
if (batch.length >= batchSize) {
|
|
307
|
-
// Sort the batch
|
|
308
|
-
batch.sort(compareFn);
|
|
309
|
-
// Write sorted batch to temporary file
|
|
310
|
-
const tempFile = `${this._tempPath}_batch_${batchIndex}`;
|
|
311
|
-
this.ensureFile(tempFile);
|
|
312
|
-
const writeStream = (0, fs_2.createWriteStream)(tempFile);
|
|
313
|
-
for (const record of batch) {
|
|
314
|
-
writeStream.write(record.stringify() + '\n');
|
|
315
|
-
}
|
|
316
|
-
writeStream.end();
|
|
317
|
-
yield new Promise((resolve, reject) => {
|
|
318
|
-
writeStream.on('finish', resolve);
|
|
319
|
-
writeStream.on('error', reject);
|
|
320
|
-
});
|
|
321
|
-
tempFiles.push(tempFile);
|
|
322
|
-
batch = [];
|
|
323
|
-
batchIndex++;
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
catch (error) {
|
|
327
|
-
Logger_1.default.log(`Error parsing line during sort: ${error}`);
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
332
|
-
finally {
|
|
333
|
-
try {
|
|
334
|
-
if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
|
|
335
|
-
}
|
|
336
|
-
finally { if (e_2) throw e_2.error; }
|
|
337
|
-
}
|
|
338
|
-
// Handle remaining items in the last batch
|
|
339
|
-
if (batch.length > 0) {
|
|
340
|
-
batch.sort(compareFn);
|
|
341
|
-
const tempFile = `${this._tempPath}_batch_${batchIndex}`;
|
|
342
|
-
this.ensureFile(tempFile);
|
|
343
|
-
const writeStream = (0, fs_2.createWriteStream)(tempFile);
|
|
344
|
-
for (const record of batch) {
|
|
345
|
-
writeStream.write(record.stringify() + '\n');
|
|
346
|
-
}
|
|
347
|
-
writeStream.end();
|
|
348
|
-
yield new Promise((resolve, reject) => {
|
|
349
|
-
writeStream.on('finish', resolve);
|
|
350
|
-
writeStream.on('error', reject);
|
|
351
|
-
});
|
|
352
|
-
tempFiles.push(tempFile);
|
|
353
|
-
}
|
|
354
|
-
rl.close();
|
|
355
|
-
readStream.close();
|
|
356
|
-
// Phase 2: Merge sorted batches using k-way merge
|
|
357
|
-
if (tempFiles.length === 0) {
|
|
358
|
-
this._finishOperation('sort');
|
|
359
|
-
return this;
|
|
360
|
-
}
|
|
361
|
-
if (tempFiles.length === 1) {
|
|
362
|
-
// Only one batch, just rename it
|
|
363
|
-
fs_1.default.renameSync(tempFiles[0], this._path);
|
|
364
|
-
}
|
|
365
|
-
else {
|
|
366
|
-
// Perform k-way merge
|
|
367
|
-
yield this._performKWayMergeSort(tempFiles, this._path, compareFn);
|
|
368
|
-
}
|
|
369
|
-
// Clean up temporary files
|
|
370
|
-
for (const tempFile of tempFiles) {
|
|
371
|
-
if (fs_1.default.existsSync(tempFile)) {
|
|
372
|
-
fs_1.default.unlinkSync(tempFile);
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
this._iterations++;
|
|
376
|
-
this._finishOperation('sort');
|
|
377
|
-
return this;
|
|
378
|
-
});
|
|
379
|
-
/**
|
|
380
|
-
* Convenience method to sort by a specific dimension
|
|
381
|
-
* @param dimensionName The name of the dimension to sort by
|
|
382
|
-
* @param ascending Whether to sort in ascending order (default: true)
|
|
383
|
-
*/
|
|
384
|
-
this.sortByDimension = (dimensionName_1, ...args_1) => __awaiter(this, [dimensionName_1, ...args_1], void 0, function* (dimensionName, ascending = true) {
|
|
385
|
-
const dimension = this._dimensions.find(d => d.name === dimensionName);
|
|
386
|
-
if (!dimension) {
|
|
387
|
-
throw new Error(`Dimension "${dimensionName}" not found. Available dimensions: ${this._dimensions.map(d => d.name).join(', ')}`);
|
|
388
|
-
}
|
|
389
|
-
const compareFn = (a, b) => {
|
|
390
|
-
const aValue = a.getValue(dimensionName);
|
|
391
|
-
const bValue = b.getValue(dimensionName);
|
|
392
|
-
// Handle null/undefined values
|
|
393
|
-
if (aValue == null && bValue == null)
|
|
394
|
-
return 0;
|
|
395
|
-
if (aValue == null)
|
|
396
|
-
return ascending ? -1 : 1;
|
|
397
|
-
if (bValue == null)
|
|
398
|
-
return ascending ? 1 : -1;
|
|
399
|
-
// Try to parse as numbers for numeric comparison
|
|
400
|
-
const aNum = Number(aValue);
|
|
401
|
-
const bNum = Number(bValue);
|
|
402
|
-
if (!isNaN(aNum) && !isNaN(bNum)) {
|
|
403
|
-
const result = aNum - bNum;
|
|
404
|
-
return ascending ? result : -result;
|
|
405
|
-
}
|
|
406
|
-
// String comparison
|
|
407
|
-
const aStr = String(aValue);
|
|
408
|
-
const bStr = String(bValue);
|
|
409
|
-
const result = aStr.localeCompare(bStr);
|
|
410
|
-
return ascending ? result : -result;
|
|
411
|
-
};
|
|
412
|
-
return this.sort(compareFn);
|
|
413
|
-
});
|
|
414
|
-
/**
|
|
415
|
-
* Remove duplicate records from the dataset using batched streaming
|
|
416
|
-
* @param keySelector Optional function to generate a key for comparison. If not provided, uses the entire record
|
|
417
|
-
*/
|
|
418
|
-
this.distinct = (keySelector) => __awaiter(this, void 0, void 0, function* () {
|
|
419
|
-
this._startOperation('distinct');
|
|
420
|
-
if (!fs_1.default.existsSync(this._path)) {
|
|
421
|
-
throw new Error(`File does not exist: ${this._path}`);
|
|
422
|
-
}
|
|
423
|
-
const seen = new Set();
|
|
424
|
-
yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
|
|
425
|
-
const distinctBatch = [];
|
|
426
|
-
for (const record of batch) {
|
|
427
|
-
// Generate a key for uniqueness check
|
|
428
|
-
const recordKey = keySelector ? keySelector(record) : record.stringify();
|
|
429
|
-
if (!seen.has(recordKey)) {
|
|
430
|
-
seen.add(recordKey);
|
|
431
|
-
distinctBatch.push(record);
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
return distinctBatch;
|
|
435
|
-
}));
|
|
436
|
-
this._finishOperation('distinct');
|
|
437
|
-
return this;
|
|
438
|
-
});
|
|
439
|
-
/**
|
|
440
|
-
* Remove duplicate records based on specific dimensions
|
|
441
|
-
* @param dimensionNames Array of dimension names to use for uniqueness comparison
|
|
442
|
-
*/
|
|
443
|
-
this.distinctByDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () {
|
|
444
|
-
// Validate that all dimension names exist
|
|
445
|
-
const existingNames = this._dimensions.map(d => d.name);
|
|
446
|
-
const missingDimensions = dimensionNames.filter(name => !existingNames.includes(name));
|
|
447
|
-
(0, Affirm_1.default)(missingDimensions.length === 0, `Cannot create distinct by dimensions. Missing dimensions: ${missingDimensions.join(', ')}`);
|
|
448
|
-
const keySelector = (record) => {
|
|
449
|
-
const values = dimensionNames.map(name => {
|
|
450
|
-
const value = record.getValue(name);
|
|
451
|
-
return value !== null && value !== undefined ? String(value) : '';
|
|
452
|
-
});
|
|
453
|
-
return values.join('|'); // Use pipe as separator to avoid collisions
|
|
454
|
-
};
|
|
455
|
-
return this.distinct(keySelector);
|
|
456
|
-
});
|
|
457
|
-
/**
|
|
458
|
-
* Internal method to perform k-way merge of sorted files
|
|
459
|
-
*/
|
|
460
|
-
this._performKWayMergeSort = (tempFiles, outputPath, compareFn) => __awaiter(this, void 0, void 0, function* () {
|
|
461
|
-
const readers = [];
|
|
462
|
-
// Initialize readers for each temp file
|
|
463
|
-
for (const file of tempFiles) {
|
|
464
|
-
const readStream = (0, fs_2.createReadStream)(file);
|
|
465
|
-
const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
|
|
466
|
-
const iterator = rl[Symbol.asyncIterator]();
|
|
467
|
-
readers.push({ file, rl, currentRecord: null, finished: false, iterator });
|
|
468
|
-
}
|
|
469
|
-
// Read first record from each file
|
|
470
|
-
for (const reader of readers) {
|
|
471
|
-
try {
|
|
472
|
-
const { value, done } = yield reader.iterator.next();
|
|
473
|
-
if (!done)
|
|
474
|
-
reader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
|
|
475
|
-
else
|
|
476
|
-
reader.finished = true;
|
|
477
|
-
}
|
|
478
|
-
catch (_a) {
|
|
479
|
-
reader.finished = true;
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
// Write merged results
|
|
483
|
-
this.ensureFile(outputPath);
|
|
484
|
-
const writeStream = (0, fs_2.createWriteStream)(outputPath);
|
|
485
|
-
while (readers.some(r => !r.finished)) {
|
|
486
|
-
// Find the reader with the smallest current record
|
|
487
|
-
let minReader = null;
|
|
488
|
-
for (const reader of readers) {
|
|
489
|
-
if (!reader.finished && reader.currentRecord) {
|
|
490
|
-
if (!minReader || !minReader.currentRecord || compareFn(reader.currentRecord, minReader.currentRecord) < 0) {
|
|
491
|
-
minReader = reader;
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
if (minReader && minReader.currentRecord) {
|
|
496
|
-
// Write the smallest record
|
|
497
|
-
writeStream.write(minReader.currentRecord.stringify() + '\n');
|
|
498
|
-
// Read next record from the same reader
|
|
499
|
-
try {
|
|
500
|
-
const { value, done } = yield minReader.iterator.next();
|
|
501
|
-
if (!done) {
|
|
502
|
-
minReader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
|
|
503
|
-
}
|
|
504
|
-
else {
|
|
505
|
-
minReader.finished = true;
|
|
506
|
-
minReader.currentRecord = null;
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
catch (_b) {
|
|
510
|
-
minReader.finished = true;
|
|
511
|
-
minReader.currentRecord = null;
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
writeStream.end();
|
|
516
|
-
yield new Promise((resolve, reject) => {
|
|
517
|
-
writeStream.on('finish', resolve);
|
|
518
|
-
writeStream.on('error', reject);
|
|
519
|
-
});
|
|
520
|
-
// Close all readers
|
|
521
|
-
for (const reader of readers) {
|
|
522
|
-
reader.rl.close();
|
|
523
|
-
}
|
|
524
|
-
});
|
|
525
|
-
/**
|
|
526
|
-
* Stream through batches without modification (for reading)
|
|
527
|
-
*/
|
|
528
|
-
this.streamBatches = (processor) => __awaiter(this, void 0, void 0, function* () {
|
|
529
|
-
var _a, e_3, _b, _c;
|
|
530
|
-
if (!fs_1.default.existsSync(this._path)) {
|
|
531
|
-
throw new Error(`File does not exist: ${this._path}`);
|
|
532
|
-
}
|
|
533
|
-
this._startOperation('stream-batches');
|
|
534
|
-
const readStream = (0, fs_2.createReadStream)(this._path);
|
|
535
|
-
const rl = (0, readline_1.createInterface)({
|
|
536
|
-
input: readStream,
|
|
537
|
-
crlfDelay: Infinity
|
|
538
|
-
});
|
|
539
|
-
let batch = [];
|
|
540
|
-
let batchIndex = 0;
|
|
541
|
-
let lineCount = 0;
|
|
542
|
-
try {
|
|
543
|
-
for (var _d = true, rl_3 = __asyncValues(rl), rl_3_1; rl_3_1 = yield rl_3.next(), _a = rl_3_1.done, !_a; _d = true) {
|
|
544
|
-
_c = rl_3_1.value;
|
|
545
|
-
_d = false;
|
|
546
|
-
const line = _c;
|
|
547
|
-
try {
|
|
548
|
-
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
|
|
549
|
-
batch.push(record);
|
|
550
|
-
lineCount++;
|
|
551
|
-
if (batch.length >= this._batchSize) {
|
|
552
|
-
yield processor(batch, batchIndex);
|
|
553
|
-
batch = [];
|
|
554
|
-
batchIndex++;
|
|
555
|
-
}
|
|
556
|
-
}
|
|
557
|
-
catch (error) {
|
|
558
|
-
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
}
|
|
562
|
-
catch (e_3_1) { e_3 = { error: e_3_1 }; }
|
|
563
|
-
finally {
|
|
564
|
-
try {
|
|
565
|
-
if (!_d && !_a && (_b = rl_3.return)) yield _b.call(rl_3);
|
|
566
|
-
}
|
|
567
|
-
finally { if (e_3) throw e_3.error; }
|
|
568
|
-
}
|
|
569
|
-
// Process remaining items in the last batch
|
|
570
|
-
if (batch.length > 0) {
|
|
571
|
-
yield processor(batch, batchIndex);
|
|
572
|
-
}
|
|
573
|
-
this._iterations++;
|
|
574
|
-
this._finishOperation('stream-batches');
|
|
575
|
-
});
|
|
576
|
-
/**
|
|
577
|
-
* Check if file exists
|
|
578
|
-
*/
|
|
579
|
-
this.exists = () => fs_1.default.existsSync(this._path);
|
|
580
|
-
/**
|
|
581
|
-
* Create the file if it doesn't exist
|
|
582
|
-
*/
|
|
583
|
-
this.ensureFile = (filePath) => {
|
|
584
|
-
const dir = path_1.default.dirname(filePath);
|
|
585
|
-
if (!fs_1.default.existsSync(dir)) {
|
|
586
|
-
fs_1.default.mkdirSync(dir, { recursive: true });
|
|
587
|
-
}
|
|
588
|
-
if (!fs_1.default.existsSync(filePath)) {
|
|
589
|
-
fs_1.default.writeFileSync(filePath, '');
|
|
590
|
-
}
|
|
591
|
-
};
|
|
592
|
-
/**
|
|
593
|
-
* Clear the file content
|
|
594
|
-
*/
|
|
595
|
-
this.clear = () => {
|
|
596
|
-
if (fs_1.default.existsSync(this._path)) {
|
|
597
|
-
fs_1.default.writeFileSync(this._path, '');
|
|
598
|
-
}
|
|
599
|
-
return this;
|
|
600
|
-
};
|
|
601
|
-
/**
|
|
602
|
-
* Append data to the file
|
|
603
|
-
*/
|
|
604
|
-
this.append = (items) => __awaiter(this, void 0, void 0, function* () {
|
|
605
|
-
this._startOperation('append');
|
|
606
|
-
const writeStream = (0, fs_2.createWriteStream)(this._path, { flags: 'a' });
|
|
607
|
-
for (const item of items) {
|
|
608
|
-
writeStream.write(item.stringify() + '\n');
|
|
609
|
-
}
|
|
610
|
-
writeStream.end();
|
|
611
|
-
yield new Promise((resolve, reject) => {
|
|
612
|
-
writeStream.on('finish', resolve);
|
|
613
|
-
writeStream.on('error', reject);
|
|
614
|
-
});
|
|
615
|
-
this._finishOperation('append');
|
|
616
|
-
return this;
|
|
617
|
-
});
|
|
618
|
-
/**
|
|
619
|
-
* Read a specified number of lines from the file
|
|
620
|
-
*/
|
|
621
|
-
this.readLines = (numberOfLines) => __awaiter(this, void 0, void 0, function* () {
|
|
622
|
-
var _a, e_4, _b, _c;
|
|
623
|
-
if (!fs_1.default.existsSync(this._path))
|
|
624
|
-
return [];
|
|
625
|
-
if (numberOfLines <= 0)
|
|
626
|
-
return [];
|
|
627
|
-
this._startOperation('read-lines', { numberOfLines });
|
|
628
|
-
const readStream = (0, fs_2.createReadStream)(this._path);
|
|
629
|
-
const rl = (0, readline_1.createInterface)({
|
|
630
|
-
input: readStream,
|
|
631
|
-
crlfDelay: Infinity
|
|
632
|
-
});
|
|
633
|
-
const results = [];
|
|
634
|
-
let lineCount = 0;
|
|
635
|
-
try {
|
|
636
|
-
for (var _d = true, rl_4 = __asyncValues(rl), rl_4_1; rl_4_1 = yield rl_4.next(), _a = rl_4_1.done, !_a; _d = true) {
|
|
637
|
-
_c = rl_4_1.value;
|
|
638
|
-
_d = false;
|
|
639
|
-
const line = _c;
|
|
640
|
-
try {
|
|
641
|
-
const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
|
|
642
|
-
results.push(record);
|
|
643
|
-
lineCount++;
|
|
644
|
-
if (lineCount >= numberOfLines) {
|
|
645
|
-
break;
|
|
646
|
-
}
|
|
647
|
-
}
|
|
648
|
-
catch (error) {
|
|
649
|
-
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
650
|
-
lineCount++;
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
catch (e_4_1) { e_4 = { error: e_4_1 }; }
|
|
655
|
-
finally {
|
|
656
|
-
try {
|
|
657
|
-
if (!_d && !_a && (_b = rl_4.return)) yield _b.call(rl_4);
|
|
658
|
-
}
|
|
659
|
-
finally { if (e_4) throw e_4.error; }
|
|
660
|
-
}
|
|
661
|
-
rl.close();
|
|
662
|
-
readStream.close();
|
|
663
|
-
this._finishOperation('read-lines');
|
|
664
|
-
return results;
|
|
665
|
-
});
|
|
666
|
-
this.getDimensions = () => this._dimensions;
|
|
667
|
-
this.setDimensions = (dimensions) => {
|
|
668
|
-
this._dimensions = dimensions;
|
|
669
|
-
return this;
|
|
670
|
-
};
|
|
671
|
-
this.setSingleDimension = (newDimension, oldDimension) => {
|
|
672
|
-
(0, Affirm_1.default)(newDimension, `Invalid new dimension`);
|
|
673
|
-
(0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
|
|
674
|
-
const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
|
|
675
|
-
(0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
|
|
676
|
-
this._dimensions.splice(current, 1, newDimension);
|
|
677
|
-
return this;
|
|
678
|
-
};
|
|
679
|
-
/**
|
|
680
|
-
* Update the record pool when dimensions change
|
|
681
|
-
*/
|
|
682
|
-
this._updateRecordPoolDimensions = () => {
|
|
683
|
-
// Update all pooled records with current dimensions
|
|
684
|
-
this._recordPool.updateDimensions(this._dimensions, this._delimiter);
|
|
685
|
-
};
|
|
686
|
-
this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
|
|
687
|
-
console.log(`DS ${this.name} (${this._count} | ${this._iterations})`);
|
|
688
|
-
console.log(this._dimensions.map(x => x.name).join(this._delimiter));
|
|
689
|
-
const records = yield this.readLines(count);
|
|
690
|
-
records.forEach((x, i) => console.log(`[${i}]`, full ? x : x.stringify()));
|
|
691
|
-
console.log('----------');
|
|
692
|
-
});
|
|
693
|
-
this.printStats = () => {
|
|
694
|
-
var _a, _b;
|
|
695
|
-
const total = ((_b = (_a = this._operations) === null || _a === void 0 ? void 0 : _a.map(x => x.elapsedMs)) !== null && _b !== void 0 ? _b : []).reduce((sum, ms) => sum + ms, 0);
|
|
696
|
-
console.log(`DS[stats] ${this.name} (size: ${this._count} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`);
|
|
697
|
-
console.log(`Operations: ${this._operations.length}`);
|
|
698
|
-
console.log(JSON.stringify(this._operations, null, 4));
|
|
699
|
-
};
|
|
700
|
-
/**
|
|
701
|
-
* Destroy the dataset by removing all allocated memory and created files
|
|
702
|
-
*/
|
|
703
|
-
this.destroy = () => {
|
|
704
|
-
this._startOperation('destroy');
|
|
705
|
-
try {
|
|
706
|
-
// Remove the main dataset file
|
|
707
|
-
if (fs_1.default.existsSync(this._path)) {
|
|
708
|
-
fs_1.default.unlinkSync(this._path);
|
|
709
|
-
}
|
|
710
|
-
// Remove the temporary file if it exists
|
|
711
|
-
if (fs_1.default.existsSync(this._tempPath)) {
|
|
712
|
-
fs_1.default.unlinkSync(this._tempPath);
|
|
713
|
-
}
|
|
714
|
-
// Remove any batch temporary files that might still exist
|
|
715
|
-
const tempDir = path_1.default.dirname(this._tempPath);
|
|
716
|
-
if (fs_1.default.existsSync(tempDir)) {
|
|
717
|
-
const files = fs_1.default.readdirSync(tempDir);
|
|
718
|
-
const batchFiles = files.filter(file => file.startsWith(path_1.default.basename(this._tempPath) + '_batch_'));
|
|
719
|
-
for (const batchFile of batchFiles) {
|
|
720
|
-
const fullPath = path_1.default.join(tempDir, batchFile);
|
|
721
|
-
if (fs_1.default.existsSync(fullPath)) {
|
|
722
|
-
fs_1.default.unlinkSync(fullPath);
|
|
723
|
-
}
|
|
724
|
-
}
|
|
725
|
-
// Try to remove the temp directory if it's empty
|
|
726
|
-
try {
|
|
727
|
-
if (fs_1.default.readdirSync(tempDir).length === 0) {
|
|
728
|
-
fs_1.default.rmdirSync(tempDir);
|
|
729
|
-
}
|
|
730
|
-
}
|
|
731
|
-
catch (_a) {
|
|
732
|
-
// Directory not empty or other error, ignore
|
|
733
|
-
}
|
|
734
|
-
}
|
|
735
|
-
this._finishOperation('destroy');
|
|
736
|
-
}
|
|
737
|
-
catch (error) {
|
|
738
|
-
this._finishOperation('destroy');
|
|
739
|
-
throw new Error(`Failed to destroy dataset: ${error instanceof Error ? error.message : String(error)}`);
|
|
740
|
-
}
|
|
741
|
-
};
|
|
742
|
-
this._startOperation = (name, metadata) => {
|
|
743
|
-
const newOperation = {
|
|
744
|
-
name,
|
|
745
|
-
count: -1,
|
|
746
|
-
elapsedMs: performance.now(),
|
|
747
|
-
status: 'running',
|
|
748
|
-
subOperations: [],
|
|
749
|
-
metadata: metadata
|
|
750
|
-
};
|
|
751
|
-
const runningOperation = this._findRunningOperation();
|
|
752
|
-
if (runningOperation)
|
|
753
|
-
runningOperation.subOperations.push(newOperation);
|
|
754
|
-
else
|
|
755
|
-
this._operations.push(newOperation);
|
|
756
|
-
};
|
|
757
|
-
this._finishOperation = (name) => {
|
|
758
|
-
const finishedOperation = this._findRunningOperation(name);
|
|
759
|
-
if (finishedOperation) {
|
|
760
|
-
finishedOperation.status = 'completed';
|
|
761
|
-
finishedOperation.count = this._count;
|
|
762
|
-
finishedOperation.elapsedMs = performance.now() - finishedOperation.elapsedMs;
|
|
763
|
-
}
|
|
764
|
-
else {
|
|
765
|
-
const currentOperation = this._operations.find(x => x.status === 'running');
|
|
766
|
-
const currentName = currentOperation ? currentOperation.name : 'none';
|
|
767
|
-
console.warn(`Finished operation "${name}" but no running operation with that name was found (current running: "${currentName}")`);
|
|
768
|
-
}
|
|
769
|
-
};
|
|
770
|
-
this._findRunningOperation = (name) => {
|
|
771
|
-
const searchInOperations = (operations) => {
|
|
772
|
-
for (const operation of operations) {
|
|
773
|
-
if (operation.status === 'running' && (name === undefined || operation.name === name)) {
|
|
774
|
-
// If we're looking for a specific name, return it
|
|
775
|
-
if (name !== undefined) {
|
|
776
|
-
return operation;
|
|
777
|
-
}
|
|
778
|
-
// If we're looking for the deepest running operation (name is undefined),
|
|
779
|
-
// check if this operation has deeper running sub-operations
|
|
780
|
-
const deeperRunning = searchInOperations(operation.subOperations);
|
|
781
|
-
if (deeperRunning) {
|
|
782
|
-
return deeperRunning;
|
|
783
|
-
}
|
|
784
|
-
// If no deeper running operations, this is the deepest
|
|
785
|
-
return operation;
|
|
786
|
-
}
|
|
787
|
-
if (operation.subOperations && operation.subOperations.length > 0) {
|
|
788
|
-
const found = searchInOperations(operation.subOperations);
|
|
789
|
-
if (found) {
|
|
790
|
-
return found;
|
|
791
|
-
}
|
|
792
|
-
}
|
|
793
|
-
}
|
|
794
|
-
return null;
|
|
795
|
-
};
|
|
796
|
-
return searchInOperations(this._operations);
|
|
797
|
-
};
|
|
798
|
-
this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
|
|
799
|
-
const { name, baseProducer, file, batchSize, executionId } = options;
|
|
800
|
-
this.name = name;
|
|
801
|
-
this._file = file;
|
|
802
|
-
this._executionId = executionId;
|
|
803
|
-
this._baseProducer = baseProducer;
|
|
804
|
-
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
805
|
-
this._dimensions = [];
|
|
806
|
-
this._firstLine = '';
|
|
807
|
-
this._delimiter = (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',';
|
|
808
|
-
this._count = 0;
|
|
809
|
-
this._iterations = 0;
|
|
810
|
-
this._operations = [];
|
|
811
|
-
// Initialize record pool for optimization
|
|
812
|
-
this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
|
|
813
|
-
const datasetName = this.name
|
|
814
|
-
.replace(/[^a-zA-Z0-9_-]/g, '_')
|
|
815
|
-
.replace(/_{2,}/g, '_')
|
|
816
|
-
.replace(/^_+|_+$/g, '')
|
|
817
|
-
.toLowerCase();
|
|
818
|
-
const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
|
|
819
|
-
this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
|
|
820
|
-
this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
|
|
821
|
-
this.ensureFile(this._path);
|
|
822
|
-
}
|
|
823
|
-
}
|
|
824
|
-
exports.default = Dataset;
|