@forzalabs/remora 1.0.21 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/actions/automap.js +26 -42
  2. package/actions/compile.js +27 -43
  3. package/actions/create_consumer.js +24 -40
  4. package/actions/create_producer.js +16 -32
  5. package/actions/debug.js +18 -34
  6. package/actions/deploy.js +30 -46
  7. package/actions/discover.js +13 -29
  8. package/actions/init.js +29 -45
  9. package/actions/mock.js +16 -32
  10. package/actions/run.js +34 -52
  11. package/actions/sample.js +42 -58
  12. package/index.js +38 -43
  13. package/package.json +4 -4
  14. package/workers/ExecutorWorker.js +18 -32
  15. package/Constants.js +0 -34
  16. package/core/Affirm.js +0 -42
  17. package/core/Algo.js +0 -160
  18. package/core/dste/DSTE.js +0 -113
  19. package/core/logger/DebugLogService.js +0 -48
  20. package/core/logger/DevelopmentLogService.js +0 -70
  21. package/core/logger/LocalLogService.js +0 -70
  22. package/core/logger/Logger.js +0 -54
  23. package/database/DatabaseEngine.js +0 -149
  24. package/database/DatabaseStructure.js +0 -27
  25. package/definitions/DatasetDefinitions.js +0 -2
  26. package/definitions/ExecutorDefinitions.js +0 -2
  27. package/definitions/ProcessENV.js +0 -2
  28. package/definitions/agents/DestinationDriver.js +0 -2
  29. package/definitions/agents/SourceDriver.js +0 -2
  30. package/definitions/cli.js +0 -2
  31. package/definitions/database/ApiKeys.js +0 -2
  32. package/definitions/database/Stored.js +0 -7
  33. package/definitions/database/UsageStat.js +0 -2
  34. package/definitions/database/User.js +0 -2
  35. package/definitions/json_schemas/consumer-schema.json +0 -1226
  36. package/definitions/json_schemas/producer-schema.json +0 -308
  37. package/definitions/json_schemas/project-schema.json +0 -100
  38. package/definitions/json_schemas/source-schema.json +0 -249
  39. package/definitions/requests/ConsumerRequest.js +0 -2
  40. package/definitions/requests/Developer.js +0 -2
  41. package/definitions/requests/Mapping.js +0 -2
  42. package/definitions/requests/ProducerRequest.js +0 -2
  43. package/definitions/requests/Request.js +0 -2
  44. package/definitions/resources/Compiled.js +0 -2
  45. package/definitions/resources/Consumer.js +0 -2
  46. package/definitions/resources/Environment.js +0 -2
  47. package/definitions/resources/Library.js +0 -2
  48. package/definitions/resources/Producer.js +0 -2
  49. package/definitions/resources/Project.js +0 -2
  50. package/definitions/resources/Schema.js +0 -2
  51. package/definitions/resources/Source.js +0 -2
  52. package/definitions/temp.js +0 -2
  53. package/definitions/transform/Transformations.js +0 -2
  54. package/drivers/DeltaShareDriver.js +0 -186
  55. package/drivers/DriverFactory.js +0 -72
  56. package/drivers/DriverHelper.js +0 -248
  57. package/drivers/HttpApiDriver.js +0 -208
  58. package/drivers/RedshiftDriver.js +0 -184
  59. package/drivers/files/LocalDestinationDriver.js +0 -146
  60. package/drivers/files/LocalSourceDriver.js +0 -405
  61. package/drivers/s3/S3DestinationDriver.js +0 -197
  62. package/drivers/s3/S3SourceDriver.js +0 -495
  63. package/engines/CryptoEngine.js +0 -75
  64. package/engines/Environment.js +0 -170
  65. package/engines/ProcessENVManager.js +0 -83
  66. package/engines/RandomEngine.js +0 -47
  67. package/engines/SecretManager.js +0 -23
  68. package/engines/UserManager.js +0 -66
  69. package/engines/ai/AutoMapperEngine.js +0 -37
  70. package/engines/ai/DeveloperEngine.js +0 -497
  71. package/engines/ai/LLM.js +0 -255
  72. package/engines/consumer/ConsumerManager.js +0 -218
  73. package/engines/consumer/ConsumerOnFinishManager.js +0 -202
  74. package/engines/dataset/Dataset.js +0 -824
  75. package/engines/dataset/DatasetManager.js +0 -211
  76. package/engines/dataset/DatasetRecord.js +0 -120
  77. package/engines/dataset/DatasetRecordPool.js +0 -77
  78. package/engines/execution/RequestExecutor.js +0 -67
  79. package/engines/parsing/CSVParser.js +0 -60
  80. package/engines/parsing/LineParser.js +0 -71
  81. package/engines/parsing/ParseCompression.js +0 -101
  82. package/engines/parsing/ParseHelper.js +0 -18
  83. package/engines/parsing/ParseManager.js +0 -54
  84. package/engines/parsing/XLSParser.js +0 -87
  85. package/engines/parsing/XMLParser.js +0 -115
  86. package/engines/producer/ProducerEngine.js +0 -127
  87. package/engines/producer/ProducerManager.js +0 -43
  88. package/engines/scheduler/CronScheduler.js +0 -222
  89. package/engines/scheduler/QueueManager.js +0 -314
  90. package/engines/schema/SchemaValidator.js +0 -67
  91. package/engines/transform/JoinEngine.js +0 -232
  92. package/engines/transform/TransformationEngine.js +0 -277
  93. package/engines/transform/TypeCaster.js +0 -59
  94. package/engines/usage/DataframeManager.js +0 -55
  95. package/engines/usage/UsageDataManager.js +0 -151
  96. package/engines/usage/UsageManager.js +0 -65
  97. package/engines/validation/Validator.js +0 -216
  98. package/executors/ConsumerExecutor.js +0 -280
  99. package/executors/Executor.js +0 -177
  100. package/executors/ExecutorOrchestrator.js +0 -331
  101. package/executors/ExecutorPerformance.js +0 -17
  102. package/executors/ExecutorProgress.js +0 -54
  103. package/executors/ExecutorScope.js +0 -52
  104. package/executors/OutputExecutor.js +0 -118
  105. package/executors/ProducerExecutor.js +0 -108
  106. package/helper/Helper.js +0 -149
  107. package/helper/Logger.js +0 -84
  108. package/helper/Runtime.js +0 -20
  109. package/helper/Settings.js +0 -13
  110. package/licencing/LicenceManager.js +0 -64
  111. package/settings.js +0 -12
@@ -1,824 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __asyncValues = (this && this.__asyncValues) || function (o) {
12
- if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
- var m = o[Symbol.asyncIterator], i;
14
- return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
- function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
- function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
- };
18
- var __importDefault = (this && this.__importDefault) || function (mod) {
19
- return (mod && mod.__esModule) ? mod : { "default": mod };
20
- };
21
- Object.defineProperty(exports, "__esModule", { value: true });
22
- const path_1 = __importDefault(require("path"));
23
- const fs_1 = __importDefault(require("fs"));
24
- const fs_2 = require("fs");
25
- const readline_1 = require("readline");
26
- const Constants_1 = __importDefault(require("../../Constants"));
27
- const DatasetManager_1 = __importDefault(require("./DatasetManager"));
28
- const DatasetRecord_1 = __importDefault(require("./DatasetRecord"));
29
- const DatasetRecordPool_1 = __importDefault(require("./DatasetRecordPool"));
30
- const Affirm_1 = __importDefault(require("../../core/Affirm"));
31
- const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
32
- const Helper_1 = __importDefault(require("../../helper/Helper"));
33
- const Algo_1 = __importDefault(require("../../core/Algo"));
34
- const Environment_1 = __importDefault(require("../Environment"));
35
- const Logger_1 = __importDefault(require("../../helper/Logger"));
36
- const ProducerManager_1 = __importDefault(require("../producer/ProducerManager"));
37
- class Dataset {
38
- constructor(options) {
39
- var _a, _b;
40
- this.getPath = () => this._path;
41
- this.setPath = (path) => {
42
- this._path = path;
43
- return this;
44
- };
45
- this.getFile = () => this._file;
46
- this.getExecutionId = () => this._executionId;
47
- this.getBatchSize = () => this._batchSize;
48
- this.setFirstLine = (firstLine) => {
49
- this._firstLine = firstLine;
50
- return this;
51
- };
52
- this.getFirstLine = () => this._firstLine;
53
- this.getCount = () => this._count;
54
- this.setCount = (count) => {
55
- this._count = count;
56
- return this;
57
- };
58
- this.getCycles = () => this._iterations;
59
- this.getDelimiter = () => this._delimiter;
60
- this.setDelimiter = (delimiter) => {
61
- this._delimiter = delimiter;
62
- return this;
63
- };
64
- this.getOperations = () => this._operations;
65
- this.load = (source) => __awaiter(this, void 0, void 0, function* () {
66
- (0, Affirm_1.default)(source, 'Invalid source');
67
- this._startOperation('load', { source: source.engine });
68
- try {
69
- const driver = yield DriverFactory_1.default.instantiateSource(source);
70
- yield driver.download(this);
71
- }
72
- catch (error) {
73
- if (this._file.isOptional) {
74
- Logger_1.default.log(`Error loading dataset "${this.name}", creating default configuration and mock data because "isOptional" is true.`);
75
- if (!this.getDimensions() || this.getDimensions().length === 0)
76
- this.setDimensions(ProducerManager_1.default.getColumns(this._baseProducer).map((x, i) => { var _a; return ({ index: i, key: x.nameInProducer, name: x.aliasInProducer, hidden: false, type: (_a = x.dimension) === null || _a === void 0 ? void 0 : _a.type }); }));
77
- if (!this.getFirstLine() || this.getFirstLine().length === 0) {
78
- if (this._file.hasHeaderRow)
79
- this.setFirstLine(this.getDimensions().map(x => x.key).join(this.getDelimiter()));
80
- else
81
- this.setFirstLine('');
82
- }
83
- }
84
- else
85
- throw error;
86
- }
87
- this._size = this._computeSize();
88
- this._finishOperation('load');
89
- return this;
90
- });
91
- /**
92
- * Load data from an in-memory array of objects and create a local dataset file
93
- * @param data Array of objects to load into the dataset
94
- * @param dimensions Optional dimensions array. If not provided, will be inferred from the first object
95
- * @param delimiter Optional delimiter. Defaults to comma
96
- */
97
- this.loadFromMemory = (data_1, producer_1, ...args_1) => __awaiter(this, [data_1, producer_1, ...args_1], void 0, function* (data, producer, discover = false) {
98
- var _a, _b;
99
- (0, Affirm_1.default)(data, 'Invalid data array');
100
- (0, Affirm_1.default)(Array.isArray(data), 'Data must be an array');
101
- if (data.length === 0) {
102
- console.warn('Loading empty array into dataset');
103
- return this;
104
- }
105
- this._startOperation('load-from-memory', { recordCount: data.length });
106
- try {
107
- this._delimiter = (_b = (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.delimiter) !== null && _b !== void 0 ? _b : this._delimiter;
108
- // Discover the dimensions on your own
109
- const firstItem = data[0];
110
- const firstLine = typeof firstItem === 'object' ? JSON.stringify(firstItem) : String(firstItem);
111
- const buildRes = yield DatasetManager_1.default.buildDimensionsFromFirstLine(firstLine, this._file, producer, discover);
112
- this._dimensions = buildRes.dimensions;
113
- this._updateRecordPoolDimensions();
114
- // Clear existing file content
115
- this.clear();
116
- // Convert objects to DatasetRecord format and write to file
117
- const records = [];
118
- for (const item of data) {
119
- // Create a row string by extracting values in dimension order
120
- const values = this._dimensions.map(dim => {
121
- const value = item[dim.key];
122
- // Handle null/undefined values
123
- return value !== null && value !== undefined ? String(value) : '';
124
- });
125
- const rowString = values.join(this._delimiter);
126
- const record = new DatasetRecord_1.default(rowString, this._dimensions, this._delimiter);
127
- records.push(record);
128
- }
129
- // Write all records to the dataset file
130
- yield this.append(records);
131
- this._size = this._computeSize();
132
- this._count = data.length;
133
- this._finishOperation('load-from-memory');
134
- return this;
135
- }
136
- catch (error) {
137
- this._finishOperation('load-from-memory');
138
- throw new Error(`Failed to load data from memory: ${error instanceof Error ? error.message : String(error)}`);
139
- }
140
- });
141
- /**
142
- * Stream through the file in batches and apply a transformation
143
- */
144
- this.transformStream = (transformer_1, ...args_1) => __awaiter(this, [transformer_1, ...args_1], void 0, function* (transformer, options = {}) {
145
- var _a, e_1, _b, _c;
146
- var _d, _e, _f, _g;
147
- const inputPath = options.inputPath || this._path;
148
- const outputPath = options.outputPath || this._tempPath;
149
- const fromLine = (_e = (_d = options.range) === null || _d === void 0 ? void 0 : _d.fromLine) !== null && _e !== void 0 ? _e : -1;
150
- const toLine = (_g = (_f = options.range) === null || _f === void 0 ? void 0 : _f.toLine) !== null && _g !== void 0 ? _g : Infinity;
151
- this.ensureFile(outputPath);
152
- if (!fs_1.default.existsSync(inputPath))
153
- throw new Error(`Input file does not exist: ${inputPath}`);
154
- this._startOperation('transform-stream');
155
- const readStream = (0, fs_2.createReadStream)(inputPath);
156
- const writeStream = (0, fs_2.createWriteStream)(outputPath);
157
- const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
158
- const dimensions = Algo_1.default.deepClone(this._dimensions);
159
- let batch = [];
160
- let lineCount = 0;
161
- let index = 0;
162
- try {
163
- for (var _h = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _h = true) {
164
- _c = rl_1_1.value;
165
- _h = false;
166
- const line = _c;
167
- try {
168
- if (index < fromLine) {
169
- index++;
170
- continue;
171
- }
172
- else if (index >= toLine) {
173
- index++;
174
- break;
175
- }
176
- index++;
177
- // Reuse record from pool and reinitialize it with new line data
178
- const record = this._recordPool.getNext(line, dimensions, this._delimiter);
179
- batch.push(record);
180
- lineCount++;
181
- if (batch.length >= this._batchSize) {
182
- const transformedBatch = yield transformer(batch);
183
- for (const transformedRecord of transformedBatch) {
184
- writeStream.write(transformedRecord.stringify() + '\n');
185
- }
186
- batch = [];
187
- this._recordPool.reset();
188
- }
189
- }
190
- catch (error) {
191
- Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
192
- lineCount++;
193
- }
194
- }
195
- }
196
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
197
- finally {
198
- try {
199
- if (!_h && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
200
- }
201
- finally { if (e_1) throw e_1.error; }
202
- }
203
- // Process remaining items in the last batch
204
- if (batch.length > 0) {
205
- const transformedBatch = yield transformer(batch);
206
- for (const transformedRecord of transformedBatch) {
207
- writeStream.write(transformedRecord.stringify() + '\n');
208
- }
209
- }
210
- writeStream.end();
211
- // Wait for write stream to finish
212
- yield new Promise((resolve, reject) => {
213
- writeStream.on('finish', resolve);
214
- writeStream.on('error', reject);
215
- });
216
- // Replace original file with transformed file
217
- if (outputPath === this._tempPath) {
218
- fs_1.default.renameSync(this._tempPath, this._path);
219
- }
220
- this._count = lineCount;
221
- this._size = this._computeSize();
222
- this._iterations++;
223
- this._finishOperation('transform-stream');
224
- });
225
- /**
226
- * Filter items in the file using batched streaming
227
- */
228
- this.filter = (predicate_1, ...args_1) => __awaiter(this, [predicate_1, ...args_1], void 0, function* (predicate, options = {}) {
229
- this._startOperation('filter');
230
- let globalIndex = 0;
231
- yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
232
- const filteredBatch = [];
233
- for (const item of batch) {
234
- if (predicate(item, globalIndex)) {
235
- filteredBatch.push(item);
236
- }
237
- globalIndex++;
238
- }
239
- return filteredBatch;
240
- }), options);
241
- this._finishOperation('filter');
242
- return this;
243
- });
244
- /**
245
- * Map items in the file using batched streaming
246
- */
247
- this.map = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
248
- this._startOperation('map');
249
- let globalIndex = 0;
250
- yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
251
- const mappedBatch = [];
252
- for (const item of batch) {
253
- const mappedItem = yield mapper(item, globalIndex);
254
- mappedBatch.push(mappedItem);
255
- globalIndex++;
256
- }
257
- return mappedBatch;
258
- }), options);
259
- this._finishOperation('map');
260
- return this;
261
- });
262
- /**
263
- * FlatMap items in the file using batched streaming - maps each item to an array of items and flattens the result
264
- */
265
- this.flatMap = (mapper_1, ...args_1) => __awaiter(this, [mapper_1, ...args_1], void 0, function* (mapper, options = {}) {
266
- this._startOperation('flat-map');
267
- let globalIndex = 0;
268
- yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
269
- const flatMappedBatch = [];
270
- for (const item of batch) {
271
- const mappedItems = yield mapper(item, globalIndex);
272
- flatMappedBatch.push(...mappedItems);
273
- globalIndex++;
274
- }
275
- return flatMappedBatch;
276
- }), options);
277
- this._finishOperation('flat-map');
278
- return this;
279
- });
280
- /**
281
- * Sort the dataset by one or more dimensions using batched streaming with external merge sort
282
- * @param compareFn Comparison function that takes two DatasetRecord objects and returns a number
283
- * @param options Optional parameters for sorting
284
- */
285
- this.sort = (compareFn_1, ...args_1) => __awaiter(this, [compareFn_1, ...args_1], void 0, function* (compareFn, options = {}) {
286
- var _a, e_2, _b, _c;
287
- this._startOperation('sort');
288
- const { batchSize = this._batchSize } = options;
289
- if (!fs_1.default.existsSync(this._path)) {
290
- throw new Error(`File does not exist: ${this._path}`);
291
- }
292
- // Phase 1: Sort individual batches and write them to temporary files
293
- const tempFiles = [];
294
- const readStream = (0, fs_2.createReadStream)(this._path);
295
- const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
296
- let batch = [];
297
- let batchIndex = 0;
298
- try {
299
- for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
300
- _c = rl_2_1.value;
301
- _d = false;
302
- const line = _c;
303
- try {
304
- const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
305
- batch.push(record);
306
- if (batch.length >= batchSize) {
307
- // Sort the batch
308
- batch.sort(compareFn);
309
- // Write sorted batch to temporary file
310
- const tempFile = `${this._tempPath}_batch_${batchIndex}`;
311
- this.ensureFile(tempFile);
312
- const writeStream = (0, fs_2.createWriteStream)(tempFile);
313
- for (const record of batch) {
314
- writeStream.write(record.stringify() + '\n');
315
- }
316
- writeStream.end();
317
- yield new Promise((resolve, reject) => {
318
- writeStream.on('finish', resolve);
319
- writeStream.on('error', reject);
320
- });
321
- tempFiles.push(tempFile);
322
- batch = [];
323
- batchIndex++;
324
- }
325
- }
326
- catch (error) {
327
- Logger_1.default.log(`Error parsing line during sort: ${error}`);
328
- }
329
- }
330
- }
331
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
332
- finally {
333
- try {
334
- if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
335
- }
336
- finally { if (e_2) throw e_2.error; }
337
- }
338
- // Handle remaining items in the last batch
339
- if (batch.length > 0) {
340
- batch.sort(compareFn);
341
- const tempFile = `${this._tempPath}_batch_${batchIndex}`;
342
- this.ensureFile(tempFile);
343
- const writeStream = (0, fs_2.createWriteStream)(tempFile);
344
- for (const record of batch) {
345
- writeStream.write(record.stringify() + '\n');
346
- }
347
- writeStream.end();
348
- yield new Promise((resolve, reject) => {
349
- writeStream.on('finish', resolve);
350
- writeStream.on('error', reject);
351
- });
352
- tempFiles.push(tempFile);
353
- }
354
- rl.close();
355
- readStream.close();
356
- // Phase 2: Merge sorted batches using k-way merge
357
- if (tempFiles.length === 0) {
358
- this._finishOperation('sort');
359
- return this;
360
- }
361
- if (tempFiles.length === 1) {
362
- // Only one batch, just rename it
363
- fs_1.default.renameSync(tempFiles[0], this._path);
364
- }
365
- else {
366
- // Perform k-way merge
367
- yield this._performKWayMergeSort(tempFiles, this._path, compareFn);
368
- }
369
- // Clean up temporary files
370
- for (const tempFile of tempFiles) {
371
- if (fs_1.default.existsSync(tempFile)) {
372
- fs_1.default.unlinkSync(tempFile);
373
- }
374
- }
375
- this._iterations++;
376
- this._finishOperation('sort');
377
- return this;
378
- });
379
- /**
380
- * Convenience method to sort by a specific dimension
381
- * @param dimensionName The name of the dimension to sort by
382
- * @param ascending Whether to sort in ascending order (default: true)
383
- */
384
- this.sortByDimension = (dimensionName_1, ...args_1) => __awaiter(this, [dimensionName_1, ...args_1], void 0, function* (dimensionName, ascending = true) {
385
- const dimension = this._dimensions.find(d => d.name === dimensionName);
386
- if (!dimension) {
387
- throw new Error(`Dimension "${dimensionName}" not found. Available dimensions: ${this._dimensions.map(d => d.name).join(', ')}`);
388
- }
389
- const compareFn = (a, b) => {
390
- const aValue = a.getValue(dimensionName);
391
- const bValue = b.getValue(dimensionName);
392
- // Handle null/undefined values
393
- if (aValue == null && bValue == null)
394
- return 0;
395
- if (aValue == null)
396
- return ascending ? -1 : 1;
397
- if (bValue == null)
398
- return ascending ? 1 : -1;
399
- // Try to parse as numbers for numeric comparison
400
- const aNum = Number(aValue);
401
- const bNum = Number(bValue);
402
- if (!isNaN(aNum) && !isNaN(bNum)) {
403
- const result = aNum - bNum;
404
- return ascending ? result : -result;
405
- }
406
- // String comparison
407
- const aStr = String(aValue);
408
- const bStr = String(bValue);
409
- const result = aStr.localeCompare(bStr);
410
- return ascending ? result : -result;
411
- };
412
- return this.sort(compareFn);
413
- });
414
- /**
415
- * Remove duplicate records from the dataset using batched streaming
416
- * @param keySelector Optional function to generate a key for comparison. If not provided, uses the entire record
417
- */
418
- this.distinct = (keySelector) => __awaiter(this, void 0, void 0, function* () {
419
- this._startOperation('distinct');
420
- if (!fs_1.default.existsSync(this._path)) {
421
- throw new Error(`File does not exist: ${this._path}`);
422
- }
423
- const seen = new Set();
424
- yield this.transformStream((batch) => __awaiter(this, void 0, void 0, function* () {
425
- const distinctBatch = [];
426
- for (const record of batch) {
427
- // Generate a key for uniqueness check
428
- const recordKey = keySelector ? keySelector(record) : record.stringify();
429
- if (!seen.has(recordKey)) {
430
- seen.add(recordKey);
431
- distinctBatch.push(record);
432
- }
433
- }
434
- return distinctBatch;
435
- }));
436
- this._finishOperation('distinct');
437
- return this;
438
- });
439
- /**
440
- * Remove duplicate records based on specific dimensions
441
- * @param dimensionNames Array of dimension names to use for uniqueness comparison
442
- */
443
- this.distinctByDimensions = (dimensionNames) => __awaiter(this, void 0, void 0, function* () {
444
- // Validate that all dimension names exist
445
- const existingNames = this._dimensions.map(d => d.name);
446
- const missingDimensions = dimensionNames.filter(name => !existingNames.includes(name));
447
- (0, Affirm_1.default)(missingDimensions.length === 0, `Cannot create distinct by dimensions. Missing dimensions: ${missingDimensions.join(', ')}`);
448
- const keySelector = (record) => {
449
- const values = dimensionNames.map(name => {
450
- const value = record.getValue(name);
451
- return value !== null && value !== undefined ? String(value) : '';
452
- });
453
- return values.join('|'); // Use pipe as separator to avoid collisions
454
- };
455
- return this.distinct(keySelector);
456
- });
457
- /**
458
- * Internal method to perform k-way merge of sorted files
459
- */
460
- this._performKWayMergeSort = (tempFiles, outputPath, compareFn) => __awaiter(this, void 0, void 0, function* () {
461
- const readers = [];
462
- // Initialize readers for each temp file
463
- for (const file of tempFiles) {
464
- const readStream = (0, fs_2.createReadStream)(file);
465
- const rl = (0, readline_1.createInterface)({ input: readStream, crlfDelay: Infinity });
466
- const iterator = rl[Symbol.asyncIterator]();
467
- readers.push({ file, rl, currentRecord: null, finished: false, iterator });
468
- }
469
- // Read first record from each file
470
- for (const reader of readers) {
471
- try {
472
- const { value, done } = yield reader.iterator.next();
473
- if (!done)
474
- reader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
475
- else
476
- reader.finished = true;
477
- }
478
- catch (_a) {
479
- reader.finished = true;
480
- }
481
- }
482
- // Write merged results
483
- this.ensureFile(outputPath);
484
- const writeStream = (0, fs_2.createWriteStream)(outputPath);
485
- while (readers.some(r => !r.finished)) {
486
- // Find the reader with the smallest current record
487
- let minReader = null;
488
- for (const reader of readers) {
489
- if (!reader.finished && reader.currentRecord) {
490
- if (!minReader || !minReader.currentRecord || compareFn(reader.currentRecord, minReader.currentRecord) < 0) {
491
- minReader = reader;
492
- }
493
- }
494
- }
495
- if (minReader && minReader.currentRecord) {
496
- // Write the smallest record
497
- writeStream.write(minReader.currentRecord.stringify() + '\n');
498
- // Read next record from the same reader
499
- try {
500
- const { value, done } = yield minReader.iterator.next();
501
- if (!done) {
502
- minReader.currentRecord = new DatasetRecord_1.default(value, this._dimensions, this._delimiter);
503
- }
504
- else {
505
- minReader.finished = true;
506
- minReader.currentRecord = null;
507
- }
508
- }
509
- catch (_b) {
510
- minReader.finished = true;
511
- minReader.currentRecord = null;
512
- }
513
- }
514
- }
515
- writeStream.end();
516
- yield new Promise((resolve, reject) => {
517
- writeStream.on('finish', resolve);
518
- writeStream.on('error', reject);
519
- });
520
- // Close all readers
521
- for (const reader of readers) {
522
- reader.rl.close();
523
- }
524
- });
525
- /**
526
- * Stream through batches without modification (for reading)
527
- */
528
- this.streamBatches = (processor) => __awaiter(this, void 0, void 0, function* () {
529
- var _a, e_3, _b, _c;
530
- if (!fs_1.default.existsSync(this._path)) {
531
- throw new Error(`File does not exist: ${this._path}`);
532
- }
533
- this._startOperation('stream-batches');
534
- const readStream = (0, fs_2.createReadStream)(this._path);
535
- const rl = (0, readline_1.createInterface)({
536
- input: readStream,
537
- crlfDelay: Infinity
538
- });
539
- let batch = [];
540
- let batchIndex = 0;
541
- let lineCount = 0;
542
- try {
543
- for (var _d = true, rl_3 = __asyncValues(rl), rl_3_1; rl_3_1 = yield rl_3.next(), _a = rl_3_1.done, !_a; _d = true) {
544
- _c = rl_3_1.value;
545
- _d = false;
546
- const line = _c;
547
- try {
548
- const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
549
- batch.push(record);
550
- lineCount++;
551
- if (batch.length >= this._batchSize) {
552
- yield processor(batch, batchIndex);
553
- batch = [];
554
- batchIndex++;
555
- }
556
- }
557
- catch (error) {
558
- Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
559
- }
560
- }
561
- }
562
- catch (e_3_1) { e_3 = { error: e_3_1 }; }
563
- finally {
564
- try {
565
- if (!_d && !_a && (_b = rl_3.return)) yield _b.call(rl_3);
566
- }
567
- finally { if (e_3) throw e_3.error; }
568
- }
569
- // Process remaining items in the last batch
570
- if (batch.length > 0) {
571
- yield processor(batch, batchIndex);
572
- }
573
- this._iterations++;
574
- this._finishOperation('stream-batches');
575
- });
576
- /**
577
- * Check if file exists
578
- */
579
- this.exists = () => fs_1.default.existsSync(this._path);
580
- /**
581
- * Create the file if it doesn't exist
582
- */
583
- this.ensureFile = (filePath) => {
584
- const dir = path_1.default.dirname(filePath);
585
- if (!fs_1.default.existsSync(dir)) {
586
- fs_1.default.mkdirSync(dir, { recursive: true });
587
- }
588
- if (!fs_1.default.existsSync(filePath)) {
589
- fs_1.default.writeFileSync(filePath, '');
590
- }
591
- };
592
- /**
593
- * Clear the file content
594
- */
595
- this.clear = () => {
596
- if (fs_1.default.existsSync(this._path)) {
597
- fs_1.default.writeFileSync(this._path, '');
598
- }
599
- return this;
600
- };
601
- /**
602
- * Append data to the file
603
- */
604
- this.append = (items) => __awaiter(this, void 0, void 0, function* () {
605
- this._startOperation('append');
606
- const writeStream = (0, fs_2.createWriteStream)(this._path, { flags: 'a' });
607
- for (const item of items) {
608
- writeStream.write(item.stringify() + '\n');
609
- }
610
- writeStream.end();
611
- yield new Promise((resolve, reject) => {
612
- writeStream.on('finish', resolve);
613
- writeStream.on('error', reject);
614
- });
615
- this._finishOperation('append');
616
- return this;
617
- });
618
- /**
619
- * Read a specified number of lines from the file
620
- */
621
- this.readLines = (numberOfLines) => __awaiter(this, void 0, void 0, function* () {
622
- var _a, e_4, _b, _c;
623
- if (!fs_1.default.existsSync(this._path))
624
- return [];
625
- if (numberOfLines <= 0)
626
- return [];
627
- this._startOperation('read-lines', { numberOfLines });
628
- const readStream = (0, fs_2.createReadStream)(this._path);
629
- const rl = (0, readline_1.createInterface)({
630
- input: readStream,
631
- crlfDelay: Infinity
632
- });
633
- const results = [];
634
- let lineCount = 0;
635
- try {
636
- for (var _d = true, rl_4 = __asyncValues(rl), rl_4_1; rl_4_1 = yield rl_4.next(), _a = rl_4_1.done, !_a; _d = true) {
637
- _c = rl_4_1.value;
638
- _d = false;
639
- const line = _c;
640
- try {
641
- const record = new DatasetRecord_1.default(line, this._dimensions, this._delimiter);
642
- results.push(record);
643
- lineCount++;
644
- if (lineCount >= numberOfLines) {
645
- break;
646
- }
647
- }
648
- catch (error) {
649
- Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
650
- lineCount++;
651
- }
652
- }
653
- }
654
- catch (e_4_1) { e_4 = { error: e_4_1 }; }
655
- finally {
656
- try {
657
- if (!_d && !_a && (_b = rl_4.return)) yield _b.call(rl_4);
658
- }
659
- finally { if (e_4) throw e_4.error; }
660
- }
661
- rl.close();
662
- readStream.close();
663
- this._finishOperation('read-lines');
664
- return results;
665
- });
666
- this.getDimensions = () => this._dimensions;
667
- this.setDimensions = (dimensions) => {
668
- this._dimensions = dimensions;
669
- return this;
670
- };
671
- this.setSingleDimension = (newDimension, oldDimension) => {
672
- (0, Affirm_1.default)(newDimension, `Invalid new dimension`);
673
- (0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
674
- const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
675
- (0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
676
- this._dimensions.splice(current, 1, newDimension);
677
- return this;
678
- };
679
- /**
680
- * Update the record pool when dimensions change
681
- */
682
- this._updateRecordPoolDimensions = () => {
683
- // Update all pooled records with current dimensions
684
- this._recordPool.updateDimensions(this._dimensions, this._delimiter);
685
- };
686
- this.print = (...args_1) => __awaiter(this, [...args_1], void 0, function* (count = 3, full = false) {
687
- console.log(`DS ${this.name} (${this._count} | ${this._iterations})`);
688
- console.log(this._dimensions.map(x => x.name).join(this._delimiter));
689
- const records = yield this.readLines(count);
690
- records.forEach((x, i) => console.log(`[${i}]`, full ? x : x.stringify()));
691
- console.log('----------');
692
- });
693
- this.printStats = () => {
694
- var _a, _b;
695
- const total = ((_b = (_a = this._operations) === null || _a === void 0 ? void 0 : _a.map(x => x.elapsedMs)) !== null && _b !== void 0 ? _b : []).reduce((sum, ms) => sum + ms, 0);
696
- console.log(`DS[stats] ${this.name} (size: ${this._count} | cycles: ${this._iterations} | ms: ${Helper_1.default.formatDuration(total)})`);
697
- console.log(`Operations: ${this._operations.length}`);
698
- console.log(JSON.stringify(this._operations, null, 4));
699
- };
700
- /**
701
- * Destroy the dataset by removing all allocated memory and created files
702
- */
703
- this.destroy = () => {
704
- this._startOperation('destroy');
705
- try {
706
- // Remove the main dataset file
707
- if (fs_1.default.existsSync(this._path)) {
708
- fs_1.default.unlinkSync(this._path);
709
- }
710
- // Remove the temporary file if it exists
711
- if (fs_1.default.existsSync(this._tempPath)) {
712
- fs_1.default.unlinkSync(this._tempPath);
713
- }
714
- // Remove any batch temporary files that might still exist
715
- const tempDir = path_1.default.dirname(this._tempPath);
716
- if (fs_1.default.existsSync(tempDir)) {
717
- const files = fs_1.default.readdirSync(tempDir);
718
- const batchFiles = files.filter(file => file.startsWith(path_1.default.basename(this._tempPath) + '_batch_'));
719
- for (const batchFile of batchFiles) {
720
- const fullPath = path_1.default.join(tempDir, batchFile);
721
- if (fs_1.default.existsSync(fullPath)) {
722
- fs_1.default.unlinkSync(fullPath);
723
- }
724
- }
725
- // Try to remove the temp directory if it's empty
726
- try {
727
- if (fs_1.default.readdirSync(tempDir).length === 0) {
728
- fs_1.default.rmdirSync(tempDir);
729
- }
730
- }
731
- catch (_a) {
732
- // Directory not empty or other error, ignore
733
- }
734
- }
735
- this._finishOperation('destroy');
736
- }
737
- catch (error) {
738
- this._finishOperation('destroy');
739
- throw new Error(`Failed to destroy dataset: ${error instanceof Error ? error.message : String(error)}`);
740
- }
741
- };
742
- this._startOperation = (name, metadata) => {
743
- const newOperation = {
744
- name,
745
- count: -1,
746
- elapsedMs: performance.now(),
747
- status: 'running',
748
- subOperations: [],
749
- metadata: metadata
750
- };
751
- const runningOperation = this._findRunningOperation();
752
- if (runningOperation)
753
- runningOperation.subOperations.push(newOperation);
754
- else
755
- this._operations.push(newOperation);
756
- };
757
- this._finishOperation = (name) => {
758
- const finishedOperation = this._findRunningOperation(name);
759
- if (finishedOperation) {
760
- finishedOperation.status = 'completed';
761
- finishedOperation.count = this._count;
762
- finishedOperation.elapsedMs = performance.now() - finishedOperation.elapsedMs;
763
- }
764
- else {
765
- const currentOperation = this._operations.find(x => x.status === 'running');
766
- const currentName = currentOperation ? currentOperation.name : 'none';
767
- console.warn(`Finished operation "${name}" but no running operation with that name was found (current running: "${currentName}")`);
768
- }
769
- };
770
- this._findRunningOperation = (name) => {
771
- const searchInOperations = (operations) => {
772
- for (const operation of operations) {
773
- if (operation.status === 'running' && (name === undefined || operation.name === name)) {
774
- // If we're looking for a specific name, return it
775
- if (name !== undefined) {
776
- return operation;
777
- }
778
- // If we're looking for the deepest running operation (name is undefined),
779
- // check if this operation has deeper running sub-operations
780
- const deeperRunning = searchInOperations(operation.subOperations);
781
- if (deeperRunning) {
782
- return deeperRunning;
783
- }
784
- // If no deeper running operations, this is the deepest
785
- return operation;
786
- }
787
- if (operation.subOperations && operation.subOperations.length > 0) {
788
- const found = searchInOperations(operation.subOperations);
789
- if (found) {
790
- return found;
791
- }
792
- }
793
- }
794
- return null;
795
- };
796
- return searchInOperations(this._operations);
797
- };
798
- this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
799
- const { name, baseProducer, file, batchSize, executionId } = options;
800
- this.name = name;
801
- this._file = file;
802
- this._executionId = executionId;
803
- this._baseProducer = baseProducer;
804
- this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
805
- this._dimensions = [];
806
- this._firstLine = '';
807
- this._delimiter = (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',';
808
- this._count = 0;
809
- this._iterations = 0;
810
- this._operations = [];
811
- // Initialize record pool for optimization
812
- this._recordPool = new DatasetRecordPool_1.default(this._batchSize);
813
- const datasetName = this.name
814
- .replace(/[^a-zA-Z0-9_-]/g, '_')
815
- .replace(/_{2,}/g, '_')
816
- .replace(/^_+|_+$/g, '')
817
- .toLowerCase();
818
- const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
819
- this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
820
- this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
821
- this.ensureFile(this._path);
822
- }
823
- }
824
- exports.default = Dataset;