@forzalabs/remora 1.0.21 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/actions/automap.js +26 -42
- package/actions/compile.js +27 -43
- package/actions/create_consumer.js +24 -40
- package/actions/create_producer.js +16 -32
- package/actions/debug.js +18 -34
- package/actions/deploy.js +30 -46
- package/actions/discover.js +13 -29
- package/actions/init.js +29 -45
- package/actions/mock.js +16 -32
- package/actions/run.js +34 -52
- package/actions/sample.js +42 -58
- package/index.js +38 -43
- package/package.json +4 -4
- package/workers/ExecutorWorker.js +18 -32
- package/Constants.js +0 -34
- package/core/Affirm.js +0 -42
- package/core/Algo.js +0 -160
- package/core/dste/DSTE.js +0 -113
- package/core/logger/DebugLogService.js +0 -48
- package/core/logger/DevelopmentLogService.js +0 -70
- package/core/logger/LocalLogService.js +0 -70
- package/core/logger/Logger.js +0 -54
- package/database/DatabaseEngine.js +0 -149
- package/database/DatabaseStructure.js +0 -27
- package/definitions/DatasetDefinitions.js +0 -2
- package/definitions/ExecutorDefinitions.js +0 -2
- package/definitions/ProcessENV.js +0 -2
- package/definitions/agents/DestinationDriver.js +0 -2
- package/definitions/agents/SourceDriver.js +0 -2
- package/definitions/cli.js +0 -2
- package/definitions/database/ApiKeys.js +0 -2
- package/definitions/database/Stored.js +0 -7
- package/definitions/database/UsageStat.js +0 -2
- package/definitions/database/User.js +0 -2
- package/definitions/json_schemas/consumer-schema.json +0 -1226
- package/definitions/json_schemas/producer-schema.json +0 -308
- package/definitions/json_schemas/project-schema.json +0 -100
- package/definitions/json_schemas/source-schema.json +0 -249
- package/definitions/requests/ConsumerRequest.js +0 -2
- package/definitions/requests/Developer.js +0 -2
- package/definitions/requests/Mapping.js +0 -2
- package/definitions/requests/ProducerRequest.js +0 -2
- package/definitions/requests/Request.js +0 -2
- package/definitions/resources/Compiled.js +0 -2
- package/definitions/resources/Consumer.js +0 -2
- package/definitions/resources/Environment.js +0 -2
- package/definitions/resources/Library.js +0 -2
- package/definitions/resources/Producer.js +0 -2
- package/definitions/resources/Project.js +0 -2
- package/definitions/resources/Schema.js +0 -2
- package/definitions/resources/Source.js +0 -2
- package/definitions/temp.js +0 -2
- package/definitions/transform/Transformations.js +0 -2
- package/drivers/DeltaShareDriver.js +0 -186
- package/drivers/DriverFactory.js +0 -72
- package/drivers/DriverHelper.js +0 -248
- package/drivers/HttpApiDriver.js +0 -208
- package/drivers/RedshiftDriver.js +0 -184
- package/drivers/files/LocalDestinationDriver.js +0 -146
- package/drivers/files/LocalSourceDriver.js +0 -405
- package/drivers/s3/S3DestinationDriver.js +0 -197
- package/drivers/s3/S3SourceDriver.js +0 -495
- package/engines/CryptoEngine.js +0 -75
- package/engines/Environment.js +0 -170
- package/engines/ProcessENVManager.js +0 -83
- package/engines/RandomEngine.js +0 -47
- package/engines/SecretManager.js +0 -23
- package/engines/UserManager.js +0 -66
- package/engines/ai/AutoMapperEngine.js +0 -37
- package/engines/ai/DeveloperEngine.js +0 -497
- package/engines/ai/LLM.js +0 -255
- package/engines/consumer/ConsumerManager.js +0 -218
- package/engines/consumer/ConsumerOnFinishManager.js +0 -202
- package/engines/dataset/Dataset.js +0 -824
- package/engines/dataset/DatasetManager.js +0 -211
- package/engines/dataset/DatasetRecord.js +0 -120
- package/engines/dataset/DatasetRecordPool.js +0 -77
- package/engines/execution/RequestExecutor.js +0 -67
- package/engines/parsing/CSVParser.js +0 -60
- package/engines/parsing/LineParser.js +0 -71
- package/engines/parsing/ParseCompression.js +0 -101
- package/engines/parsing/ParseHelper.js +0 -18
- package/engines/parsing/ParseManager.js +0 -54
- package/engines/parsing/XLSParser.js +0 -87
- package/engines/parsing/XMLParser.js +0 -115
- package/engines/producer/ProducerEngine.js +0 -127
- package/engines/producer/ProducerManager.js +0 -43
- package/engines/scheduler/CronScheduler.js +0 -222
- package/engines/scheduler/QueueManager.js +0 -314
- package/engines/schema/SchemaValidator.js +0 -67
- package/engines/transform/JoinEngine.js +0 -232
- package/engines/transform/TransformationEngine.js +0 -277
- package/engines/transform/TypeCaster.js +0 -59
- package/engines/usage/DataframeManager.js +0 -55
- package/engines/usage/UsageDataManager.js +0 -151
- package/engines/usage/UsageManager.js +0 -65
- package/engines/validation/Validator.js +0 -216
- package/executors/ConsumerExecutor.js +0 -280
- package/executors/Executor.js +0 -177
- package/executors/ExecutorOrchestrator.js +0 -331
- package/executors/ExecutorPerformance.js +0 -17
- package/executors/ExecutorProgress.js +0 -54
- package/executors/ExecutorScope.js +0 -52
- package/executors/OutputExecutor.js +0 -118
- package/executors/ProducerExecutor.js +0 -108
- package/helper/Helper.js +0 -149
- package/helper/Logger.js +0 -84
- package/helper/Runtime.js +0 -20
- package/helper/Settings.js +0 -13
- package/licencing/LicenceManager.js +0 -64
- package/settings.js +0 -12
package/definitions/temp.js
DELETED
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
-
};
|
|
14
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
16
|
-
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
17
|
-
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
18
|
-
/**
|
|
19
|
-
* Delta Share (Databricks Delta Sharing) Source Driver
|
|
20
|
-
*/
|
|
21
|
-
class DeltaShareSourceDriver {
|
|
22
|
-
constructor() {
|
|
23
|
-
this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
|
|
24
|
-
this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
|
|
25
|
-
this._tablesInShare = '{prefix}/shares/{share}/all-tables';
|
|
26
|
-
this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
|
|
27
|
-
this._schemasInShare = '{prefix}/shares/{share}/schemas';
|
|
28
|
-
this._shares = '{prefix}/shares';
|
|
29
|
-
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
30
|
-
(0, Affirm_1.default)(source, 'Invalid source');
|
|
31
|
-
// Expected authentication shape for delta-share
|
|
32
|
-
const { authentication } = source;
|
|
33
|
-
(0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
|
|
34
|
-
this._shareUrl = authentication.host;
|
|
35
|
-
this._bearerToken = SecretManager_1.default.replaceSecret(authentication.bearerToken || authentication.sessionToken || authentication.password);
|
|
36
|
-
this._share = authentication.share;
|
|
37
|
-
this._schema = authentication.schema;
|
|
38
|
-
this._table = authentication.table;
|
|
39
|
-
(0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
|
|
40
|
-
(0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
|
|
41
|
-
(0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
|
|
42
|
-
(0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
|
|
43
|
-
(0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
|
|
44
|
-
this._source = source;
|
|
45
|
-
return this;
|
|
46
|
-
});
|
|
47
|
-
// Delta Sharing is not a SQL engine; expose explicit error
|
|
48
|
-
this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
|
|
49
|
-
void __sql;
|
|
50
|
-
throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
|
|
51
|
-
});
|
|
52
|
-
this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
|
|
53
|
-
void __sql;
|
|
54
|
-
void __values;
|
|
55
|
-
throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
|
|
56
|
-
});
|
|
57
|
-
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
58
|
-
var _a, _b, _c;
|
|
59
|
-
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
60
|
-
(0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
|
|
61
|
-
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
62
|
-
const hyparquet = yield import('hyparquet');
|
|
63
|
-
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
|
|
64
|
-
const lines = [];
|
|
65
|
-
for (const deltaFile of deltaFiles) {
|
|
66
|
-
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
67
|
-
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
68
|
-
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
69
|
-
lines.push(...parquetRecords.map(x => JSON.stringify(x)));
|
|
70
|
-
}
|
|
71
|
-
return lines;
|
|
72
|
-
});
|
|
73
|
-
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
74
|
-
var _a, _b, _c;
|
|
75
|
-
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
76
|
-
(0, Affirm_1.default)(request.options, 'Invalid read options');
|
|
77
|
-
(0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
|
|
78
|
-
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
79
|
-
const { options: { lineFrom, lineTo } } = request;
|
|
80
|
-
const hyparquet = yield import('hyparquet');
|
|
81
|
-
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
|
|
82
|
-
const lines = [];
|
|
83
|
-
let index = 0;
|
|
84
|
-
for (const deltaFile of deltaFiles) {
|
|
85
|
-
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
86
|
-
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
87
|
-
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
88
|
-
for (const record of parquetRecords) {
|
|
89
|
-
if (index >= lineFrom && index < lineTo)
|
|
90
|
-
lines.push(JSON.stringify(record));
|
|
91
|
-
index++;
|
|
92
|
-
if (index >= lineTo)
|
|
93
|
-
break;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
return lines;
|
|
97
|
-
});
|
|
98
|
-
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
99
|
-
var _a, _b, _c;
|
|
100
|
-
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
101
|
-
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
102
|
-
const hyparquet = yield import('hyparquet');
|
|
103
|
-
const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
|
|
104
|
-
// For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
|
|
105
|
-
let index = 0;
|
|
106
|
-
let totalLineCount = 0;
|
|
107
|
-
for (const deltaFile of deltaFiles) {
|
|
108
|
-
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
109
|
-
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
110
|
-
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
111
|
-
if (index === 0 && parquetRecords.length > 0) {
|
|
112
|
-
// I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
|
|
113
|
-
dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
|
|
114
|
-
}
|
|
115
|
-
totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
116
|
-
append: index > 0,
|
|
117
|
-
delimiter: dataset.getDelimiter(),
|
|
118
|
-
destinationPath: dataset.getPath(),
|
|
119
|
-
objects: parquetRecords
|
|
120
|
-
});
|
|
121
|
-
index++;
|
|
122
|
-
}
|
|
123
|
-
dataset.setCount(totalLineCount);
|
|
124
|
-
return dataset;
|
|
125
|
-
});
|
|
126
|
-
this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
|
|
127
|
-
void __producer;
|
|
128
|
-
try {
|
|
129
|
-
yield this._getAllFilesInTables(this._table);
|
|
130
|
-
// If it doesn't exist, then it fails in the above function
|
|
131
|
-
return true;
|
|
132
|
-
}
|
|
133
|
-
catch (_a) {
|
|
134
|
-
return false;
|
|
135
|
-
}
|
|
136
|
-
});
|
|
137
|
-
this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
138
|
-
const url = this._version
|
|
139
|
-
.replace('{prefix}', this._shareUrl)
|
|
140
|
-
.replace('{share}', this._share)
|
|
141
|
-
.replace('{schema}', this._schema)
|
|
142
|
-
.replace('{table}', table);
|
|
143
|
-
const res = yield fetch(url, {
|
|
144
|
-
method: 'GET',
|
|
145
|
-
headers: {
|
|
146
|
-
Authorization: `Bearer ${this._bearerToken}`
|
|
147
|
-
}
|
|
148
|
-
});
|
|
149
|
-
(0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText} (${yield res.text()})`);
|
|
150
|
-
const version = res.headers['delta-table-version'];
|
|
151
|
-
return version;
|
|
152
|
-
});
|
|
153
|
-
this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
154
|
-
const url = this._query
|
|
155
|
-
.replace('{prefix}', this._shareUrl)
|
|
156
|
-
.replace('{share}', this._share)
|
|
157
|
-
.replace('{schema}', this._schema)
|
|
158
|
-
.replace('{table}', table);
|
|
159
|
-
const body = {
|
|
160
|
-
version: yield this._getVersion(table)
|
|
161
|
-
};
|
|
162
|
-
const res = yield fetch(url, {
|
|
163
|
-
method: 'POST',
|
|
164
|
-
headers: {
|
|
165
|
-
'Authorization': `Bearer ${this._bearerToken}`,
|
|
166
|
-
'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
|
|
167
|
-
},
|
|
168
|
-
body: JSON.stringify(body)
|
|
169
|
-
});
|
|
170
|
-
const rawText = yield res.text();
|
|
171
|
-
(0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
|
|
172
|
-
// By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
|
|
173
|
-
const deltaLines = rawText
|
|
174
|
-
.split('\n')
|
|
175
|
-
.filter(x => x.length > 0)
|
|
176
|
-
.slice(2)
|
|
177
|
-
.map(x => JSON.parse(x));
|
|
178
|
-
return deltaLines;
|
|
179
|
-
});
|
|
180
|
-
this.ready = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
181
|
-
void request;
|
|
182
|
-
throw new Error('DeltaShareSourceDriver.ready is not supported: Delta Sharing does not support readiness checks');
|
|
183
|
-
});
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
exports.default = DeltaShareSourceDriver;
|
package/drivers/DriverFactory.js
DELETED
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
-
};
|
|
14
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
16
|
-
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
17
|
-
const HttpApiDriver_1 = __importDefault(require("./HttpApiDriver"));
|
|
18
|
-
const LocalSourceDriver_1 = __importDefault(require("./files/LocalSourceDriver"));
|
|
19
|
-
const LocalDestinationDriver_1 = __importDefault(require("./files/LocalDestinationDriver"));
|
|
20
|
-
const S3SourceDriver_1 = __importDefault(require("./s3/S3SourceDriver"));
|
|
21
|
-
const S3DestinationDriver_1 = __importDefault(require("./s3/S3DestinationDriver"));
|
|
22
|
-
class DriverFactoryClass {
|
|
23
|
-
constructor() {
|
|
24
|
-
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
25
|
-
switch (source.engine) {
|
|
26
|
-
case 'aws-redshift': {
|
|
27
|
-
const driver = new RedshiftDriver_1.default();
|
|
28
|
-
yield driver.init(source);
|
|
29
|
-
return driver;
|
|
30
|
-
}
|
|
31
|
-
case 'aws-s3': {
|
|
32
|
-
const driver = new S3SourceDriver_1.default();
|
|
33
|
-
yield driver.init(source);
|
|
34
|
-
return driver;
|
|
35
|
-
}
|
|
36
|
-
case 'delta-share': {
|
|
37
|
-
const driver = new DeltaShareDriver_1.default();
|
|
38
|
-
yield driver.init(source);
|
|
39
|
-
return driver;
|
|
40
|
-
}
|
|
41
|
-
case 'local': {
|
|
42
|
-
const driver = new LocalSourceDriver_1.default();
|
|
43
|
-
yield driver.init(source);
|
|
44
|
-
return driver;
|
|
45
|
-
}
|
|
46
|
-
case 'http-api': {
|
|
47
|
-
const driver = new HttpApiDriver_1.default();
|
|
48
|
-
yield driver.init(source);
|
|
49
|
-
return driver;
|
|
50
|
-
}
|
|
51
|
-
default: throw new Error(`Invalid driver type "${source.engine}". This driver is not implemented yet`);
|
|
52
|
-
}
|
|
53
|
-
});
|
|
54
|
-
this.instantiateDestination = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
55
|
-
switch (source.engine) {
|
|
56
|
-
case 'aws-s3': {
|
|
57
|
-
const driver = new S3DestinationDriver_1.default();
|
|
58
|
-
yield driver.init(source);
|
|
59
|
-
return driver;
|
|
60
|
-
}
|
|
61
|
-
case 'local': {
|
|
62
|
-
const driver = new LocalDestinationDriver_1.default();
|
|
63
|
-
yield driver.init(source);
|
|
64
|
-
return driver;
|
|
65
|
-
}
|
|
66
|
-
default: throw new Error(`Invalid driver type "${source.engine}". This driver is not implemented yet`);
|
|
67
|
-
}
|
|
68
|
-
});
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
const DriverFactory = new DriverFactoryClass();
|
|
72
|
-
exports.default = DriverFactory;
|
package/drivers/DriverHelper.js
DELETED
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
|
-
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
-
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
-
var m = o[Symbol.asyncIterator], i;
|
|
14
|
-
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
-
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
-
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
-
};
|
|
18
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
19
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
20
|
-
};
|
|
21
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
-
const stream_1 = require("stream");
|
|
23
|
-
const readline_1 = require("readline");
|
|
24
|
-
const promises_1 = require("stream/promises");
|
|
25
|
-
const fs_1 = require("fs");
|
|
26
|
-
const path_1 = __importDefault(require("path"));
|
|
27
|
-
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
28
|
-
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
29
|
-
const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
|
|
30
|
-
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
|
|
31
|
-
const Constants_1 = __importDefault(require("../Constants"));
|
|
32
|
-
const DriverHelper = {
|
|
33
|
-
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
34
|
-
(0, Affirm_1.default)(options, 'Invalid options');
|
|
35
|
-
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options;
|
|
36
|
-
(0, Affirm_1.default)(headerLine, `Invalid header line`);
|
|
37
|
-
const keys = (fileType === 'JSON' || fileType === 'JSONL')
|
|
38
|
-
? Object.keys(JSON.parse(headerLine))
|
|
39
|
-
: [];
|
|
40
|
-
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
|
|
41
|
-
// When sourceFilename is set, the headerLine includes $source_filename at the end.
|
|
42
|
-
// For validation, we need to compare against the original header without this suffix.
|
|
43
|
-
const originalHeaderLine = sourceFilename
|
|
44
|
-
? headerLine.slice(0, headerLine.lastIndexOf(delimiter))
|
|
45
|
-
: headerLine;
|
|
46
|
-
let isFirstLine = true;
|
|
47
|
-
let hasValidatedHeader = shouldValidateHeader ? false : true;
|
|
48
|
-
let leftoverData = '';
|
|
49
|
-
let globalIndex = 0;
|
|
50
|
-
let lineCount = 0;
|
|
51
|
-
const headerValidationTransform = new stream_1.Transform({
|
|
52
|
-
transform(chunk, encoding, callback) {
|
|
53
|
-
const chunkStr = leftoverData + chunk.toString();
|
|
54
|
-
const lines = chunkStr.split('\n');
|
|
55
|
-
// Keep the last line as leftover if it doesn't end with newline
|
|
56
|
-
leftoverData = lines.pop() || '';
|
|
57
|
-
const filteredLines = [];
|
|
58
|
-
for (let i = 0; i < lines.length; i++) {
|
|
59
|
-
const line = lines[i];
|
|
60
|
-
// Header validation for first line
|
|
61
|
-
if (!hasValidatedHeader && isFirstLine && i === 0) {
|
|
62
|
-
if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) {
|
|
63
|
-
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`;
|
|
64
|
-
Logger_1.default.log(msg);
|
|
65
|
-
return callback(new Error(msg));
|
|
66
|
-
}
|
|
67
|
-
hasValidatedHeader = true;
|
|
68
|
-
isFirstLine = false;
|
|
69
|
-
}
|
|
70
|
-
// Apply your filtering logic here
|
|
71
|
-
if (shouldIncludeLine(line, globalIndex)) {
|
|
72
|
-
filteredLines.push(processLine(line));
|
|
73
|
-
}
|
|
74
|
-
globalIndex++;
|
|
75
|
-
}
|
|
76
|
-
// Output filtered lines
|
|
77
|
-
if (filteredLines.length > 0) {
|
|
78
|
-
const output = filteredLines.join('\n') + '\n';
|
|
79
|
-
callback(null, Buffer.from(output));
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
callback(null, null); // No data to output
|
|
83
|
-
}
|
|
84
|
-
},
|
|
85
|
-
flush(callback) {
|
|
86
|
-
// Process any remaining data
|
|
87
|
-
if (leftoverData.trim()) {
|
|
88
|
-
if (shouldIncludeLine(leftoverData, -1)) {
|
|
89
|
-
callback(null, Buffer.from(processLine(leftoverData) + '\n'));
|
|
90
|
-
}
|
|
91
|
-
else {
|
|
92
|
-
callback(null, null);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
else {
|
|
96
|
-
callback(null, null);
|
|
97
|
-
}
|
|
98
|
-
globalIndex++;
|
|
99
|
-
}
|
|
100
|
-
});
|
|
101
|
-
// Helper function to determine if a line should be included
|
|
102
|
-
const shouldIncludeLine = (line, lineIndex) => {
|
|
103
|
-
// For flat files (csv, txt) ignore the first line of the header (I already saved that line)
|
|
104
|
-
if (lineIndex === 0 && shouldValidateHeader)
|
|
105
|
-
return false;
|
|
106
|
-
// Skip empty lines
|
|
107
|
-
if (line.trim() === '')
|
|
108
|
-
return false;
|
|
109
|
-
return true;
|
|
110
|
-
};
|
|
111
|
-
const processLine = (line) => {
|
|
112
|
-
lineCount++;
|
|
113
|
-
let processedLine;
|
|
114
|
-
switch (fileType) {
|
|
115
|
-
case 'JSON':
|
|
116
|
-
case 'JSONL': {
|
|
117
|
-
try {
|
|
118
|
-
const parsed = JSON.parse(line);
|
|
119
|
-
processedLine = keys.map(k => parsed[k]).join(delimiter);
|
|
120
|
-
}
|
|
121
|
-
catch (error) {
|
|
122
|
-
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
|
|
123
|
-
throw error;
|
|
124
|
-
}
|
|
125
|
-
break;
|
|
126
|
-
}
|
|
127
|
-
default:
|
|
128
|
-
processedLine = line;
|
|
129
|
-
}
|
|
130
|
-
// If sourceFilename is provided, append it to each line
|
|
131
|
-
if (sourceFilename) {
|
|
132
|
-
processedLine = processedLine + delimiter + sourceFilename;
|
|
133
|
-
}
|
|
134
|
-
return processedLine;
|
|
135
|
-
};
|
|
136
|
-
const writeOptions = append ? { flags: 'a' } : {};
|
|
137
|
-
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
138
|
-
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
|
139
|
-
return lineCount;
|
|
140
|
-
}),
|
|
141
|
-
appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
142
|
-
(0, Affirm_1.default)(options, 'Invalid options');
|
|
143
|
-
const { append, destinationPath, objects, delimiter } = options;
|
|
144
|
-
const writeOptions = append ? { flags: 'a' } : {};
|
|
145
|
-
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
146
|
-
let lineCount = 0;
|
|
147
|
-
const keys = Object.keys(objects[0]);
|
|
148
|
-
for (const obj of objects) {
|
|
149
|
-
const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
|
|
150
|
-
writeStream.write(serialized);
|
|
151
|
-
lineCount++;
|
|
152
|
-
}
|
|
153
|
-
writeStream.close();
|
|
154
|
-
return lineCount;
|
|
155
|
-
}),
|
|
156
|
-
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
157
|
-
var _a, e_1, _b, _c;
|
|
158
|
-
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
159
|
-
const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
|
|
160
|
-
const lines = [];
|
|
161
|
-
let counter = 0;
|
|
162
|
-
try {
|
|
163
|
-
for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
|
|
164
|
-
_c = rl_1_1.value;
|
|
165
|
-
_d = false;
|
|
166
|
-
const line = _c;
|
|
167
|
-
lines.push(line);
|
|
168
|
-
counter++;
|
|
169
|
-
if (counter >= lineCount) {
|
|
170
|
-
break;
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
175
|
-
finally {
|
|
176
|
-
try {
|
|
177
|
-
if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
|
|
178
|
-
}
|
|
179
|
-
finally { if (e_1) throw e_1.error; }
|
|
180
|
-
}
|
|
181
|
-
rl.close();
|
|
182
|
-
fileStream.close();
|
|
183
|
-
return lines;
|
|
184
|
-
}),
|
|
185
|
-
setHeaderFromFile: (fileKey, file, filePath, dataset) => __awaiter(void 0, void 0, void 0, function* () {
|
|
186
|
-
(0, Affirm_1.default)(filePath, 'Invalid path');
|
|
187
|
-
(0, Affirm_1.default)(fileKey, 'Invalid fileKey');
|
|
188
|
-
(0, Affirm_1.default)(file, 'Invalid File');
|
|
189
|
-
let headerLine;
|
|
190
|
-
switch (file.fileType) {
|
|
191
|
-
case 'XLS':
|
|
192
|
-
case 'XLSX':
|
|
193
|
-
headerLine = yield XLSParser_1.default.getHeaderXls(path_1.default.join(filePath, fileKey), file.sheetName);
|
|
194
|
-
if (file.includeSourceFilename) {
|
|
195
|
-
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
196
|
-
}
|
|
197
|
-
dataset.setFirstLine(headerLine);
|
|
198
|
-
break;
|
|
199
|
-
case 'XML':
|
|
200
|
-
// using a differnt logic for encoded type xls and xlsx
|
|
201
|
-
headerLine = (yield XMLParser_1.default.readXmlLines(path_1.default.join(filePath, fileKey)))[0];
|
|
202
|
-
dataset.setFirstLine(headerLine);
|
|
203
|
-
break;
|
|
204
|
-
case 'CSV':
|
|
205
|
-
case 'JSON':
|
|
206
|
-
case 'JSONL':
|
|
207
|
-
case 'TXT':
|
|
208
|
-
// Get header line from the first file
|
|
209
|
-
headerLine = (yield DriverHelper.quickReadFile(path_1.default.join(filePath, fileKey), 1))[0];
|
|
210
|
-
// If including source filename, append a placeholder column name to the header
|
|
211
|
-
if (file.includeSourceFilename) {
|
|
212
|
-
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
213
|
-
}
|
|
214
|
-
dataset.setFirstLine(headerLine);
|
|
215
|
-
break;
|
|
216
|
-
default:
|
|
217
|
-
throw new Error(`the fileType "${file.fileType}" is not implemented yet`);
|
|
218
|
-
}
|
|
219
|
-
}),
|
|
220
|
-
quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
221
|
-
var _a, e_2, _b, _c;
|
|
222
|
-
const rl = (0, readline_1.createInterface)({ input: stream, crlfDelay: Infinity });
|
|
223
|
-
const lines = [];
|
|
224
|
-
let counter = 0;
|
|
225
|
-
try {
|
|
226
|
-
for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
|
|
227
|
-
_c = rl_2_1.value;
|
|
228
|
-
_d = false;
|
|
229
|
-
const line = _c;
|
|
230
|
-
lines.push(line);
|
|
231
|
-
counter++;
|
|
232
|
-
if (counter >= lineCount) {
|
|
233
|
-
break;
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
238
|
-
finally {
|
|
239
|
-
try {
|
|
240
|
-
if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
|
|
241
|
-
}
|
|
242
|
-
finally { if (e_2) throw e_2.error; }
|
|
243
|
-
}
|
|
244
|
-
rl.close();
|
|
245
|
-
return lines;
|
|
246
|
-
})
|
|
247
|
-
};
|
|
248
|
-
exports.default = DriverHelper;
|