@forzalabs/remora 0.1.9 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +10 -3
- package/actions/deploy.js +1 -1
- package/database/DatabaseEngine.js +18 -3
- package/definitions/DatasetDefinitions.js +2 -0
- package/definitions/json_schemas/producer-schema.json +39 -1
- package/definitions/json_schemas/source-schema.json +76 -3
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +18 -6
- package/drivers/HttpApiDriver.js +204 -0
- package/drivers/LocalDriver.js +21 -7
- package/drivers/S3Driver.js +24 -8
- package/engines/dataset/DatasetManager.js +68 -25
- package/engines/dataset/DatasetRecord.js +2 -1
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/parsing/ParseManager.js +19 -3
- package/engines/producer/ProducerEngine.js +13 -4
- package/engines/validation/Validator.js +17 -0
- package/package.json +2 -1
package/Constants.js
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
const CONSTANTS = {
|
|
4
|
-
cliVersion: '0.
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
cliVersion: '0.2.0',
|
|
5
|
+
backendVersion: 1,
|
|
6
|
+
backendPort: 5088,
|
|
7
|
+
workerVersion: 2,
|
|
8
|
+
workerPort: 5069,
|
|
9
|
+
/**
|
|
10
|
+
* Column name for the dynamically injected source filename dimension.
|
|
11
|
+
* Prefixed with $ to indicate it's a system-generated dynamic value.
|
|
12
|
+
*/
|
|
13
|
+
SOURCE_FILENAME_COLUMN: '$source_filename',
|
|
7
14
|
defaults: {
|
|
8
15
|
PRODUCER_TEMP_FOLDER: '.temp',
|
|
9
16
|
SQL_MAX_QUERY_ROWS: 10000,
|
package/actions/deploy.js
CHANGED
|
@@ -51,7 +51,7 @@ const deploy = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
51
51
|
// Read the zip file as a buffer
|
|
52
52
|
const zipBuffer = fs_1.default.readFileSync(tempZipPath);
|
|
53
53
|
const host = process.env.REMORA_WORKER_HOST;
|
|
54
|
-
const version = Constants_1.default.
|
|
54
|
+
const version = Constants_1.default.workerVersion;
|
|
55
55
|
const workerAPI = `${host}/cli/v${version}/uploaddeployment`;
|
|
56
56
|
const formData = new FormData();
|
|
57
57
|
const blob = new Blob([zipBuffer], { type: 'application/zip' });
|
|
@@ -21,7 +21,8 @@ class DatabaseEngineClass {
|
|
|
21
21
|
this.db = () => this._db;
|
|
22
22
|
this.connect = () => __awaiter(this, void 0, void 0, function* () {
|
|
23
23
|
var _a;
|
|
24
|
-
// WARNING: this was changed during the deployment to ECS...
|
|
24
|
+
// WARNING: this was changed during the deployment to ECS...
|
|
25
|
+
// I've reverted it, but maybe it needs to be changed or looked into...
|
|
25
26
|
this._uri = ((_a = process.env.MONGO_URI) !== null && _a !== void 0 ? _a : Helper_1.default.isDev())
|
|
26
27
|
? 'mongodb://mongo:27017/remora'
|
|
27
28
|
: 'mongodb://localhost:27017/remora';
|
|
@@ -29,7 +30,7 @@ class DatabaseEngineClass {
|
|
|
29
30
|
const errors = [];
|
|
30
31
|
for (let i = 0; i < this.MAX_TRY_CONNECTION; i++) {
|
|
31
32
|
try {
|
|
32
|
-
console.log(`Attempting to connect to mongo: "${this._uri}"`);
|
|
33
|
+
console.log(`Attempting to connect to mongo: "${this._uri}" (${i})`);
|
|
33
34
|
yield this._client.connect();
|
|
34
35
|
this._db = this._client.db(Settings_1.default.db.name);
|
|
35
36
|
this._connected = true;
|
|
@@ -37,7 +38,7 @@ class DatabaseEngineClass {
|
|
|
37
38
|
break;
|
|
38
39
|
}
|
|
39
40
|
catch (error) {
|
|
40
|
-
errors.push((i + 1) + ': connection to MongoDB throws this error:'
|
|
41
|
+
errors.push((i + 1) + ': connection to MongoDB throws this error: ' + error);
|
|
41
42
|
}
|
|
42
43
|
}
|
|
43
44
|
if (!this._connected)
|
|
@@ -54,6 +55,7 @@ class DatabaseEngineClass {
|
|
|
54
55
|
});
|
|
55
56
|
this.query = (collectionName, filter, options) => __awaiter(this, void 0, void 0, function* () {
|
|
56
57
|
try {
|
|
58
|
+
yield this._checkConnection();
|
|
57
59
|
const collection = this._db.collection(collectionName);
|
|
58
60
|
const result = yield collection.find(filter, options).toArray();
|
|
59
61
|
return result;
|
|
@@ -65,6 +67,7 @@ class DatabaseEngineClass {
|
|
|
65
67
|
});
|
|
66
68
|
this.aggregate = (collectionName, aggregation) => __awaiter(this, void 0, void 0, function* () {
|
|
67
69
|
try {
|
|
70
|
+
yield this._checkConnection();
|
|
68
71
|
const collection = this._db.collection(collectionName);
|
|
69
72
|
return yield collection.aggregate(aggregation).toArray();
|
|
70
73
|
}
|
|
@@ -75,6 +78,7 @@ class DatabaseEngineClass {
|
|
|
75
78
|
});
|
|
76
79
|
this.get = (collectionName, id) => __awaiter(this, void 0, void 0, function* () {
|
|
77
80
|
try {
|
|
81
|
+
yield this._checkConnection();
|
|
78
82
|
const collection = this._db.collection(collectionName);
|
|
79
83
|
return yield collection.findOne({ _id: id });
|
|
80
84
|
}
|
|
@@ -85,6 +89,7 @@ class DatabaseEngineClass {
|
|
|
85
89
|
});
|
|
86
90
|
this.findOne = (collectionName, query) => __awaiter(this, void 0, void 0, function* () {
|
|
87
91
|
try {
|
|
92
|
+
yield this._checkConnection();
|
|
88
93
|
const collection = this._db.collection(collectionName);
|
|
89
94
|
return yield collection.findOne(query);
|
|
90
95
|
}
|
|
@@ -95,6 +100,7 @@ class DatabaseEngineClass {
|
|
|
95
100
|
});
|
|
96
101
|
this.upsert = (collectionName, id, update) => __awaiter(this, void 0, void 0, function* () {
|
|
97
102
|
try {
|
|
103
|
+
yield this._checkConnection();
|
|
98
104
|
const collection = this._db.collection(collectionName);
|
|
99
105
|
const result = yield collection.findOneAndUpdate({ _id: id }, { $set: update }, { upsert: true, returnDocument: 'after' });
|
|
100
106
|
return result;
|
|
@@ -106,6 +112,7 @@ class DatabaseEngineClass {
|
|
|
106
112
|
});
|
|
107
113
|
this.addToList = (collectionName, id, arrayField, arrayItem) => __awaiter(this, void 0, void 0, function* () {
|
|
108
114
|
try {
|
|
115
|
+
yield this._checkConnection();
|
|
109
116
|
const collection = this._db.collection(collectionName);
|
|
110
117
|
const result = yield collection.findOneAndUpdate({ _id: id }, { $push: { [arrayField]: arrayItem } }, { returnDocument: 'after' });
|
|
111
118
|
return result;
|
|
@@ -117,6 +124,7 @@ class DatabaseEngineClass {
|
|
|
117
124
|
});
|
|
118
125
|
this.doUpdate = (collectionName, id, update) => __awaiter(this, void 0, void 0, function* () {
|
|
119
126
|
try {
|
|
127
|
+
yield this._checkConnection();
|
|
120
128
|
const collection = this._db.collection(collectionName);
|
|
121
129
|
const result = yield collection.findOneAndUpdate({ _id: id }, update, { returnDocument: 'after' });
|
|
122
130
|
return result;
|
|
@@ -126,6 +134,13 @@ class DatabaseEngineClass {
|
|
|
126
134
|
throw error;
|
|
127
135
|
}
|
|
128
136
|
});
|
|
137
|
+
this._checkConnection = () => __awaiter(this, void 0, void 0, function* () {
|
|
138
|
+
if (this._connected)
|
|
139
|
+
return;
|
|
140
|
+
yield this.connect();
|
|
141
|
+
if (!this._connected)
|
|
142
|
+
throw new Error(`Can't to perform db operation: unable to connect to the database (${this._uri})`);
|
|
143
|
+
});
|
|
129
144
|
}
|
|
130
145
|
}
|
|
131
146
|
const DatabaseEngine = new DatabaseEngineClass();
|
|
@@ -76,6 +76,10 @@
|
|
|
76
76
|
"none",
|
|
77
77
|
"{REMORA_MASK_IN_DEV}"
|
|
78
78
|
]
|
|
79
|
+
},
|
|
80
|
+
"sourceFilename": {
|
|
81
|
+
"type": "boolean",
|
|
82
|
+
"description": "When true, this dimension will be populated with the source filename. Only valid for file-based producers (local, aws-s3) and only one dimension per producer can have this set to true. Useful when reading multiple files with wildcard patterns to track which file each row came from."
|
|
79
83
|
}
|
|
80
84
|
},
|
|
81
85
|
"required": [
|
|
@@ -130,7 +134,7 @@
|
|
|
130
134
|
},
|
|
131
135
|
"fileKey": {
|
|
132
136
|
"type": "string",
|
|
133
|
-
"description": "
|
|
137
|
+
"description": "For S3/local sources: the file key/path that identifies the file to read. For HTTP API sources: the API endpoint path (e.g., '/api/v1/users')"
|
|
134
138
|
},
|
|
135
139
|
"fileType": {
|
|
136
140
|
"type": "string",
|
|
@@ -252,6 +256,40 @@
|
|
|
252
256
|
"fileType": "CSV"
|
|
253
257
|
},
|
|
254
258
|
"_version": 2
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
"name": "APIUsers",
|
|
262
|
+
"description": "Producer for user data from REST API",
|
|
263
|
+
"source": "REST API with Bearer Token",
|
|
264
|
+
"dimensions": [
|
|
265
|
+
{
|
|
266
|
+
"name": "user_id",
|
|
267
|
+
"type": "string",
|
|
268
|
+
"pk": true
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"name": "username",
|
|
272
|
+
"type": "string"
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"name": "email",
|
|
276
|
+
"type": "string",
|
|
277
|
+
"classification": [
|
|
278
|
+
"PII",
|
|
279
|
+
"GDPR"
|
|
280
|
+
],
|
|
281
|
+
"mask": "mask"
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
"name": "created_at",
|
|
285
|
+
"type": "datetime"
|
|
286
|
+
}
|
|
287
|
+
],
|
|
288
|
+
"settings": {
|
|
289
|
+
"fileKey": "/api/v1/users",
|
|
290
|
+
"fileType": "JSON"
|
|
291
|
+
},
|
|
292
|
+
"_version": 1
|
|
255
293
|
}
|
|
256
294
|
]
|
|
257
295
|
}
|
|
@@ -24,7 +24,8 @@
|
|
|
24
24
|
"aws-s3",
|
|
25
25
|
"postgres",
|
|
26
26
|
"local",
|
|
27
|
-
"delta-share"
|
|
27
|
+
"delta-share",
|
|
28
|
+
"http-api"
|
|
28
29
|
],
|
|
29
30
|
"description": "The type of data engine"
|
|
30
31
|
},
|
|
@@ -39,7 +40,10 @@
|
|
|
39
40
|
"username-password",
|
|
40
41
|
"access-secret-key",
|
|
41
42
|
"arn",
|
|
42
|
-
"implicit"
|
|
43
|
+
"implicit",
|
|
44
|
+
"bearer-token",
|
|
45
|
+
"api-key",
|
|
46
|
+
"none"
|
|
43
47
|
],
|
|
44
48
|
"description": "The authentication method to use"
|
|
45
49
|
},
|
|
@@ -113,7 +117,47 @@
|
|
|
113
117
|
},
|
|
114
118
|
"bearerToken": {
|
|
115
119
|
"type": "string",
|
|
116
|
-
"description": "
|
|
120
|
+
"description": "Bearer token used for authentication (Delta Sharing or HTTP API)"
|
|
121
|
+
},
|
|
122
|
+
"url": {
|
|
123
|
+
"type": "string",
|
|
124
|
+
"format": "uri",
|
|
125
|
+
"description": "Base URL for HTTP API sources"
|
|
126
|
+
},
|
|
127
|
+
"headers": {
|
|
128
|
+
"type": "object",
|
|
129
|
+
"description": "Custom HTTP headers for API requests",
|
|
130
|
+
"additionalProperties": {
|
|
131
|
+
"type": "string"
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
"queryParams": {
|
|
135
|
+
"type": "object",
|
|
136
|
+
"description": "Default query parameters for API requests",
|
|
137
|
+
"additionalProperties": {
|
|
138
|
+
"type": "string"
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
"httpMethod": {
|
|
142
|
+
"type": "string",
|
|
143
|
+
"enum": ["GET", "POST", "PUT", "PATCH", "DELETE"],
|
|
144
|
+
"description": "HTTP method to use for API requests",
|
|
145
|
+
"default": "GET"
|
|
146
|
+
},
|
|
147
|
+
"apiKey": {
|
|
148
|
+
"type": "string",
|
|
149
|
+
"description": "API key for api-key authentication method"
|
|
150
|
+
},
|
|
151
|
+
"apiKeyHeader": {
|
|
152
|
+
"type": "string",
|
|
153
|
+
"description": "Header name for API key (defaults to X-API-Key)",
|
|
154
|
+
"default": "X-API-Key"
|
|
155
|
+
},
|
|
156
|
+
"timeout": {
|
|
157
|
+
"type": "number",
|
|
158
|
+
"description": "Request timeout in milliseconds",
|
|
159
|
+
"default": 30000,
|
|
160
|
+
"minimum": 1000
|
|
117
161
|
}
|
|
118
162
|
},
|
|
119
163
|
"required": ["method"]
|
|
@@ -172,6 +216,35 @@
|
|
|
172
216
|
"clusterId": "analytics-cluster"
|
|
173
217
|
},
|
|
174
218
|
"_version": 1
|
|
219
|
+
},
|
|
220
|
+
{
|
|
221
|
+
"name": "REST API with Bearer Token",
|
|
222
|
+
"description": "HTTP API source with bearer token authentication",
|
|
223
|
+
"engine": "http-api",
|
|
224
|
+
"authentication": {
|
|
225
|
+
"method": "bearer-token",
|
|
226
|
+
"url": "https://api.example.com",
|
|
227
|
+
"bearerToken": "{API_BEARER_TOKEN}",
|
|
228
|
+
"headers": {
|
|
229
|
+
"Accept": "application/json"
|
|
230
|
+
},
|
|
231
|
+
"timeout": 30000
|
|
232
|
+
},
|
|
233
|
+
"_version": 1
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
"name": "Public REST API",
|
|
237
|
+
"description": "Public HTTP API with no authentication",
|
|
238
|
+
"engine": "http-api",
|
|
239
|
+
"authentication": {
|
|
240
|
+
"method": "none",
|
|
241
|
+
"url": "https://api.publicapis.org",
|
|
242
|
+
"headers": {
|
|
243
|
+
"Accept": "application/json"
|
|
244
|
+
},
|
|
245
|
+
"httpMethod": "GET"
|
|
246
|
+
},
|
|
247
|
+
"_version": 1
|
|
175
248
|
}
|
|
176
249
|
]
|
|
177
250
|
}
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -16,6 +16,7 @@ const LocalDriver_1 = require("./LocalDriver");
|
|
|
16
16
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
17
17
|
const S3Driver_1 = require("./S3Driver");
|
|
18
18
|
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
19
|
+
const HttpApiDriver_1 = require("./HttpApiDriver");
|
|
19
20
|
class DriverFactoryClass {
|
|
20
21
|
constructor() {
|
|
21
22
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -41,6 +42,11 @@ class DriverFactoryClass {
|
|
|
41
42
|
yield driver.init(source);
|
|
42
43
|
return driver;
|
|
43
44
|
}
|
|
45
|
+
case 'http-api': {
|
|
46
|
+
const driver = new HttpApiDriver_1.HttpApiSourceDriver();
|
|
47
|
+
yield driver.init(source);
|
|
48
|
+
return driver;
|
|
49
|
+
}
|
|
44
50
|
default: throw new Error(`Invalid driver type "${source.engine}". This driver is not implemented yet`);
|
|
45
51
|
}
|
|
46
52
|
});
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -28,12 +28,17 @@ const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
|
28
28
|
const DriverHelper = {
|
|
29
29
|
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
30
30
|
(0, Affirm_1.default)(options, 'Invalid options');
|
|
31
|
-
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter } = options;
|
|
31
|
+
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options;
|
|
32
32
|
(0, Affirm_1.default)(headerLine, `Invalid header line`);
|
|
33
33
|
const keys = (fileType === 'JSON' || fileType === 'JSONL')
|
|
34
34
|
? Object.keys(JSON.parse(headerLine))
|
|
35
35
|
: [];
|
|
36
36
|
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
|
|
37
|
+
// When sourceFilename is set, the headerLine includes $source_filename at the end.
|
|
38
|
+
// For validation, we need to compare against the original header without this suffix.
|
|
39
|
+
const originalHeaderLine = sourceFilename
|
|
40
|
+
? headerLine.slice(0, headerLine.lastIndexOf(delimiter))
|
|
41
|
+
: headerLine;
|
|
37
42
|
let isFirstLine = true;
|
|
38
43
|
let hasValidatedHeader = shouldValidateHeader ? false : true;
|
|
39
44
|
let leftoverData = '';
|
|
@@ -50,8 +55,8 @@ const DriverHelper = {
|
|
|
50
55
|
const line = lines[i];
|
|
51
56
|
// Header validation for first line
|
|
52
57
|
if (!hasValidatedHeader && isFirstLine && i === 0) {
|
|
53
|
-
if (shouldValidateHeader &&
|
|
54
|
-
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${
|
|
58
|
+
if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) {
|
|
59
|
+
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`;
|
|
55
60
|
Logger_1.default.log(msg);
|
|
56
61
|
return callback(new Error(msg));
|
|
57
62
|
}
|
|
@@ -77,7 +82,7 @@ const DriverHelper = {
|
|
|
77
82
|
// Process any remaining data
|
|
78
83
|
if (leftoverData.trim()) {
|
|
79
84
|
if (shouldIncludeLine(leftoverData, -1)) {
|
|
80
|
-
callback(null, Buffer.from(processLine(leftoverData)));
|
|
85
|
+
callback(null, Buffer.from(processLine(leftoverData) + '\n'));
|
|
81
86
|
}
|
|
82
87
|
else {
|
|
83
88
|
callback(null, null);
|
|
@@ -101,21 +106,28 @@ const DriverHelper = {
|
|
|
101
106
|
};
|
|
102
107
|
const processLine = (line) => {
|
|
103
108
|
lineCount++;
|
|
109
|
+
let processedLine;
|
|
104
110
|
switch (fileType) {
|
|
105
111
|
case 'JSON':
|
|
106
112
|
case 'JSONL': {
|
|
107
113
|
try {
|
|
108
114
|
const parsed = JSON.parse(line);
|
|
109
|
-
|
|
115
|
+
processedLine = keys.map(k => parsed[k]).join(delimiter);
|
|
110
116
|
}
|
|
111
117
|
catch (error) {
|
|
112
118
|
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
|
|
113
119
|
throw error;
|
|
114
120
|
}
|
|
121
|
+
break;
|
|
115
122
|
}
|
|
116
123
|
default:
|
|
117
|
-
|
|
124
|
+
processedLine = line;
|
|
125
|
+
}
|
|
126
|
+
// If sourceFilename is provided, append it to each line
|
|
127
|
+
if (sourceFilename) {
|
|
128
|
+
processedLine = processedLine + delimiter + sourceFilename;
|
|
118
129
|
}
|
|
130
|
+
return processedLine;
|
|
119
131
|
};
|
|
120
132
|
const writeOptions = append ? { flags: 'a' } : {};
|
|
121
133
|
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.HttpApiSourceDriver = void 0;
|
|
16
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
17
|
+
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
18
|
+
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
19
|
+
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
20
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
21
|
+
class HttpApiSourceDriver {
|
|
22
|
+
constructor() {
|
|
23
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
24
|
+
(0, Affirm_1.default)(source, 'Invalid source');
|
|
25
|
+
(0, Affirm_1.default)(source.authentication, 'Invalid authentication');
|
|
26
|
+
(0, Affirm_1.default)(source.authentication.url, 'HTTP API source requires a URL in authentication.url');
|
|
27
|
+
this._source = source;
|
|
28
|
+
this._baseUrl = SecretManager_1.default.replaceSecret(source.authentication.url);
|
|
29
|
+
this._httpMethod = source.authentication.httpMethod || 'GET';
|
|
30
|
+
this._timeout = source.authentication.timeout || 30000; // 30 seconds default
|
|
31
|
+
this._headers = source.authentication.headers ? Object.assign({}, source.authentication.headers) : {};
|
|
32
|
+
this._queryParams = source.authentication.queryParams ? Object.assign({}, source.authentication.queryParams) : {};
|
|
33
|
+
// Handle different authentication methods
|
|
34
|
+
switch (source.authentication.method) {
|
|
35
|
+
case 'bearer-token': {
|
|
36
|
+
(0, Affirm_1.default)(source.authentication.bearerToken, 'Bearer token authentication requires bearerToken');
|
|
37
|
+
this._headers['Authorization'] = `Bearer ${SecretManager_1.default.replaceSecret(source.authentication.bearerToken)}`;
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
case 'api-key': {
|
|
41
|
+
(0, Affirm_1.default)(source.authentication.apiKey, 'API key authentication requires apiKey');
|
|
42
|
+
const apiKeyHeader = source.authentication.apiKeyHeader || 'X-API-Key';
|
|
43
|
+
this._headers[apiKeyHeader] = SecretManager_1.default.replaceSecret(source.authentication.apiKey);
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
case 'username-password': {
|
|
47
|
+
(0, Affirm_1.default)(source.authentication.user && source.authentication.password, 'Username-password authentication requires user and password');
|
|
48
|
+
const credentials = Buffer.from(`${SecretManager_1.default.replaceSecret(source.authentication.user)}:${SecretManager_1.default.replaceSecret(source.authentication.password)}`).toString('base64');
|
|
49
|
+
this._headers['Authorization'] = `Basic ${credentials}`;
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
case 'none':
|
|
53
|
+
// No authentication required
|
|
54
|
+
break;
|
|
55
|
+
default:
|
|
56
|
+
throw new Error(`Authentication method "${source.authentication.method}" is not supported for HTTP API sources`);
|
|
57
|
+
}
|
|
58
|
+
// Test connection
|
|
59
|
+
try {
|
|
60
|
+
yield this._makeRequest(this._baseUrl);
|
|
61
|
+
Logger_1.default.log(`HTTP API connection to ${this._baseUrl} successful`);
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
throw new Error(`Failed to connect to HTTP API at ${this._baseUrl}: ${error.message}`);
|
|
65
|
+
}
|
|
66
|
+
return this;
|
|
67
|
+
});
|
|
68
|
+
this._makeRequest = (url, options) => __awaiter(this, void 0, void 0, function* () {
|
|
69
|
+
const method = (options === null || options === void 0 ? void 0 : options.method) || this._httpMethod;
|
|
70
|
+
const headers = Object.assign(Object.assign({}, this._headers), options === null || options === void 0 ? void 0 : options.additionalHeaders);
|
|
71
|
+
const queryParams = Object.assign(Object.assign({}, this._queryParams), options === null || options === void 0 ? void 0 : options.additionalQueryParams);
|
|
72
|
+
// Build URL with query parameters
|
|
73
|
+
const urlWithParams = new URL(url);
|
|
74
|
+
Object.entries(queryParams).forEach(([key, value]) => {
|
|
75
|
+
urlWithParams.searchParams.append(key, value);
|
|
76
|
+
});
|
|
77
|
+
const fetchOptions = {
|
|
78
|
+
method,
|
|
79
|
+
headers,
|
|
80
|
+
signal: AbortSignal.timeout(this._timeout)
|
|
81
|
+
};
|
|
82
|
+
if ((options === null || options === void 0 ? void 0 : options.body) && (method === 'POST' || method === 'PUT' || method === 'PATCH')) {
|
|
83
|
+
fetchOptions.body = typeof options.body === 'string'
|
|
84
|
+
? options.body
|
|
85
|
+
: JSON.stringify(options.body);
|
|
86
|
+
if (!headers['Content-Type']) {
|
|
87
|
+
headers['Content-Type'] = 'application/json';
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
const response = yield fetch(urlWithParams.toString(), fetchOptions);
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
93
|
+
}
|
|
94
|
+
const contentType = response.headers.get('content-type');
|
|
95
|
+
if (contentType === null || contentType === void 0 ? void 0 : contentType.includes('application/json')) {
|
|
96
|
+
return yield response.json();
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
return yield response.text();
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
this.execute = (_sql) => __awaiter(this, void 0, void 0, function* () {
|
|
103
|
+
void _sql;
|
|
104
|
+
throw new Error('SQL execution is not supported for HTTP API sources. Use query() or readAll() instead.');
|
|
105
|
+
});
|
|
106
|
+
this.query = (_sql, _values) => __awaiter(this, void 0, void 0, function* () {
|
|
107
|
+
void _sql;
|
|
108
|
+
void _values;
|
|
109
|
+
throw new Error('SQL queries are not supported for HTTP API sources. Use readAll() to fetch data from the API.');
|
|
110
|
+
});
|
|
111
|
+
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
112
|
+
try {
|
|
113
|
+
const endpoint = producer.settings.fileKey || '';
|
|
114
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
115
|
+
yield this._makeRequest(url, { method: 'HEAD' });
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
catch (error) {
|
|
119
|
+
if (error.message.includes('404')) {
|
|
120
|
+
return false;
|
|
121
|
+
}
|
|
122
|
+
throw error;
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
this.readAll = (request, values) => __awaiter(this, void 0, void 0, function* () {
|
|
126
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
127
|
+
(0, Affirm_1.default)(request.fileKey, 'Invalid file key (endpoint path)');
|
|
128
|
+
const endpoint = request.fileKey;
|
|
129
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
130
|
+
// Convert IQueryParameter[] to query params if provided
|
|
131
|
+
const additionalQueryParams = {};
|
|
132
|
+
if (values && values.length > 0) {
|
|
133
|
+
values.forEach(param => {
|
|
134
|
+
additionalQueryParams[param.name] = param.value;
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
const data = yield this._makeRequest(url, { additionalQueryParams });
|
|
138
|
+
// Convert response to string array (lines)
|
|
139
|
+
return this._extractObjectsFromResponse(data, request.httpApi).map(x => JSON.stringify(x));
|
|
140
|
+
});
|
|
141
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
142
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
143
|
+
(0, Affirm_1.default)(request.options, 'Invalid read request options');
|
|
144
|
+
const allLines = yield this.readAll(request);
|
|
145
|
+
const { lineFrom, lineTo } = request.options;
|
|
146
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
|
|
147
|
+
return allLines.slice(lineFrom, lineTo);
|
|
148
|
+
}
|
|
149
|
+
return allLines;
|
|
150
|
+
});
|
|
151
|
+
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
152
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
153
|
+
const file = dataset.getFile();
|
|
154
|
+
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
155
|
+
(0, Affirm_1.default)(file.fileKey, 'Invalid file key (endpoint path)');
|
|
156
|
+
const endpoint = file.fileKey;
|
|
157
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
158
|
+
const data = yield this._makeRequest(url);
|
|
159
|
+
const apiObjects = this._extractObjectsFromResponse(data, file.httpApi);
|
|
160
|
+
dataset.setFirstLine(JSON.stringify(apiObjects[0]));
|
|
161
|
+
const totalLineCount = yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
162
|
+
append: true,
|
|
163
|
+
delimiter: dataset.getDelimiter(),
|
|
164
|
+
destinationPath: dataset.getPath(),
|
|
165
|
+
objects: apiObjects
|
|
166
|
+
});
|
|
167
|
+
dataset.setCount(totalLineCount);
|
|
168
|
+
return dataset;
|
|
169
|
+
});
|
|
170
|
+
this._extractObjectsFromResponse = (data, httpApi) => {
|
|
171
|
+
let itemsData = [];
|
|
172
|
+
if (httpApi && httpApi.dataProperty && httpApi.dataProperty.length > 0) {
|
|
173
|
+
itemsData = data[httpApi.dataProperty];
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
if (typeof data === 'string') {
|
|
177
|
+
itemsData = data.split('\n').filter(line => line.trim().length > 0);
|
|
178
|
+
}
|
|
179
|
+
else if (Array.isArray(data)) {
|
|
180
|
+
itemsData = data;
|
|
181
|
+
}
|
|
182
|
+
else if (typeof data === 'object' && data !== null) {
|
|
183
|
+
const dataObj = data;
|
|
184
|
+
if (dataObj.data && Array.isArray(dataObj.data)) {
|
|
185
|
+
itemsData = dataObj.data;
|
|
186
|
+
}
|
|
187
|
+
else if (dataObj.results && Array.isArray(dataObj.results)) {
|
|
188
|
+
itemsData = dataObj.results;
|
|
189
|
+
}
|
|
190
|
+
else if (dataObj.items && Array.isArray(dataObj.items)) {
|
|
191
|
+
itemsData = dataObj.items;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
// Single object, return as single line
|
|
195
|
+
itemsData = [data];
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return itemsData;
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
exports.HttpApiSourceDriver = HttpApiSourceDriver;
|
|
204
|
+
exports.default = HttpApiSourceDriver;
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -65,6 +65,7 @@ const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"))
|
|
|
65
65
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
66
66
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
67
67
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
68
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
68
69
|
class LocalSourceDriver {
|
|
69
70
|
constructor() {
|
|
70
71
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -123,7 +124,8 @@ class LocalSourceDriver {
|
|
|
123
124
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
124
125
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
125
126
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
126
|
-
const
|
|
127
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
128
|
+
const copyLocally = (fileKey_1, headerLine_1, ...args_1) => __awaiter(this, [fileKey_1, headerLine_1, ...args_1], void 0, function* (fileKey, headerLine, appendMode = false, sourceFilename) {
|
|
127
129
|
const sourceFilePath = path_1.default.join(this._path, fileKey);
|
|
128
130
|
(0, Affirm_1.default)(fs.existsSync(sourceFilePath), `Source file does not exist: ${sourceFilePath}`);
|
|
129
131
|
// Copy and validate header in a single stream pass
|
|
@@ -136,7 +138,8 @@ class LocalSourceDriver {
|
|
|
136
138
|
headerLine,
|
|
137
139
|
fileType: file.fileType,
|
|
138
140
|
hasHeaderRow: file.hasHeaderRow,
|
|
139
|
-
delimiter: dataset.getDelimiter()
|
|
141
|
+
delimiter: dataset.getDelimiter(),
|
|
142
|
+
sourceFilename
|
|
140
143
|
});
|
|
141
144
|
});
|
|
142
145
|
const { fileKey } = file;
|
|
@@ -145,21 +148,32 @@ class LocalSourceDriver {
|
|
|
145
148
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
146
149
|
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in path "${this._path}".`);
|
|
147
150
|
// Get header line from the first file
|
|
148
|
-
|
|
151
|
+
let headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
152
|
+
// If including source filename, append a placeholder column name to the header
|
|
153
|
+
if (includeSourceFilename) {
|
|
154
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
155
|
+
}
|
|
149
156
|
dataset.setFirstLine(headerLine);
|
|
150
157
|
let totalLineCount = 0;
|
|
151
158
|
// Copy files sequentially to avoid file conflicts
|
|
152
159
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
153
|
-
|
|
160
|
+
const currentFileKey = allFileKeys[i];
|
|
161
|
+
// Pass the filename (just the basename) if includeSourceFilename is enabled
|
|
162
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
|
|
163
|
+
totalLineCount += yield copyLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files
|
|
154
164
|
}
|
|
155
165
|
dataset.setCount(totalLineCount);
|
|
156
166
|
return dataset;
|
|
157
167
|
}
|
|
158
168
|
else {
|
|
159
|
-
// For single file,
|
|
160
|
-
|
|
169
|
+
// For single file, include the filename if configured
|
|
170
|
+
let headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
|
|
171
|
+
if (includeSourceFilename) {
|
|
172
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
173
|
+
}
|
|
161
174
|
dataset.setFirstLine(headerLine);
|
|
162
|
-
const
|
|
175
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
176
|
+
const totalLineCount = yield copyLocally(fileKey, headerLine, false, sourceFilename);
|
|
163
177
|
dataset.setCount(totalLineCount);
|
|
164
178
|
return dataset;
|
|
165
179
|
}
|
package/drivers/S3Driver.js
CHANGED
|
@@ -24,6 +24,7 @@ const client_s3_1 = require("@aws-sdk/client-s3");
|
|
|
24
24
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
25
25
|
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
26
26
|
const readline_1 = __importDefault(require("readline"));
|
|
27
|
+
const path_1 = __importDefault(require("path"));
|
|
27
28
|
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
28
29
|
const xlsx_1 = __importDefault(require("xlsx"));
|
|
29
30
|
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); // Added XMLParser import
|
|
@@ -32,6 +33,7 @@ const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"))
|
|
|
32
33
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
33
34
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
34
35
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
36
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
35
37
|
class S3DestinationDriver {
|
|
36
38
|
constructor() {
|
|
37
39
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -213,7 +215,8 @@ class S3SourceDriver {
|
|
|
213
215
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
214
216
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
215
217
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
216
|
-
const
|
|
218
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
219
|
+
const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false, sourceFilename) {
|
|
217
220
|
// Download and validate header in a single stream pass
|
|
218
221
|
const command = new client_s3_1.GetObjectCommand({
|
|
219
222
|
Bucket: this._bucketName,
|
|
@@ -230,7 +233,8 @@ class S3SourceDriver {
|
|
|
230
233
|
headerLine,
|
|
231
234
|
fileType: file.fileType,
|
|
232
235
|
hasHeaderRow: file.hasHeaderRow,
|
|
233
|
-
delimiter: dataset.getDelimiter()
|
|
236
|
+
delimiter: dataset.getDelimiter(),
|
|
237
|
+
sourceFilename
|
|
234
238
|
});
|
|
235
239
|
});
|
|
236
240
|
const { fileKey } = file;
|
|
@@ -246,18 +250,25 @@ class S3SourceDriver {
|
|
|
246
250
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
247
251
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
248
252
|
const firstFileStream = firstFileResponse.Body;
|
|
249
|
-
|
|
253
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
254
|
+
// If including source filename, append a placeholder column name to the header
|
|
255
|
+
if (includeSourceFilename) {
|
|
256
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
257
|
+
}
|
|
250
258
|
dataset.setFirstLine(headerLine);
|
|
251
259
|
let totalLineCount = 0;
|
|
252
260
|
// Download files sequentially to avoid file conflicts
|
|
253
261
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
254
|
-
|
|
262
|
+
const currentFileKey = allFileKeys[i];
|
|
263
|
+
// Pass the filename (just the basename) if includeSourceFilename is enabled
|
|
264
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
|
|
265
|
+
totalLineCount += yield downloadLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files
|
|
255
266
|
}
|
|
256
267
|
dataset.setCount(totalLineCount);
|
|
257
268
|
return dataset;
|
|
258
269
|
}
|
|
259
270
|
else {
|
|
260
|
-
// Get header line from the
|
|
271
|
+
// Get header line from the single file
|
|
261
272
|
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
262
273
|
Bucket: this._bucketName,
|
|
263
274
|
Key: fileKey
|
|
@@ -265,10 +276,15 @@ class S3SourceDriver {
|
|
|
265
276
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
266
277
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
267
278
|
const firstFileStream = firstFileResponse.Body;
|
|
268
|
-
|
|
279
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
280
|
+
// If including source filename, append a placeholder column name to the header
|
|
281
|
+
if (includeSourceFilename) {
|
|
282
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
283
|
+
}
|
|
269
284
|
dataset.setFirstLine(headerLine);
|
|
270
|
-
//
|
|
271
|
-
const
|
|
285
|
+
// Pass the filename if includeSourceFilename is enabled
|
|
286
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
287
|
+
const totalLineCount = yield downloadLocally(fileKey, headerLine, false, sourceFilename);
|
|
272
288
|
dataset.setCount(totalLineCount);
|
|
273
289
|
return dataset;
|
|
274
290
|
}
|
|
@@ -22,6 +22,7 @@ const Dataset_1 = __importDefault(require("./Dataset"));
|
|
|
22
22
|
const promises_1 = require("stream/promises");
|
|
23
23
|
const fs_1 = require("fs");
|
|
24
24
|
const DeveloperEngine_1 = __importDefault(require("../ai/DeveloperEngine"));
|
|
25
|
+
const Constants_1 = __importDefault(require("../../Constants"));
|
|
25
26
|
class DatasetManagerClass {
|
|
26
27
|
constructor() {
|
|
27
28
|
/**
|
|
@@ -30,14 +31,19 @@ class DatasetManagerClass {
|
|
|
30
31
|
* when the same producer / consumer is executed multiple times in parallel.
|
|
31
32
|
*/
|
|
32
33
|
this.create = (producer, executionId) => {
|
|
34
|
+
var _a, _b;
|
|
33
35
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
34
|
-
const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName } } = producer;
|
|
36
|
+
const { name, settings: { delimiter, fileKey, fileType, hasHeaderRow, sheetName, httpApi } } = producer;
|
|
37
|
+
// Check if any dimension has sourceFilename flag set to true
|
|
38
|
+
const hasSourceFilenameDimension = (_b = (_a = producer.dimensions) === null || _a === void 0 ? void 0 : _a.some(d => d.sourceFilename === true)) !== null && _b !== void 0 ? _b : false;
|
|
35
39
|
const dataset = new Dataset_1.default(name, {
|
|
36
40
|
fileKey,
|
|
37
41
|
fileType,
|
|
38
42
|
hasHeaderRow,
|
|
39
43
|
sheetName,
|
|
40
|
-
delimiter
|
|
44
|
+
delimiter,
|
|
45
|
+
httpApi,
|
|
46
|
+
includeSourceFilename: hasSourceFilenameDimension
|
|
41
47
|
}, undefined, executionId);
|
|
42
48
|
return dataset;
|
|
43
49
|
};
|
|
@@ -49,7 +55,7 @@ class DatasetManagerClass {
|
|
|
49
55
|
return this.buildDimensionsFromFirstLine(firstLine, dataset.getFile(), producer, discover);
|
|
50
56
|
});
|
|
51
57
|
this.buildDimensionsFromFirstLine = (firstLine_1, dsFile_1, producer_1, ...args_1) => __awaiter(this, [firstLine_1, dsFile_1, producer_1, ...args_1], void 0, function* (firstLine, dsFile, producer, discover = false) {
|
|
52
|
-
var _a, _b, _c, _d, _e, _f, _g, _h;
|
|
58
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
|
|
53
59
|
(0, Affirm_1.default)(firstLine, `Invalid first line`);
|
|
54
60
|
(0, Affirm_1.default)(dsFile, `Invalid dataset file`);
|
|
55
61
|
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
@@ -77,6 +83,12 @@ class DatasetManagerClass {
|
|
|
77
83
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
78
84
|
const firstObject = JSON.parse(firstLine);
|
|
79
85
|
const keys = Object.keys(firstObject);
|
|
86
|
+
// If includeSourceFilename is enabled, the driver has added $source_filename column
|
|
87
|
+
// We need to add it to the keys list so dimensions can reference it
|
|
88
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
89
|
+
if (includeSourceFilename) {
|
|
90
|
+
keys.push(Constants_1.default.SOURCE_FILENAME_COLUMN);
|
|
91
|
+
}
|
|
80
92
|
if (discover) {
|
|
81
93
|
return {
|
|
82
94
|
delimiter: (_b = file.delimiter) !== null && _b !== void 0 ? _b : ',',
|
|
@@ -91,7 +103,21 @@ class DatasetManagerClass {
|
|
|
91
103
|
}
|
|
92
104
|
const dimensions = [];
|
|
93
105
|
for (const pColumn of columns) {
|
|
94
|
-
|
|
106
|
+
// Handle sourceFilename dimension specially - it maps to the $source_filename column added by the driver
|
|
107
|
+
if (((_c = pColumn.dimension) === null || _c === void 0 ? void 0 : _c.sourceFilename) === true) {
|
|
108
|
+
if (includeSourceFilename) {
|
|
109
|
+
const sourceFilenameIndex = keys.findIndex(x => x === Constants_1.default.SOURCE_FILENAME_COLUMN);
|
|
110
|
+
dimensions.push({
|
|
111
|
+
index: sourceFilenameIndex,
|
|
112
|
+
key: Constants_1.default.SOURCE_FILENAME_COLUMN,
|
|
113
|
+
name: pColumn.nameInProducer,
|
|
114
|
+
hidden: null,
|
|
115
|
+
type: (_e = (_d = pColumn.dimension) === null || _d === void 0 ? void 0 : _d.type) !== null && _e !== void 0 ? _e : 'string'
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
const columnKey = (_f = pColumn.aliasInProducer) !== null && _f !== void 0 ? _f : pColumn.nameInProducer;
|
|
95
121
|
const csvColumnIndex = keys.findIndex(x => x === columnKey);
|
|
96
122
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
97
123
|
dimensions.push({
|
|
@@ -99,47 +125,64 @@ class DatasetManagerClass {
|
|
|
99
125
|
key: columnKey,
|
|
100
126
|
name: pColumn.nameInProducer,
|
|
101
127
|
hidden: null,
|
|
102
|
-
type: (
|
|
128
|
+
type: (_h = (_g = pColumn.dimension) === null || _g === void 0 ? void 0 : _g.type) !== null && _h !== void 0 ? _h : 'string'
|
|
103
129
|
});
|
|
104
130
|
}
|
|
105
|
-
const delimiterChar = (
|
|
131
|
+
const delimiterChar = (_j = file.delimiter) !== null && _j !== void 0 ? _j : ',';
|
|
106
132
|
return { dimensions, delimiter: delimiterChar };
|
|
107
133
|
}
|
|
108
134
|
case 'TXT': {
|
|
109
135
|
if (!file.hasHeaderRow) {
|
|
110
136
|
// If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
|
|
111
|
-
const delimiterChar = (
|
|
137
|
+
const delimiterChar = (_k = file.delimiter) !== null && _k !== void 0 ? _k : ',';
|
|
112
138
|
const source = Environment_1.default.getSource(producer.source);
|
|
113
139
|
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
140
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
114
141
|
if (discover) {
|
|
115
142
|
// Since I don't have an header, and I'm discovering, I just create placeholder dimensions based on the same number of columns of the txt
|
|
143
|
+
const colValues = firstLine.split(delimiterChar);
|
|
144
|
+
const dimensions = colValues.map((x, i) => ({
|
|
145
|
+
hidden: false,
|
|
146
|
+
index: i,
|
|
147
|
+
key: `Col ${i + 1}`,
|
|
148
|
+
name: `Col ${i + 1}`,
|
|
149
|
+
type: 'string'
|
|
150
|
+
}));
|
|
116
151
|
return {
|
|
117
152
|
delimiter: delimiterChar,
|
|
118
|
-
dimensions
|
|
119
|
-
hidden: false,
|
|
120
|
-
index: i,
|
|
121
|
-
key: `Col ${i + 1}`,
|
|
122
|
-
name: `Col ${i + 1}`,
|
|
123
|
-
type: 'string'
|
|
124
|
-
}))
|
|
153
|
+
dimensions
|
|
125
154
|
};
|
|
126
155
|
}
|
|
156
|
+
// Filter out sourceFilename columns for index-based mapping, but track them for later
|
|
157
|
+
const regularColumns = columns.filter(x => { var _a; return ((_a = x.dimension) === null || _a === void 0 ? void 0 : _a.sourceFilename) !== true; });
|
|
158
|
+
const sourceFilenameColumn = columns.find(x => { var _a; return ((_a = x.dimension) === null || _a === void 0 ? void 0 : _a.sourceFilename) === true; });
|
|
159
|
+
const dimensions = regularColumns.map((x, i) => {
|
|
160
|
+
var _a, _b, _c;
|
|
161
|
+
return ({
|
|
162
|
+
key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
|
|
163
|
+
name: x.nameInProducer,
|
|
164
|
+
index: i,
|
|
165
|
+
hidden: null,
|
|
166
|
+
type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
// Add sourceFilename dimension at the end if enabled
|
|
170
|
+
if (sourceFilenameColumn && includeSourceFilename) {
|
|
171
|
+
dimensions.push({
|
|
172
|
+
key: Constants_1.default.SOURCE_FILENAME_COLUMN,
|
|
173
|
+
name: sourceFilenameColumn.nameInProducer,
|
|
174
|
+
index: regularColumns.length, // Index after all regular columns
|
|
175
|
+
hidden: null,
|
|
176
|
+
type: (_m = (_l = sourceFilenameColumn.dimension) === null || _l === void 0 ? void 0 : _l.type) !== null && _m !== void 0 ? _m : 'string'
|
|
177
|
+
});
|
|
178
|
+
}
|
|
127
179
|
return {
|
|
128
|
-
dimensions
|
|
129
|
-
var _a, _b, _c;
|
|
130
|
-
return ({
|
|
131
|
-
key: (_a = x.aliasInProducer) !== null && _a !== void 0 ? _a : x.nameInProducer,
|
|
132
|
-
name: x.nameInProducer,
|
|
133
|
-
index: i,
|
|
134
|
-
hidden: null,
|
|
135
|
-
type: (_c = (_b = x.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
136
|
-
});
|
|
137
|
-
}),
|
|
180
|
+
dimensions,
|
|
138
181
|
delimiter: delimiterChar
|
|
139
182
|
};
|
|
140
183
|
}
|
|
141
184
|
else {
|
|
142
|
-
const delimiterChar = (
|
|
185
|
+
const delimiterChar = (_o = producer.settings.delimiter) !== null && _o !== void 0 ? _o : ',';
|
|
143
186
|
const rawDimensions = ParseManager_1.default._extractHeader(firstLine, delimiterChar, producer, discover);
|
|
144
187
|
return {
|
|
145
188
|
dimensions: rawDimensions.map(x => ({
|
|
@@ -13,7 +13,8 @@ class DatasetRecord {
|
|
|
13
13
|
const parts = CSVParser_1.default.parseRow(row, delimiter);
|
|
14
14
|
for (let i = 0; i < dimensions.length; i++) {
|
|
15
15
|
const dim = dimensions[i];
|
|
16
|
-
|
|
16
|
+
// Use dim.index to get the correct column from the file, not the loop index
|
|
17
|
+
this._value[dim.name] = TypeCaster_1.default.cast(parts[dim.index], dim.type, dim.format);
|
|
17
18
|
}
|
|
18
19
|
}
|
|
19
20
|
};
|
|
@@ -112,7 +112,8 @@ class ExecutionPlannerClas {
|
|
|
112
112
|
}
|
|
113
113
|
case 'local':
|
|
114
114
|
case 'aws-s3':
|
|
115
|
-
case 'delta-share':
|
|
115
|
+
case 'delta-share':
|
|
116
|
+
case 'http-api': {
|
|
116
117
|
plan.push({ type: 'load-dataset', producer });
|
|
117
118
|
plan.push({ type: 'prepare-dataset', producer });
|
|
118
119
|
if (producer.dimensions.some(x => { var _a, _b; return ((_a = x.alias) === null || _a === void 0 ? void 0 : _a.includes('{')) || ((_b = x.alias) === null || _b === void 0 ? void 0 : _b.includes('[')); }))
|
|
@@ -7,10 +7,11 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
7
7
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
8
8
|
const FileCompiler_1 = __importDefault(require("../file/FileCompiler"));
|
|
9
9
|
const CSVParser_1 = __importDefault(require("./CSVParser"));
|
|
10
|
+
const Constants_1 = __importDefault(require("../../Constants"));
|
|
10
11
|
class ParseManagerClass {
|
|
11
12
|
constructor() {
|
|
12
13
|
this._extractHeader = (headerLine, delimiter, producer, discover) => {
|
|
13
|
-
var _a, _b, _c;
|
|
14
|
+
var _a, _b, _c, _d, _e, _f;
|
|
14
15
|
(0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
|
|
15
16
|
(0, Affirm_1.default)(delimiter, 'Invalid CSV delimiter');
|
|
16
17
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
@@ -22,14 +23,29 @@ class ParseManagerClass {
|
|
|
22
23
|
columns = headerColumns.map(x => ({ nameInProducer: x }));
|
|
23
24
|
const csvColumns = [];
|
|
24
25
|
for (const pColumn of columns) {
|
|
25
|
-
|
|
26
|
+
// Skip sourceFilename dimensions - they don't exist in the source file
|
|
27
|
+
// They are added dynamically by the driver when reading the file
|
|
28
|
+
if (((_a = pColumn.dimension) === null || _a === void 0 ? void 0 : _a.sourceFilename) === true) {
|
|
29
|
+
// Find the index of $source_filename in the header (it was added by the driver)
|
|
30
|
+
const sourceFilenameIndex = headerColumns.findIndex(x => x === Constants_1.default.SOURCE_FILENAME_COLUMN);
|
|
31
|
+
if (sourceFilenameIndex > -1) {
|
|
32
|
+
csvColumns.push({
|
|
33
|
+
index: sourceFilenameIndex,
|
|
34
|
+
name: Constants_1.default.SOURCE_FILENAME_COLUMN,
|
|
35
|
+
saveAs: pColumn.nameInProducer,
|
|
36
|
+
type: (_c = (_b = pColumn.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
const columnKey = (_d = pColumn.aliasInProducer) !== null && _d !== void 0 ? _d : pColumn.nameInProducer;
|
|
26
42
|
const csvColumnIndex = headerColumns.findIndex(x => x === columnKey);
|
|
27
43
|
(0, Affirm_1.default)(csvColumnIndex > -1, `The column "${pColumn.nameInProducer}" (with key "${columnKey}") of producer "${producer.name}" doesn't exist in the underlying dataset.`);
|
|
28
44
|
csvColumns.push({
|
|
29
45
|
index: csvColumnIndex,
|
|
30
46
|
name: columnKey,
|
|
31
47
|
saveAs: pColumn.nameInProducer,
|
|
32
|
-
type: (
|
|
48
|
+
type: (_f = (_e = pColumn.dimension) === null || _e === void 0 ? void 0 : _e.type) !== null && _f !== void 0 ? _f : 'string'
|
|
33
49
|
});
|
|
34
50
|
}
|
|
35
51
|
return csvColumns;
|
|
@@ -65,7 +65,7 @@ class ProducerEngineClass {
|
|
|
65
65
|
}
|
|
66
66
|
});
|
|
67
67
|
this.readFile = (producer, options) => __awaiter(this, void 0, void 0, function* () {
|
|
68
|
-
var _a;
|
|
68
|
+
var _a, _b, _c;
|
|
69
69
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
70
70
|
(0, Affirm_1.default)(options, 'Invalid options');
|
|
71
71
|
if (options.readmode === 'lines')
|
|
@@ -79,16 +79,25 @@ class ProducerEngineClass {
|
|
|
79
79
|
let lines = [];
|
|
80
80
|
switch (options.readmode) {
|
|
81
81
|
case 'lines':
|
|
82
|
-
lines = yield driver.readLinesInRange({
|
|
82
|
+
lines = yield driver.readLinesInRange({
|
|
83
|
+
fileKey,
|
|
84
|
+
fileType,
|
|
85
|
+
options: { lineFrom: options.lines.from, lineTo: options.lines.to, sheetName, hasHeaderRow },
|
|
86
|
+
httpApi: (_a = producer.settings) === null || _a === void 0 ? void 0 : _a.httpApi
|
|
87
|
+
});
|
|
83
88
|
break;
|
|
84
89
|
case 'all':
|
|
85
|
-
lines = yield driver.readAll({
|
|
90
|
+
lines = yield driver.readAll({
|
|
91
|
+
fileKey, fileType,
|
|
92
|
+
options: { sheetName, hasHeaderRow },
|
|
93
|
+
httpApi: (_b = producer.settings) === null || _b === void 0 ? void 0 : _b.httpApi
|
|
94
|
+
});
|
|
86
95
|
break;
|
|
87
96
|
case 'download':
|
|
88
97
|
dataset = yield driver.download(dataset);
|
|
89
98
|
break;
|
|
90
99
|
}
|
|
91
|
-
switch ((
|
|
100
|
+
switch ((_c = producer.settings.fileType) === null || _c === void 0 ? void 0 : _c.toUpperCase()) {
|
|
92
101
|
case 'CSV':
|
|
93
102
|
case 'TXT':
|
|
94
103
|
return { data: lines, dataset, dataType: 'lines-of-text' };
|
|
@@ -51,6 +51,23 @@ class ValidatorClass {
|
|
|
51
51
|
errors.push(`Missing parameter "source" in producer`);
|
|
52
52
|
if (producer.dimensions.some(x => x.name.includes('{') || x.name.includes('[')))
|
|
53
53
|
errors.push(`Invalid dimension name found in producer "${producer.name}": can't use characters "{" or "[" in dimension names`);
|
|
54
|
+
// Validate sourceFilename dimension usage
|
|
55
|
+
const sourceFilenameDimensions = producer.dimensions.filter(x => x.sourceFilename === true);
|
|
56
|
+
if (sourceFilenameDimensions.length > 1) {
|
|
57
|
+
errors.push(`Producer "${producer.name}" has multiple dimensions with sourceFilename=true. Only one dimension can have this flag.`);
|
|
58
|
+
}
|
|
59
|
+
if (sourceFilenameDimensions.length > 0) {
|
|
60
|
+
const source = Environment_1.default.getSource(producer.source);
|
|
61
|
+
if (source) {
|
|
62
|
+
const validEngines = ['local', 'aws-s3'];
|
|
63
|
+
if (!validEngines.includes(source.engine)) {
|
|
64
|
+
errors.push(`Producer "${producer.name}" has a dimension with sourceFilename=true but the source engine "${source.engine}" doesn't support this feature. Only "local" and "aws-s3" sources support sourceFilename.`);
|
|
65
|
+
}
|
|
66
|
+
if (!producer.settings.fileKey && !producer.settings.fileType) {
|
|
67
|
+
errors.push(`Producer "${producer.name}" has a dimension with sourceFilename=true but is not a file-based producer. sourceFilename requires fileKey and fileType to be set.`);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
54
71
|
}
|
|
55
72
|
catch (e) {
|
|
56
73
|
if (errors.length === 0)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forzalabs/remora",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "A powerful CLI tool for seamless data translation.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"private": false,
|
|
@@ -61,6 +61,7 @@
|
|
|
61
61
|
"react": "^18.2.0",
|
|
62
62
|
"react-dom": "^18.2.0",
|
|
63
63
|
"seedrandom": "^3.0.5",
|
|
64
|
+
"uuid": "^13.0.0",
|
|
64
65
|
"workerpool": "^9.3.3",
|
|
65
66
|
"xlsx": "^0.18.5",
|
|
66
67
|
"zod": "^3.24.2"
|