@forzalabs/remora 0.1.3-nasco.3 → 0.1.5-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +9 -1
- package/definitions/json_schemas/producer-schema.json +2 -1
- package/definitions/json_schemas/source-schema.json +14 -1
- package/documentation/README.md +1 -0
- package/documentation/default_resources/consumer.json +7 -7
- package/drivers/DeltaShareDriver.js +178 -0
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +16 -1
- package/drivers/LocalDriver.js +1 -0
- package/drivers/S3Driver.js +1 -0
- package/engines/ai/DeveloperEngine.js +90 -1
- package/engines/consumer/ConsumerEngine.js +1 -1
- package/engines/consumer/PostProcessor.js +27 -18
- package/engines/dataset/Dataset.js +18 -7
- package/engines/dataset/DatasetManager.js +58 -12
- package/engines/dataset/DatasetRecord.js +17 -4
- package/engines/dataset/ParallelDataset.js +29 -7
- package/engines/execution/ExecutionEnvironment.js +13 -4
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/file/FileCompiler.js +2 -1
- package/engines/file/FileExporter.js +12 -3
- package/engines/parsing/ParseManager.js +7 -2
- package/engines/producer/ProducerEngine.js +4 -2
- package/engines/transform/JoinEngine.js +10 -6
- package/engines/transform/TransformationEngine.js +35 -3
- package/engines/transform/TypeCaster.js +20 -9
- package/engines/usage/UsageDataManager.js +110 -0
- package/engines/validation/Validator.js +0 -3
- package/package.json +3 -1
- package/workers/FilterWorker.js +3 -3
- package/workers/ProjectionWorker.js +3 -3
- package/workers/TransformWorker.js +3 -3
package/Constants.js
CHANGED
|
@@ -223,6 +223,10 @@
|
|
|
223
223
|
"type": "string",
|
|
224
224
|
"description": "The name of the source where the consumer will export its data when deployed/run"
|
|
225
225
|
},
|
|
226
|
+
"exportName": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"description": "If the format is a file, forces the same name in the export file (extension is auto-added)"
|
|
229
|
+
},
|
|
226
230
|
"trigger": {
|
|
227
231
|
"type": "object",
|
|
228
232
|
"description": "Triggers to perform the export (not just the usual 'Deploy')",
|
|
@@ -463,7 +467,11 @@
|
|
|
463
467
|
"cast": {
|
|
464
468
|
"type": "string",
|
|
465
469
|
"description": "Cast the value to a specific type",
|
|
466
|
-
"enum": ["string", "number", "
|
|
470
|
+
"enum": ["string", "number", "datetime", "boolean"]
|
|
471
|
+
},
|
|
472
|
+
"format": {
|
|
473
|
+
"type": "string",
|
|
474
|
+
"description": "Optional format for date parsing or string formatting (e.g. YYYY-MM-DD, DD/MM/YY)"
|
|
467
475
|
}
|
|
468
476
|
},
|
|
469
477
|
"required": ["cast"],
|
|
@@ -23,7 +23,8 @@
|
|
|
23
23
|
"aws-dynamodb",
|
|
24
24
|
"aws-s3",
|
|
25
25
|
"postgres",
|
|
26
|
-
"local"
|
|
26
|
+
"local",
|
|
27
|
+
"delta-share"
|
|
27
28
|
],
|
|
28
29
|
"description": "The type of data engine"
|
|
29
30
|
},
|
|
@@ -66,6 +67,10 @@
|
|
|
66
67
|
"type": "string",
|
|
67
68
|
"description": "Database schema name"
|
|
68
69
|
},
|
|
70
|
+
"table": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Table name (used by some engines like delta-share)"
|
|
73
|
+
},
|
|
69
74
|
"port": {
|
|
70
75
|
"type": "string",
|
|
71
76
|
"description": "Port number for the connection"
|
|
@@ -101,6 +106,14 @@
|
|
|
101
106
|
"path": {
|
|
102
107
|
"type": "string",
|
|
103
108
|
"description": "The folder path"
|
|
109
|
+
},
|
|
110
|
+
"share": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"description": "Delta Sharing share name"
|
|
113
|
+
},
|
|
114
|
+
"bearerToken": {
|
|
115
|
+
"type": "string",
|
|
116
|
+
"description": "Delta Sharing bearer token used for authentication"
|
|
104
117
|
}
|
|
105
118
|
},
|
|
106
119
|
"required": ["method"]
|
package/documentation/README.md
CHANGED
|
@@ -106,6 +106,7 @@ Consumers transform and combine data from producers for specific use cases.
|
|
|
106
106
|
| `outputs[].accellerated` | Whether to materialize for performance | `true`, `false` |
|
|
107
107
|
| `outputs[].direct` | Whether to query directly without creating views | `true`, `false` |
|
|
108
108
|
| `outputs[].exportDestination` | Where to export data | Must match a source `name` |
|
|
109
|
+
| `outputs[].exportName` | Fixed file name (without extension) for file exports | String |
|
|
109
110
|
| `outputs[].trigger.type` | How to trigger exports | `CRON`, `API` |
|
|
110
111
|
| `outputs[].trigger.value` | Trigger expression | CRON expression (e.g., `0 0 * * *`) or endpoint path |
|
|
111
112
|
| `metadata` | Custom tags | Object with string keys and values |
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
"joins": [
|
|
12
12
|
{
|
|
13
13
|
"otherName": "<primary producer name>",
|
|
14
|
-
"relationship": "
|
|
15
|
-
"sql": "<
|
|
14
|
+
"relationship": "one-to-many",
|
|
15
|
+
"sql": "${P.id} = ${<primary producer name>.fk_id}"
|
|
16
16
|
}
|
|
17
17
|
]
|
|
18
18
|
}
|
|
@@ -29,17 +29,17 @@
|
|
|
29
29
|
}
|
|
30
30
|
],
|
|
31
31
|
"outputs": [
|
|
32
|
-
{ "format": "
|
|
32
|
+
{ "format": "API" },
|
|
33
33
|
{
|
|
34
|
-
"format": "
|
|
34
|
+
"format": "JSON",
|
|
35
35
|
"exportDestination": "<export destination>"
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
"format": "
|
|
38
|
+
"format": "CSV",
|
|
39
39
|
"exportDestination": "<export destination>",
|
|
40
40
|
"trigger": {
|
|
41
|
-
"type": "
|
|
42
|
-
"value": "
|
|
41
|
+
"type": "CRON",
|
|
42
|
+
"value": "0 0 * * *"
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
],
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
16
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
17
|
+
/**
|
|
18
|
+
* Delta Share (Databricks Delta Sharing) Source Driver
|
|
19
|
+
*/
|
|
20
|
+
class DeltaShareSourceDriver {
|
|
21
|
+
constructor() {
|
|
22
|
+
this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
|
|
23
|
+
this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
|
|
24
|
+
this._tablesInShare = '{prefix}/shares/{share}/all-tables';
|
|
25
|
+
this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
|
|
26
|
+
this._schemasInShare = '{prefix}/shares/{share}/schemas';
|
|
27
|
+
this._shares = '{prefix}/shares';
|
|
28
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
29
|
+
(0, Affirm_1.default)(source, 'Invalid source');
|
|
30
|
+
// Expected authentication shape for delta-share
|
|
31
|
+
const { authentication } = source;
|
|
32
|
+
(0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
|
|
33
|
+
this._shareUrl = authentication.host;
|
|
34
|
+
this._bearerToken = authentication.bearerToken || authentication.sessionToken || authentication.password;
|
|
35
|
+
this._share = authentication.share;
|
|
36
|
+
this._schema = authentication.schema;
|
|
37
|
+
this._table = authentication.table;
|
|
38
|
+
(0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
|
|
39
|
+
(0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
|
|
40
|
+
(0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
|
|
41
|
+
(0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
|
|
42
|
+
(0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
|
|
43
|
+
this._source = source;
|
|
44
|
+
return this;
|
|
45
|
+
});
|
|
46
|
+
// Delta Sharing is not a SQL engine; expose explicit error
|
|
47
|
+
this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
|
|
48
|
+
void __sql;
|
|
49
|
+
throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
|
|
50
|
+
});
|
|
51
|
+
this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
|
|
52
|
+
void __sql;
|
|
53
|
+
void __values;
|
|
54
|
+
throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
|
|
55
|
+
});
|
|
56
|
+
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
var _a, _b, _c;
|
|
58
|
+
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
59
|
+
(0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
|
|
60
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
61
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
62
|
+
const lines = [];
|
|
63
|
+
for (const deltaFile of deltaFiles) {
|
|
64
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
65
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
66
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
67
|
+
lines.push(...parquetRecords.map(x => JSON.stringify(x)));
|
|
68
|
+
}
|
|
69
|
+
return lines;
|
|
70
|
+
});
|
|
71
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
72
|
+
var _a, _b, _c;
|
|
73
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
74
|
+
(0, Affirm_1.default)(request.options, 'Invalid read options');
|
|
75
|
+
(0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
|
|
76
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
77
|
+
const { options: { lineFrom, lineTo } } = request;
|
|
78
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
79
|
+
const lines = [];
|
|
80
|
+
let index = 0;
|
|
81
|
+
for (const deltaFile of deltaFiles) {
|
|
82
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
83
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
84
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
85
|
+
for (const record of parquetRecords) {
|
|
86
|
+
if (index >= lineFrom && index < lineTo)
|
|
87
|
+
lines.push(JSON.stringify(record));
|
|
88
|
+
index++;
|
|
89
|
+
if (index >= lineTo)
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return lines;
|
|
94
|
+
});
|
|
95
|
+
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
96
|
+
var _a, _b, _c;
|
|
97
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
98
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
99
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
100
|
+
// For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
|
|
101
|
+
let index = 0;
|
|
102
|
+
let totalLineCount = 0;
|
|
103
|
+
for (const deltaFile of deltaFiles) {
|
|
104
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
105
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
106
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
107
|
+
if (index === 0 && parquetRecords.length > 0) {
|
|
108
|
+
// I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
|
|
109
|
+
dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
|
|
110
|
+
}
|
|
111
|
+
totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
112
|
+
append: index > 0,
|
|
113
|
+
delimiter: dataset.getDelimiter(),
|
|
114
|
+
destinationPath: dataset.getPath(),
|
|
115
|
+
objects: parquetRecords
|
|
116
|
+
});
|
|
117
|
+
index++;
|
|
118
|
+
}
|
|
119
|
+
dataset.setCount(totalLineCount);
|
|
120
|
+
return dataset;
|
|
121
|
+
});
|
|
122
|
+
this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
|
|
123
|
+
void __producer;
|
|
124
|
+
try {
|
|
125
|
+
yield this._getAllFilesInTables(this._table);
|
|
126
|
+
// If it doesn't exist, then it fails in the above function
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
catch (_a) {
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
134
|
+
const url = this._version
|
|
135
|
+
.replace('{prefix}', this._shareUrl)
|
|
136
|
+
.replace('{share}', this._share)
|
|
137
|
+
.replace('{schema}', this._schema)
|
|
138
|
+
.replace('{table}', table);
|
|
139
|
+
const res = yield fetch(url, {
|
|
140
|
+
method: 'GET',
|
|
141
|
+
headers: {
|
|
142
|
+
Authorization: `Bearer ${this._bearerToken}`
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
(0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText}`);
|
|
146
|
+
const version = res.headers['delta-table-version'];
|
|
147
|
+
return version;
|
|
148
|
+
});
|
|
149
|
+
this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
150
|
+
const url = this._query
|
|
151
|
+
.replace('{prefix}', this._shareUrl)
|
|
152
|
+
.replace('{share}', this._share)
|
|
153
|
+
.replace('{schema}', this._schema)
|
|
154
|
+
.replace('{table}', table);
|
|
155
|
+
const body = {
|
|
156
|
+
version: yield this._getVersion(table)
|
|
157
|
+
};
|
|
158
|
+
const res = yield fetch(url, {
|
|
159
|
+
method: 'POST',
|
|
160
|
+
headers: {
|
|
161
|
+
'Authorization': `Bearer ${this._bearerToken}`,
|
|
162
|
+
'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
|
|
163
|
+
},
|
|
164
|
+
body: JSON.stringify(body)
|
|
165
|
+
});
|
|
166
|
+
const rawText = yield res.text();
|
|
167
|
+
(0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
|
|
168
|
+
// By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
|
|
169
|
+
const deltaLines = rawText
|
|
170
|
+
.split('\n')
|
|
171
|
+
.filter(x => x.length > 0)
|
|
172
|
+
.slice(2)
|
|
173
|
+
.map(x => JSON.parse(x));
|
|
174
|
+
return deltaLines;
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
exports.default = DeltaShareSourceDriver;
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
const LocalDriver_1 = require("./LocalDriver");
|
|
16
16
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
17
17
|
const S3Driver_1 = require("./S3Driver");
|
|
18
|
+
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
18
19
|
class DriverFactoryClass {
|
|
19
20
|
constructor() {
|
|
20
21
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -30,6 +31,11 @@ class DriverFactoryClass {
|
|
|
30
31
|
yield driver.init(source);
|
|
31
32
|
return driver;
|
|
32
33
|
}
|
|
34
|
+
case 'delta-share': {
|
|
35
|
+
const driver = new DeltaShareDriver_1.default();
|
|
36
|
+
yield driver.init(source);
|
|
37
|
+
return driver;
|
|
38
|
+
}
|
|
33
39
|
case 'local': {
|
|
34
40
|
const driver = new LocalDriver_1.LocalSourceDriver();
|
|
35
41
|
yield driver.init(source);
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -109,7 +109,7 @@ const DriverHelper = {
|
|
|
109
109
|
return keys.map(k => parsed[k]).join(delimiter);
|
|
110
110
|
}
|
|
111
111
|
catch (error) {
|
|
112
|
-
Logger_1.default.log(`Failed parsing
|
|
112
|
+
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
|
|
113
113
|
throw error;
|
|
114
114
|
}
|
|
115
115
|
}
|
|
@@ -122,6 +122,21 @@ const DriverHelper = {
|
|
|
122
122
|
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
|
123
123
|
return lineCount;
|
|
124
124
|
}),
|
|
125
|
+
appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
|
+
(0, Affirm_1.default)(options, 'Invalid options');
|
|
127
|
+
const { append, destinationPath, objects, delimiter } = options;
|
|
128
|
+
const writeOptions = append ? { flags: 'a' } : {};
|
|
129
|
+
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
130
|
+
let lineCount = 0;
|
|
131
|
+
const keys = Object.keys(objects[0]);
|
|
132
|
+
for (const obj of objects) {
|
|
133
|
+
const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
|
|
134
|
+
writeStream.write(serialized);
|
|
135
|
+
lineCount++;
|
|
136
|
+
}
|
|
137
|
+
writeStream.close();
|
|
138
|
+
return lineCount;
|
|
139
|
+
}),
|
|
125
140
|
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
141
|
var _a, e_1, _b, _c;
|
|
127
142
|
const fileStream = (0, fs_1.createReadStream)(filePath);
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -143,6 +143,7 @@ class LocalSourceDriver {
|
|
|
143
143
|
if (fileKey.includes('%')) {
|
|
144
144
|
const allFileKeys = this.listFiles(fileKey);
|
|
145
145
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
146
|
+
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in path "${this._path}".`);
|
|
146
147
|
// Get header line from the first file
|
|
147
148
|
const headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
148
149
|
dataset.setFirstLine(headerLine);
|
package/drivers/S3Driver.js
CHANGED
|
@@ -216,6 +216,7 @@ class S3SourceDriver {
|
|
|
216
216
|
if (fileKey.includes('%')) {
|
|
217
217
|
const allFileKeys = yield this.listFiles(fileKey);
|
|
218
218
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
219
|
+
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in bucket "${this._bucketName}".`);
|
|
219
220
|
// Get header line from the first file
|
|
220
221
|
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
221
222
|
Bucket: this._bucketName,
|
|
@@ -16,6 +16,9 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
16
16
|
const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine"));
|
|
17
17
|
const path_1 = __importDefault(require("path"));
|
|
18
18
|
const promises_1 = __importDefault(require("fs/promises"));
|
|
19
|
+
const dayjs_1 = __importDefault(require("dayjs"));
|
|
20
|
+
const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat"));
|
|
21
|
+
dayjs_1.default.extend(customParseFormat_1.default);
|
|
19
22
|
class DeveloperEngineClass {
|
|
20
23
|
constructor() {
|
|
21
24
|
this.discover = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -60,13 +63,99 @@ class DeveloperEngineClass {
|
|
|
60
63
|
return 'string';
|
|
61
64
|
}
|
|
62
65
|
};
|
|
66
|
+
// Infer the most likely type from a single JS value
|
|
67
|
+
// Returns one of: 'number' | 'boolean' | 'date' | 'datetime' | 'string' | 'array' | 'object' | 'null'
|
|
68
|
+
this.inferType = (value) => {
|
|
69
|
+
if (value === null || value === undefined)
|
|
70
|
+
return 'string';
|
|
71
|
+
// Arrays
|
|
72
|
+
if (Array.isArray(value))
|
|
73
|
+
return 'array';
|
|
74
|
+
// Booleans (including common string representations)
|
|
75
|
+
if (typeof value === 'boolean')
|
|
76
|
+
return 'boolean';
|
|
77
|
+
if (typeof value === 'string') {
|
|
78
|
+
const trimmed = value.trim();
|
|
79
|
+
const lower = trimmed.toLowerCase();
|
|
80
|
+
if (lower === 'true' || lower === 'false')
|
|
81
|
+
return 'boolean';
|
|
82
|
+
// Numbers (numeric strings)
|
|
83
|
+
const numericRegex = /^-?\d+(?:\.\d+)?$/;
|
|
84
|
+
if (numericRegex.test(trimmed))
|
|
85
|
+
return 'number';
|
|
86
|
+
// Timestamps (10 or 13 digits)
|
|
87
|
+
const tsRegex = /^-?\d{10}(?:\d{3})?$/;
|
|
88
|
+
if (tsRegex.test(trimmed)) {
|
|
89
|
+
const n = Number(trimmed.length === 10 ? `${trimmed}000` : trimmed);
|
|
90
|
+
const d = new Date(n);
|
|
91
|
+
if (!isNaN(d.getTime()))
|
|
92
|
+
return 'datetime';
|
|
93
|
+
}
|
|
94
|
+
// Dates with common formats
|
|
95
|
+
const dateFormats = [
|
|
96
|
+
'YYYY-MM-DD',
|
|
97
|
+
'YYYY/MM/DD',
|
|
98
|
+
'DD/MM/YYYY',
|
|
99
|
+
'MM/DD/YYYY',
|
|
100
|
+
'YYYYMMDD',
|
|
101
|
+
'DD-MMM-YYYY',
|
|
102
|
+
'YYYY-MM-DD HH:mm',
|
|
103
|
+
'YYYY-MM-DD HH:mm:ss',
|
|
104
|
+
'YYYY-MM-DDTHH:mm',
|
|
105
|
+
'YYYY-MM-DDTHH:mmZ',
|
|
106
|
+
'YYYY-MM-DDTHH:mm:ss',
|
|
107
|
+
'YYYY-MM-DDTHH:mm:ssZ',
|
|
108
|
+
'YYYY-MM-DDTHH:mm:ss.SSSZ'
|
|
109
|
+
];
|
|
110
|
+
for (const fmt of dateFormats) {
|
|
111
|
+
const d = (0, dayjs_1.default)(trimmed, fmt, true);
|
|
112
|
+
if (d.isValid()) {
|
|
113
|
+
// If time components likely present, classify as datetime
|
|
114
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
115
|
+
return 'datetime';
|
|
116
|
+
return 'date';
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// ISO 8601 without specifying format
|
|
120
|
+
const iso = (0, dayjs_1.default)(trimmed);
|
|
121
|
+
if (iso.isValid() && /\d{4}-\d{2}-\d{2}/.test(trimmed)) {
|
|
122
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
123
|
+
return 'datetime';
|
|
124
|
+
return 'date';
|
|
125
|
+
}
|
|
126
|
+
return 'string';
|
|
127
|
+
}
|
|
128
|
+
if (typeof value === 'number')
|
|
129
|
+
return 'number';
|
|
130
|
+
if (typeof value === 'object') {
|
|
131
|
+
// Date instance
|
|
132
|
+
if (value instanceof Date && !isNaN(value.getTime()))
|
|
133
|
+
return 'datetime';
|
|
134
|
+
return 'object';
|
|
135
|
+
}
|
|
136
|
+
// Fallback for bigint, symbol, function -> string
|
|
137
|
+
return 'string';
|
|
138
|
+
};
|
|
139
|
+
this.inferDimensionType = (value) => {
|
|
140
|
+
const type = this.inferType(value);
|
|
141
|
+
switch (type) {
|
|
142
|
+
case 'array':
|
|
143
|
+
case 'object': return 'string';
|
|
144
|
+
case 'boolean': return 'boolean';
|
|
145
|
+
case 'date':
|
|
146
|
+
case 'datetime': return 'datetime';
|
|
147
|
+
case 'number': return 'number';
|
|
148
|
+
case 'string': return 'string';
|
|
149
|
+
default: return 'string';
|
|
150
|
+
}
|
|
151
|
+
};
|
|
63
152
|
this.extractFieldTypes = (records) => {
|
|
64
153
|
if (!records || records.length === 0)
|
|
65
154
|
return [];
|
|
66
155
|
const sample = records[0];
|
|
67
156
|
return Object.entries(sample._value).map(([key, value]) => ({
|
|
68
157
|
name: key,
|
|
69
|
-
type:
|
|
158
|
+
type: this.inferType(value)
|
|
70
159
|
}));
|
|
71
160
|
};
|
|
72
161
|
this.extractFieldClassification = (field) => {
|
|
@@ -132,7 +132,7 @@ class ConsumerEngineClass {
|
|
|
132
132
|
(0, Affirm_1.default)(options, `Invalid execute consume options`);
|
|
133
133
|
const { usageId } = UsageManager_1.default.startUsage(consumer, user);
|
|
134
134
|
try {
|
|
135
|
-
const execution = new ExecutionEnvironment_1.default(consumer);
|
|
135
|
+
const execution = new ExecutionEnvironment_1.default(consumer, usageId);
|
|
136
136
|
const result = yield execution.run(options);
|
|
137
137
|
UsageManager_1.default.endUsage(usageId, result._stats.size);
|
|
138
138
|
return result;
|
|
@@ -62,7 +62,7 @@ class PostProcessorClass {
|
|
|
62
62
|
}
|
|
63
63
|
return record;
|
|
64
64
|
}, options);
|
|
65
|
-
newDataset.
|
|
65
|
+
newDataset.setDimensions(updatedDimensions);
|
|
66
66
|
return newDataset;
|
|
67
67
|
});
|
|
68
68
|
/**
|
|
@@ -174,12 +174,16 @@ class PostProcessorClass {
|
|
|
174
174
|
normalizedRecord[fieldName] = (_a = splitRecord[fieldName]) !== null && _a !== void 0 ? _a : '';
|
|
175
175
|
}
|
|
176
176
|
// Create dimensions based on the expected field names
|
|
177
|
-
const newDimensions = expectedFieldNames.map((key, index) =>
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
const newDimensions = expectedFieldNames.map((key, index) => {
|
|
178
|
+
var _a, _b, _c;
|
|
179
|
+
return ({
|
|
180
|
+
name: key,
|
|
181
|
+
key: key,
|
|
182
|
+
index: index,
|
|
183
|
+
hidden: null,
|
|
184
|
+
type: (_c = (_b = (_a = columns[index]) === null || _a === void 0 ? void 0 : _a.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
185
|
+
});
|
|
186
|
+
});
|
|
183
187
|
// Create the row string
|
|
184
188
|
const values = newDimensions.map(dim => {
|
|
185
189
|
const value = normalizedRecord[dim.name];
|
|
@@ -196,14 +200,17 @@ class PostProcessorClass {
|
|
|
196
200
|
// Update the dataset dimensions to match the unpacked structure
|
|
197
201
|
// TODO: 99% certain this will cause a bug
|
|
198
202
|
if (columns.length > 0) {
|
|
199
|
-
const newDimensions = columns.map((col, index) =>
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
203
|
+
const newDimensions = columns.map((col, index) => {
|
|
204
|
+
var _a;
|
|
205
|
+
return ({
|
|
206
|
+
name: col.nameInProducer,
|
|
207
|
+
key: col.nameInProducer,
|
|
208
|
+
index: index,
|
|
209
|
+
hidden: null,
|
|
210
|
+
type: (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type
|
|
211
|
+
});
|
|
212
|
+
});
|
|
213
|
+
resDataset.setDimensions(newDimensions);
|
|
207
214
|
}
|
|
208
215
|
return resDataset;
|
|
209
216
|
});
|
|
@@ -214,17 +221,19 @@ class PostProcessorClass {
|
|
|
214
221
|
return defaultValue;
|
|
215
222
|
const fieldKey = alias !== null && alias !== void 0 ? alias : key;
|
|
216
223
|
const fieldValue = record.getValue(fieldKey);
|
|
224
|
+
const fieldType = (_b = (_a = field.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string';
|
|
217
225
|
if (Algo_1.default.hasVal(fieldValue) && !isNaN(fieldValue)) {
|
|
218
|
-
const fieldType = (_b = (_a = field.dimension) === null || _a === void 0 ? void 0 : _a.type) !== null && _b !== void 0 ? _b : 'string';
|
|
219
226
|
if (fieldType === 'number' && typeof fieldValue === 'string' && fieldValue.length === 0)
|
|
220
227
|
return (_c = field.cField.default) !== null && _c !== void 0 ? _c : fieldValue;
|
|
221
228
|
else
|
|
222
229
|
return fieldValue;
|
|
223
230
|
}
|
|
224
|
-
else if ((!Algo_1.default.hasVal(fieldValue) || isNaN(fieldValue)) && Algo_1.default.hasVal(field.cField.default))
|
|
231
|
+
else if ((!Algo_1.default.hasVal(fieldValue) || (isNaN(fieldValue) && fieldType === 'number')) && Algo_1.default.hasVal(field.cField.default)) {
|
|
225
232
|
return field.cField.default;
|
|
226
|
-
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
227
235
|
return fieldValue;
|
|
236
|
+
}
|
|
228
237
|
};
|
|
229
238
|
this.distinct = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
230
239
|
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|