@forzalabs/remora 0.1.4-nasco.3 → 0.1.6-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +6 -2
- package/definitions/json_schemas/producer-schema.json +2 -1
- package/definitions/json_schemas/source-schema.json +14 -1
- package/documentation/README.md +1 -0
- package/documentation/default_resources/consumer.json +7 -7
- package/drivers/DeltaShareDriver.js +178 -0
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +15 -0
- package/engines/ai/DeveloperEngine.js +90 -1
- package/engines/consumer/ConsumerEngine.js +1 -1
- package/engines/consumer/PostProcessor.js +22 -15
- package/engines/dataset/Dataset.js +22 -8
- package/engines/dataset/DatasetManager.js +58 -12
- package/engines/dataset/DatasetRecord.js +17 -4
- package/engines/dataset/ParallelDataset.js +16 -6
- package/engines/execution/ExecutionEnvironment.js +13 -4
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/file/FileCompiler.js +2 -1
- package/engines/file/FileExporter.js +12 -3
- package/engines/parsing/ParseManager.js +7 -2
- package/engines/producer/ProducerEngine.js +4 -2
- package/engines/transform/JoinEngine.js +10 -6
- package/engines/transform/TransformationEngine.js +31 -2
- package/engines/transform/TypeCaster.js +12 -4
- package/engines/usage/UsageDataManager.js +110 -0
- package/package.json +2 -1
- package/workers/FilterWorker.js +3 -3
- package/workers/ProjectionWorker.js +3 -3
- package/workers/TransformWorker.js +3 -3
package/Constants.js
CHANGED
|
@@ -223,6 +223,10 @@
|
|
|
223
223
|
"type": "string",
|
|
224
224
|
"description": "The name of the source where the consumer will export its data when deployed/run"
|
|
225
225
|
},
|
|
226
|
+
"exportName": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"description": "If the format is a file, forces the same name in the export file (extension is auto-added)"
|
|
229
|
+
},
|
|
226
230
|
"trigger": {
|
|
227
231
|
"type": "object",
|
|
228
232
|
"description": "Triggers to perform the export (not just the usual 'Deploy')",
|
|
@@ -463,11 +467,11 @@
|
|
|
463
467
|
"cast": {
|
|
464
468
|
"type": "string",
|
|
465
469
|
"description": "Cast the value to a specific type",
|
|
466
|
-
"enum": ["string", "number", "
|
|
470
|
+
"enum": ["string", "number", "datetime", "boolean"]
|
|
467
471
|
},
|
|
468
472
|
"format": {
|
|
469
473
|
"type": "string",
|
|
470
|
-
"description": "Optional format for date parsing or string formatting (
|
|
474
|
+
"description": "Optional format for date parsing or string formatting (e.g. YYYY-MM-DD, DD/MM/YY)"
|
|
471
475
|
}
|
|
472
476
|
},
|
|
473
477
|
"required": ["cast"],
|
|
@@ -23,7 +23,8 @@
|
|
|
23
23
|
"aws-dynamodb",
|
|
24
24
|
"aws-s3",
|
|
25
25
|
"postgres",
|
|
26
|
-
"local"
|
|
26
|
+
"local",
|
|
27
|
+
"delta-share"
|
|
27
28
|
],
|
|
28
29
|
"description": "The type of data engine"
|
|
29
30
|
},
|
|
@@ -66,6 +67,10 @@
|
|
|
66
67
|
"type": "string",
|
|
67
68
|
"description": "Database schema name"
|
|
68
69
|
},
|
|
70
|
+
"table": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Table name (used by some engines like delta-share)"
|
|
73
|
+
},
|
|
69
74
|
"port": {
|
|
70
75
|
"type": "string",
|
|
71
76
|
"description": "Port number for the connection"
|
|
@@ -101,6 +106,14 @@
|
|
|
101
106
|
"path": {
|
|
102
107
|
"type": "string",
|
|
103
108
|
"description": "The folder path"
|
|
109
|
+
},
|
|
110
|
+
"share": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"description": "Delta Sharing share name"
|
|
113
|
+
},
|
|
114
|
+
"bearerToken": {
|
|
115
|
+
"type": "string",
|
|
116
|
+
"description": "Delta Sharing bearer token used for authentication"
|
|
104
117
|
}
|
|
105
118
|
},
|
|
106
119
|
"required": ["method"]
|
package/documentation/README.md
CHANGED
|
@@ -106,6 +106,7 @@ Consumers transform and combine data from producers for specific use cases.
|
|
|
106
106
|
| `outputs[].accellerated` | Whether to materialize for performance | `true`, `false` |
|
|
107
107
|
| `outputs[].direct` | Whether to query directly without creating views | `true`, `false` |
|
|
108
108
|
| `outputs[].exportDestination` | Where to export data | Must match a source `name` |
|
|
109
|
+
| `outputs[].exportName` | Fixed file name (without extension) for file exports | String |
|
|
109
110
|
| `outputs[].trigger.type` | How to trigger exports | `CRON`, `API` |
|
|
110
111
|
| `outputs[].trigger.value` | Trigger expression | CRON expression (e.g., `0 0 * * *`) or endpoint path |
|
|
111
112
|
| `metadata` | Custom tags | Object with string keys and values |
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
"joins": [
|
|
12
12
|
{
|
|
13
13
|
"otherName": "<primary producer name>",
|
|
14
|
-
"relationship": "
|
|
15
|
-
"sql": "<
|
|
14
|
+
"relationship": "one-to-many",
|
|
15
|
+
"sql": "${P.id} = ${<primary producer name>.fk_id}"
|
|
16
16
|
}
|
|
17
17
|
]
|
|
18
18
|
}
|
|
@@ -29,17 +29,17 @@
|
|
|
29
29
|
}
|
|
30
30
|
],
|
|
31
31
|
"outputs": [
|
|
32
|
-
{ "format": "
|
|
32
|
+
{ "format": "API" },
|
|
33
33
|
{
|
|
34
|
-
"format": "
|
|
34
|
+
"format": "JSON",
|
|
35
35
|
"exportDestination": "<export destination>"
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
"format": "
|
|
38
|
+
"format": "CSV",
|
|
39
39
|
"exportDestination": "<export destination>",
|
|
40
40
|
"trigger": {
|
|
41
|
-
"type": "
|
|
42
|
-
"value": "
|
|
41
|
+
"type": "CRON",
|
|
42
|
+
"value": "0 0 * * *"
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
],
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
16
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
17
|
+
/**
|
|
18
|
+
* Delta Share (Databricks Delta Sharing) Source Driver
|
|
19
|
+
*/
|
|
20
|
+
class DeltaShareSourceDriver {
|
|
21
|
+
constructor() {
|
|
22
|
+
this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
|
|
23
|
+
this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
|
|
24
|
+
this._tablesInShare = '{prefix}/shares/{share}/all-tables';
|
|
25
|
+
this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
|
|
26
|
+
this._schemasInShare = '{prefix}/shares/{share}/schemas';
|
|
27
|
+
this._shares = '{prefix}/shares';
|
|
28
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
29
|
+
(0, Affirm_1.default)(source, 'Invalid source');
|
|
30
|
+
// Expected authentication shape for delta-share
|
|
31
|
+
const { authentication } = source;
|
|
32
|
+
(0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
|
|
33
|
+
this._shareUrl = authentication.host;
|
|
34
|
+
this._bearerToken = authentication.bearerToken || authentication.sessionToken || authentication.password;
|
|
35
|
+
this._share = authentication.share;
|
|
36
|
+
this._schema = authentication.schema;
|
|
37
|
+
this._table = authentication.table;
|
|
38
|
+
(0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
|
|
39
|
+
(0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
|
|
40
|
+
(0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
|
|
41
|
+
(0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
|
|
42
|
+
(0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
|
|
43
|
+
this._source = source;
|
|
44
|
+
return this;
|
|
45
|
+
});
|
|
46
|
+
// Delta Sharing is not a SQL engine; expose explicit error
|
|
47
|
+
this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
|
|
48
|
+
void __sql;
|
|
49
|
+
throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
|
|
50
|
+
});
|
|
51
|
+
this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
|
|
52
|
+
void __sql;
|
|
53
|
+
void __values;
|
|
54
|
+
throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
|
|
55
|
+
});
|
|
56
|
+
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
var _a, _b, _c;
|
|
58
|
+
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
59
|
+
(0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
|
|
60
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
61
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
62
|
+
const lines = [];
|
|
63
|
+
for (const deltaFile of deltaFiles) {
|
|
64
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
65
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
66
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
67
|
+
lines.push(...parquetRecords.map(x => JSON.stringify(x)));
|
|
68
|
+
}
|
|
69
|
+
return lines;
|
|
70
|
+
});
|
|
71
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
72
|
+
var _a, _b, _c;
|
|
73
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
74
|
+
(0, Affirm_1.default)(request.options, 'Invalid read options');
|
|
75
|
+
(0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
|
|
76
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
77
|
+
const { options: { lineFrom, lineTo } } = request;
|
|
78
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
79
|
+
const lines = [];
|
|
80
|
+
let index = 0;
|
|
81
|
+
for (const deltaFile of deltaFiles) {
|
|
82
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
83
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
84
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
85
|
+
for (const record of parquetRecords) {
|
|
86
|
+
if (index >= lineFrom && index < lineTo)
|
|
87
|
+
lines.push(JSON.stringify(record));
|
|
88
|
+
index++;
|
|
89
|
+
if (index >= lineTo)
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return lines;
|
|
94
|
+
});
|
|
95
|
+
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
96
|
+
var _a, _b, _c;
|
|
97
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
98
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
99
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
100
|
+
// For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
|
|
101
|
+
let index = 0;
|
|
102
|
+
let totalLineCount = 0;
|
|
103
|
+
for (const deltaFile of deltaFiles) {
|
|
104
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
105
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
106
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
107
|
+
if (index === 0 && parquetRecords.length > 0) {
|
|
108
|
+
// I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
|
|
109
|
+
dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
|
|
110
|
+
}
|
|
111
|
+
totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
112
|
+
append: index > 0,
|
|
113
|
+
delimiter: dataset.getDelimiter(),
|
|
114
|
+
destinationPath: dataset.getPath(),
|
|
115
|
+
objects: parquetRecords
|
|
116
|
+
});
|
|
117
|
+
index++;
|
|
118
|
+
}
|
|
119
|
+
dataset.setCount(totalLineCount);
|
|
120
|
+
return dataset;
|
|
121
|
+
});
|
|
122
|
+
this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
|
|
123
|
+
void __producer;
|
|
124
|
+
try {
|
|
125
|
+
yield this._getAllFilesInTables(this._table);
|
|
126
|
+
// If it doesn't exist, then it fails in the above function
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
catch (_a) {
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
134
|
+
const url = this._version
|
|
135
|
+
.replace('{prefix}', this._shareUrl)
|
|
136
|
+
.replace('{share}', this._share)
|
|
137
|
+
.replace('{schema}', this._schema)
|
|
138
|
+
.replace('{table}', table);
|
|
139
|
+
const res = yield fetch(url, {
|
|
140
|
+
method: 'GET',
|
|
141
|
+
headers: {
|
|
142
|
+
Authorization: `Bearer ${this._bearerToken}`
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
(0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText}`);
|
|
146
|
+
const version = res.headers['delta-table-version'];
|
|
147
|
+
return version;
|
|
148
|
+
});
|
|
149
|
+
this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
150
|
+
const url = this._query
|
|
151
|
+
.replace('{prefix}', this._shareUrl)
|
|
152
|
+
.replace('{share}', this._share)
|
|
153
|
+
.replace('{schema}', this._schema)
|
|
154
|
+
.replace('{table}', table);
|
|
155
|
+
const body = {
|
|
156
|
+
version: yield this._getVersion(table)
|
|
157
|
+
};
|
|
158
|
+
const res = yield fetch(url, {
|
|
159
|
+
method: 'POST',
|
|
160
|
+
headers: {
|
|
161
|
+
'Authorization': `Bearer ${this._bearerToken}`,
|
|
162
|
+
'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
|
|
163
|
+
},
|
|
164
|
+
body: JSON.stringify(body)
|
|
165
|
+
});
|
|
166
|
+
const rawText = yield res.text();
|
|
167
|
+
(0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
|
|
168
|
+
// By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
|
|
169
|
+
const deltaLines = rawText
|
|
170
|
+
.split('\n')
|
|
171
|
+
.filter(x => x.length > 0)
|
|
172
|
+
.slice(2)
|
|
173
|
+
.map(x => JSON.parse(x));
|
|
174
|
+
return deltaLines;
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
exports.default = DeltaShareSourceDriver;
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
const LocalDriver_1 = require("./LocalDriver");
|
|
16
16
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
17
17
|
const S3Driver_1 = require("./S3Driver");
|
|
18
|
+
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
18
19
|
class DriverFactoryClass {
|
|
19
20
|
constructor() {
|
|
20
21
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -30,6 +31,11 @@ class DriverFactoryClass {
|
|
|
30
31
|
yield driver.init(source);
|
|
31
32
|
return driver;
|
|
32
33
|
}
|
|
34
|
+
case 'delta-share': {
|
|
35
|
+
const driver = new DeltaShareDriver_1.default();
|
|
36
|
+
yield driver.init(source);
|
|
37
|
+
return driver;
|
|
38
|
+
}
|
|
33
39
|
case 'local': {
|
|
34
40
|
const driver = new LocalDriver_1.LocalSourceDriver();
|
|
35
41
|
yield driver.init(source);
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -122,6 +122,21 @@ const DriverHelper = {
|
|
|
122
122
|
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
|
123
123
|
return lineCount;
|
|
124
124
|
}),
|
|
125
|
+
appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
|
+
(0, Affirm_1.default)(options, 'Invalid options');
|
|
127
|
+
const { append, destinationPath, objects, delimiter } = options;
|
|
128
|
+
const writeOptions = append ? { flags: 'a' } : {};
|
|
129
|
+
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
130
|
+
let lineCount = 0;
|
|
131
|
+
const keys = Object.keys(objects[0]);
|
|
132
|
+
for (const obj of objects) {
|
|
133
|
+
const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
|
|
134
|
+
writeStream.write(serialized);
|
|
135
|
+
lineCount++;
|
|
136
|
+
}
|
|
137
|
+
writeStream.close();
|
|
138
|
+
return lineCount;
|
|
139
|
+
}),
|
|
125
140
|
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
141
|
var _a, e_1, _b, _c;
|
|
127
142
|
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
@@ -16,6 +16,9 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
16
16
|
const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine"));
|
|
17
17
|
const path_1 = __importDefault(require("path"));
|
|
18
18
|
const promises_1 = __importDefault(require("fs/promises"));
|
|
19
|
+
const dayjs_1 = __importDefault(require("dayjs"));
|
|
20
|
+
const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat"));
|
|
21
|
+
dayjs_1.default.extend(customParseFormat_1.default);
|
|
19
22
|
class DeveloperEngineClass {
|
|
20
23
|
constructor() {
|
|
21
24
|
this.discover = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -60,13 +63,99 @@ class DeveloperEngineClass {
|
|
|
60
63
|
return 'string';
|
|
61
64
|
}
|
|
62
65
|
};
|
|
66
|
+
// Infer the most likely type from a single JS value
|
|
67
|
+
// Returns one of: 'number' | 'boolean' | 'date' | 'datetime' | 'string' | 'array' | 'object' | 'null'
|
|
68
|
+
this.inferType = (value) => {
|
|
69
|
+
if (value === null || value === undefined)
|
|
70
|
+
return 'string';
|
|
71
|
+
// Arrays
|
|
72
|
+
if (Array.isArray(value))
|
|
73
|
+
return 'array';
|
|
74
|
+
// Booleans (including common string representations)
|
|
75
|
+
if (typeof value === 'boolean')
|
|
76
|
+
return 'boolean';
|
|
77
|
+
if (typeof value === 'string') {
|
|
78
|
+
const trimmed = value.trim();
|
|
79
|
+
const lower = trimmed.toLowerCase();
|
|
80
|
+
if (lower === 'true' || lower === 'false')
|
|
81
|
+
return 'boolean';
|
|
82
|
+
// Numbers (numeric strings)
|
|
83
|
+
const numericRegex = /^-?\d+(?:\.\d+)?$/;
|
|
84
|
+
if (numericRegex.test(trimmed))
|
|
85
|
+
return 'number';
|
|
86
|
+
// Timestamps (10 or 13 digits)
|
|
87
|
+
const tsRegex = /^-?\d{10}(?:\d{3})?$/;
|
|
88
|
+
if (tsRegex.test(trimmed)) {
|
|
89
|
+
const n = Number(trimmed.length === 10 ? `${trimmed}000` : trimmed);
|
|
90
|
+
const d = new Date(n);
|
|
91
|
+
if (!isNaN(d.getTime()))
|
|
92
|
+
return 'datetime';
|
|
93
|
+
}
|
|
94
|
+
// Dates with common formats
|
|
95
|
+
const dateFormats = [
|
|
96
|
+
'YYYY-MM-DD',
|
|
97
|
+
'YYYY/MM/DD',
|
|
98
|
+
'DD/MM/YYYY',
|
|
99
|
+
'MM/DD/YYYY',
|
|
100
|
+
'YYYYMMDD',
|
|
101
|
+
'DD-MMM-YYYY',
|
|
102
|
+
'YYYY-MM-DD HH:mm',
|
|
103
|
+
'YYYY-MM-DD HH:mm:ss',
|
|
104
|
+
'YYYY-MM-DDTHH:mm',
|
|
105
|
+
'YYYY-MM-DDTHH:mmZ',
|
|
106
|
+
'YYYY-MM-DDTHH:mm:ss',
|
|
107
|
+
'YYYY-MM-DDTHH:mm:ssZ',
|
|
108
|
+
'YYYY-MM-DDTHH:mm:ss.SSSZ'
|
|
109
|
+
];
|
|
110
|
+
for (const fmt of dateFormats) {
|
|
111
|
+
const d = (0, dayjs_1.default)(trimmed, fmt, true);
|
|
112
|
+
if (d.isValid()) {
|
|
113
|
+
// If time components likely present, classify as datetime
|
|
114
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
115
|
+
return 'datetime';
|
|
116
|
+
return 'date';
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// ISO 8601 without specifying format
|
|
120
|
+
const iso = (0, dayjs_1.default)(trimmed);
|
|
121
|
+
if (iso.isValid() && /\d{4}-\d{2}-\d{2}/.test(trimmed)) {
|
|
122
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
123
|
+
return 'datetime';
|
|
124
|
+
return 'date';
|
|
125
|
+
}
|
|
126
|
+
return 'string';
|
|
127
|
+
}
|
|
128
|
+
if (typeof value === 'number')
|
|
129
|
+
return 'number';
|
|
130
|
+
if (typeof value === 'object') {
|
|
131
|
+
// Date instance
|
|
132
|
+
if (value instanceof Date && !isNaN(value.getTime()))
|
|
133
|
+
return 'datetime';
|
|
134
|
+
return 'object';
|
|
135
|
+
}
|
|
136
|
+
// Fallback for bigint, symbol, function -> string
|
|
137
|
+
return 'string';
|
|
138
|
+
};
|
|
139
|
+
this.inferDimensionType = (value) => {
|
|
140
|
+
const type = this.inferType(value);
|
|
141
|
+
switch (type) {
|
|
142
|
+
case 'array':
|
|
143
|
+
case 'object': return 'string';
|
|
144
|
+
case 'boolean': return 'boolean';
|
|
145
|
+
case 'date':
|
|
146
|
+
case 'datetime': return 'datetime';
|
|
147
|
+
case 'number': return 'number';
|
|
148
|
+
case 'string': return 'string';
|
|
149
|
+
default: return 'string';
|
|
150
|
+
}
|
|
151
|
+
};
|
|
63
152
|
this.extractFieldTypes = (records) => {
|
|
64
153
|
if (!records || records.length === 0)
|
|
65
154
|
return [];
|
|
66
155
|
const sample = records[0];
|
|
67
156
|
return Object.entries(sample._value).map(([key, value]) => ({
|
|
68
157
|
name: key,
|
|
69
|
-
type:
|
|
158
|
+
type: this.inferType(value)
|
|
70
159
|
}));
|
|
71
160
|
};
|
|
72
161
|
this.extractFieldClassification = (field) => {
|
|
@@ -132,7 +132,7 @@ class ConsumerEngineClass {
|
|
|
132
132
|
(0, Affirm_1.default)(options, `Invalid execute consume options`);
|
|
133
133
|
const { usageId } = UsageManager_1.default.startUsage(consumer, user);
|
|
134
134
|
try {
|
|
135
|
-
const execution = new ExecutionEnvironment_1.default(consumer);
|
|
135
|
+
const execution = new ExecutionEnvironment_1.default(consumer, usageId);
|
|
136
136
|
const result = yield execution.run(options);
|
|
137
137
|
UsageManager_1.default.endUsage(usageId, result._stats.size);
|
|
138
138
|
return result;
|
|
@@ -62,7 +62,7 @@ class PostProcessorClass {
|
|
|
62
62
|
}
|
|
63
63
|
return record;
|
|
64
64
|
}, options);
|
|
65
|
-
newDataset.
|
|
65
|
+
newDataset.setDimensions(updatedDimensions);
|
|
66
66
|
return newDataset;
|
|
67
67
|
});
|
|
68
68
|
/**
|
|
@@ -174,12 +174,16 @@ class PostProcessorClass {
|
|
|
174
174
|
normalizedRecord[fieldName] = (_a = splitRecord[fieldName]) !== null && _a !== void 0 ? _a : '';
|
|
175
175
|
}
|
|
176
176
|
// Create dimensions based on the expected field names
|
|
177
|
-
const newDimensions = expectedFieldNames.map((key, index) =>
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
const newDimensions = expectedFieldNames.map((key, index) => {
|
|
178
|
+
var _a, _b, _c;
|
|
179
|
+
return ({
|
|
180
|
+
name: key,
|
|
181
|
+
key: key,
|
|
182
|
+
index: index,
|
|
183
|
+
hidden: null,
|
|
184
|
+
type: (_c = (_b = (_a = columns[index]) === null || _a === void 0 ? void 0 : _a.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
185
|
+
});
|
|
186
|
+
});
|
|
183
187
|
// Create the row string
|
|
184
188
|
const values = newDimensions.map(dim => {
|
|
185
189
|
const value = normalizedRecord[dim.name];
|
|
@@ -196,14 +200,17 @@ class PostProcessorClass {
|
|
|
196
200
|
// Update the dataset dimensions to match the unpacked structure
|
|
197
201
|
// TODO: 99% certain this will cause a bug
|
|
198
202
|
if (columns.length > 0) {
|
|
199
|
-
const newDimensions = columns.map((col, index) =>
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
203
|
+
const newDimensions = columns.map((col, index) => {
|
|
204
|
+
var _a;
|
|
205
|
+
return ({
|
|
206
|
+
name: col.nameInProducer,
|
|
207
|
+
key: col.nameInProducer,
|
|
208
|
+
index: index,
|
|
209
|
+
hidden: null,
|
|
210
|
+
type: (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type
|
|
211
|
+
});
|
|
212
|
+
});
|
|
213
|
+
resDataset.setDimensions(newDimensions);
|
|
207
214
|
}
|
|
208
215
|
return resDataset;
|
|
209
216
|
});
|
|
@@ -34,8 +34,9 @@ const DriverFactory_1 = __importDefault(require("../../drivers/DriverFactory"));
|
|
|
34
34
|
const Helper_1 = __importDefault(require("../../helper/Helper"));
|
|
35
35
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
36
36
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
37
|
+
const Logger_1 = __importDefault(require("../../helper/Logger"));
|
|
37
38
|
class Dataset {
|
|
38
|
-
constructor(name, file, batchSize) {
|
|
39
|
+
constructor(name, file, batchSize, executionId) {
|
|
39
40
|
var _a;
|
|
40
41
|
this.getPath = () => this._path;
|
|
41
42
|
this.setPath = (path) => {
|
|
@@ -43,6 +44,7 @@ class Dataset {
|
|
|
43
44
|
return this;
|
|
44
45
|
};
|
|
45
46
|
this.getFile = () => this._file;
|
|
47
|
+
this.getExecutionId = () => this._executionId;
|
|
46
48
|
this.getBatchSize = () => this._batchSize;
|
|
47
49
|
this.setFirstLine = (firstLine) => {
|
|
48
50
|
this._firstLine = firstLine;
|
|
@@ -170,7 +172,8 @@ class Dataset {
|
|
|
170
172
|
}
|
|
171
173
|
}
|
|
172
174
|
catch (error) {
|
|
173
|
-
|
|
175
|
+
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
176
|
+
lineCount++;
|
|
174
177
|
}
|
|
175
178
|
}
|
|
176
179
|
}
|
|
@@ -305,7 +308,7 @@ class Dataset {
|
|
|
305
308
|
}
|
|
306
309
|
}
|
|
307
310
|
catch (error) {
|
|
308
|
-
|
|
311
|
+
Logger_1.default.log(`Error parsing line during sort: ${error}`);
|
|
309
312
|
}
|
|
310
313
|
}
|
|
311
314
|
}
|
|
@@ -536,7 +539,7 @@ class Dataset {
|
|
|
536
539
|
}
|
|
537
540
|
}
|
|
538
541
|
catch (error) {
|
|
539
|
-
|
|
542
|
+
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
540
543
|
}
|
|
541
544
|
}
|
|
542
545
|
}
|
|
@@ -627,7 +630,8 @@ class Dataset {
|
|
|
627
630
|
}
|
|
628
631
|
}
|
|
629
632
|
catch (error) {
|
|
630
|
-
|
|
633
|
+
Logger_1.default.log(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
634
|
+
lineCount++;
|
|
631
635
|
}
|
|
632
636
|
}
|
|
633
637
|
}
|
|
@@ -703,10 +707,18 @@ class Dataset {
|
|
|
703
707
|
return this;
|
|
704
708
|
});
|
|
705
709
|
this.getDimensions = () => this._dimensions;
|
|
706
|
-
this.
|
|
710
|
+
this.setDimensions = (dimensions) => {
|
|
707
711
|
this._dimensions = dimensions;
|
|
708
712
|
return this;
|
|
709
713
|
};
|
|
714
|
+
this.setSingleDimension = (newDimension, oldDimension) => {
|
|
715
|
+
(0, Affirm_1.default)(newDimension, `Invalid new dimension`);
|
|
716
|
+
(0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
|
|
717
|
+
const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
|
|
718
|
+
(0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
|
|
719
|
+
this._dimensions.splice(current, 1, newDimension);
|
|
720
|
+
return this;
|
|
721
|
+
};
|
|
710
722
|
/**
|
|
711
723
|
* Update the record pool when dimensions change
|
|
712
724
|
*/
|
|
@@ -829,6 +841,7 @@ class Dataset {
|
|
|
829
841
|
this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
|
|
830
842
|
this.name = name;
|
|
831
843
|
this._file = file;
|
|
844
|
+
this._executionId = executionId;
|
|
832
845
|
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
833
846
|
this._dimensions = [];
|
|
834
847
|
this._firstLine = '';
|
|
@@ -843,8 +856,9 @@ class Dataset {
|
|
|
843
856
|
.replace(/_{2,}/g, '_')
|
|
844
857
|
.replace(/^_+|_+$/g, '')
|
|
845
858
|
.toLowerCase();
|
|
846
|
-
|
|
847
|
-
this.
|
|
859
|
+
const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
|
|
860
|
+
this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
|
|
861
|
+
this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
|
|
848
862
|
this.ensureFile(this._path);
|
|
849
863
|
}
|
|
850
864
|
}
|