@forzalabs/remora 0.1.4-nasco.3 → 0.1.5-nasco.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/consumer-schema.json +6 -2
- package/definitions/json_schemas/producer-schema.json +2 -1
- package/definitions/json_schemas/source-schema.json +14 -1
- package/documentation/README.md +1 -0
- package/documentation/default_resources/consumer.json +7 -7
- package/drivers/DeltaShareDriver.js +178 -0
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +15 -0
- package/engines/ai/DeveloperEngine.js +90 -1
- package/engines/consumer/ConsumerEngine.js +1 -1
- package/engines/consumer/PostProcessor.js +22 -15
- package/engines/dataset/Dataset.js +18 -7
- package/engines/dataset/DatasetManager.js +58 -12
- package/engines/dataset/DatasetRecord.js +17 -4
- package/engines/dataset/ParallelDataset.js +16 -6
- package/engines/execution/ExecutionEnvironment.js +13 -4
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/file/FileCompiler.js +2 -1
- package/engines/file/FileExporter.js +12 -3
- package/engines/parsing/ParseManager.js +7 -2
- package/engines/producer/ProducerEngine.js +4 -2
- package/engines/transform/JoinEngine.js +10 -6
- package/engines/transform/TransformationEngine.js +31 -2
- package/engines/usage/UsageDataManager.js +110 -0
- package/package.json +2 -1
- package/workers/FilterWorker.js +3 -3
- package/workers/ProjectionWorker.js +3 -3
- package/workers/TransformWorker.js +3 -3
package/Constants.js
CHANGED
|
@@ -223,6 +223,10 @@
|
|
|
223
223
|
"type": "string",
|
|
224
224
|
"description": "The name of the source where the consumer will export its data when deployed/run"
|
|
225
225
|
},
|
|
226
|
+
"exportName": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"description": "If the format is a file, forces the same name in the export file (extension is auto-added)"
|
|
229
|
+
},
|
|
226
230
|
"trigger": {
|
|
227
231
|
"type": "object",
|
|
228
232
|
"description": "Triggers to perform the export (not just the usual 'Deploy')",
|
|
@@ -463,11 +467,11 @@
|
|
|
463
467
|
"cast": {
|
|
464
468
|
"type": "string",
|
|
465
469
|
"description": "Cast the value to a specific type",
|
|
466
|
-
"enum": ["string", "number", "
|
|
470
|
+
"enum": ["string", "number", "datetime", "boolean"]
|
|
467
471
|
},
|
|
468
472
|
"format": {
|
|
469
473
|
"type": "string",
|
|
470
|
-
"description": "Optional format for date parsing or string formatting (
|
|
474
|
+
"description": "Optional format for date parsing or string formatting (e.g. YYYY-MM-DD, DD/MM/YY)"
|
|
471
475
|
}
|
|
472
476
|
},
|
|
473
477
|
"required": ["cast"],
|
|
@@ -23,7 +23,8 @@
|
|
|
23
23
|
"aws-dynamodb",
|
|
24
24
|
"aws-s3",
|
|
25
25
|
"postgres",
|
|
26
|
-
"local"
|
|
26
|
+
"local",
|
|
27
|
+
"delta-share"
|
|
27
28
|
],
|
|
28
29
|
"description": "The type of data engine"
|
|
29
30
|
},
|
|
@@ -66,6 +67,10 @@
|
|
|
66
67
|
"type": "string",
|
|
67
68
|
"description": "Database schema name"
|
|
68
69
|
},
|
|
70
|
+
"table": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Table name (used by some engines like delta-share)"
|
|
73
|
+
},
|
|
69
74
|
"port": {
|
|
70
75
|
"type": "string",
|
|
71
76
|
"description": "Port number for the connection"
|
|
@@ -101,6 +106,14 @@
|
|
|
101
106
|
"path": {
|
|
102
107
|
"type": "string",
|
|
103
108
|
"description": "The folder path"
|
|
109
|
+
},
|
|
110
|
+
"share": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"description": "Delta Sharing share name"
|
|
113
|
+
},
|
|
114
|
+
"bearerToken": {
|
|
115
|
+
"type": "string",
|
|
116
|
+
"description": "Delta Sharing bearer token used for authentication"
|
|
104
117
|
}
|
|
105
118
|
},
|
|
106
119
|
"required": ["method"]
|
package/documentation/README.md
CHANGED
|
@@ -106,6 +106,7 @@ Consumers transform and combine data from producers for specific use cases.
|
|
|
106
106
|
| `outputs[].accellerated` | Whether to materialize for performance | `true`, `false` |
|
|
107
107
|
| `outputs[].direct` | Whether to query directly without creating views | `true`, `false` |
|
|
108
108
|
| `outputs[].exportDestination` | Where to export data | Must match a source `name` |
|
|
109
|
+
| `outputs[].exportName` | Fixed file name (without extension) for file exports | String |
|
|
109
110
|
| `outputs[].trigger.type` | How to trigger exports | `CRON`, `API` |
|
|
110
111
|
| `outputs[].trigger.value` | Trigger expression | CRON expression (e.g., `0 0 * * *`) or endpoint path |
|
|
111
112
|
| `metadata` | Custom tags | Object with string keys and values |
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
"joins": [
|
|
12
12
|
{
|
|
13
13
|
"otherName": "<primary producer name>",
|
|
14
|
-
"relationship": "
|
|
15
|
-
"sql": "<
|
|
14
|
+
"relationship": "one-to-many",
|
|
15
|
+
"sql": "${P.id} = ${<primary producer name>.fk_id}"
|
|
16
16
|
}
|
|
17
17
|
]
|
|
18
18
|
}
|
|
@@ -29,17 +29,17 @@
|
|
|
29
29
|
}
|
|
30
30
|
],
|
|
31
31
|
"outputs": [
|
|
32
|
-
{ "format": "
|
|
32
|
+
{ "format": "API" },
|
|
33
33
|
{
|
|
34
|
-
"format": "
|
|
34
|
+
"format": "JSON",
|
|
35
35
|
"exportDestination": "<export destination>"
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
"format": "
|
|
38
|
+
"format": "CSV",
|
|
39
39
|
"exportDestination": "<export destination>",
|
|
40
40
|
"trigger": {
|
|
41
|
-
"type": "
|
|
42
|
-
"value": "
|
|
41
|
+
"type": "CRON",
|
|
42
|
+
"value": "0 0 * * *"
|
|
43
43
|
}
|
|
44
44
|
}
|
|
45
45
|
],
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
16
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
17
|
+
/**
|
|
18
|
+
* Delta Share (Databricks Delta Sharing) Source Driver
|
|
19
|
+
*/
|
|
20
|
+
class DeltaShareSourceDriver {
|
|
21
|
+
constructor() {
|
|
22
|
+
this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
|
|
23
|
+
this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
|
|
24
|
+
this._tablesInShare = '{prefix}/shares/{share}/all-tables';
|
|
25
|
+
this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
|
|
26
|
+
this._schemasInShare = '{prefix}/shares/{share}/schemas';
|
|
27
|
+
this._shares = '{prefix}/shares';
|
|
28
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
29
|
+
(0, Affirm_1.default)(source, 'Invalid source');
|
|
30
|
+
// Expected authentication shape for delta-share
|
|
31
|
+
const { authentication } = source;
|
|
32
|
+
(0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
|
|
33
|
+
this._shareUrl = authentication.host;
|
|
34
|
+
this._bearerToken = authentication.bearerToken || authentication.sessionToken || authentication.password;
|
|
35
|
+
this._share = authentication.share;
|
|
36
|
+
this._schema = authentication.schema;
|
|
37
|
+
this._table = authentication.table;
|
|
38
|
+
(0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
|
|
39
|
+
(0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
|
|
40
|
+
(0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
|
|
41
|
+
(0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
|
|
42
|
+
(0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
|
|
43
|
+
this._source = source;
|
|
44
|
+
return this;
|
|
45
|
+
});
|
|
46
|
+
// Delta Sharing is not a SQL engine; expose explicit error
|
|
47
|
+
this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
|
|
48
|
+
void __sql;
|
|
49
|
+
throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
|
|
50
|
+
});
|
|
51
|
+
this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
|
|
52
|
+
void __sql;
|
|
53
|
+
void __values;
|
|
54
|
+
throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
|
|
55
|
+
});
|
|
56
|
+
this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
var _a, _b, _c;
|
|
58
|
+
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
59
|
+
(0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
|
|
60
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
61
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
62
|
+
const lines = [];
|
|
63
|
+
for (const deltaFile of deltaFiles) {
|
|
64
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
65
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
66
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
67
|
+
lines.push(...parquetRecords.map(x => JSON.stringify(x)));
|
|
68
|
+
}
|
|
69
|
+
return lines;
|
|
70
|
+
});
|
|
71
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
72
|
+
var _a, _b, _c;
|
|
73
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
74
|
+
(0, Affirm_1.default)(request.options, 'Invalid read options');
|
|
75
|
+
(0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
|
|
76
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
77
|
+
const { options: { lineFrom, lineTo } } = request;
|
|
78
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
79
|
+
const lines = [];
|
|
80
|
+
let index = 0;
|
|
81
|
+
for (const deltaFile of deltaFiles) {
|
|
82
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
83
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
84
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
85
|
+
for (const record of parquetRecords) {
|
|
86
|
+
if (index >= lineFrom && index < lineTo)
|
|
87
|
+
lines.push(JSON.stringify(record));
|
|
88
|
+
index++;
|
|
89
|
+
if (index >= lineTo)
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return lines;
|
|
94
|
+
});
|
|
95
|
+
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
96
|
+
var _a, _b, _c;
|
|
97
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
98
|
+
const deltaFiles = yield this._getAllFilesInTables(this._table);
|
|
99
|
+
const { asyncBufferFromUrl, parquetReadObjects } = yield import('hyparquet');
|
|
100
|
+
// For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
|
|
101
|
+
let index = 0;
|
|
102
|
+
let totalLineCount = 0;
|
|
103
|
+
for (const deltaFile of deltaFiles) {
|
|
104
|
+
const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
|
|
105
|
+
const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
|
|
106
|
+
const parquetRecords = yield parquetReadObjects({ file: file });
|
|
107
|
+
if (index === 0 && parquetRecords.length > 0) {
|
|
108
|
+
// I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
|
|
109
|
+
dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
|
|
110
|
+
}
|
|
111
|
+
totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
112
|
+
append: index > 0,
|
|
113
|
+
delimiter: dataset.getDelimiter(),
|
|
114
|
+
destinationPath: dataset.getPath(),
|
|
115
|
+
objects: parquetRecords
|
|
116
|
+
});
|
|
117
|
+
index++;
|
|
118
|
+
}
|
|
119
|
+
dataset.setCount(totalLineCount);
|
|
120
|
+
return dataset;
|
|
121
|
+
});
|
|
122
|
+
this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
|
|
123
|
+
void __producer;
|
|
124
|
+
try {
|
|
125
|
+
yield this._getAllFilesInTables(this._table);
|
|
126
|
+
// If it doesn't exist, then it fails in the above function
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
catch (_a) {
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
134
|
+
const url = this._version
|
|
135
|
+
.replace('{prefix}', this._shareUrl)
|
|
136
|
+
.replace('{share}', this._share)
|
|
137
|
+
.replace('{schema}', this._schema)
|
|
138
|
+
.replace('{table}', table);
|
|
139
|
+
const res = yield fetch(url, {
|
|
140
|
+
method: 'GET',
|
|
141
|
+
headers: {
|
|
142
|
+
Authorization: `Bearer ${this._bearerToken}`
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
(0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText}`);
|
|
146
|
+
const version = res.headers['delta-table-version'];
|
|
147
|
+
return version;
|
|
148
|
+
});
|
|
149
|
+
this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
|
|
150
|
+
const url = this._query
|
|
151
|
+
.replace('{prefix}', this._shareUrl)
|
|
152
|
+
.replace('{share}', this._share)
|
|
153
|
+
.replace('{schema}', this._schema)
|
|
154
|
+
.replace('{table}', table);
|
|
155
|
+
const body = {
|
|
156
|
+
version: yield this._getVersion(table)
|
|
157
|
+
};
|
|
158
|
+
const res = yield fetch(url, {
|
|
159
|
+
method: 'POST',
|
|
160
|
+
headers: {
|
|
161
|
+
'Authorization': `Bearer ${this._bearerToken}`,
|
|
162
|
+
'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
|
|
163
|
+
},
|
|
164
|
+
body: JSON.stringify(body)
|
|
165
|
+
});
|
|
166
|
+
const rawText = yield res.text();
|
|
167
|
+
(0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
|
|
168
|
+
// By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
|
|
169
|
+
const deltaLines = rawText
|
|
170
|
+
.split('\n')
|
|
171
|
+
.filter(x => x.length > 0)
|
|
172
|
+
.slice(2)
|
|
173
|
+
.map(x => JSON.parse(x));
|
|
174
|
+
return deltaLines;
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
exports.default = DeltaShareSourceDriver;
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -15,6 +15,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
const LocalDriver_1 = require("./LocalDriver");
|
|
16
16
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
17
17
|
const S3Driver_1 = require("./S3Driver");
|
|
18
|
+
const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
|
|
18
19
|
class DriverFactoryClass {
|
|
19
20
|
constructor() {
|
|
20
21
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -30,6 +31,11 @@ class DriverFactoryClass {
|
|
|
30
31
|
yield driver.init(source);
|
|
31
32
|
return driver;
|
|
32
33
|
}
|
|
34
|
+
case 'delta-share': {
|
|
35
|
+
const driver = new DeltaShareDriver_1.default();
|
|
36
|
+
yield driver.init(source);
|
|
37
|
+
return driver;
|
|
38
|
+
}
|
|
33
39
|
case 'local': {
|
|
34
40
|
const driver = new LocalDriver_1.LocalSourceDriver();
|
|
35
41
|
yield driver.init(source);
|
package/drivers/DriverHelper.js
CHANGED
|
@@ -122,6 +122,21 @@ const DriverHelper = {
|
|
|
122
122
|
yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
|
|
123
123
|
return lineCount;
|
|
124
124
|
}),
|
|
125
|
+
appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
|
+
(0, Affirm_1.default)(options, 'Invalid options');
|
|
127
|
+
const { append, destinationPath, objects, delimiter } = options;
|
|
128
|
+
const writeOptions = append ? { flags: 'a' } : {};
|
|
129
|
+
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
130
|
+
let lineCount = 0;
|
|
131
|
+
const keys = Object.keys(objects[0]);
|
|
132
|
+
for (const obj of objects) {
|
|
133
|
+
const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
|
|
134
|
+
writeStream.write(serialized);
|
|
135
|
+
lineCount++;
|
|
136
|
+
}
|
|
137
|
+
writeStream.close();
|
|
138
|
+
return lineCount;
|
|
139
|
+
}),
|
|
125
140
|
quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
|
|
126
141
|
var _a, e_1, _b, _c;
|
|
127
142
|
const fileStream = (0, fs_1.createReadStream)(filePath);
|
|
@@ -16,6 +16,9 @@ const Affirm_1 = __importDefault(require("../../core/Affirm"));
|
|
|
16
16
|
const ProducerEngine_1 = __importDefault(require("../producer/ProducerEngine"));
|
|
17
17
|
const path_1 = __importDefault(require("path"));
|
|
18
18
|
const promises_1 = __importDefault(require("fs/promises"));
|
|
19
|
+
const dayjs_1 = __importDefault(require("dayjs"));
|
|
20
|
+
const customParseFormat_1 = __importDefault(require("dayjs/plugin/customParseFormat"));
|
|
21
|
+
dayjs_1.default.extend(customParseFormat_1.default);
|
|
19
22
|
class DeveloperEngineClass {
|
|
20
23
|
constructor() {
|
|
21
24
|
this.discover = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -60,13 +63,99 @@ class DeveloperEngineClass {
|
|
|
60
63
|
return 'string';
|
|
61
64
|
}
|
|
62
65
|
};
|
|
66
|
+
// Infer the most likely type from a single JS value
|
|
67
|
+
// Returns one of: 'number' | 'boolean' | 'date' | 'datetime' | 'string' | 'array' | 'object' | 'null'
|
|
68
|
+
this.inferType = (value) => {
|
|
69
|
+
if (value === null || value === undefined)
|
|
70
|
+
return 'string';
|
|
71
|
+
// Arrays
|
|
72
|
+
if (Array.isArray(value))
|
|
73
|
+
return 'array';
|
|
74
|
+
// Booleans (including common string representations)
|
|
75
|
+
if (typeof value === 'boolean')
|
|
76
|
+
return 'boolean';
|
|
77
|
+
if (typeof value === 'string') {
|
|
78
|
+
const trimmed = value.trim();
|
|
79
|
+
const lower = trimmed.toLowerCase();
|
|
80
|
+
if (lower === 'true' || lower === 'false')
|
|
81
|
+
return 'boolean';
|
|
82
|
+
// Numbers (numeric strings)
|
|
83
|
+
const numericRegex = /^-?\d+(?:\.\d+)?$/;
|
|
84
|
+
if (numericRegex.test(trimmed))
|
|
85
|
+
return 'number';
|
|
86
|
+
// Timestamps (10 or 13 digits)
|
|
87
|
+
const tsRegex = /^-?\d{10}(?:\d{3})?$/;
|
|
88
|
+
if (tsRegex.test(trimmed)) {
|
|
89
|
+
const n = Number(trimmed.length === 10 ? `${trimmed}000` : trimmed);
|
|
90
|
+
const d = new Date(n);
|
|
91
|
+
if (!isNaN(d.getTime()))
|
|
92
|
+
return 'datetime';
|
|
93
|
+
}
|
|
94
|
+
// Dates with common formats
|
|
95
|
+
const dateFormats = [
|
|
96
|
+
'YYYY-MM-DD',
|
|
97
|
+
'YYYY/MM/DD',
|
|
98
|
+
'DD/MM/YYYY',
|
|
99
|
+
'MM/DD/YYYY',
|
|
100
|
+
'YYYYMMDD',
|
|
101
|
+
'DD-MMM-YYYY',
|
|
102
|
+
'YYYY-MM-DD HH:mm',
|
|
103
|
+
'YYYY-MM-DD HH:mm:ss',
|
|
104
|
+
'YYYY-MM-DDTHH:mm',
|
|
105
|
+
'YYYY-MM-DDTHH:mmZ',
|
|
106
|
+
'YYYY-MM-DDTHH:mm:ss',
|
|
107
|
+
'YYYY-MM-DDTHH:mm:ssZ',
|
|
108
|
+
'YYYY-MM-DDTHH:mm:ss.SSSZ'
|
|
109
|
+
];
|
|
110
|
+
for (const fmt of dateFormats) {
|
|
111
|
+
const d = (0, dayjs_1.default)(trimmed, fmt, true);
|
|
112
|
+
if (d.isValid()) {
|
|
113
|
+
// If time components likely present, classify as datetime
|
|
114
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
115
|
+
return 'datetime';
|
|
116
|
+
return 'date';
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// ISO 8601 without specifying format
|
|
120
|
+
const iso = (0, dayjs_1.default)(trimmed);
|
|
121
|
+
if (iso.isValid() && /\d{4}-\d{2}-\d{2}/.test(trimmed)) {
|
|
122
|
+
if (/T|\d+:\d+/.test(trimmed))
|
|
123
|
+
return 'datetime';
|
|
124
|
+
return 'date';
|
|
125
|
+
}
|
|
126
|
+
return 'string';
|
|
127
|
+
}
|
|
128
|
+
if (typeof value === 'number')
|
|
129
|
+
return 'number';
|
|
130
|
+
if (typeof value === 'object') {
|
|
131
|
+
// Date instance
|
|
132
|
+
if (value instanceof Date && !isNaN(value.getTime()))
|
|
133
|
+
return 'datetime';
|
|
134
|
+
return 'object';
|
|
135
|
+
}
|
|
136
|
+
// Fallback for bigint, symbol, function -> string
|
|
137
|
+
return 'string';
|
|
138
|
+
};
|
|
139
|
+
this.inferDimensionType = (value) => {
|
|
140
|
+
const type = this.inferType(value);
|
|
141
|
+
switch (type) {
|
|
142
|
+
case 'array':
|
|
143
|
+
case 'object': return 'string';
|
|
144
|
+
case 'boolean': return 'boolean';
|
|
145
|
+
case 'date':
|
|
146
|
+
case 'datetime': return 'datetime';
|
|
147
|
+
case 'number': return 'number';
|
|
148
|
+
case 'string': return 'string';
|
|
149
|
+
default: return 'string';
|
|
150
|
+
}
|
|
151
|
+
};
|
|
63
152
|
this.extractFieldTypes = (records) => {
|
|
64
153
|
if (!records || records.length === 0)
|
|
65
154
|
return [];
|
|
66
155
|
const sample = records[0];
|
|
67
156
|
return Object.entries(sample._value).map(([key, value]) => ({
|
|
68
157
|
name: key,
|
|
69
|
-
type:
|
|
158
|
+
type: this.inferType(value)
|
|
70
159
|
}));
|
|
71
160
|
};
|
|
72
161
|
this.extractFieldClassification = (field) => {
|
|
@@ -132,7 +132,7 @@ class ConsumerEngineClass {
|
|
|
132
132
|
(0, Affirm_1.default)(options, `Invalid execute consume options`);
|
|
133
133
|
const { usageId } = UsageManager_1.default.startUsage(consumer, user);
|
|
134
134
|
try {
|
|
135
|
-
const execution = new ExecutionEnvironment_1.default(consumer);
|
|
135
|
+
const execution = new ExecutionEnvironment_1.default(consumer, usageId);
|
|
136
136
|
const result = yield execution.run(options);
|
|
137
137
|
UsageManager_1.default.endUsage(usageId, result._stats.size);
|
|
138
138
|
return result;
|
|
@@ -62,7 +62,7 @@ class PostProcessorClass {
|
|
|
62
62
|
}
|
|
63
63
|
return record;
|
|
64
64
|
}, options);
|
|
65
|
-
newDataset.
|
|
65
|
+
newDataset.setDimensions(updatedDimensions);
|
|
66
66
|
return newDataset;
|
|
67
67
|
});
|
|
68
68
|
/**
|
|
@@ -174,12 +174,16 @@ class PostProcessorClass {
|
|
|
174
174
|
normalizedRecord[fieldName] = (_a = splitRecord[fieldName]) !== null && _a !== void 0 ? _a : '';
|
|
175
175
|
}
|
|
176
176
|
// Create dimensions based on the expected field names
|
|
177
|
-
const newDimensions = expectedFieldNames.map((key, index) =>
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
const newDimensions = expectedFieldNames.map((key, index) => {
|
|
178
|
+
var _a, _b, _c;
|
|
179
|
+
return ({
|
|
180
|
+
name: key,
|
|
181
|
+
key: key,
|
|
182
|
+
index: index,
|
|
183
|
+
hidden: null,
|
|
184
|
+
type: (_c = (_b = (_a = columns[index]) === null || _a === void 0 ? void 0 : _a.dimension) === null || _b === void 0 ? void 0 : _b.type) !== null && _c !== void 0 ? _c : 'string'
|
|
185
|
+
});
|
|
186
|
+
});
|
|
183
187
|
// Create the row string
|
|
184
188
|
const values = newDimensions.map(dim => {
|
|
185
189
|
const value = normalizedRecord[dim.name];
|
|
@@ -196,14 +200,17 @@ class PostProcessorClass {
|
|
|
196
200
|
// Update the dataset dimensions to match the unpacked structure
|
|
197
201
|
// TODO: 99% certain this will cause a bug
|
|
198
202
|
if (columns.length > 0) {
|
|
199
|
-
const newDimensions = columns.map((col, index) =>
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
203
|
+
const newDimensions = columns.map((col, index) => {
|
|
204
|
+
var _a;
|
|
205
|
+
return ({
|
|
206
|
+
name: col.nameInProducer,
|
|
207
|
+
key: col.nameInProducer,
|
|
208
|
+
index: index,
|
|
209
|
+
hidden: null,
|
|
210
|
+
type: (_a = col.dimension) === null || _a === void 0 ? void 0 : _a.type
|
|
211
|
+
});
|
|
212
|
+
});
|
|
213
|
+
resDataset.setDimensions(newDimensions);
|
|
207
214
|
}
|
|
208
215
|
return resDataset;
|
|
209
216
|
});
|
|
@@ -35,7 +35,7 @@ const Helper_1 = __importDefault(require("../../helper/Helper"));
|
|
|
35
35
|
const Algo_1 = __importDefault(require("../../core/Algo"));
|
|
36
36
|
const Environment_1 = __importDefault(require("../Environment"));
|
|
37
37
|
class Dataset {
|
|
38
|
-
constructor(name, file, batchSize) {
|
|
38
|
+
constructor(name, file, batchSize, executionId) {
|
|
39
39
|
var _a;
|
|
40
40
|
this.getPath = () => this._path;
|
|
41
41
|
this.setPath = (path) => {
|
|
@@ -43,6 +43,7 @@ class Dataset {
|
|
|
43
43
|
return this;
|
|
44
44
|
};
|
|
45
45
|
this.getFile = () => this._file;
|
|
46
|
+
this.getExecutionId = () => this._executionId;
|
|
46
47
|
this.getBatchSize = () => this._batchSize;
|
|
47
48
|
this.setFirstLine = (firstLine) => {
|
|
48
49
|
this._firstLine = firstLine;
|
|
@@ -170,7 +171,7 @@ class Dataset {
|
|
|
170
171
|
}
|
|
171
172
|
}
|
|
172
173
|
catch (error) {
|
|
173
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
174
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
174
175
|
}
|
|
175
176
|
}
|
|
176
177
|
}
|
|
@@ -536,7 +537,7 @@ class Dataset {
|
|
|
536
537
|
}
|
|
537
538
|
}
|
|
538
539
|
catch (error) {
|
|
539
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
540
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
540
541
|
}
|
|
541
542
|
}
|
|
542
543
|
}
|
|
@@ -627,7 +628,7 @@ class Dataset {
|
|
|
627
628
|
}
|
|
628
629
|
}
|
|
629
630
|
catch (error) {
|
|
630
|
-
console.warn(`Error parsing line ${lineCount}: ${error}`);
|
|
631
|
+
console.warn(`Error parsing line ${line}\n${lineCount}: ${error}`);
|
|
631
632
|
}
|
|
632
633
|
}
|
|
633
634
|
}
|
|
@@ -703,10 +704,18 @@ class Dataset {
|
|
|
703
704
|
return this;
|
|
704
705
|
});
|
|
705
706
|
this.getDimensions = () => this._dimensions;
|
|
706
|
-
this.
|
|
707
|
+
this.setDimensions = (dimensions) => {
|
|
707
708
|
this._dimensions = dimensions;
|
|
708
709
|
return this;
|
|
709
710
|
};
|
|
711
|
+
this.setSingleDimension = (newDimension, oldDimension) => {
|
|
712
|
+
(0, Affirm_1.default)(newDimension, `Invalid new dimension`);
|
|
713
|
+
(0, Affirm_1.default)(oldDimension, 'Invalid old dimension');
|
|
714
|
+
const current = this._dimensions.findIndex(x => x.index === oldDimension.index);
|
|
715
|
+
(0, Affirm_1.default)(current, `Trying to updata a dataset dimension that doesn't exist: ${oldDimension.name} index ${oldDimension.index}`);
|
|
716
|
+
this._dimensions.splice(current, 1, newDimension);
|
|
717
|
+
return this;
|
|
718
|
+
};
|
|
710
719
|
/**
|
|
711
720
|
* Update the record pool when dimensions change
|
|
712
721
|
*/
|
|
@@ -829,6 +838,7 @@ class Dataset {
|
|
|
829
838
|
this._computeSize = () => fs_1.default.statSync(this._path).size / (1024 * 1024);
|
|
830
839
|
this.name = name;
|
|
831
840
|
this._file = file;
|
|
841
|
+
this._executionId = executionId;
|
|
832
842
|
this._batchSize = (_a = batchSize !== null && batchSize !== void 0 ? batchSize : parseInt(Environment_1.default.get('MAX_ITEMS_IN_MEMORY'))) !== null && _a !== void 0 ? _a : Constants_1.default.defaults.MAX_ITEMS_IN_MEMORY;
|
|
833
843
|
this._dimensions = [];
|
|
834
844
|
this._firstLine = '';
|
|
@@ -843,8 +853,9 @@ class Dataset {
|
|
|
843
853
|
.replace(/_{2,}/g, '_')
|
|
844
854
|
.replace(/^_+|_+$/g, '')
|
|
845
855
|
.toLowerCase();
|
|
846
|
-
|
|
847
|
-
this.
|
|
856
|
+
const execFolder = executionId ? path_1.default.join(datasetName, executionId) : datasetName;
|
|
857
|
+
this._path = path_1.default.join('./remora', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset');
|
|
858
|
+
this._tempPath = path_1.default.join('./remora/', Constants_1.default.defaults.PRODUCER_TEMP_FOLDER, execFolder, '.dataset_tmp');
|
|
848
859
|
this.ensureFile(this._path);
|
|
849
860
|
}
|
|
850
861
|
}
|