@forzalabs/remora 0.1.8-nasco.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +10 -3
- package/actions/automap.js +77 -0
- package/actions/deploy.js +1 -1
- package/actions/run.js +9 -4
- package/actions/sample.js +176 -0
- package/database/DatabaseEngine.js +18 -3
- package/definitions/DatasetDefinitions.js +2 -0
- package/definitions/json_schemas/producer-schema.json +39 -1
- package/definitions/json_schemas/source-schema.json +76 -3
- package/drivers/DriverFactory.js +6 -0
- package/drivers/DriverHelper.js +18 -6
- package/drivers/HttpApiDriver.js +204 -0
- package/drivers/LocalDriver.js +21 -7
- package/drivers/S3Driver.js +24 -8
- package/engines/UserManager.js +12 -0
- package/engines/ai/LLM.js +4 -24
- package/engines/consumer/ConsumerEngine.js +2 -2
- package/engines/dataset/Dataset.js +1 -1
- package/engines/dataset/DatasetManager.js +68 -25
- package/engines/dataset/DatasetRecord.js +5 -3
- package/engines/execution/ExecutionPlanner.js +2 -1
- package/engines/parsing/CSVParser.js +59 -0
- package/engines/parsing/ParseManager.js +21 -4
- package/engines/producer/ProducerEngine.js +13 -4
- package/engines/scheduler/CronScheduler.js +2 -3
- package/engines/scheduler/QueueManager.js +2 -3
- package/engines/transform/TransformationEngine.js +18 -0
- package/engines/usage/UsageManager.js +4 -2
- package/engines/validation/Validator.js +17 -0
- package/index.js +20 -0
- package/package.json +3 -2
package/drivers/DriverHelper.js
CHANGED
|
@@ -28,12 +28,17 @@ const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
|
28
28
|
const DriverHelper = {
|
|
29
29
|
appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
30
30
|
(0, Affirm_1.default)(options, 'Invalid options');
|
|
31
|
-
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter } = options;
|
|
31
|
+
const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options;
|
|
32
32
|
(0, Affirm_1.default)(headerLine, `Invalid header line`);
|
|
33
33
|
const keys = (fileType === 'JSON' || fileType === 'JSONL')
|
|
34
34
|
? Object.keys(JSON.parse(headerLine))
|
|
35
35
|
: [];
|
|
36
36
|
const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
|
|
37
|
+
// When sourceFilename is set, the headerLine includes $source_filename at the end.
|
|
38
|
+
// For validation, we need to compare against the original header without this suffix.
|
|
39
|
+
const originalHeaderLine = sourceFilename
|
|
40
|
+
? headerLine.slice(0, headerLine.lastIndexOf(delimiter))
|
|
41
|
+
: headerLine;
|
|
37
42
|
let isFirstLine = true;
|
|
38
43
|
let hasValidatedHeader = shouldValidateHeader ? false : true;
|
|
39
44
|
let leftoverData = '';
|
|
@@ -50,8 +55,8 @@ const DriverHelper = {
|
|
|
50
55
|
const line = lines[i];
|
|
51
56
|
// Header validation for first line
|
|
52
57
|
if (!hasValidatedHeader && isFirstLine && i === 0) {
|
|
53
|
-
if (shouldValidateHeader &&
|
|
54
|
-
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${
|
|
58
|
+
if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) {
|
|
59
|
+
const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`;
|
|
55
60
|
Logger_1.default.log(msg);
|
|
56
61
|
return callback(new Error(msg));
|
|
57
62
|
}
|
|
@@ -77,7 +82,7 @@ const DriverHelper = {
|
|
|
77
82
|
// Process any remaining data
|
|
78
83
|
if (leftoverData.trim()) {
|
|
79
84
|
if (shouldIncludeLine(leftoverData, -1)) {
|
|
80
|
-
callback(null, Buffer.from(processLine(leftoverData)));
|
|
85
|
+
callback(null, Buffer.from(processLine(leftoverData) + '\n'));
|
|
81
86
|
}
|
|
82
87
|
else {
|
|
83
88
|
callback(null, null);
|
|
@@ -101,21 +106,28 @@ const DriverHelper = {
|
|
|
101
106
|
};
|
|
102
107
|
const processLine = (line) => {
|
|
103
108
|
lineCount++;
|
|
109
|
+
let processedLine;
|
|
104
110
|
switch (fileType) {
|
|
105
111
|
case 'JSON':
|
|
106
112
|
case 'JSONL': {
|
|
107
113
|
try {
|
|
108
114
|
const parsed = JSON.parse(line);
|
|
109
|
-
|
|
115
|
+
processedLine = keys.map(k => parsed[k]).join(delimiter);
|
|
110
116
|
}
|
|
111
117
|
catch (error) {
|
|
112
118
|
Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
|
|
113
119
|
throw error;
|
|
114
120
|
}
|
|
121
|
+
break;
|
|
115
122
|
}
|
|
116
123
|
default:
|
|
117
|
-
|
|
124
|
+
processedLine = line;
|
|
125
|
+
}
|
|
126
|
+
// If sourceFilename is provided, append it to each line
|
|
127
|
+
if (sourceFilename) {
|
|
128
|
+
processedLine = processedLine + delimiter + sourceFilename;
|
|
118
129
|
}
|
|
130
|
+
return processedLine;
|
|
119
131
|
};
|
|
120
132
|
const writeOptions = append ? { flags: 'a' } : {};
|
|
121
133
|
const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.HttpApiSourceDriver = void 0;
|
|
16
|
+
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
17
|
+
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
18
|
+
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
19
|
+
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
20
|
+
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
21
|
+
class HttpApiSourceDriver {
|
|
22
|
+
constructor() {
|
|
23
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
24
|
+
(0, Affirm_1.default)(source, 'Invalid source');
|
|
25
|
+
(0, Affirm_1.default)(source.authentication, 'Invalid authentication');
|
|
26
|
+
(0, Affirm_1.default)(source.authentication.url, 'HTTP API source requires a URL in authentication.url');
|
|
27
|
+
this._source = source;
|
|
28
|
+
this._baseUrl = SecretManager_1.default.replaceSecret(source.authentication.url);
|
|
29
|
+
this._httpMethod = source.authentication.httpMethod || 'GET';
|
|
30
|
+
this._timeout = source.authentication.timeout || 30000; // 30 seconds default
|
|
31
|
+
this._headers = source.authentication.headers ? Object.assign({}, source.authentication.headers) : {};
|
|
32
|
+
this._queryParams = source.authentication.queryParams ? Object.assign({}, source.authentication.queryParams) : {};
|
|
33
|
+
// Handle different authentication methods
|
|
34
|
+
switch (source.authentication.method) {
|
|
35
|
+
case 'bearer-token': {
|
|
36
|
+
(0, Affirm_1.default)(source.authentication.bearerToken, 'Bearer token authentication requires bearerToken');
|
|
37
|
+
this._headers['Authorization'] = `Bearer ${SecretManager_1.default.replaceSecret(source.authentication.bearerToken)}`;
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
case 'api-key': {
|
|
41
|
+
(0, Affirm_1.default)(source.authentication.apiKey, 'API key authentication requires apiKey');
|
|
42
|
+
const apiKeyHeader = source.authentication.apiKeyHeader || 'X-API-Key';
|
|
43
|
+
this._headers[apiKeyHeader] = SecretManager_1.default.replaceSecret(source.authentication.apiKey);
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
case 'username-password': {
|
|
47
|
+
(0, Affirm_1.default)(source.authentication.user && source.authentication.password, 'Username-password authentication requires user and password');
|
|
48
|
+
const credentials = Buffer.from(`${SecretManager_1.default.replaceSecret(source.authentication.user)}:${SecretManager_1.default.replaceSecret(source.authentication.password)}`).toString('base64');
|
|
49
|
+
this._headers['Authorization'] = `Basic ${credentials}`;
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
case 'none':
|
|
53
|
+
// No authentication required
|
|
54
|
+
break;
|
|
55
|
+
default:
|
|
56
|
+
throw new Error(`Authentication method "${source.authentication.method}" is not supported for HTTP API sources`);
|
|
57
|
+
}
|
|
58
|
+
// Test connection
|
|
59
|
+
try {
|
|
60
|
+
yield this._makeRequest(this._baseUrl);
|
|
61
|
+
Logger_1.default.log(`HTTP API connection to ${this._baseUrl} successful`);
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
throw new Error(`Failed to connect to HTTP API at ${this._baseUrl}: ${error.message}`);
|
|
65
|
+
}
|
|
66
|
+
return this;
|
|
67
|
+
});
|
|
68
|
+
this._makeRequest = (url, options) => __awaiter(this, void 0, void 0, function* () {
|
|
69
|
+
const method = (options === null || options === void 0 ? void 0 : options.method) || this._httpMethod;
|
|
70
|
+
const headers = Object.assign(Object.assign({}, this._headers), options === null || options === void 0 ? void 0 : options.additionalHeaders);
|
|
71
|
+
const queryParams = Object.assign(Object.assign({}, this._queryParams), options === null || options === void 0 ? void 0 : options.additionalQueryParams);
|
|
72
|
+
// Build URL with query parameters
|
|
73
|
+
const urlWithParams = new URL(url);
|
|
74
|
+
Object.entries(queryParams).forEach(([key, value]) => {
|
|
75
|
+
urlWithParams.searchParams.append(key, value);
|
|
76
|
+
});
|
|
77
|
+
const fetchOptions = {
|
|
78
|
+
method,
|
|
79
|
+
headers,
|
|
80
|
+
signal: AbortSignal.timeout(this._timeout)
|
|
81
|
+
};
|
|
82
|
+
if ((options === null || options === void 0 ? void 0 : options.body) && (method === 'POST' || method === 'PUT' || method === 'PATCH')) {
|
|
83
|
+
fetchOptions.body = typeof options.body === 'string'
|
|
84
|
+
? options.body
|
|
85
|
+
: JSON.stringify(options.body);
|
|
86
|
+
if (!headers['Content-Type']) {
|
|
87
|
+
headers['Content-Type'] = 'application/json';
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
const response = yield fetch(urlWithParams.toString(), fetchOptions);
|
|
91
|
+
if (!response.ok) {
|
|
92
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
93
|
+
}
|
|
94
|
+
const contentType = response.headers.get('content-type');
|
|
95
|
+
if (contentType === null || contentType === void 0 ? void 0 : contentType.includes('application/json')) {
|
|
96
|
+
return yield response.json();
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
return yield response.text();
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
this.execute = (_sql) => __awaiter(this, void 0, void 0, function* () {
|
|
103
|
+
void _sql;
|
|
104
|
+
throw new Error('SQL execution is not supported for HTTP API sources. Use query() or readAll() instead.');
|
|
105
|
+
});
|
|
106
|
+
this.query = (_sql, _values) => __awaiter(this, void 0, void 0, function* () {
|
|
107
|
+
void _sql;
|
|
108
|
+
void _values;
|
|
109
|
+
throw new Error('SQL queries are not supported for HTTP API sources. Use readAll() to fetch data from the API.');
|
|
110
|
+
});
|
|
111
|
+
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
112
|
+
try {
|
|
113
|
+
const endpoint = producer.settings.fileKey || '';
|
|
114
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
115
|
+
yield this._makeRequest(url, { method: 'HEAD' });
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
catch (error) {
|
|
119
|
+
if (error.message.includes('404')) {
|
|
120
|
+
return false;
|
|
121
|
+
}
|
|
122
|
+
throw error;
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
this.readAll = (request, values) => __awaiter(this, void 0, void 0, function* () {
|
|
126
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
127
|
+
(0, Affirm_1.default)(request.fileKey, 'Invalid file key (endpoint path)');
|
|
128
|
+
const endpoint = request.fileKey;
|
|
129
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
130
|
+
// Convert IQueryParameter[] to query params if provided
|
|
131
|
+
const additionalQueryParams = {};
|
|
132
|
+
if (values && values.length > 0) {
|
|
133
|
+
values.forEach(param => {
|
|
134
|
+
additionalQueryParams[param.name] = param.value;
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
const data = yield this._makeRequest(url, { additionalQueryParams });
|
|
138
|
+
// Convert response to string array (lines)
|
|
139
|
+
return this._extractObjectsFromResponse(data, request.httpApi).map(x => JSON.stringify(x));
|
|
140
|
+
});
|
|
141
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
142
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
143
|
+
(0, Affirm_1.default)(request.options, 'Invalid read request options');
|
|
144
|
+
const allLines = yield this.readAll(request);
|
|
145
|
+
const { lineFrom, lineTo } = request.options;
|
|
146
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
|
|
147
|
+
return allLines.slice(lineFrom, lineTo);
|
|
148
|
+
}
|
|
149
|
+
return allLines;
|
|
150
|
+
});
|
|
151
|
+
this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
|
|
152
|
+
(0, Affirm_1.default)(dataset, 'Invalid dataset');
|
|
153
|
+
const file = dataset.getFile();
|
|
154
|
+
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
155
|
+
(0, Affirm_1.default)(file.fileKey, 'Invalid file key (endpoint path)');
|
|
156
|
+
const endpoint = file.fileKey;
|
|
157
|
+
const url = endpoint.startsWith('http') ? endpoint : `${this._baseUrl}${endpoint}`;
|
|
158
|
+
const data = yield this._makeRequest(url);
|
|
159
|
+
const apiObjects = this._extractObjectsFromResponse(data, file.httpApi);
|
|
160
|
+
dataset.setFirstLine(JSON.stringify(apiObjects[0]));
|
|
161
|
+
const totalLineCount = yield DriverHelper_1.default.appendObjectsToUnifiedFile({
|
|
162
|
+
append: true,
|
|
163
|
+
delimiter: dataset.getDelimiter(),
|
|
164
|
+
destinationPath: dataset.getPath(),
|
|
165
|
+
objects: apiObjects
|
|
166
|
+
});
|
|
167
|
+
dataset.setCount(totalLineCount);
|
|
168
|
+
return dataset;
|
|
169
|
+
});
|
|
170
|
+
this._extractObjectsFromResponse = (data, httpApi) => {
|
|
171
|
+
let itemsData = [];
|
|
172
|
+
if (httpApi && httpApi.dataProperty && httpApi.dataProperty.length > 0) {
|
|
173
|
+
itemsData = data[httpApi.dataProperty];
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
if (typeof data === 'string') {
|
|
177
|
+
itemsData = data.split('\n').filter(line => line.trim().length > 0);
|
|
178
|
+
}
|
|
179
|
+
else if (Array.isArray(data)) {
|
|
180
|
+
itemsData = data;
|
|
181
|
+
}
|
|
182
|
+
else if (typeof data === 'object' && data !== null) {
|
|
183
|
+
const dataObj = data;
|
|
184
|
+
if (dataObj.data && Array.isArray(dataObj.data)) {
|
|
185
|
+
itemsData = dataObj.data;
|
|
186
|
+
}
|
|
187
|
+
else if (dataObj.results && Array.isArray(dataObj.results)) {
|
|
188
|
+
itemsData = dataObj.results;
|
|
189
|
+
}
|
|
190
|
+
else if (dataObj.items && Array.isArray(dataObj.items)) {
|
|
191
|
+
itemsData = dataObj.items;
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
// Single object, return as single line
|
|
195
|
+
itemsData = [data];
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return itemsData;
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
exports.HttpApiSourceDriver = HttpApiSourceDriver;
|
|
204
|
+
exports.default = HttpApiSourceDriver;
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -65,6 +65,7 @@ const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"))
|
|
|
65
65
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
66
66
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
67
67
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
68
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
68
69
|
class LocalSourceDriver {
|
|
69
70
|
constructor() {
|
|
70
71
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -123,7 +124,8 @@ class LocalSourceDriver {
|
|
|
123
124
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
124
125
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
125
126
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
126
|
-
const
|
|
127
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
128
|
+
const copyLocally = (fileKey_1, headerLine_1, ...args_1) => __awaiter(this, [fileKey_1, headerLine_1, ...args_1], void 0, function* (fileKey, headerLine, appendMode = false, sourceFilename) {
|
|
127
129
|
const sourceFilePath = path_1.default.join(this._path, fileKey);
|
|
128
130
|
(0, Affirm_1.default)(fs.existsSync(sourceFilePath), `Source file does not exist: ${sourceFilePath}`);
|
|
129
131
|
// Copy and validate header in a single stream pass
|
|
@@ -136,7 +138,8 @@ class LocalSourceDriver {
|
|
|
136
138
|
headerLine,
|
|
137
139
|
fileType: file.fileType,
|
|
138
140
|
hasHeaderRow: file.hasHeaderRow,
|
|
139
|
-
delimiter: dataset.getDelimiter()
|
|
141
|
+
delimiter: dataset.getDelimiter(),
|
|
142
|
+
sourceFilename
|
|
140
143
|
});
|
|
141
144
|
});
|
|
142
145
|
const { fileKey } = file;
|
|
@@ -145,21 +148,32 @@ class LocalSourceDriver {
|
|
|
145
148
|
Logger_1.default.log(`Matched ${allFileKeys.length} files, copying locally and creating unified dataset.`);
|
|
146
149
|
Affirm_1.default.hasItems(allFileKeys, `The file key "${fileKey}" doesn't have any matches in path "${this._path}".`);
|
|
147
150
|
// Get header line from the first file
|
|
148
|
-
|
|
151
|
+
let headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, allFileKeys[0]), 1))[0];
|
|
152
|
+
// If including source filename, append a placeholder column name to the header
|
|
153
|
+
if (includeSourceFilename) {
|
|
154
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
155
|
+
}
|
|
149
156
|
dataset.setFirstLine(headerLine);
|
|
150
157
|
let totalLineCount = 0;
|
|
151
158
|
// Copy files sequentially to avoid file conflicts
|
|
152
159
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
153
|
-
|
|
160
|
+
const currentFileKey = allFileKeys[i];
|
|
161
|
+
// Pass the filename (just the basename) if includeSourceFilename is enabled
|
|
162
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
|
|
163
|
+
totalLineCount += yield copyLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files
|
|
154
164
|
}
|
|
155
165
|
dataset.setCount(totalLineCount);
|
|
156
166
|
return dataset;
|
|
157
167
|
}
|
|
158
168
|
else {
|
|
159
|
-
// For single file,
|
|
160
|
-
|
|
169
|
+
// For single file, include the filename if configured
|
|
170
|
+
let headerLine = (yield DriverHelper_1.default.quickReadFile(path_1.default.join(this._path, fileKey), 1))[0];
|
|
171
|
+
if (includeSourceFilename) {
|
|
172
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
173
|
+
}
|
|
161
174
|
dataset.setFirstLine(headerLine);
|
|
162
|
-
const
|
|
175
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
176
|
+
const totalLineCount = yield copyLocally(fileKey, headerLine, false, sourceFilename);
|
|
163
177
|
dataset.setCount(totalLineCount);
|
|
164
178
|
return dataset;
|
|
165
179
|
}
|
package/drivers/S3Driver.js
CHANGED
|
@@ -24,6 +24,7 @@ const client_s3_1 = require("@aws-sdk/client-s3");
|
|
|
24
24
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
25
25
|
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
26
26
|
const readline_1 = __importDefault(require("readline"));
|
|
27
|
+
const path_1 = __importDefault(require("path"));
|
|
27
28
|
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
28
29
|
const xlsx_1 = __importDefault(require("xlsx"));
|
|
29
30
|
const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser")); // Added XMLParser import
|
|
@@ -32,6 +33,7 @@ const ParseHelper_1 = __importDefault(require("../engines/parsing/ParseHelper"))
|
|
|
32
33
|
const FileExporter_1 = __importDefault(require("../engines/file/FileExporter"));
|
|
33
34
|
const DriverHelper_1 = __importDefault(require("./DriverHelper"));
|
|
34
35
|
const Logger_1 = __importDefault(require("../helper/Logger"));
|
|
36
|
+
const Constants_1 = __importDefault(require("../Constants"));
|
|
35
37
|
class S3DestinationDriver {
|
|
36
38
|
constructor() {
|
|
37
39
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -213,7 +215,8 @@ class S3SourceDriver {
|
|
|
213
215
|
(0, Affirm_1.default)(file, 'Invalid dataset file');
|
|
214
216
|
(0, Affirm_1.default)(file.fileKey, 'Invalid file key');
|
|
215
217
|
(0, Affirm_1.default)(file.fileType, `Invalid file type`);
|
|
216
|
-
const
|
|
218
|
+
const includeSourceFilename = file.includeSourceFilename === true;
|
|
219
|
+
const downloadLocally = (fileUrl_1, headerLine_1, ...args_1) => __awaiter(this, [fileUrl_1, headerLine_1, ...args_1], void 0, function* (fileUrl, headerLine, appendMode = false, sourceFilename) {
|
|
217
220
|
// Download and validate header in a single stream pass
|
|
218
221
|
const command = new client_s3_1.GetObjectCommand({
|
|
219
222
|
Bucket: this._bucketName,
|
|
@@ -230,7 +233,8 @@ class S3SourceDriver {
|
|
|
230
233
|
headerLine,
|
|
231
234
|
fileType: file.fileType,
|
|
232
235
|
hasHeaderRow: file.hasHeaderRow,
|
|
233
|
-
delimiter: dataset.getDelimiter()
|
|
236
|
+
delimiter: dataset.getDelimiter(),
|
|
237
|
+
sourceFilename
|
|
234
238
|
});
|
|
235
239
|
});
|
|
236
240
|
const { fileKey } = file;
|
|
@@ -246,18 +250,25 @@ class S3SourceDriver {
|
|
|
246
250
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
247
251
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
248
252
|
const firstFileStream = firstFileResponse.Body;
|
|
249
|
-
|
|
253
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
254
|
+
// If including source filename, append a placeholder column name to the header
|
|
255
|
+
if (includeSourceFilename) {
|
|
256
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
257
|
+
}
|
|
250
258
|
dataset.setFirstLine(headerLine);
|
|
251
259
|
let totalLineCount = 0;
|
|
252
260
|
// Download files sequentially to avoid file conflicts
|
|
253
261
|
for (let i = 0; i < allFileKeys.length; i++) {
|
|
254
|
-
|
|
262
|
+
const currentFileKey = allFileKeys[i];
|
|
263
|
+
// Pass the filename (just the basename) if includeSourceFilename is enabled
|
|
264
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(currentFileKey) : undefined;
|
|
265
|
+
totalLineCount += yield downloadLocally(currentFileKey, headerLine, i > 0, sourceFilename); // Append mode for subsequent files
|
|
255
266
|
}
|
|
256
267
|
dataset.setCount(totalLineCount);
|
|
257
268
|
return dataset;
|
|
258
269
|
}
|
|
259
270
|
else {
|
|
260
|
-
// Get header line from the
|
|
271
|
+
// Get header line from the single file
|
|
261
272
|
const firstFileCommand = new client_s3_1.GetObjectCommand({
|
|
262
273
|
Bucket: this._bucketName,
|
|
263
274
|
Key: fileKey
|
|
@@ -265,10 +276,15 @@ class S3SourceDriver {
|
|
|
265
276
|
const firstFileResponse = yield this._client.send(firstFileCommand);
|
|
266
277
|
(0, Affirm_1.default)(firstFileResponse.Body, 'Failed to fetch first file from S3');
|
|
267
278
|
const firstFileStream = firstFileResponse.Body;
|
|
268
|
-
|
|
279
|
+
let headerLine = yield this.getFirstLineFromStream(firstFileStream);
|
|
280
|
+
// If including source filename, append a placeholder column name to the header
|
|
281
|
+
if (includeSourceFilename) {
|
|
282
|
+
headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
|
|
283
|
+
}
|
|
269
284
|
dataset.setFirstLine(headerLine);
|
|
270
|
-
//
|
|
271
|
-
const
|
|
285
|
+
// Pass the filename if includeSourceFilename is enabled
|
|
286
|
+
const sourceFilename = includeSourceFilename ? path_1.default.basename(fileKey) : undefined;
|
|
287
|
+
const totalLineCount = yield downloadLocally(fileKey, headerLine, false, sourceFilename);
|
|
272
288
|
dataset.setCount(totalLineCount);
|
|
273
289
|
return dataset;
|
|
274
290
|
}
|
package/engines/UserManager.js
CHANGED
|
@@ -24,6 +24,18 @@ class UserManagerClass {
|
|
|
24
24
|
return MOCK_USER;
|
|
25
25
|
// TODO: figure out how to handle users
|
|
26
26
|
};
|
|
27
|
+
this.getRemoraWorkerUser = () => {
|
|
28
|
+
const remora = {
|
|
29
|
+
_id: '__remora_worker__',
|
|
30
|
+
auth: { oid: '', provider: 'internal' },
|
|
31
|
+
email: '',
|
|
32
|
+
name: 'Remora Worker',
|
|
33
|
+
roles: ['root'],
|
|
34
|
+
_signature: '',
|
|
35
|
+
lastLogin: new Date().toJSON()
|
|
36
|
+
};
|
|
37
|
+
return remora;
|
|
38
|
+
};
|
|
27
39
|
this.findOIDC = (oid) => __awaiter(this, void 0, void 0, function* () {
|
|
28
40
|
return yield DatabaseEngine_1.default.findOne(Settings_1.default.db.collections.users, { 'auth.oid': oid });
|
|
29
41
|
});
|
package/engines/ai/LLM.js
CHANGED
|
@@ -164,26 +164,6 @@ resulting consumer: """
|
|
|
164
164
|
}
|
|
165
165
|
"""
|
|
166
166
|
`;
|
|
167
|
-
const baseQASystemPrompt = `
|
|
168
|
-
# TASK
|
|
169
|
-
You are an agent tasked with ensuring that the CONSUMER(S) created follow the guidelines given.
|
|
170
|
-
You are going to receive a list of CONSUMERS and you need to return in the correct JSON format the same CONSUMERS with the needed updates to ensure that they follow all the rules.
|
|
171
|
-
|
|
172
|
-
# CONSUMER DEFINITION
|
|
173
|
-
A consumer takes the data from one or more producers and changes it's shape to transform it into the required output schema.
|
|
174
|
-
## FIELDS
|
|
175
|
-
- fields.from: used to distinct between the producers imported by the consumer. The value is the name of the producer.
|
|
176
|
-
|
|
177
|
-
# RULES
|
|
178
|
-
- If a field is not needed, do not add it e.g.
|
|
179
|
-
- Only import a producer once
|
|
180
|
-
- Awlays include this exact property as the first -> "https://raw.githubusercontent.com/ForzaLabs/remora-public/refs/heads/main/json_schemas/consumer-schema.json",
|
|
181
|
-
- Use "API" as the only valid output format.
|
|
182
|
-
- The "from" must contain only the name of the producer
|
|
183
|
-
|
|
184
|
-
# CONSUMERS
|
|
185
|
-
{{consumers}}
|
|
186
|
-
`;
|
|
187
167
|
class LLM {
|
|
188
168
|
constructor() {
|
|
189
169
|
this.inferProducers = (input, outputs, fileName, sources) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -192,7 +172,7 @@ class LLM {
|
|
|
192
172
|
systemPrompt = systemPrompt.replace('{{output data spec}}', outputs.map(x => `- ${JSON.stringify(x)}`).join('\n'));
|
|
193
173
|
systemPrompt = systemPrompt.replace('{{file name}}', fileName);
|
|
194
174
|
systemPrompt = systemPrompt.replace('{{sources}}', sources.map(x => `- ${JSON.stringify(x)}`).join('\n'));
|
|
195
|
-
const res = yield this._client.
|
|
175
|
+
const res = yield this._client.chat.completions.create({
|
|
196
176
|
model: 'gpt-4o',
|
|
197
177
|
messages: [
|
|
198
178
|
{ role: 'system', content: systemPrompt }
|
|
@@ -219,7 +199,7 @@ class LLM {
|
|
|
219
199
|
}), 'environment')
|
|
220
200
|
});
|
|
221
201
|
const msg = res.choices[0].message;
|
|
222
|
-
return msg.
|
|
202
|
+
return JSON.parse(msg.content);
|
|
223
203
|
});
|
|
224
204
|
this.inferConsumers = (producers, outputs) => __awaiter(this, void 0, void 0, function* () {
|
|
225
205
|
let systemPrompt = baseConsumersSystemPrompt;
|
|
@@ -261,9 +241,9 @@ class LLM {
|
|
|
261
241
|
}))
|
|
262
242
|
}), 'environment')
|
|
263
243
|
};
|
|
264
|
-
const res = yield this._client.
|
|
244
|
+
const res = yield this._client.chat.completions.create(item);
|
|
265
245
|
const msg = res.choices[0].message;
|
|
266
|
-
const finalDraft = msg.
|
|
246
|
+
const finalDraft = JSON.parse(msg.content);
|
|
267
247
|
// Do some manual adjustments cause some things still don't work...
|
|
268
248
|
if (finalDraft && finalDraft.consumers) {
|
|
269
249
|
for (const cons of finalDraft.consumers) {
|
|
@@ -127,10 +127,10 @@ class ConsumerEngineClass {
|
|
|
127
127
|
}
|
|
128
128
|
}
|
|
129
129
|
});
|
|
130
|
-
this.execute = (consumer, options, user) => __awaiter(this, void 0, void 0, function* () {
|
|
130
|
+
this.execute = (consumer, options, user, details) => __awaiter(this, void 0, void 0, function* () {
|
|
131
131
|
(0, Affirm_1.default)(consumer, `Invalid consumer`);
|
|
132
132
|
(0, Affirm_1.default)(options, `Invalid execute consume options`);
|
|
133
|
-
const { usageId } = UsageManager_1.default.startUsage(consumer, user);
|
|
133
|
+
const { usageId } = UsageManager_1.default.startUsage(consumer, user, details);
|
|
134
134
|
try {
|
|
135
135
|
const execution = new ExecutionEnvironment_1.default(consumer, usageId);
|
|
136
136
|
const result = yield execution.run(options);
|
|
@@ -730,7 +730,7 @@ class Dataset {
|
|
|
730
730
|
console.log(`DS ${this.name} (${this._count} | ${this._iterations})`);
|
|
731
731
|
console.log(this._dimensions.map(x => x.name).join(this._delimiter));
|
|
732
732
|
const records = yield this.readLines(count);
|
|
733
|
-
records.forEach(x => console.log(full ? x : x.stringify()));
|
|
733
|
+
records.forEach((x, i) => console.log(`[${i}]`, full ? x : x.stringify()));
|
|
734
734
|
console.log('----------');
|
|
735
735
|
});
|
|
736
736
|
this.printStats = () => {
|