@forzalabs/remora 0.0.26 → 0.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Constants.js +1 -1
- package/definitions/json_schemas/producer-schema.json +13 -2
- package/definitions/json_schemas/source-schema.json +4 -0
- package/drivers/DriverFactory.js +6 -40
- package/drivers/LocalDriver.js +80 -49
- package/drivers/S3Driver.js +160 -2
- package/engines/ParseManager.js +20 -3
- package/engines/ProducerEngine.js +33 -17
- package/engines/ai/LLM.js +5 -5
- package/package.json +3 -1
package/Constants.js
CHANGED
|
@@ -132,13 +132,24 @@
|
|
|
132
132
|
"enum": [
|
|
133
133
|
"JSON",
|
|
134
134
|
"JSONL",
|
|
135
|
-
"CSV"
|
|
135
|
+
"CSV",
|
|
136
|
+
"TXT",
|
|
137
|
+
"XLS",
|
|
138
|
+
"XLSX"
|
|
136
139
|
],
|
|
137
140
|
"description": "The type of file to read"
|
|
138
141
|
},
|
|
139
142
|
"delimiter": {
|
|
140
143
|
"type": "string",
|
|
141
|
-
"description": "The column delimiter for
|
|
144
|
+
"description": "The column delimiter for CSV or TXT files if different from the default (,)."
|
|
145
|
+
},
|
|
146
|
+
"hasHeaderRow": {
|
|
147
|
+
"type": "boolean",
|
|
148
|
+
"description": "For TXT files, specifies whether the file has a header row containing column names. Defaults to true."
|
|
149
|
+
},
|
|
150
|
+
"sheetName": {
|
|
151
|
+
"type": "string",
|
|
152
|
+
"description": "For Excel files (.xls/.xlsx), specifies the name of the sheet to read data from. If not specified, the first sheet will be used."
|
|
142
153
|
}
|
|
143
154
|
},
|
|
144
155
|
"additionalProperties": false
|
package/drivers/DriverFactory.js
CHANGED
|
@@ -1,37 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
3
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
4
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -45,10 +12,9 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
45
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
13
|
};
|
|
47
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
-
const LocalDriver_1 =
|
|
15
|
+
const LocalDriver_1 = require("./LocalDriver");
|
|
49
16
|
const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
|
|
50
|
-
const S3Driver_1 =
|
|
51
|
-
const S3SourceDriver_1 = __importDefault(require("./S3SourceDriver"));
|
|
17
|
+
const S3Driver_1 = require("./S3Driver");
|
|
52
18
|
class DriverFactoryClass {
|
|
53
19
|
constructor() {
|
|
54
20
|
this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
@@ -60,12 +26,12 @@ class DriverFactoryClass {
|
|
|
60
26
|
return driver;
|
|
61
27
|
}
|
|
62
28
|
case 'aws-s3': {
|
|
63
|
-
const driver = new
|
|
29
|
+
const driver = new S3Driver_1.S3SourceDriver();
|
|
64
30
|
yield driver.init(source);
|
|
65
31
|
return driver;
|
|
66
32
|
}
|
|
67
33
|
case 'local': {
|
|
68
|
-
const driver = new LocalDriver_1.
|
|
34
|
+
const driver = new LocalDriver_1.LocalSourceDriver();
|
|
69
35
|
yield driver.init(source);
|
|
70
36
|
return driver;
|
|
71
37
|
}
|
|
@@ -76,12 +42,12 @@ class DriverFactoryClass {
|
|
|
76
42
|
switch (source.engine) {
|
|
77
43
|
// TODO: implement all the other engines
|
|
78
44
|
case 'aws-s3': {
|
|
79
|
-
const driver = new S3Driver_1.
|
|
45
|
+
const driver = new S3Driver_1.S3DestinationDriver();
|
|
80
46
|
yield driver.init(source);
|
|
81
47
|
return driver;
|
|
82
48
|
}
|
|
83
49
|
case 'local': {
|
|
84
|
-
const driver = new LocalDriver_1.
|
|
50
|
+
const driver = new LocalDriver_1.LocalDestinationDriver();
|
|
85
51
|
yield driver.init(source);
|
|
86
52
|
return driver;
|
|
87
53
|
}
|
package/drivers/LocalDriver.js
CHANGED
|
@@ -52,12 +52,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
52
52
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
53
53
|
};
|
|
54
54
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
55
|
-
exports.
|
|
55
|
+
exports.LocalDestinationDriver = exports.LocalSourceDriver = void 0;
|
|
56
56
|
const fs = __importStar(require("fs"));
|
|
57
57
|
const path_1 = __importDefault(require("path"));
|
|
58
58
|
const readline_1 = __importDefault(require("readline"));
|
|
59
59
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
60
|
-
|
|
60
|
+
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
61
|
+
const xlsx_1 = __importDefault(require("xlsx"));
|
|
62
|
+
class LocalSourceDriver {
|
|
61
63
|
constructor() {
|
|
62
64
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
63
65
|
const fileURL = source.authentication['path'];
|
|
@@ -68,79 +70,109 @@ class LocalDriver {
|
|
|
68
70
|
return this;
|
|
69
71
|
});
|
|
70
72
|
this.download = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
71
|
-
var _a, e_1, _b, _c;
|
|
72
73
|
(0, Affirm_1.default)(this._path, `Invalid path`);
|
|
73
74
|
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
74
75
|
(0, Affirm_1.default)(request.fileKey, `Invalid file key for download request`);
|
|
75
|
-
|
|
76
|
-
const
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
76
|
+
(0, Affirm_1.default)(request.fileType, `Invalid file type for download request`);
|
|
77
|
+
const { fileKey, options } = request;
|
|
78
|
+
const fileUrl = path_1.default.join(this._path, fileKey);
|
|
79
|
+
switch (request.fileType) {
|
|
80
|
+
case 'CSV':
|
|
81
|
+
case 'JSON':
|
|
82
|
+
case 'JSONL':
|
|
83
|
+
case 'TXT':
|
|
84
|
+
return yield this._readLines(fileUrl);
|
|
85
|
+
case 'XLS':
|
|
86
|
+
case 'XLSX':
|
|
87
|
+
return yield this._readExcelLines(fileUrl, options === null || options === void 0 ? void 0 : options.sheetName);
|
|
86
88
|
}
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
});
|
|
90
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
91
|
+
(0, Affirm_1.default)(this._path, `Invalid path`);
|
|
92
|
+
(0, Affirm_1.default)(request, 'Invalid read options');
|
|
93
|
+
(0, Affirm_1.default)(request.fileKey, 'Invalid file key');
|
|
94
|
+
(0, Affirm_1.default)(request.fileType, `Invalid file type`);
|
|
95
|
+
(0, Affirm_1.default)(request.options, `Invalid request options`);
|
|
96
|
+
Affirm_1.default.hasValue(request.options.lineFrom, `Invalid request options line from`);
|
|
97
|
+
Affirm_1.default.hasValue(request.options.lineTo, `Invalid request options line to`);
|
|
98
|
+
const { fileKey, fileType, options: { lineFrom, lineTo, sheetName } } = request;
|
|
99
|
+
const fileUrl = path_1.default.join(this._path, fileKey);
|
|
100
|
+
switch (fileType) {
|
|
101
|
+
case 'CSV':
|
|
102
|
+
case 'JSON':
|
|
103
|
+
case 'JSONL':
|
|
104
|
+
case 'TXT':
|
|
105
|
+
return yield this._readLines(fileUrl, lineFrom, lineTo);
|
|
106
|
+
case 'XLS':
|
|
107
|
+
case 'XLSX':
|
|
108
|
+
return yield this._readExcelLines(fileUrl, sheetName, lineFrom, lineTo);
|
|
93
109
|
}
|
|
94
|
-
reader.close();
|
|
95
|
-
stream.close();
|
|
96
|
-
return lines;
|
|
97
110
|
});
|
|
98
|
-
this.
|
|
99
|
-
var _a, e_2, _b, _c;
|
|
111
|
+
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
100
112
|
(0, Affirm_1.default)(this._path, `Invalid path`);
|
|
101
|
-
(0, Affirm_1.default)(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
const
|
|
105
|
-
|
|
113
|
+
(0, Affirm_1.default)(producer, `Invalid producer`);
|
|
114
|
+
const fileKey = producer.settings.fileKey;
|
|
115
|
+
(0, Affirm_1.default)(fileKey, `Invalid file key for download request`);
|
|
116
|
+
const fileUrl = path_1.default.join(this._path, fileKey);
|
|
117
|
+
return fs.existsSync(fileUrl);
|
|
118
|
+
});
|
|
119
|
+
this._readLines = (fileUri, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
120
|
+
var _a, e_1, _b, _c;
|
|
121
|
+
const stream = fs.createReadStream(fileUri);
|
|
106
122
|
const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
107
123
|
const lines = [];
|
|
108
124
|
let lineCounter = 0;
|
|
109
125
|
try {
|
|
110
|
-
for (var _d = true,
|
|
111
|
-
_c =
|
|
126
|
+
for (var _d = true, reader_1 = __asyncValues(reader), reader_1_1; reader_1_1 = yield reader_1.next(), _a = reader_1_1.done, !_a; _d = true) {
|
|
127
|
+
_c = reader_1_1.value;
|
|
112
128
|
_d = false;
|
|
113
129
|
const line = _c;
|
|
114
|
-
if (
|
|
130
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
|
|
131
|
+
if (lineCounter >= lineFrom && lineCounter < lineTo) {
|
|
132
|
+
lines.push(line);
|
|
133
|
+
}
|
|
134
|
+
lineCounter++;
|
|
135
|
+
if (lineCounter >= lineTo)
|
|
136
|
+
break;
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
115
139
|
lines.push(line);
|
|
116
140
|
}
|
|
117
|
-
lineCounter++;
|
|
118
|
-
if (lineCounter >= lineTo)
|
|
119
|
-
break;
|
|
120
141
|
}
|
|
121
142
|
}
|
|
122
|
-
catch (
|
|
143
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
123
144
|
finally {
|
|
124
145
|
try {
|
|
125
|
-
if (!_d && !_a && (_b =
|
|
146
|
+
if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
|
|
126
147
|
}
|
|
127
|
-
finally { if (
|
|
148
|
+
finally { if (e_1) throw e_1.error; }
|
|
128
149
|
}
|
|
129
150
|
reader.close();
|
|
130
151
|
stream.close();
|
|
131
152
|
return lines;
|
|
132
153
|
});
|
|
133
|
-
this.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
154
|
+
this._readExcelLines = (fileUri, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
155
|
+
const excel = xlsx_1.default.readFile(fileUri);
|
|
156
|
+
let targetSheetName = sheetName;
|
|
157
|
+
if (!targetSheetName) {
|
|
158
|
+
(0, Affirm_1.default)(excel.SheetNames.length > 0, 'The Excel file has no sheets.');
|
|
159
|
+
targetSheetName = excel.SheetNames[0];
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
(0, Affirm_1.default)(excel.SheetNames.includes(targetSheetName), `The sheet "${targetSheetName}" doesn't exist in the excel (available: ${excel.SheetNames.join(', ')})`);
|
|
163
|
+
}
|
|
164
|
+
const sheet = excel.Sheets[targetSheetName];
|
|
165
|
+
const csv = xlsx_1.default.utils.sheet_to_csv(sheet);
|
|
166
|
+
const lines = csv.split('\n');
|
|
167
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
|
|
168
|
+
return lines.slice(lineFrom, lineTo + 1);
|
|
169
|
+
else
|
|
170
|
+
return lines;
|
|
140
171
|
});
|
|
141
172
|
}
|
|
142
173
|
}
|
|
143
|
-
|
|
174
|
+
exports.LocalSourceDriver = LocalSourceDriver;
|
|
175
|
+
class LocalDestinationDriver {
|
|
144
176
|
constructor() {
|
|
145
177
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
146
178
|
(0, Affirm_1.default)(source, `Invalid source`);
|
|
@@ -206,5 +238,4 @@ class LocalDriverDestination {
|
|
|
206
238
|
});
|
|
207
239
|
}
|
|
208
240
|
}
|
|
209
|
-
exports.
|
|
210
|
-
exports.default = LocalDriver;
|
|
241
|
+
exports.LocalDestinationDriver = LocalDestinationDriver;
|
package/drivers/S3Driver.js
CHANGED
|
@@ -8,14 +8,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
+
var m = o[Symbol.asyncIterator], i;
|
|
14
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
+
};
|
|
11
18
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
19
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
20
|
};
|
|
14
21
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
exports.S3SourceDriver = exports.S3DestinationDriver = void 0;
|
|
15
23
|
const client_s3_1 = require("@aws-sdk/client-s3");
|
|
16
24
|
const Affirm_1 = __importDefault(require("../core/Affirm"));
|
|
17
25
|
const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
|
|
18
|
-
|
|
26
|
+
const readline_1 = __importDefault(require("readline"));
|
|
27
|
+
const Algo_1 = __importDefault(require("../core/Algo"));
|
|
28
|
+
const xlsx_1 = __importDefault(require("xlsx"));
|
|
29
|
+
class S3DestinationDriver {
|
|
19
30
|
constructor() {
|
|
20
31
|
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
21
32
|
this._bucketName = source.authentication['bucket'];
|
|
@@ -99,4 +110,151 @@ class S3Driver {
|
|
|
99
110
|
});
|
|
100
111
|
}
|
|
101
112
|
}
|
|
102
|
-
exports.
|
|
113
|
+
exports.S3DestinationDriver = S3DestinationDriver;
|
|
114
|
+
class S3SourceDriver {
|
|
115
|
+
constructor() {
|
|
116
|
+
this.init = (source) => __awaiter(this, void 0, void 0, function* () {
|
|
117
|
+
this._bucketName = source.authentication['bucket'];
|
|
118
|
+
const sessionToken = SecretManager_1.default.replaceSecret(source.authentication['sessionToken']);
|
|
119
|
+
const config = {
|
|
120
|
+
region: source.authentication['region'],
|
|
121
|
+
credentials: {
|
|
122
|
+
accessKeyId: SecretManager_1.default.replaceSecret(source.authentication['accessKey']),
|
|
123
|
+
secretAccessKey: SecretManager_1.default.replaceSecret(source.authentication['secretKey']),
|
|
124
|
+
sessionToken: sessionToken ? sessionToken : undefined
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
this._client = new client_s3_1.S3Client(config);
|
|
128
|
+
// TODO: is there a way to test if the connection was successful? like a query or scan that I can do?
|
|
129
|
+
return this;
|
|
130
|
+
});
|
|
131
|
+
this.download = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
132
|
+
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
|
|
133
|
+
(0, Affirm_1.default)(request, `Invalid download request`);
|
|
134
|
+
(0, Affirm_1.default)(request.fileKey, `Invalid file key for download request`);
|
|
135
|
+
const { fileKey, fileType, options } = request;
|
|
136
|
+
const bucket = this._bucketName;
|
|
137
|
+
const response = yield this._client.send(new client_s3_1.GetObjectCommand({
|
|
138
|
+
Bucket: bucket,
|
|
139
|
+
Key: fileKey
|
|
140
|
+
}));
|
|
141
|
+
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
|
|
142
|
+
const stream = response.Body;
|
|
143
|
+
switch (fileType) {
|
|
144
|
+
case 'CSV':
|
|
145
|
+
case 'JSON':
|
|
146
|
+
case 'JSONL':
|
|
147
|
+
case 'TXT':
|
|
148
|
+
return yield this._readLines(stream);
|
|
149
|
+
case 'XLS':
|
|
150
|
+
case 'XLSX':
|
|
151
|
+
return yield this._readExcelLines(stream, options === null || options === void 0 ? void 0 : options.sheetName);
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
|
|
155
|
+
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
|
|
156
|
+
(0, Affirm_1.default)(request, 'Invalid read request');
|
|
157
|
+
(0, Affirm_1.default)(request.options, 'Invalid read request options');
|
|
158
|
+
const { fileKey, fileType, options: { sheetName, lineFrom, lineTo } } = request;
|
|
159
|
+
const bucket = this._bucketName;
|
|
160
|
+
const response = yield this._client.send(new client_s3_1.GetObjectCommand({
|
|
161
|
+
Bucket: bucket,
|
|
162
|
+
Key: fileKey
|
|
163
|
+
}));
|
|
164
|
+
(0, Affirm_1.default)(response.Body, 'Failed to fetch object from S3');
|
|
165
|
+
const stream = response.Body;
|
|
166
|
+
switch (fileType) {
|
|
167
|
+
case 'CSV':
|
|
168
|
+
case 'JSON':
|
|
169
|
+
case 'JSONL':
|
|
170
|
+
case 'TXT':
|
|
171
|
+
return yield this._readLines(stream, lineFrom, lineTo);
|
|
172
|
+
case 'XLS':
|
|
173
|
+
case 'XLSX':
|
|
174
|
+
return yield this._readExcelLines(stream, sheetName, lineFrom, lineTo);
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
this.exist = (producer) => __awaiter(this, void 0, void 0, function* () {
|
|
178
|
+
var _a;
|
|
179
|
+
(0, Affirm_1.default)(this._client, 'S3 client not yet initialized, call "connect()" first');
|
|
180
|
+
(0, Affirm_1.default)(producer, 'Invalid read producer');
|
|
181
|
+
const bucket = this._bucketName;
|
|
182
|
+
const fileKey = producer.settings.fileKey;
|
|
183
|
+
(0, Affirm_1.default)(fileKey, `Invalid file key for download request`);
|
|
184
|
+
try {
|
|
185
|
+
yield this._client.send(new client_s3_1.HeadObjectCommand({ Bucket: bucket, Key: fileKey }));
|
|
186
|
+
return true;
|
|
187
|
+
}
|
|
188
|
+
catch (error) {
|
|
189
|
+
if (((_a = error.$metadata) === null || _a === void 0 ? void 0 : _a.httpStatusCode) === 404 || error.name === 'NotFound')
|
|
190
|
+
return false;
|
|
191
|
+
throw error;
|
|
192
|
+
}
|
|
193
|
+
});
|
|
194
|
+
this._readLines = (stream, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
195
|
+
var _a, e_1, _b, _c;
|
|
196
|
+
const reader = readline_1.default.createInterface({ input: stream, crlfDelay: Infinity });
|
|
197
|
+
const lines = [];
|
|
198
|
+
let lineCounter = 0;
|
|
199
|
+
try {
|
|
200
|
+
for (var _d = true, reader_1 = __asyncValues(reader), reader_1_1; reader_1_1 = yield reader_1.next(), _a = reader_1_1.done, !_a; _d = true) {
|
|
201
|
+
_c = reader_1_1.value;
|
|
202
|
+
_d = false;
|
|
203
|
+
const line = _c;
|
|
204
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo)) {
|
|
205
|
+
if (lineCounter >= lineFrom && lineCounter < lineTo) {
|
|
206
|
+
lines.push(line);
|
|
207
|
+
}
|
|
208
|
+
lineCounter++;
|
|
209
|
+
if (lineCounter >= lineTo)
|
|
210
|
+
break;
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
lines.push(line);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
218
|
+
finally {
|
|
219
|
+
try {
|
|
220
|
+
if (!_d && !_a && (_b = reader_1.return)) yield _b.call(reader_1);
|
|
221
|
+
}
|
|
222
|
+
finally { if (e_1) throw e_1.error; }
|
|
223
|
+
}
|
|
224
|
+
reader.close();
|
|
225
|
+
return lines;
|
|
226
|
+
});
|
|
227
|
+
this._readExcelLines = (stream, sheetName, lineFrom, lineTo) => __awaiter(this, void 0, void 0, function* () {
|
|
228
|
+
var _a, stream_1, stream_1_1;
|
|
229
|
+
var _b, e_2, _c, _d;
|
|
230
|
+
(0, Affirm_1.default)(sheetName, `Invalid sheetname`);
|
|
231
|
+
const chunks = [];
|
|
232
|
+
try {
|
|
233
|
+
for (_a = true, stream_1 = __asyncValues(stream); stream_1_1 = yield stream_1.next(), _b = stream_1_1.done, !_b; _a = true) {
|
|
234
|
+
_d = stream_1_1.value;
|
|
235
|
+
_a = false;
|
|
236
|
+
const chunk = _d;
|
|
237
|
+
chunks.push(chunk);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
241
|
+
finally {
|
|
242
|
+
try {
|
|
243
|
+
if (!_a && !_b && (_c = stream_1.return)) yield _c.call(stream_1);
|
|
244
|
+
}
|
|
245
|
+
finally { if (e_2) throw e_2.error; }
|
|
246
|
+
}
|
|
247
|
+
const buffer = Buffer.concat(chunks);
|
|
248
|
+
const excel = xlsx_1.default.read(buffer, { type: 'buffer' });
|
|
249
|
+
(0, Affirm_1.default)(excel.SheetNames.includes(sheetName), `The sheet "${sheetName}" doesn't exist in the excel (available: ${excel.SheetNames.join(', ')})`);
|
|
250
|
+
const sheet = excel.Sheets[sheetName];
|
|
251
|
+
const csv = xlsx_1.default.utils.sheet_to_csv(sheet);
|
|
252
|
+
const lines = csv.split('\n');
|
|
253
|
+
if (Algo_1.default.hasVal(lineFrom) && Algo_1.default.hasVal(lineTo))
|
|
254
|
+
return lines.slice(lineFrom, lineTo + 1);
|
|
255
|
+
else
|
|
256
|
+
return lines;
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
exports.S3SourceDriver = S3SourceDriver;
|
package/engines/ParseManager.js
CHANGED
|
@@ -18,9 +18,9 @@ class ParseManagerClass {
|
|
|
18
18
|
(0, Affirm_1.default)(lines, 'Invalid csv lines');
|
|
19
19
|
Affirm_1.default.hasValue(lines.length, 'Invalid csv lines length');
|
|
20
20
|
const delimiterChar = (_a = producer.settings.delimiter) !== null && _a !== void 0 ? _a : ',';
|
|
21
|
-
const
|
|
22
|
-
const
|
|
23
|
-
const
|
|
21
|
+
const { header, records } = this._getClassifiedRows(lines, delimiterChar, producer);
|
|
22
|
+
const headerColumns = this._extractHeader(header, delimiterChar, producer, discover);
|
|
23
|
+
const rows = records.map(x => x.split(delimiterChar).map(k => k.trim()));
|
|
24
24
|
const result = [];
|
|
25
25
|
for (const row of rows) {
|
|
26
26
|
const rowObject = {};
|
|
@@ -32,6 +32,23 @@ class ParseManagerClass {
|
|
|
32
32
|
}
|
|
33
33
|
return result;
|
|
34
34
|
};
|
|
35
|
+
this._getClassifiedRows = (lines, delimiterChar, producer) => {
|
|
36
|
+
if (producer.settings.fileType === 'TXT' && !producer.settings.hasHeaderRow) {
|
|
37
|
+
// If the file is a TXT and there isn't an header row, then I add a fake one that maps directly to the producer
|
|
38
|
+
const source = Environment_1.default.getSource(producer.source);
|
|
39
|
+
const columns = FileCompiler_1.default.compileProducer(producer, source);
|
|
40
|
+
return {
|
|
41
|
+
header: columns.map(x => x.nameInProducer).join(delimiterChar),
|
|
42
|
+
records: lines
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
return {
|
|
47
|
+
header: lines[0],
|
|
48
|
+
records: lines.slice(1)
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
};
|
|
35
52
|
this._extractHeader = (headerLine, delimiter, producer, discover) => {
|
|
36
53
|
var _a;
|
|
37
54
|
(0, Affirm_1.default)(headerLine, `Invalid CSV header line for producer "${producer.name}"`);
|
|
@@ -91,19 +91,24 @@ class ProducerEngineClass {
|
|
|
91
91
|
(0, Affirm_1.default)(source, `No source found for producer "${producer.name}" with name "${producer.source}"`);
|
|
92
92
|
const driver = yield DriverFactory_1.default.instantiateSource(source);
|
|
93
93
|
(0, Affirm_1.default)(driver, `No driver found for producer "${producer.name}" with driver type "${source.engine}"`);
|
|
94
|
+
const { settings: { fileKey, fileType, sheetName } } = producer;
|
|
94
95
|
let lines = [];
|
|
95
96
|
if (options.readmode === 'lines')
|
|
96
|
-
lines = yield driver.readLinesInRange({ fileKey
|
|
97
|
+
lines = yield driver.readLinesInRange({ fileKey, fileType, options: { lineFrom: options.lines.from, lineTo: options.lines.to, sheetName } });
|
|
97
98
|
else
|
|
98
|
-
lines = yield driver.download({ fileKey:
|
|
99
|
+
lines = yield driver.download({ fileKey, fileType, options: { sheetName } });
|
|
99
100
|
switch ((_a = producer.settings.fileType) === null || _a === void 0 ? void 0 : _a.toUpperCase()) {
|
|
100
|
-
case 'CSV':
|
|
101
|
+
case 'CSV':
|
|
102
|
+
case 'TXT':
|
|
103
|
+
return { data: lines, dataType: 'lines-of-text' };
|
|
104
|
+
case 'XLS':
|
|
105
|
+
case 'XLSX':
|
|
101
106
|
return { data: lines, dataType: 'lines-of-text' };
|
|
102
|
-
}
|
|
103
107
|
case 'JSONL':
|
|
104
108
|
case 'JSON': {
|
|
105
|
-
if (lines.length === 1)
|
|
109
|
+
if (lines.length === 1) {
|
|
106
110
|
lines = lines[0].split('\n');
|
|
111
|
+
}
|
|
107
112
|
const json = lines.map(x => JSON.parse(x));
|
|
108
113
|
return { data: json, dataType: 'array-of-json' };
|
|
109
114
|
}
|
|
@@ -112,7 +117,7 @@ class ProducerEngineClass {
|
|
|
112
117
|
}
|
|
113
118
|
});
|
|
114
119
|
this.readSampleData = (producer_1, ...args_1) => __awaiter(this, [producer_1, ...args_1], void 0, function* (producer, sampleSize = 10, discover = false) {
|
|
115
|
-
var _a
|
|
120
|
+
var _a;
|
|
116
121
|
(0, Affirm_1.default)(producer, 'Invalid producer');
|
|
117
122
|
(0, Affirm_1.default)(sampleSize > 0, 'Sample size must be greater than 0');
|
|
118
123
|
const source = Environment_1.default.getSource(producer.source);
|
|
@@ -131,18 +136,29 @@ class ProducerEngineClass {
|
|
|
131
136
|
case 'local':
|
|
132
137
|
case 'aws-s3': {
|
|
133
138
|
const fileData = yield this.readFile(producer, { readmode: 'lines', lines: { from: 0, to: sampleSize } });
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
139
|
+
const fileType = (_a = producer.settings.fileType) === null || _a === void 0 ? void 0 : _a.toUpperCase();
|
|
140
|
+
switch (fileType) {
|
|
141
|
+
case 'CSV':
|
|
142
|
+
case 'TXT':
|
|
143
|
+
case 'XLS':
|
|
144
|
+
case 'XLSX': {
|
|
145
|
+
sampleData = ParseManager_1.default.csvLinesToJson(fileData.data, producer, discover);
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
case 'JSON':
|
|
149
|
+
case 'JSONL': {
|
|
150
|
+
// With JSON or JSONL the readFile function already parses the strings
|
|
151
|
+
if (typeof fileData.data[0] === 'object')
|
|
152
|
+
sampleData = fileData.data;
|
|
153
|
+
else
|
|
154
|
+
sampleData = fileData.data.map(line => JSON.parse(line));
|
|
155
|
+
sampleData = sampleData.slice(0, sampleSize);
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
default: {
|
|
140
159
|
sampleData = fileData.data;
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
}
|
|
144
|
-
else {
|
|
145
|
-
sampleData = fileData.data;
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
146
162
|
}
|
|
147
163
|
break;
|
|
148
164
|
}
|
package/engines/ai/LLM.js
CHANGED
|
@@ -202,14 +202,14 @@ class LLM {
|
|
|
202
202
|
$schema: zod_2.z.string().describe('The schema of the producer. This should always be the same.'),
|
|
203
203
|
name: zod_2.z.string(),
|
|
204
204
|
description: zod_2.z.string(),
|
|
205
|
+
source: zod_2.z.string().describe('The name of the source linked to this producer.'),
|
|
206
|
+
settings: zod_2.z.object({
|
|
207
|
+
fileKey: zod_2.z.string().describe('The name of the file'),
|
|
208
|
+
fileType: zod_2.z.string().describe('The file extension (CSV | JSONL | JSON)')
|
|
209
|
+
}),
|
|
205
210
|
dimensions: zod_2.z.array(zod_2.z.object({
|
|
206
211
|
name: zod_2.z.string(),
|
|
207
212
|
// alias: z.string().optional(),
|
|
208
|
-
source: zod_2.z.string().describe('The name of the source linked to this producer.'),
|
|
209
|
-
settings: zod_2.z.object({
|
|
210
|
-
fileKey: zod_2.z.string().describe('The name of the file'),
|
|
211
|
-
fileType: zod_2.z.string().describe('The file extension (CSV | JSONL | JSON)')
|
|
212
|
-
}),
|
|
213
213
|
description: zod_2.z.string().optional(),
|
|
214
214
|
type: zod_2.z.enum(['string', 'number', 'datetime']),
|
|
215
215
|
pk: zod_2.z.boolean().optional(),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forzalabs/remora",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.28",
|
|
4
4
|
"description": "A powerful CLI tool for seamless data translation.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"private": false,
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
},
|
|
10
10
|
"scripts": {
|
|
11
11
|
"sync": "cd ../dev_ops && npm run sync",
|
|
12
|
+
"tsc-check": "npx tsc --noemit",
|
|
12
13
|
"init": "npx tsx ./src/index.ts init",
|
|
13
14
|
"version": "npx tsx ./src/index.ts -v",
|
|
14
15
|
"run": "npx tsx ./src/index.ts run",
|
|
@@ -53,6 +54,7 @@
|
|
|
53
54
|
"ora": "^5.4.1",
|
|
54
55
|
"react": "^18.2.0",
|
|
55
56
|
"react-dom": "^18.2.0",
|
|
57
|
+
"xlsx": "^0.18.5",
|
|
56
58
|
"zod": "^3.24.2"
|
|
57
59
|
},
|
|
58
60
|
"devDependencies": {
|