@forzalabs/remora 1.0.21 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/actions/automap.js +26 -42
  2. package/actions/compile.js +27 -43
  3. package/actions/create_consumer.js +24 -40
  4. package/actions/create_producer.js +16 -32
  5. package/actions/debug.js +18 -34
  6. package/actions/deploy.js +30 -46
  7. package/actions/discover.js +13 -29
  8. package/actions/init.js +29 -45
  9. package/actions/mock.js +16 -32
  10. package/actions/run.js +34 -52
  11. package/actions/sample.js +42 -58
  12. package/index.js +38 -43
  13. package/package.json +4 -4
  14. package/workers/ExecutorWorker.js +18 -32
  15. package/Constants.js +0 -34
  16. package/core/Affirm.js +0 -42
  17. package/core/Algo.js +0 -160
  18. package/core/dste/DSTE.js +0 -113
  19. package/core/logger/DebugLogService.js +0 -48
  20. package/core/logger/DevelopmentLogService.js +0 -70
  21. package/core/logger/LocalLogService.js +0 -70
  22. package/core/logger/Logger.js +0 -54
  23. package/database/DatabaseEngine.js +0 -149
  24. package/database/DatabaseStructure.js +0 -27
  25. package/definitions/DatasetDefinitions.js +0 -2
  26. package/definitions/ExecutorDefinitions.js +0 -2
  27. package/definitions/ProcessENV.js +0 -2
  28. package/definitions/agents/DestinationDriver.js +0 -2
  29. package/definitions/agents/SourceDriver.js +0 -2
  30. package/definitions/cli.js +0 -2
  31. package/definitions/database/ApiKeys.js +0 -2
  32. package/definitions/database/Stored.js +0 -7
  33. package/definitions/database/UsageStat.js +0 -2
  34. package/definitions/database/User.js +0 -2
  35. package/definitions/json_schemas/consumer-schema.json +0 -1226
  36. package/definitions/json_schemas/producer-schema.json +0 -308
  37. package/definitions/json_schemas/project-schema.json +0 -100
  38. package/definitions/json_schemas/source-schema.json +0 -249
  39. package/definitions/requests/ConsumerRequest.js +0 -2
  40. package/definitions/requests/Developer.js +0 -2
  41. package/definitions/requests/Mapping.js +0 -2
  42. package/definitions/requests/ProducerRequest.js +0 -2
  43. package/definitions/requests/Request.js +0 -2
  44. package/definitions/resources/Compiled.js +0 -2
  45. package/definitions/resources/Consumer.js +0 -2
  46. package/definitions/resources/Environment.js +0 -2
  47. package/definitions/resources/Library.js +0 -2
  48. package/definitions/resources/Producer.js +0 -2
  49. package/definitions/resources/Project.js +0 -2
  50. package/definitions/resources/Schema.js +0 -2
  51. package/definitions/resources/Source.js +0 -2
  52. package/definitions/temp.js +0 -2
  53. package/definitions/transform/Transformations.js +0 -2
  54. package/drivers/DeltaShareDriver.js +0 -186
  55. package/drivers/DriverFactory.js +0 -72
  56. package/drivers/DriverHelper.js +0 -248
  57. package/drivers/HttpApiDriver.js +0 -208
  58. package/drivers/RedshiftDriver.js +0 -184
  59. package/drivers/files/LocalDestinationDriver.js +0 -146
  60. package/drivers/files/LocalSourceDriver.js +0 -405
  61. package/drivers/s3/S3DestinationDriver.js +0 -197
  62. package/drivers/s3/S3SourceDriver.js +0 -495
  63. package/engines/CryptoEngine.js +0 -75
  64. package/engines/Environment.js +0 -170
  65. package/engines/ProcessENVManager.js +0 -83
  66. package/engines/RandomEngine.js +0 -47
  67. package/engines/SecretManager.js +0 -23
  68. package/engines/UserManager.js +0 -66
  69. package/engines/ai/AutoMapperEngine.js +0 -37
  70. package/engines/ai/DeveloperEngine.js +0 -497
  71. package/engines/ai/LLM.js +0 -255
  72. package/engines/consumer/ConsumerManager.js +0 -218
  73. package/engines/consumer/ConsumerOnFinishManager.js +0 -202
  74. package/engines/dataset/Dataset.js +0 -824
  75. package/engines/dataset/DatasetManager.js +0 -211
  76. package/engines/dataset/DatasetRecord.js +0 -120
  77. package/engines/dataset/DatasetRecordPool.js +0 -77
  78. package/engines/execution/RequestExecutor.js +0 -67
  79. package/engines/parsing/CSVParser.js +0 -60
  80. package/engines/parsing/LineParser.js +0 -71
  81. package/engines/parsing/ParseCompression.js +0 -101
  82. package/engines/parsing/ParseHelper.js +0 -18
  83. package/engines/parsing/ParseManager.js +0 -54
  84. package/engines/parsing/XLSParser.js +0 -87
  85. package/engines/parsing/XMLParser.js +0 -115
  86. package/engines/producer/ProducerEngine.js +0 -127
  87. package/engines/producer/ProducerManager.js +0 -43
  88. package/engines/scheduler/CronScheduler.js +0 -222
  89. package/engines/scheduler/QueueManager.js +0 -314
  90. package/engines/schema/SchemaValidator.js +0 -67
  91. package/engines/transform/JoinEngine.js +0 -232
  92. package/engines/transform/TransformationEngine.js +0 -277
  93. package/engines/transform/TypeCaster.js +0 -59
  94. package/engines/usage/DataframeManager.js +0 -55
  95. package/engines/usage/UsageDataManager.js +0 -151
  96. package/engines/usage/UsageManager.js +0 -65
  97. package/engines/validation/Validator.js +0 -216
  98. package/executors/ConsumerExecutor.js +0 -280
  99. package/executors/Executor.js +0 -177
  100. package/executors/ExecutorOrchestrator.js +0 -331
  101. package/executors/ExecutorPerformance.js +0 -17
  102. package/executors/ExecutorProgress.js +0 -54
  103. package/executors/ExecutorScope.js +0 -52
  104. package/executors/OutputExecutor.js +0 -118
  105. package/executors/ProducerExecutor.js +0 -108
  106. package/helper/Helper.js +0 -149
  107. package/helper/Logger.js +0 -84
  108. package/helper/Runtime.js +0 -20
  109. package/helper/Settings.js +0 -13
  110. package/licencing/LicenceManager.js +0 -64
  111. package/settings.js +0 -12
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,2 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
@@ -1,186 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __importDefault = (this && this.__importDefault) || function (mod) {
12
- return (mod && mod.__esModule) ? mod : { "default": mod };
13
- };
14
- Object.defineProperty(exports, "__esModule", { value: true });
15
- const Affirm_1 = __importDefault(require("../core/Affirm"));
16
- const SecretManager_1 = __importDefault(require("../engines/SecretManager"));
17
- const DriverHelper_1 = __importDefault(require("./DriverHelper"));
18
- /**
19
- * Delta Share (Databricks Delta Sharing) Source Driver
20
- */
21
- class DeltaShareSourceDriver {
22
- constructor() {
23
- this._query = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/query';
24
- this._version = '{prefix}/shares/{share}/schemas/{schema}/tables/{table}/version';
25
- this._tablesInShare = '{prefix}/shares/{share}/all-tables';
26
- this._tablesInSchema = '{prefix}/shares/{share}/schemas/{schema}/tables';
27
- this._schemasInShare = '{prefix}/shares/{share}/schemas';
28
- this._shares = '{prefix}/shares';
29
- this.init = (source) => __awaiter(this, void 0, void 0, function* () {
30
- (0, Affirm_1.default)(source, 'Invalid source');
31
- // Expected authentication shape for delta-share
32
- const { authentication } = source;
33
- (0, Affirm_1.default)(authentication, 'Invalid authentication for delta-share source');
34
- this._shareUrl = authentication.host;
35
- this._bearerToken = SecretManager_1.default.replaceSecret(authentication.bearerToken || authentication.sessionToken || authentication.password);
36
- this._share = authentication.share;
37
- this._schema = authentication.schema;
38
- this._table = authentication.table;
39
- (0, Affirm_1.default)(this._shareUrl, 'Missing delta-share host (share server URL) in source.authentication.host');
40
- (0, Affirm_1.default)(this._bearerToken, 'Missing delta-share bearer token in source.authentication.sessionToken (or password)');
41
- (0, Affirm_1.default)(this._share, 'Missing delta-share "share" (use authentication.share or bucket)');
42
- (0, Affirm_1.default)(this._schema, 'Missing delta-share schema in source.authentication.schema');
43
- (0, Affirm_1.default)(this._table, 'Missing delta-share table in source.authentication.table (or database)');
44
- this._source = source;
45
- return this;
46
- });
47
- // Delta Sharing is not a SQL engine; expose explicit error
48
- this.execute = (__sql) => __awaiter(this, void 0, void 0, function* () {
49
- void __sql;
50
- throw new Error('DeltaShareSourceDriver.execute is not supported: Delta Sharing is not a SQL engine');
51
- });
52
- this.query = (__sql, __values) => __awaiter(this, void 0, void 0, function* () {
53
- void __sql;
54
- void __values;
55
- throw new Error('DeltaShareSourceDriver.query is not supported: Delta Sharing is not a SQL engine');
56
- });
57
- this.readAll = (request) => __awaiter(this, void 0, void 0, function* () {
58
- var _a, _b, _c;
59
- (0, Affirm_1.default)(request, `Invalid download request`);
60
- (0, Affirm_1.default)(!request.fileKey.includes('%'), `On a delta-share the file key can not include "%"`);
61
- const deltaFiles = yield this._getAllFilesInTables(this._table);
62
- const hyparquet = yield import('hyparquet');
63
- const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
64
- const lines = [];
65
- for (const deltaFile of deltaFiles) {
66
- const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
67
- const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
68
- const parquetRecords = yield parquetReadObjects({ file: file });
69
- lines.push(...parquetRecords.map(x => JSON.stringify(x)));
70
- }
71
- return lines;
72
- });
73
- this.readLinesInRange = (request) => __awaiter(this, void 0, void 0, function* () {
74
- var _a, _b, _c;
75
- (0, Affirm_1.default)(request, 'Invalid read request');
76
- (0, Affirm_1.default)(request.options, 'Invalid read options');
77
- (0, Affirm_1.default)(request.options.lineFrom !== undefined && request.options.lineTo !== undefined, 'Missing read range');
78
- const deltaFiles = yield this._getAllFilesInTables(this._table);
79
- const { options: { lineFrom, lineTo } } = request;
80
- const hyparquet = yield import('hyparquet');
81
- const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
82
- const lines = [];
83
- let index = 0;
84
- for (const deltaFile of deltaFiles) {
85
- const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
86
- const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
87
- const parquetRecords = yield parquetReadObjects({ file: file });
88
- for (const record of parquetRecords) {
89
- if (index >= lineFrom && index < lineTo)
90
- lines.push(JSON.stringify(record));
91
- index++;
92
- if (index >= lineTo)
93
- break;
94
- }
95
- }
96
- return lines;
97
- });
98
- this.download = (dataset) => __awaiter(this, void 0, void 0, function* () {
99
- var _a, _b, _c;
100
- (0, Affirm_1.default)(dataset, 'Invalid dataset');
101
- const deltaFiles = yield this._getAllFilesInTables(this._table);
102
- const hyparquet = yield import('hyparquet');
103
- const { asyncBufferFromUrl, parquetReadObjects } = hyparquet;
104
- // For each file, download it with the hyparquet package, read lines, then save locally to create the dataset
105
- let index = 0;
106
- let totalLineCount = 0;
107
- for (const deltaFile of deltaFiles) {
108
- const byteLength = (_b = (_a = deltaFile.file.deltaSingleAction.add) === null || _a === void 0 ? void 0 : _a.size) !== null && _b !== void 0 ? _b : (_c = deltaFile.file.deltaSingleAction.remove) === null || _c === void 0 ? void 0 : _c.size;
109
- const file = yield asyncBufferFromUrl({ url: deltaFile.file.url, byteLength });
110
- const parquetRecords = yield parquetReadObjects({ file: file });
111
- if (index === 0 && parquetRecords.length > 0) {
112
- // I intentionally keep the first record as a JSON, so it can be used to extract the dimensions
113
- dataset.setFirstLine(JSON.stringify(parquetRecords[0]));
114
- }
115
- totalLineCount += yield DriverHelper_1.default.appendObjectsToUnifiedFile({
116
- append: index > 0,
117
- delimiter: dataset.getDelimiter(),
118
- destinationPath: dataset.getPath(),
119
- objects: parquetRecords
120
- });
121
- index++;
122
- }
123
- dataset.setCount(totalLineCount);
124
- return dataset;
125
- });
126
- this.exist = (__producer) => __awaiter(this, void 0, void 0, function* () {
127
- void __producer;
128
- try {
129
- yield this._getAllFilesInTables(this._table);
130
- // If it doesn't exist, then it fails in the above function
131
- return true;
132
- }
133
- catch (_a) {
134
- return false;
135
- }
136
- });
137
- this._getVersion = (table) => __awaiter(this, void 0, void 0, function* () {
138
- const url = this._version
139
- .replace('{prefix}', this._shareUrl)
140
- .replace('{share}', this._share)
141
- .replace('{schema}', this._schema)
142
- .replace('{table}', table);
143
- const res = yield fetch(url, {
144
- method: 'GET',
145
- headers: {
146
- Authorization: `Bearer ${this._bearerToken}`
147
- }
148
- });
149
- (0, Affirm_1.default)(res.ok, `Error fetching version from the delta share: ${res.status} ${res.statusText} (${yield res.text()})`);
150
- const version = res.headers['delta-table-version'];
151
- return version;
152
- });
153
- this._getAllFilesInTables = (table) => __awaiter(this, void 0, void 0, function* () {
154
- const url = this._query
155
- .replace('{prefix}', this._shareUrl)
156
- .replace('{share}', this._share)
157
- .replace('{schema}', this._schema)
158
- .replace('{table}', table);
159
- const body = {
160
- version: yield this._getVersion(table)
161
- };
162
- const res = yield fetch(url, {
163
- method: 'POST',
164
- headers: {
165
- 'Authorization': `Bearer ${this._bearerToken}`,
166
- 'delta-sharing-capabilities': 'responseformat=delta;readerfeatures=deletionvectors'
167
- },
168
- body: JSON.stringify(body)
169
- });
170
- const rawText = yield res.text();
171
- (0, Affirm_1.default)(res.ok, `Error fetching data from the delta share: ${res.status} ${res.statusText}; Message: ${rawText}`);
172
- // By the protocol: the first is the profile, the second is the metadata, I'm interested from the third onwards
173
- const deltaLines = rawText
174
- .split('\n')
175
- .filter(x => x.length > 0)
176
- .slice(2)
177
- .map(x => JSON.parse(x));
178
- return deltaLines;
179
- });
180
- this.ready = (request) => __awaiter(this, void 0, void 0, function* () {
181
- void request;
182
- throw new Error('DeltaShareSourceDriver.ready is not supported: Delta Sharing does not support readiness checks');
183
- });
184
- }
185
- }
186
- exports.default = DeltaShareSourceDriver;
@@ -1,72 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __importDefault = (this && this.__importDefault) || function (mod) {
12
- return (mod && mod.__esModule) ? mod : { "default": mod };
13
- };
14
- Object.defineProperty(exports, "__esModule", { value: true });
15
- const RedshiftDriver_1 = __importDefault(require("./RedshiftDriver"));
16
- const DeltaShareDriver_1 = __importDefault(require("./DeltaShareDriver"));
17
- const HttpApiDriver_1 = __importDefault(require("./HttpApiDriver"));
18
- const LocalSourceDriver_1 = __importDefault(require("./files/LocalSourceDriver"));
19
- const LocalDestinationDriver_1 = __importDefault(require("./files/LocalDestinationDriver"));
20
- const S3SourceDriver_1 = __importDefault(require("./s3/S3SourceDriver"));
21
- const S3DestinationDriver_1 = __importDefault(require("./s3/S3DestinationDriver"));
22
- class DriverFactoryClass {
23
- constructor() {
24
- this.instantiateSource = (source) => __awaiter(this, void 0, void 0, function* () {
25
- switch (source.engine) {
26
- case 'aws-redshift': {
27
- const driver = new RedshiftDriver_1.default();
28
- yield driver.init(source);
29
- return driver;
30
- }
31
- case 'aws-s3': {
32
- const driver = new S3SourceDriver_1.default();
33
- yield driver.init(source);
34
- return driver;
35
- }
36
- case 'delta-share': {
37
- const driver = new DeltaShareDriver_1.default();
38
- yield driver.init(source);
39
- return driver;
40
- }
41
- case 'local': {
42
- const driver = new LocalSourceDriver_1.default();
43
- yield driver.init(source);
44
- return driver;
45
- }
46
- case 'http-api': {
47
- const driver = new HttpApiDriver_1.default();
48
- yield driver.init(source);
49
- return driver;
50
- }
51
- default: throw new Error(`Invalid driver type "${source.engine}". This driver is not implemented yet`);
52
- }
53
- });
54
- this.instantiateDestination = (source) => __awaiter(this, void 0, void 0, function* () {
55
- switch (source.engine) {
56
- case 'aws-s3': {
57
- const driver = new S3DestinationDriver_1.default();
58
- yield driver.init(source);
59
- return driver;
60
- }
61
- case 'local': {
62
- const driver = new LocalDestinationDriver_1.default();
63
- yield driver.init(source);
64
- return driver;
65
- }
66
- default: throw new Error(`Invalid driver type "${source.engine}". This driver is not implemented yet`);
67
- }
68
- });
69
- }
70
- }
71
- const DriverFactory = new DriverFactoryClass();
72
- exports.default = DriverFactory;
@@ -1,248 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __asyncValues = (this && this.__asyncValues) || function (o) {
12
- if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
13
- var m = o[Symbol.asyncIterator], i;
14
- return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
15
- function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
16
- function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
17
- };
18
- var __importDefault = (this && this.__importDefault) || function (mod) {
19
- return (mod && mod.__esModule) ? mod : { "default": mod };
20
- };
21
- Object.defineProperty(exports, "__esModule", { value: true });
22
- const stream_1 = require("stream");
23
- const readline_1 = require("readline");
24
- const promises_1 = require("stream/promises");
25
- const fs_1 = require("fs");
26
- const path_1 = __importDefault(require("path"));
27
- const Logger_1 = __importDefault(require("../helper/Logger"));
28
- const Affirm_1 = __importDefault(require("../core/Affirm"));
29
- const XLSParser_1 = __importDefault(require("../engines/parsing/XLSParser"));
30
- const XMLParser_1 = __importDefault(require("../engines/parsing/XMLParser"));
31
- const Constants_1 = __importDefault(require("../Constants"));
32
- const DriverHelper = {
33
- appendToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
34
- (0, Affirm_1.default)(options, 'Invalid options');
35
- const { append, destinationPath, fileKey, headerLine, stream, fileType, hasHeaderRow, delimiter, sourceFilename } = options;
36
- (0, Affirm_1.default)(headerLine, `Invalid header line`);
37
- const keys = (fileType === 'JSON' || fileType === 'JSONL')
38
- ? Object.keys(JSON.parse(headerLine))
39
- : [];
40
- const shouldValidateHeader = fileType === 'CSV' || (fileType === 'TXT' && hasHeaderRow === true);
41
- // When sourceFilename is set, the headerLine includes $source_filename at the end.
42
- // For validation, we need to compare against the original header without this suffix.
43
- const originalHeaderLine = sourceFilename
44
- ? headerLine.slice(0, headerLine.lastIndexOf(delimiter))
45
- : headerLine;
46
- let isFirstLine = true;
47
- let hasValidatedHeader = shouldValidateHeader ? false : true;
48
- let leftoverData = '';
49
- let globalIndex = 0;
50
- let lineCount = 0;
51
- const headerValidationTransform = new stream_1.Transform({
52
- transform(chunk, encoding, callback) {
53
- const chunkStr = leftoverData + chunk.toString();
54
- const lines = chunkStr.split('\n');
55
- // Keep the last line as leftover if it doesn't end with newline
56
- leftoverData = lines.pop() || '';
57
- const filteredLines = [];
58
- for (let i = 0; i < lines.length; i++) {
59
- const line = lines[i];
60
- // Header validation for first line
61
- if (!hasValidatedHeader && isFirstLine && i === 0) {
62
- if (shouldValidateHeader && originalHeaderLine && originalHeaderLine.trim() !== '' && line.trim() !== originalHeaderLine.trim()) {
63
- const msg = `Error creating unified dataset: file "${fileKey}" has a different header line than the other files in this dataset\n\t-${fileKey}: ${line}\n\t-main: ${originalHeaderLine}`;
64
- Logger_1.default.log(msg);
65
- return callback(new Error(msg));
66
- }
67
- hasValidatedHeader = true;
68
- isFirstLine = false;
69
- }
70
- // Apply your filtering logic here
71
- if (shouldIncludeLine(line, globalIndex)) {
72
- filteredLines.push(processLine(line));
73
- }
74
- globalIndex++;
75
- }
76
- // Output filtered lines
77
- if (filteredLines.length > 0) {
78
- const output = filteredLines.join('\n') + '\n';
79
- callback(null, Buffer.from(output));
80
- }
81
- else {
82
- callback(null, null); // No data to output
83
- }
84
- },
85
- flush(callback) {
86
- // Process any remaining data
87
- if (leftoverData.trim()) {
88
- if (shouldIncludeLine(leftoverData, -1)) {
89
- callback(null, Buffer.from(processLine(leftoverData) + '\n'));
90
- }
91
- else {
92
- callback(null, null);
93
- }
94
- }
95
- else {
96
- callback(null, null);
97
- }
98
- globalIndex++;
99
- }
100
- });
101
- // Helper function to determine if a line should be included
102
- const shouldIncludeLine = (line, lineIndex) => {
103
- // For flat files (csv, txt) ignore the first line of the header (I already saved that line)
104
- if (lineIndex === 0 && shouldValidateHeader)
105
- return false;
106
- // Skip empty lines
107
- if (line.trim() === '')
108
- return false;
109
- return true;
110
- };
111
- const processLine = (line) => {
112
- lineCount++;
113
- let processedLine;
114
- switch (fileType) {
115
- case 'JSON':
116
- case 'JSONL': {
117
- try {
118
- const parsed = JSON.parse(line);
119
- processedLine = keys.map(k => parsed[k]).join(delimiter);
120
- }
121
- catch (error) {
122
- Logger_1.default.log(`Failed parsing in JSON line -> file: ${fileKey}; index: ${globalIndex}; line: ${line}; err: ${error === null || error === void 0 ? void 0 : error.name}`);
123
- throw error;
124
- }
125
- break;
126
- }
127
- default:
128
- processedLine = line;
129
- }
130
- // If sourceFilename is provided, append it to each line
131
- if (sourceFilename) {
132
- processedLine = processedLine + delimiter + sourceFilename;
133
- }
134
- return processedLine;
135
- };
136
- const writeOptions = append ? { flags: 'a' } : {};
137
- const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
138
- yield (0, promises_1.pipeline)(stream, headerValidationTransform, writeStream);
139
- return lineCount;
140
- }),
141
- appendObjectsToUnifiedFile: (options) => __awaiter(void 0, void 0, void 0, function* () {
142
- (0, Affirm_1.default)(options, 'Invalid options');
143
- const { append, destinationPath, objects, delimiter } = options;
144
- const writeOptions = append ? { flags: 'a' } : {};
145
- const writeStream = (0, fs_1.createWriteStream)(destinationPath, writeOptions);
146
- let lineCount = 0;
147
- const keys = Object.keys(objects[0]);
148
- for (const obj of objects) {
149
- const serialized = keys.map(k => obj[k]).join(delimiter) + '\n';
150
- writeStream.write(serialized);
151
- lineCount++;
152
- }
153
- writeStream.close();
154
- return lineCount;
155
- }),
156
- quickReadFile: (filePath, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
157
- var _a, e_1, _b, _c;
158
- const fileStream = (0, fs_1.createReadStream)(filePath);
159
- const rl = (0, readline_1.createInterface)({ input: fileStream, crlfDelay: Infinity });
160
- const lines = [];
161
- let counter = 0;
162
- try {
163
- for (var _d = true, rl_1 = __asyncValues(rl), rl_1_1; rl_1_1 = yield rl_1.next(), _a = rl_1_1.done, !_a; _d = true) {
164
- _c = rl_1_1.value;
165
- _d = false;
166
- const line = _c;
167
- lines.push(line);
168
- counter++;
169
- if (counter >= lineCount) {
170
- break;
171
- }
172
- }
173
- }
174
- catch (e_1_1) { e_1 = { error: e_1_1 }; }
175
- finally {
176
- try {
177
- if (!_d && !_a && (_b = rl_1.return)) yield _b.call(rl_1);
178
- }
179
- finally { if (e_1) throw e_1.error; }
180
- }
181
- rl.close();
182
- fileStream.close();
183
- return lines;
184
- }),
185
- setHeaderFromFile: (fileKey, file, filePath, dataset) => __awaiter(void 0, void 0, void 0, function* () {
186
- (0, Affirm_1.default)(filePath, 'Invalid path');
187
- (0, Affirm_1.default)(fileKey, 'Invalid fileKey');
188
- (0, Affirm_1.default)(file, 'Invalid File');
189
- let headerLine;
190
- switch (file.fileType) {
191
- case 'XLS':
192
- case 'XLSX':
193
- headerLine = yield XLSParser_1.default.getHeaderXls(path_1.default.join(filePath, fileKey), file.sheetName);
194
- if (file.includeSourceFilename) {
195
- headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
196
- }
197
- dataset.setFirstLine(headerLine);
198
- break;
199
- case 'XML':
200
- // using a differnt logic for encoded type xls and xlsx
201
- headerLine = (yield XMLParser_1.default.readXmlLines(path_1.default.join(filePath, fileKey)))[0];
202
- dataset.setFirstLine(headerLine);
203
- break;
204
- case 'CSV':
205
- case 'JSON':
206
- case 'JSONL':
207
- case 'TXT':
208
- // Get header line from the first file
209
- headerLine = (yield DriverHelper.quickReadFile(path_1.default.join(filePath, fileKey), 1))[0];
210
- // If including source filename, append a placeholder column name to the header
211
- if (file.includeSourceFilename) {
212
- headerLine = headerLine + dataset.getDelimiter() + Constants_1.default.SOURCE_FILENAME_COLUMN;
213
- }
214
- dataset.setFirstLine(headerLine);
215
- break;
216
- default:
217
- throw new Error(`the fileType "${file.fileType}" is not implemented yet`);
218
- }
219
- }),
220
- quickReadStream: (stream, lineCount) => __awaiter(void 0, void 0, void 0, function* () {
221
- var _a, e_2, _b, _c;
222
- const rl = (0, readline_1.createInterface)({ input: stream, crlfDelay: Infinity });
223
- const lines = [];
224
- let counter = 0;
225
- try {
226
- for (var _d = true, rl_2 = __asyncValues(rl), rl_2_1; rl_2_1 = yield rl_2.next(), _a = rl_2_1.done, !_a; _d = true) {
227
- _c = rl_2_1.value;
228
- _d = false;
229
- const line = _c;
230
- lines.push(line);
231
- counter++;
232
- if (counter >= lineCount) {
233
- break;
234
- }
235
- }
236
- }
237
- catch (e_2_1) { e_2 = { error: e_2_1 }; }
238
- finally {
239
- try {
240
- if (!_d && !_a && (_b = rl_2.return)) yield _b.call(rl_2);
241
- }
242
- finally { if (e_2) throw e_2.error; }
243
- }
244
- rl.close();
245
- return lines;
246
- })
247
- };
248
- exports.default = DriverHelper;