@forzalabs/remora 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +181 -28
- package/json_schemas/project-schema.json +16 -1
- package/json_schemas/source-schema.json +23 -0
- package/package.json +1 -1
- package/workers/ExecutorWorker.js +192 -31
package/index.js
CHANGED
|
@@ -13518,12 +13518,10 @@ var CONSTANTS = {
|
|
|
13518
13518
|
REMORA_PATH: "./remora",
|
|
13519
13519
|
PRODUCER_TEMP_FOLDER: ".temp",
|
|
13520
13520
|
SQL_MAX_QUERY_ROWS: 1e4,
|
|
13521
|
-
STRING_MAX_CHARACTERS_LENGTH: 1e7,
|
|
13522
|
-
MAX_ITEMS_IN_MEMORY: 2e5,
|
|
13523
13521
|
MIN_RUNTIME_HEAP_MB: 4e3,
|
|
13524
13522
|
RECOMMENDED_RUNTIME_HEAP_MB: 8e3,
|
|
13525
13523
|
INDICATIVE_THREAD_LINE_COUNT: 75e4,
|
|
13526
|
-
MAX_THREAD_COUNT:
|
|
13524
|
+
MAX_THREAD_COUNT: 99,
|
|
13527
13525
|
/**
|
|
13528
13526
|
* Minimum file size to consider parallel processing (10 MB)
|
|
13529
13527
|
*/
|
|
@@ -13605,6 +13603,8 @@ var ExecutorScope_default = ExecutorScope;
|
|
|
13605
13603
|
|
|
13606
13604
|
// ../../packages/common/src/Environment.ts
|
|
13607
13605
|
var import_fs4 = __toESM(require("fs"), 1);
|
|
13606
|
+
var import_crypto = __toESM(require("crypto"), 1);
|
|
13607
|
+
var import_adm_zip = __toESM(require("adm-zip"), 1);
|
|
13608
13608
|
var import_path5 = __toESM(require("path"), 1);
|
|
13609
13609
|
|
|
13610
13610
|
// ../../packages/common/src/schema/SchemaValidator.ts
|
|
@@ -13726,6 +13726,12 @@ var ValidatorClass = class {
|
|
|
13726
13726
|
const source = sources[i];
|
|
13727
13727
|
if (source.engine === "local" && !source.authentication.path)
|
|
13728
13728
|
errors.push(`For source ${source.name}, the path has not been configured`);
|
|
13729
|
+
if (source.engine === "aws-sqs") {
|
|
13730
|
+
if (!source.authentication.queue)
|
|
13731
|
+
errors.push(`For source ${source.name}, the queue has not been configured`);
|
|
13732
|
+
if (!source.authentication.region && !source.authentication.queue?.startsWith("https://"))
|
|
13733
|
+
errors.push(`For source ${source.name}, the region has not been configured`);
|
|
13734
|
+
}
|
|
13729
13735
|
}
|
|
13730
13736
|
} catch (e) {
|
|
13731
13737
|
if (errors.length === 0)
|
|
@@ -14016,8 +14022,11 @@ var ValidatorClass = class {
|
|
|
14016
14022
|
if (!auth) continue;
|
|
14017
14023
|
const ctx = (field) => `source "${source.name}" authentication.${field}`;
|
|
14018
14024
|
checkValue(auth.accessKey, ctx("accessKey"));
|
|
14025
|
+
checkValue(auth.accountId, ctx("accountId"));
|
|
14019
14026
|
checkValue(auth.secretKey, ctx("secretKey"));
|
|
14020
14027
|
checkValue(auth.sessionToken, ctx("sessionToken"));
|
|
14028
|
+
checkValue(auth.queue, ctx("queue"));
|
|
14029
|
+
checkValue(auth.region, ctx("region"));
|
|
14021
14030
|
checkValue(auth.bearerToken, ctx("bearerToken"));
|
|
14022
14031
|
checkValue(auth.url, ctx("url"));
|
|
14023
14032
|
checkValue(auth.apiKey, ctx("apiKey"));
|
|
@@ -14036,6 +14045,8 @@ var ValidatorClass = class {
|
|
|
14036
14045
|
switch (engine) {
|
|
14037
14046
|
case "aws-dynamodb":
|
|
14038
14047
|
return "no-sql";
|
|
14048
|
+
case "aws-sqs":
|
|
14049
|
+
return "no-sql";
|
|
14039
14050
|
case "aws-redshift":
|
|
14040
14051
|
case "postgres":
|
|
14041
14052
|
return "sql";
|
|
@@ -14055,10 +14066,72 @@ var Validator_default = Validator;
|
|
|
14055
14066
|
var EnvironmentClass = class {
|
|
14056
14067
|
constructor() {
|
|
14057
14068
|
this._env = null;
|
|
14069
|
+
this._configUri = null;
|
|
14070
|
+
this._localPath = null;
|
|
14071
|
+
this._configHash = null;
|
|
14072
|
+
this._lastEtag = null;
|
|
14073
|
+
this._projectSettings = null;
|
|
14058
14074
|
this.init = (env) => {
|
|
14059
14075
|
this._env = env;
|
|
14060
14076
|
};
|
|
14077
|
+
/**
|
|
14078
|
+
* Load environment from a local path or remote URL (e.g. S3 presigned URL).
|
|
14079
|
+
* Remote configs are downloaded as a zip and extracted to the local cache path.
|
|
14080
|
+
* Falls back to `REMORA_CONFIG_URI` env variable if no URI is provided.
|
|
14081
|
+
*/
|
|
14082
|
+
this.loadFromUri = async (uri) => {
|
|
14083
|
+
const configUri = uri || process.env.REMORA_CONFIG_URI;
|
|
14084
|
+
Affirm_default(configUri, "No configuration URI provided. Set REMORA_CONFIG_URI environment variable or pass a URI.");
|
|
14085
|
+
this._configUri = configUri;
|
|
14086
|
+
if (this._isRemoteUri(configUri)) {
|
|
14087
|
+
this._localPath = this._resolveLocalCachePath();
|
|
14088
|
+
await this._downloadRemoteConfig(configUri);
|
|
14089
|
+
} else {
|
|
14090
|
+
this._localPath = configUri;
|
|
14091
|
+
}
|
|
14092
|
+
this.load(this._localPath);
|
|
14093
|
+
};
|
|
14094
|
+
/**
|
|
14095
|
+
* Check if the configuration has changed and reload if so.
|
|
14096
|
+
* For remote URIs, re-downloads using ETag caching (HTTP 304 avoids redundant downloads).
|
|
14097
|
+
* For local paths, compares a SHA-256 hash of all config JSON files against the last loaded hash.
|
|
14098
|
+
*/
|
|
14099
|
+
this.refreshIfNeeded = async () => {
|
|
14100
|
+
if (!this._localPath && !this._configUri) return;
|
|
14101
|
+
if (this._configUri && this._isRemoteUri(this._configUri)) {
|
|
14102
|
+
const changed = await this._downloadRemoteConfig(this._configUri);
|
|
14103
|
+
if (!changed) return;
|
|
14104
|
+
}
|
|
14105
|
+
const newHash = this._computeConfigHash();
|
|
14106
|
+
if (newHash !== this._configHash) {
|
|
14107
|
+
Logger_default.log("Configuration change detected, reloading environment");
|
|
14108
|
+
this.load(this._localPath);
|
|
14109
|
+
}
|
|
14110
|
+
};
|
|
14111
|
+
/**
|
|
14112
|
+
* Synchronous load for worker threads. Reads `REMORA_CONFIG_URI` to determine
|
|
14113
|
+
* the config source — for remote URIs it loads from the local cache (already downloaded
|
|
14114
|
+
* by the main thread), for local paths it loads directly.
|
|
14115
|
+
*/
|
|
14116
|
+
this.loadFromResolvedUri = () => {
|
|
14117
|
+
const configUri = process.env.REMORA_CONFIG_URI;
|
|
14118
|
+
if (!configUri) {
|
|
14119
|
+
this.load("./");
|
|
14120
|
+
return;
|
|
14121
|
+
}
|
|
14122
|
+
if (this._isRemoteUri(configUri)) {
|
|
14123
|
+
this.load(this._resolveLocalCachePath());
|
|
14124
|
+
} else {
|
|
14125
|
+
this.load(configUri);
|
|
14126
|
+
}
|
|
14127
|
+
};
|
|
14128
|
+
/**
|
|
14129
|
+
* Load environment configuration from a local directory.
|
|
14130
|
+
* Reads `{remoraPath}/remora/project.json` and all referenced sources, producers,
|
|
14131
|
+
* consumers and schemas, validates them against JSON schemas, and initializes the singleton.
|
|
14132
|
+
*/
|
|
14061
14133
|
this.load = (remoraPath) => {
|
|
14134
|
+
this._localPath = remoraPath;
|
|
14062
14135
|
const envPath = import_path5.default.join(remoraPath, "remora");
|
|
14063
14136
|
const projectPath = import_path5.default.join(envPath, "project.json");
|
|
14064
14137
|
if (!import_fs4.default.existsSync(projectPath))
|
|
@@ -14109,6 +14182,14 @@ var EnvironmentClass = class {
|
|
|
14109
14182
|
if (!SchemaValidator_default.validate("source-schema", source))
|
|
14110
14183
|
throw new Error(`Invalid source configuration: ${source.name}`);
|
|
14111
14184
|
});
|
|
14185
|
+
const projectApiQueueSource = projectConfig.settings.API_QUEUE?.source;
|
|
14186
|
+
if (projectApiQueueSource) {
|
|
14187
|
+
const source = sources.find((item) => item.name === projectApiQueueSource);
|
|
14188
|
+
if (!source)
|
|
14189
|
+
throw new Error(`Invalid project configuration: API_QUEUE source "${projectApiQueueSource}" was not found`);
|
|
14190
|
+
if (source.engine !== "aws-sqs")
|
|
14191
|
+
throw new Error(`Invalid project configuration: API_QUEUE source "${projectApiQueueSource}" must use engine "aws-sqs"`);
|
|
14192
|
+
}
|
|
14112
14193
|
const producers = loadConfigurations(envPath, projectConfig.producers);
|
|
14113
14194
|
producers.forEach((producer) => {
|
|
14114
14195
|
if (!SchemaValidator_default.validate("producer-schema", producer))
|
|
@@ -14122,13 +14203,10 @@ var EnvironmentClass = class {
|
|
|
14122
14203
|
const envSettings = new Map(Object.entries({ ...projectConfig.settings }).map(([key, value]) => [key, String(value)]));
|
|
14123
14204
|
if (!envSettings.has("SQL_MAX_QUERY_ROWS"))
|
|
14124
14205
|
envSettings.set("SQL_MAX_QUERY_ROWS", Constants_default.defaults.SQL_MAX_QUERY_ROWS.toString());
|
|
14125
|
-
if (!envSettings.has("STRING_MAX_CHARACTERS_LENGTH"))
|
|
14126
|
-
envSettings.set("STRING_MAX_CHARACTERS_LENGTH", Constants_default.defaults.STRING_MAX_CHARACTERS_LENGTH.toString());
|
|
14127
|
-
if (!envSettings.has("MAX_ITEMS_IN_MEMORY"))
|
|
14128
|
-
envSettings.set("MAX_ITEMS_IN_MEMORY", Constants_default.defaults.MAX_ITEMS_IN_MEMORY.toString());
|
|
14129
14206
|
const debugMode = process.env.REMORA_DEBUG_MODE;
|
|
14130
14207
|
if (debugMode && debugMode.toLowerCase() === "true")
|
|
14131
14208
|
Logger_default.setLevel("debug");
|
|
14209
|
+
this._projectSettings = projectConfig.settings;
|
|
14132
14210
|
this.init({
|
|
14133
14211
|
settings: envSettings,
|
|
14134
14212
|
sources,
|
|
@@ -14139,10 +14217,14 @@ var EnvironmentClass = class {
|
|
|
14139
14217
|
// TODO: Add SQL library loading if needed
|
|
14140
14218
|
});
|
|
14141
14219
|
Logger_default.log(`Environment loaded: ${sources.length} source(s), ${producers.length} producer(s), ${consumers.length} consumer(s)`);
|
|
14220
|
+
this._configHash = this._computeConfigHash();
|
|
14142
14221
|
};
|
|
14143
14222
|
this.get = (setting) => {
|
|
14144
14223
|
return this._env.settings.get(setting);
|
|
14145
14224
|
};
|
|
14225
|
+
this.getProjectSettings = () => {
|
|
14226
|
+
return this._projectSettings;
|
|
14227
|
+
};
|
|
14146
14228
|
this.getSource = (sourceName) => {
|
|
14147
14229
|
Affirm_default(sourceName, "Invalid source name");
|
|
14148
14230
|
return this._env.sources.find((x) => x.name === sourceName);
|
|
@@ -14221,6 +14303,58 @@ ${ce.map((x) => ` -${x}
|
|
|
14221
14303
|
}
|
|
14222
14304
|
return errors;
|
|
14223
14305
|
};
|
|
14306
|
+
this._isRemoteUri = (uri) => {
|
|
14307
|
+
return uri.startsWith("http://") || uri.startsWith("https://");
|
|
14308
|
+
};
|
|
14309
|
+
this._resolveLocalCachePath = () => {
|
|
14310
|
+
return process.env.REMORA_LOCAL_CACHE_PATH || "/app";
|
|
14311
|
+
};
|
|
14312
|
+
this._downloadRemoteConfig = async (url) => {
|
|
14313
|
+
const headers = {};
|
|
14314
|
+
if (this._lastEtag)
|
|
14315
|
+
headers["If-None-Match"] = this._lastEtag;
|
|
14316
|
+
const response = await fetch(url, { headers });
|
|
14317
|
+
if (response.status === 304) return false;
|
|
14318
|
+
if (!response.ok)
|
|
14319
|
+
throw new Error(`Failed to download configuration from ${url}: HTTP ${response.status}`);
|
|
14320
|
+
const etag = response.headers.get("etag");
|
|
14321
|
+
if (etag) this._lastEtag = etag;
|
|
14322
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
14323
|
+
const zip = new import_adm_zip.default(buffer);
|
|
14324
|
+
const extractPath = import_path5.default.join(this._localPath, "remora");
|
|
14325
|
+
if (import_fs4.default.existsSync(extractPath)) {
|
|
14326
|
+
for (const file of import_fs4.default.readdirSync(extractPath)) {
|
|
14327
|
+
if (file === "temp") continue;
|
|
14328
|
+
const filePath = import_path5.default.join(extractPath, file);
|
|
14329
|
+
if (import_fs4.default.statSync(filePath).isDirectory())
|
|
14330
|
+
import_fs4.default.rmSync(filePath, { recursive: true, force: true });
|
|
14331
|
+
else
|
|
14332
|
+
import_fs4.default.unlinkSync(filePath);
|
|
14333
|
+
}
|
|
14334
|
+
}
|
|
14335
|
+
zip.extractAllTo(extractPath, true);
|
|
14336
|
+
Logger_default.log(`Remote configuration downloaded and extracted from ${url}`);
|
|
14337
|
+
return true;
|
|
14338
|
+
};
|
|
14339
|
+
this._computeConfigHash = () => {
|
|
14340
|
+
if (!this._localPath) return null;
|
|
14341
|
+
const envPath = import_path5.default.join(this._localPath, "remora");
|
|
14342
|
+
if (!import_fs4.default.existsSync(envPath)) return null;
|
|
14343
|
+
const hash = import_crypto.default.createHash("sha256");
|
|
14344
|
+
const hashDir = (dirPath) => {
|
|
14345
|
+
if (!import_fs4.default.existsSync(dirPath)) return;
|
|
14346
|
+
for (const entry of import_fs4.default.readdirSync(dirPath, { withFileTypes: true })) {
|
|
14347
|
+
if (entry.name === "temp" || entry.name === ".temp") continue;
|
|
14348
|
+
const fullEntry = import_path5.default.join(dirPath, entry.name);
|
|
14349
|
+
if (entry.isDirectory())
|
|
14350
|
+
hashDir(fullEntry);
|
|
14351
|
+
else if (entry.name.endsWith(".json"))
|
|
14352
|
+
hash.update(import_fs4.default.readFileSync(fullEntry, "utf-8"));
|
|
14353
|
+
}
|
|
14354
|
+
};
|
|
14355
|
+
hashDir(envPath);
|
|
14356
|
+
return hash.digest("hex");
|
|
14357
|
+
};
|
|
14224
14358
|
}
|
|
14225
14359
|
};
|
|
14226
14360
|
var Environment = new EnvironmentClass();
|
|
@@ -15213,11 +15347,13 @@ var DriverHelper = {
|
|
|
15213
15347
|
const { append, destinationPath, objects, delimiter } = options;
|
|
15214
15348
|
const writeOptions = append ? { flags: "a" } : {};
|
|
15215
15349
|
const writeStream = (0, import_fs5.createWriteStream)(destinationPath, writeOptions);
|
|
15350
|
+
const waitForDrain = () => new Promise((resolve) => writeStream.once("drain", resolve));
|
|
15216
15351
|
let lineCount = 0;
|
|
15217
15352
|
const keys = Object.keys(objects[0]);
|
|
15218
15353
|
for (const obj of objects) {
|
|
15219
15354
|
const serialized = keys.map((k) => obj[k]).join(delimiter) + "\n";
|
|
15220
|
-
writeStream.write(serialized)
|
|
15355
|
+
if (!writeStream.write(serialized))
|
|
15356
|
+
await waitForDrain();
|
|
15221
15357
|
lineCount++;
|
|
15222
15358
|
}
|
|
15223
15359
|
writeStream.end();
|
|
@@ -15564,8 +15700,10 @@ var LocalDestinationDriver = class {
|
|
|
15564
15700
|
const reader = fs9.createReadStream(fromPath);
|
|
15565
15701
|
const lineReader = import_readline3.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
15566
15702
|
const writer = fs9.createWriteStream(toFilePath);
|
|
15703
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
15567
15704
|
for await (const line of lineReader) {
|
|
15568
|
-
writer.write(transform(line) + "\n")
|
|
15705
|
+
if (!writer.write(transform(line) + "\n"))
|
|
15706
|
+
await waitForDrain();
|
|
15569
15707
|
}
|
|
15570
15708
|
writer.end();
|
|
15571
15709
|
await new Promise((resolve, reject) => {
|
|
@@ -16102,7 +16240,7 @@ var debug = async (options) => {
|
|
|
16102
16240
|
// src/actions/deploy.ts
|
|
16103
16241
|
var import_chalk4 = __toESM(require("chalk"));
|
|
16104
16242
|
var import_fs8 = __toESM(require("fs"));
|
|
16105
|
-
var
|
|
16243
|
+
var import_adm_zip2 = __toESM(require("adm-zip"));
|
|
16106
16244
|
var import_path11 = __toESM(require("path"));
|
|
16107
16245
|
var deploy = async (options) => {
|
|
16108
16246
|
console.log(import_chalk4.default.blue.bold(`\u{1F680} Deploying to ${options.env}...`));
|
|
@@ -16110,7 +16248,7 @@ var deploy = async (options) => {
|
|
|
16110
16248
|
const rootDir = "./remora";
|
|
16111
16249
|
if (!import_fs8.default.existsSync("./remora"))
|
|
16112
16250
|
throw new Error(import_chalk4.default.red("Missing directory: ") + import_chalk4.default.yellow("./remora"));
|
|
16113
|
-
const zip = new
|
|
16251
|
+
const zip = new import_adm_zip2.default();
|
|
16114
16252
|
const addDirectoryToZip = (directoryPath, zipPath = "") => {
|
|
16115
16253
|
const files = import_fs8.default.readdirSync(directoryPath);
|
|
16116
16254
|
files.forEach((file) => {
|
|
@@ -16616,7 +16754,7 @@ var ApiKeysManagerClass = class {
|
|
|
16616
16754
|
var ApiKeysManager = new ApiKeysManagerClass();
|
|
16617
16755
|
|
|
16618
16756
|
// ../../packages/auth/src/LicenceManager.ts
|
|
16619
|
-
var
|
|
16757
|
+
var import_crypto2 = __toESM(require("crypto"), 1);
|
|
16620
16758
|
var PUBLICK_KEY = `-----BEGIN PUBLIC KEY-----
|
|
16621
16759
|
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA7BWugM83YKGzTyZ6kJyy
|
|
16622
16760
|
M01JoGYBQYn/9H9utQQyC/uugV4g9d7vv87I2yUfqiHtx7BQj0mOGctqnK7vuRcg
|
|
@@ -16638,7 +16776,7 @@ var LicenceManagerClass = class {
|
|
|
16638
16776
|
expires: expirationDate.toISOString()
|
|
16639
16777
|
};
|
|
16640
16778
|
const licenceString = JSON.stringify(licenceData);
|
|
16641
|
-
const sign =
|
|
16779
|
+
const sign = import_crypto2.default.createSign("SHA256");
|
|
16642
16780
|
sign.update(licenceString);
|
|
16643
16781
|
sign.end();
|
|
16644
16782
|
const signature = sign.sign(privateKey, "base64");
|
|
@@ -16654,7 +16792,7 @@ var LicenceManagerClass = class {
|
|
|
16654
16792
|
const expirationDate = new Date(data.expires);
|
|
16655
16793
|
if (now > expirationDate)
|
|
16656
16794
|
return { valid: false, reason: "License expired", expiryDate: expirationDate };
|
|
16657
|
-
const verify =
|
|
16795
|
+
const verify = import_crypto2.default.createVerify("SHA256");
|
|
16658
16796
|
verify.update(JSON.stringify(data));
|
|
16659
16797
|
const isSignatureValid = verify.verify(PUBLICK_KEY, signature, "base64");
|
|
16660
16798
|
if (!isSignatureValid)
|
|
@@ -16680,9 +16818,10 @@ var import_path18 = __toESM(require("path"));
|
|
|
16680
16818
|
var import_fs11 = __toESM(require("fs"));
|
|
16681
16819
|
var import_readline6 = __toESM(require("readline"));
|
|
16682
16820
|
var import_promises8 = __toESM(require("fs/promises"));
|
|
16821
|
+
var import_crypto5 = __toESM(require("crypto"));
|
|
16683
16822
|
|
|
16684
16823
|
// ../../packages/engines/src/CryptoEngine.ts
|
|
16685
|
-
var
|
|
16824
|
+
var import_crypto3 = __toESM(require("crypto"), 1);
|
|
16686
16825
|
|
|
16687
16826
|
// ../../packages/engines/src/RandomEngine.ts
|
|
16688
16827
|
var import_seedrandom = __toESM(require("seedrandom"), 1);
|
|
@@ -16748,7 +16887,7 @@ var CryptoEngineClass = class {
|
|
|
16748
16887
|
};
|
|
16749
16888
|
this.valueToHash = (value) => {
|
|
16750
16889
|
const textValue = JSON.stringify(value);
|
|
16751
|
-
return
|
|
16890
|
+
return import_crypto3.default.createHash("sha256").update(textValue).digest("hex");
|
|
16752
16891
|
};
|
|
16753
16892
|
this.hashValue = (maskType, value, valueType) => {
|
|
16754
16893
|
if (!Algo_default.hasVal(value)) return value;
|
|
@@ -17064,7 +17203,7 @@ var AutoMapperEngine_default = AutoMapperEngine;
|
|
|
17064
17203
|
|
|
17065
17204
|
// ../../packages/engines/src/producer/ProducerEngine.ts
|
|
17066
17205
|
var import_path14 = __toESM(require("path"), 1);
|
|
17067
|
-
var
|
|
17206
|
+
var import_crypto4 = require("crypto");
|
|
17068
17207
|
|
|
17069
17208
|
// ../../packages/engines/src/transform/TypeCaster.ts
|
|
17070
17209
|
var import_dayjs = __toESM(require("dayjs"), 1);
|
|
@@ -17270,7 +17409,7 @@ var ProducerEngineClass = class {
|
|
|
17270
17409
|
let effectiveProducer = producer;
|
|
17271
17410
|
let effectiveSource = source;
|
|
17272
17411
|
if (compressionType) {
|
|
17273
|
-
scope = { id: (0,
|
|
17412
|
+
scope = { id: (0, import_crypto4.randomUUID)(), folder: `sample-${(0, import_crypto4.randomUUID)()}`, workersId: [] };
|
|
17274
17413
|
const driver = await DriverFactory_default.instantiateSource(source);
|
|
17275
17414
|
const readyResult = await driver.ready({ producer, scope });
|
|
17276
17415
|
const firstFile = readyResult.files[0].fullUri;
|
|
@@ -18419,8 +18558,9 @@ var TransformationEngineClass = class {
|
|
|
18419
18558
|
throw new Error(`Cannot apply combine_fields transformation without record context in field '${field.key}'`);
|
|
18420
18559
|
}
|
|
18421
18560
|
const { fields, separator = "", template } = transformations.combine_fields;
|
|
18561
|
+
const currentFieldKey = field.alias ?? field.key;
|
|
18422
18562
|
const fieldValues = fields.map((fieldName) => {
|
|
18423
|
-
const fieldValue = record[fieldName];
|
|
18563
|
+
const fieldValue = fieldName === currentFieldKey ? value : record[fieldName];
|
|
18424
18564
|
return fieldValue !== null && fieldValue !== void 0 ? String(fieldValue) : "";
|
|
18425
18565
|
});
|
|
18426
18566
|
if (template) {
|
|
@@ -19180,12 +19320,15 @@ var ConsumerExecutorClass = class {
|
|
|
19180
19320
|
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
19181
19321
|
const tempWorkPath = datasetPath + "_tmp";
|
|
19182
19322
|
const writer = import_fs11.default.createWriteStream(tempWorkPath);
|
|
19323
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
19183
19324
|
let newLineCount = 0;
|
|
19184
19325
|
const seen = /* @__PURE__ */ new Set();
|
|
19185
19326
|
for await (const line of lineReader) {
|
|
19186
|
-
|
|
19187
|
-
|
|
19188
|
-
|
|
19327
|
+
const hash = import_crypto5.default.createHash("sha1").update(line).digest("base64");
|
|
19328
|
+
if (!seen.has(hash)) {
|
|
19329
|
+
seen.add(hash);
|
|
19330
|
+
if (!writer.write(line + "\n"))
|
|
19331
|
+
await waitForDrain();
|
|
19189
19332
|
newLineCount++;
|
|
19190
19333
|
}
|
|
19191
19334
|
}
|
|
@@ -19230,8 +19373,10 @@ var ConsumerExecutorClass = class {
|
|
|
19230
19373
|
lineReader.close();
|
|
19231
19374
|
const tempWorkPath = datasetPath + "_tmp";
|
|
19232
19375
|
const writer = import_fs11.default.createWriteStream(tempWorkPath);
|
|
19376
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
19233
19377
|
for (const { line } of winners.values()) {
|
|
19234
|
-
writer.write(line + "\n")
|
|
19378
|
+
if (!writer.write(line + "\n"))
|
|
19379
|
+
await waitForDrain();
|
|
19235
19380
|
}
|
|
19236
19381
|
await new Promise((resolve, reject) => {
|
|
19237
19382
|
writer.on("close", resolve);
|
|
@@ -19305,6 +19450,7 @@ var ConsumerExecutorClass = class {
|
|
|
19305
19450
|
];
|
|
19306
19451
|
const tempWorkPath = datasetPath + "_tmp";
|
|
19307
19452
|
const writer = import_fs11.default.createWriteStream(tempWorkPath);
|
|
19453
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
19308
19454
|
let outputCount = 0;
|
|
19309
19455
|
for (const { rowRecord, cells } of groups.values()) {
|
|
19310
19456
|
const outputRecord = { ...rowRecord };
|
|
@@ -19334,7 +19480,8 @@ var ConsumerExecutorClass = class {
|
|
|
19334
19480
|
}
|
|
19335
19481
|
}
|
|
19336
19482
|
const line = OutputExecutor_default.outputRecord(outputRecord, consumer, pivotedFields);
|
|
19337
|
-
writer.write(line + "\n")
|
|
19483
|
+
if (!writer.write(line + "\n"))
|
|
19484
|
+
await waitForDrain();
|
|
19338
19485
|
outputCount++;
|
|
19339
19486
|
}
|
|
19340
19487
|
await new Promise((resolve, reject) => {
|
|
@@ -19581,6 +19728,7 @@ var ExecutorWriter = class {
|
|
|
19581
19728
|
let writerIndex = 0;
|
|
19582
19729
|
let destPath = this.getCompletedPath(sourcePath, writerIndex);
|
|
19583
19730
|
let writeStream = fs19.createWriteStream(destPath, { flags: "a" });
|
|
19731
|
+
const waitForDrain = () => new Promise((resolve) => writeStream.once("drain", resolve));
|
|
19584
19732
|
for await (const line of reader) {
|
|
19585
19733
|
if (readStream.bytesRead > maxOutputFileSize * (writerIndex + 1)) {
|
|
19586
19734
|
writeStream.end();
|
|
@@ -19592,7 +19740,8 @@ var ExecutorWriter = class {
|
|
|
19592
19740
|
destPath = this.getCompletedPath(sourcePath, writerIndex);
|
|
19593
19741
|
writeStream = fs19.createWriteStream(destPath, { flags: "a" });
|
|
19594
19742
|
}
|
|
19595
|
-
writeStream.write(line + "\n")
|
|
19743
|
+
if (!writeStream.write(line + "\n"))
|
|
19744
|
+
await waitForDrain();
|
|
19596
19745
|
}
|
|
19597
19746
|
writeStream.end();
|
|
19598
19747
|
await new Promise((resolve, reject) => {
|
|
@@ -19673,7 +19822,9 @@ var ExecutorOrchestratorClass = class {
|
|
|
19673
19822
|
};
|
|
19674
19823
|
this.launch = async (request) => {
|
|
19675
19824
|
Affirm_default(request, "Invalid options");
|
|
19676
|
-
|
|
19825
|
+
await Environment_default.refreshIfNeeded();
|
|
19826
|
+
const { details, logProgress, options } = request;
|
|
19827
|
+
const consumer = Environment_default.getConsumer(request.consumer.name) ?? request.consumer;
|
|
19677
19828
|
Affirm_default(consumer, "Invalid consumer");
|
|
19678
19829
|
Affirm_default(details, "Invalid execution details");
|
|
19679
19830
|
const tracker = new ExecutorPerformance_default();
|
|
@@ -19843,7 +19994,8 @@ var ExecutorOrchestratorClass = class {
|
|
|
19843
19994
|
if (fileSize < Constants_default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) {
|
|
19844
19995
|
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
19845
19996
|
}
|
|
19846
|
-
const
|
|
19997
|
+
const availableCores = Math.max(1, Math.floor(import_os.default.cpus().length * 0.75));
|
|
19998
|
+
const cpus = numChunks ?? Math.min(availableCores, Constants_default.defaults.MAX_THREAD_COUNT);
|
|
19847
19999
|
const maxChunksBySize = Math.floor(fileSize / Constants_default.defaults.MIN_CHUNK_SIZE);
|
|
19848
20000
|
const effectiveChunks = Math.min(cpus, maxChunksBySize);
|
|
19849
20001
|
if (effectiveChunks <= 1) return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
@@ -19852,7 +20004,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19852
20004
|
try {
|
|
19853
20005
|
const offsets = [];
|
|
19854
20006
|
let currentStart = 0;
|
|
19855
|
-
for (let i = 0; i <
|
|
20007
|
+
for (let i = 0; i < effectiveChunks - 1; i++) {
|
|
19856
20008
|
const targetEnd = currentStart + targetChunkSize;
|
|
19857
20009
|
if (targetEnd >= fileSize) {
|
|
19858
20010
|
break;
|
|
@@ -19926,6 +20078,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19926
20078
|
import_fs12.default.createReadStream(workerResult.resultUri),
|
|
19927
20079
|
import_fs12.default.createWriteStream(mainPath, { flags: "a" })
|
|
19928
20080
|
);
|
|
20081
|
+
await import_promises9.default.unlink(workerResult.resultUri);
|
|
19929
20082
|
}
|
|
19930
20083
|
tracker.measure("merge-workers", performance.now() - perf);
|
|
19931
20084
|
Logger_default.log(`[${scope.id}] Merge complete in ${Math.round(performance.now() - perf)}ms`);
|
|
@@ -63,6 +63,18 @@
|
|
|
63
63
|
"minimum": 1,
|
|
64
64
|
"description": "Maximum number of rows for SQL queries"
|
|
65
65
|
},
|
|
66
|
+
"API_QUEUE": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"required": ["source"],
|
|
69
|
+
"additionalProperties": false,
|
|
70
|
+
"properties": {
|
|
71
|
+
"source": {
|
|
72
|
+
"type": "string",
|
|
73
|
+
"minLength": 1,
|
|
74
|
+
"description": "Name of an aws-sqs source used as an alternative execute-consumer API"
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
},
|
|
66
78
|
"STRING_MAX_CHARACTERS_LENGTH": {
|
|
67
79
|
"type": "integer",
|
|
68
80
|
"minimum": 1,
|
|
@@ -88,7 +100,10 @@
|
|
|
88
100
|
"sources": ["/sources"],
|
|
89
101
|
"schemas": ["/schemas"],
|
|
90
102
|
"settings": {
|
|
91
|
-
"SQL_MAX_QUERY_ROWS": 10000
|
|
103
|
+
"SQL_MAX_QUERY_ROWS": 10000,
|
|
104
|
+
"API_QUEUE": {
|
|
105
|
+
"source": "remora-api-queue"
|
|
106
|
+
}
|
|
92
107
|
}
|
|
93
108
|
}
|
|
94
109
|
]
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
"aws-redshift",
|
|
23
23
|
"aws-dynamodb",
|
|
24
24
|
"aws-s3",
|
|
25
|
+
"aws-sqs",
|
|
25
26
|
"postgres",
|
|
26
27
|
"local",
|
|
27
28
|
"delta-share",
|
|
@@ -51,6 +52,10 @@
|
|
|
51
52
|
"type": "string",
|
|
52
53
|
"description": "Hostname or endpoint of the data source"
|
|
53
54
|
},
|
|
55
|
+
"accountId": {
|
|
56
|
+
"type": "string",
|
|
57
|
+
"description": "AWS account ID used to construct resource URLs when only a resource name is provided"
|
|
58
|
+
},
|
|
54
59
|
"user": {
|
|
55
60
|
"type": "string",
|
|
56
61
|
"description": "Username for authentication"
|
|
@@ -95,6 +100,10 @@
|
|
|
95
100
|
"type": "string",
|
|
96
101
|
"description": "AWS region"
|
|
97
102
|
},
|
|
103
|
+
"queue": {
|
|
104
|
+
"type": "string",
|
|
105
|
+
"description": "SQS queue URL or queue name"
|
|
106
|
+
},
|
|
98
107
|
"bucket": {
|
|
99
108
|
"type": "string",
|
|
100
109
|
"description": "S3 bucket name"
|
|
@@ -202,6 +211,20 @@
|
|
|
202
211
|
},
|
|
203
212
|
"_version": 2
|
|
204
213
|
},
|
|
214
|
+
{
|
|
215
|
+
"name": "Consumer Execute Queue",
|
|
216
|
+
"description": "SQS queue used as an alternative execute-consumer API",
|
|
217
|
+
"engine": "aws-sqs",
|
|
218
|
+
"authentication": {
|
|
219
|
+
"method": "access-secret-key",
|
|
220
|
+
"queue": "remora-execute-consumer",
|
|
221
|
+
"accountId": "{AWS_ACCOUNT_ID}",
|
|
222
|
+
"region": "us-east-1",
|
|
223
|
+
"accessKey": "{AWS_ACCESS_KEY_ID}",
|
|
224
|
+
"secretKey": "{AWS_SECRET_ACCESS_KEY}"
|
|
225
|
+
},
|
|
226
|
+
"_version": 1
|
|
227
|
+
},
|
|
205
228
|
{
|
|
206
229
|
"name": "Redshift Data Warehouse",
|
|
207
230
|
"engine": "aws-redshift",
|
package/package.json
CHANGED
|
@@ -13512,12 +13512,10 @@ var CONSTANTS = {
|
|
|
13512
13512
|
REMORA_PATH: "./remora",
|
|
13513
13513
|
PRODUCER_TEMP_FOLDER: ".temp",
|
|
13514
13514
|
SQL_MAX_QUERY_ROWS: 1e4,
|
|
13515
|
-
STRING_MAX_CHARACTERS_LENGTH: 1e7,
|
|
13516
|
-
MAX_ITEMS_IN_MEMORY: 2e5,
|
|
13517
13515
|
MIN_RUNTIME_HEAP_MB: 4e3,
|
|
13518
13516
|
RECOMMENDED_RUNTIME_HEAP_MB: 8e3,
|
|
13519
13517
|
INDICATIVE_THREAD_LINE_COUNT: 75e4,
|
|
13520
|
-
MAX_THREAD_COUNT:
|
|
13518
|
+
MAX_THREAD_COUNT: 99,
|
|
13521
13519
|
/**
|
|
13522
13520
|
* Minimum file size to consider parallel processing (10 MB)
|
|
13523
13521
|
*/
|
|
@@ -13599,6 +13597,8 @@ var ExecutorScope_default = ExecutorScope;
|
|
|
13599
13597
|
|
|
13600
13598
|
// ../../packages/common/src/Environment.ts
|
|
13601
13599
|
var import_fs4 = __toESM(require("fs"), 1);
|
|
13600
|
+
var import_crypto = __toESM(require("crypto"), 1);
|
|
13601
|
+
var import_adm_zip = __toESM(require("adm-zip"), 1);
|
|
13602
13602
|
var import_path5 = __toESM(require("path"), 1);
|
|
13603
13603
|
|
|
13604
13604
|
// ../../packages/common/src/schema/SchemaValidator.ts
|
|
@@ -13720,6 +13720,12 @@ var ValidatorClass = class {
|
|
|
13720
13720
|
const source = sources[i];
|
|
13721
13721
|
if (source.engine === "local" && !source.authentication.path)
|
|
13722
13722
|
errors.push(`For source ${source.name}, the path has not been configured`);
|
|
13723
|
+
if (source.engine === "aws-sqs") {
|
|
13724
|
+
if (!source.authentication.queue)
|
|
13725
|
+
errors.push(`For source ${source.name}, the queue has not been configured`);
|
|
13726
|
+
if (!source.authentication.region && !source.authentication.queue?.startsWith("https://"))
|
|
13727
|
+
errors.push(`For source ${source.name}, the region has not been configured`);
|
|
13728
|
+
}
|
|
13723
13729
|
}
|
|
13724
13730
|
} catch (e) {
|
|
13725
13731
|
if (errors.length === 0)
|
|
@@ -14010,8 +14016,11 @@ var ValidatorClass = class {
|
|
|
14010
14016
|
if (!auth) continue;
|
|
14011
14017
|
const ctx = (field) => `source "${source.name}" authentication.${field}`;
|
|
14012
14018
|
checkValue(auth.accessKey, ctx("accessKey"));
|
|
14019
|
+
checkValue(auth.accountId, ctx("accountId"));
|
|
14013
14020
|
checkValue(auth.secretKey, ctx("secretKey"));
|
|
14014
14021
|
checkValue(auth.sessionToken, ctx("sessionToken"));
|
|
14022
|
+
checkValue(auth.queue, ctx("queue"));
|
|
14023
|
+
checkValue(auth.region, ctx("region"));
|
|
14015
14024
|
checkValue(auth.bearerToken, ctx("bearerToken"));
|
|
14016
14025
|
checkValue(auth.url, ctx("url"));
|
|
14017
14026
|
checkValue(auth.apiKey, ctx("apiKey"));
|
|
@@ -14030,6 +14039,8 @@ var ValidatorClass = class {
|
|
|
14030
14039
|
switch (engine) {
|
|
14031
14040
|
case "aws-dynamodb":
|
|
14032
14041
|
return "no-sql";
|
|
14042
|
+
case "aws-sqs":
|
|
14043
|
+
return "no-sql";
|
|
14033
14044
|
case "aws-redshift":
|
|
14034
14045
|
case "postgres":
|
|
14035
14046
|
return "sql";
|
|
@@ -14049,10 +14060,72 @@ var Validator_default = Validator;
|
|
|
14049
14060
|
var EnvironmentClass = class {
|
|
14050
14061
|
constructor() {
|
|
14051
14062
|
this._env = null;
|
|
14063
|
+
this._configUri = null;
|
|
14064
|
+
this._localPath = null;
|
|
14065
|
+
this._configHash = null;
|
|
14066
|
+
this._lastEtag = null;
|
|
14067
|
+
this._projectSettings = null;
|
|
14052
14068
|
this.init = (env) => {
|
|
14053
14069
|
this._env = env;
|
|
14054
14070
|
};
|
|
14071
|
+
/**
|
|
14072
|
+
* Load environment from a local path or remote URL (e.g. S3 presigned URL).
|
|
14073
|
+
* Remote configs are downloaded as a zip and extracted to the local cache path.
|
|
14074
|
+
* Falls back to `REMORA_CONFIG_URI` env variable if no URI is provided.
|
|
14075
|
+
*/
|
|
14076
|
+
this.loadFromUri = async (uri) => {
|
|
14077
|
+
const configUri = uri || process.env.REMORA_CONFIG_URI;
|
|
14078
|
+
Affirm_default(configUri, "No configuration URI provided. Set REMORA_CONFIG_URI environment variable or pass a URI.");
|
|
14079
|
+
this._configUri = configUri;
|
|
14080
|
+
if (this._isRemoteUri(configUri)) {
|
|
14081
|
+
this._localPath = this._resolveLocalCachePath();
|
|
14082
|
+
await this._downloadRemoteConfig(configUri);
|
|
14083
|
+
} else {
|
|
14084
|
+
this._localPath = configUri;
|
|
14085
|
+
}
|
|
14086
|
+
this.load(this._localPath);
|
|
14087
|
+
};
|
|
14088
|
+
/**
|
|
14089
|
+
* Check if the configuration has changed and reload if so.
|
|
14090
|
+
* For remote URIs, re-downloads using ETag caching (HTTP 304 avoids redundant downloads).
|
|
14091
|
+
* For local paths, compares a SHA-256 hash of all config JSON files against the last loaded hash.
|
|
14092
|
+
*/
|
|
14093
|
+
this.refreshIfNeeded = async () => {
|
|
14094
|
+
if (!this._localPath && !this._configUri) return;
|
|
14095
|
+
if (this._configUri && this._isRemoteUri(this._configUri)) {
|
|
14096
|
+
const changed = await this._downloadRemoteConfig(this._configUri);
|
|
14097
|
+
if (!changed) return;
|
|
14098
|
+
}
|
|
14099
|
+
const newHash = this._computeConfigHash();
|
|
14100
|
+
if (newHash !== this._configHash) {
|
|
14101
|
+
Logger_default.log("Configuration change detected, reloading environment");
|
|
14102
|
+
this.load(this._localPath);
|
|
14103
|
+
}
|
|
14104
|
+
};
|
|
14105
|
+
/**
|
|
14106
|
+
* Synchronous load for worker threads. Reads `REMORA_CONFIG_URI` to determine
|
|
14107
|
+
* the config source — for remote URIs it loads from the local cache (already downloaded
|
|
14108
|
+
* by the main thread), for local paths it loads directly.
|
|
14109
|
+
*/
|
|
14110
|
+
this.loadFromResolvedUri = () => {
|
|
14111
|
+
const configUri = process.env.REMORA_CONFIG_URI;
|
|
14112
|
+
if (!configUri) {
|
|
14113
|
+
this.load("./");
|
|
14114
|
+
return;
|
|
14115
|
+
}
|
|
14116
|
+
if (this._isRemoteUri(configUri)) {
|
|
14117
|
+
this.load(this._resolveLocalCachePath());
|
|
14118
|
+
} else {
|
|
14119
|
+
this.load(configUri);
|
|
14120
|
+
}
|
|
14121
|
+
};
|
|
14122
|
+
/**
|
|
14123
|
+
* Load environment configuration from a local directory.
|
|
14124
|
+
* Reads `{remoraPath}/remora/project.json` and all referenced sources, producers,
|
|
14125
|
+
* consumers and schemas, validates them against JSON schemas, and initializes the singleton.
|
|
14126
|
+
*/
|
|
14055
14127
|
this.load = (remoraPath) => {
|
|
14128
|
+
this._localPath = remoraPath;
|
|
14056
14129
|
const envPath = import_path5.default.join(remoraPath, "remora");
|
|
14057
14130
|
const projectPath = import_path5.default.join(envPath, "project.json");
|
|
14058
14131
|
if (!import_fs4.default.existsSync(projectPath))
|
|
@@ -14103,6 +14176,14 @@ var EnvironmentClass = class {
|
|
|
14103
14176
|
if (!SchemaValidator_default.validate("source-schema", source))
|
|
14104
14177
|
throw new Error(`Invalid source configuration: ${source.name}`);
|
|
14105
14178
|
});
|
|
14179
|
+
const projectApiQueueSource = projectConfig.settings.API_QUEUE?.source;
|
|
14180
|
+
if (projectApiQueueSource) {
|
|
14181
|
+
const source = sources.find((item) => item.name === projectApiQueueSource);
|
|
14182
|
+
if (!source)
|
|
14183
|
+
throw new Error(`Invalid project configuration: API_QUEUE source "${projectApiQueueSource}" was not found`);
|
|
14184
|
+
if (source.engine !== "aws-sqs")
|
|
14185
|
+
throw new Error(`Invalid project configuration: API_QUEUE source "${projectApiQueueSource}" must use engine "aws-sqs"`);
|
|
14186
|
+
}
|
|
14106
14187
|
const producers = loadConfigurations(envPath, projectConfig.producers);
|
|
14107
14188
|
producers.forEach((producer) => {
|
|
14108
14189
|
if (!SchemaValidator_default.validate("producer-schema", producer))
|
|
@@ -14116,13 +14197,10 @@ var EnvironmentClass = class {
|
|
|
14116
14197
|
const envSettings = new Map(Object.entries({ ...projectConfig.settings }).map(([key, value]) => [key, String(value)]));
|
|
14117
14198
|
if (!envSettings.has("SQL_MAX_QUERY_ROWS"))
|
|
14118
14199
|
envSettings.set("SQL_MAX_QUERY_ROWS", Constants_default.defaults.SQL_MAX_QUERY_ROWS.toString());
|
|
14119
|
-
if (!envSettings.has("STRING_MAX_CHARACTERS_LENGTH"))
|
|
14120
|
-
envSettings.set("STRING_MAX_CHARACTERS_LENGTH", Constants_default.defaults.STRING_MAX_CHARACTERS_LENGTH.toString());
|
|
14121
|
-
if (!envSettings.has("MAX_ITEMS_IN_MEMORY"))
|
|
14122
|
-
envSettings.set("MAX_ITEMS_IN_MEMORY", Constants_default.defaults.MAX_ITEMS_IN_MEMORY.toString());
|
|
14123
14200
|
const debugMode = process.env.REMORA_DEBUG_MODE;
|
|
14124
14201
|
if (debugMode && debugMode.toLowerCase() === "true")
|
|
14125
14202
|
Logger_default.setLevel("debug");
|
|
14203
|
+
this._projectSettings = projectConfig.settings;
|
|
14126
14204
|
this.init({
|
|
14127
14205
|
settings: envSettings,
|
|
14128
14206
|
sources,
|
|
@@ -14133,10 +14211,14 @@ var EnvironmentClass = class {
|
|
|
14133
14211
|
// TODO: Add SQL library loading if needed
|
|
14134
14212
|
});
|
|
14135
14213
|
Logger_default.log(`Environment loaded: ${sources.length} source(s), ${producers.length} producer(s), ${consumers.length} consumer(s)`);
|
|
14214
|
+
this._configHash = this._computeConfigHash();
|
|
14136
14215
|
};
|
|
14137
14216
|
this.get = (setting) => {
|
|
14138
14217
|
return this._env.settings.get(setting);
|
|
14139
14218
|
};
|
|
14219
|
+
this.getProjectSettings = () => {
|
|
14220
|
+
return this._projectSettings;
|
|
14221
|
+
};
|
|
14140
14222
|
this.getSource = (sourceName) => {
|
|
14141
14223
|
Affirm_default(sourceName, "Invalid source name");
|
|
14142
14224
|
return this._env.sources.find((x) => x.name === sourceName);
|
|
@@ -14215,6 +14297,58 @@ ${ce.map((x) => ` -${x}
|
|
|
14215
14297
|
}
|
|
14216
14298
|
return errors;
|
|
14217
14299
|
};
|
|
14300
|
+
this._isRemoteUri = (uri) => {
|
|
14301
|
+
return uri.startsWith("http://") || uri.startsWith("https://");
|
|
14302
|
+
};
|
|
14303
|
+
this._resolveLocalCachePath = () => {
|
|
14304
|
+
return process.env.REMORA_LOCAL_CACHE_PATH || "/app";
|
|
14305
|
+
};
|
|
14306
|
+
this._downloadRemoteConfig = async (url) => {
|
|
14307
|
+
const headers = {};
|
|
14308
|
+
if (this._lastEtag)
|
|
14309
|
+
headers["If-None-Match"] = this._lastEtag;
|
|
14310
|
+
const response = await fetch(url, { headers });
|
|
14311
|
+
if (response.status === 304) return false;
|
|
14312
|
+
if (!response.ok)
|
|
14313
|
+
throw new Error(`Failed to download configuration from ${url}: HTTP ${response.status}`);
|
|
14314
|
+
const etag = response.headers.get("etag");
|
|
14315
|
+
if (etag) this._lastEtag = etag;
|
|
14316
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
14317
|
+
const zip = new import_adm_zip.default(buffer);
|
|
14318
|
+
const extractPath = import_path5.default.join(this._localPath, "remora");
|
|
14319
|
+
if (import_fs4.default.existsSync(extractPath)) {
|
|
14320
|
+
for (const file of import_fs4.default.readdirSync(extractPath)) {
|
|
14321
|
+
if (file === "temp") continue;
|
|
14322
|
+
const filePath = import_path5.default.join(extractPath, file);
|
|
14323
|
+
if (import_fs4.default.statSync(filePath).isDirectory())
|
|
14324
|
+
import_fs4.default.rmSync(filePath, { recursive: true, force: true });
|
|
14325
|
+
else
|
|
14326
|
+
import_fs4.default.unlinkSync(filePath);
|
|
14327
|
+
}
|
|
14328
|
+
}
|
|
14329
|
+
zip.extractAllTo(extractPath, true);
|
|
14330
|
+
Logger_default.log(`Remote configuration downloaded and extracted from ${url}`);
|
|
14331
|
+
return true;
|
|
14332
|
+
};
|
|
14333
|
+
this._computeConfigHash = () => {
|
|
14334
|
+
if (!this._localPath) return null;
|
|
14335
|
+
const envPath = import_path5.default.join(this._localPath, "remora");
|
|
14336
|
+
if (!import_fs4.default.existsSync(envPath)) return null;
|
|
14337
|
+
const hash = import_crypto.default.createHash("sha256");
|
|
14338
|
+
const hashDir = (dirPath) => {
|
|
14339
|
+
if (!import_fs4.default.existsSync(dirPath)) return;
|
|
14340
|
+
for (const entry of import_fs4.default.readdirSync(dirPath, { withFileTypes: true })) {
|
|
14341
|
+
if (entry.name === "temp" || entry.name === ".temp") continue;
|
|
14342
|
+
const fullEntry = import_path5.default.join(dirPath, entry.name);
|
|
14343
|
+
if (entry.isDirectory())
|
|
14344
|
+
hashDir(fullEntry);
|
|
14345
|
+
else if (entry.name.endsWith(".json"))
|
|
14346
|
+
hash.update(import_fs4.default.readFileSync(fullEntry, "utf-8"));
|
|
14347
|
+
}
|
|
14348
|
+
};
|
|
14349
|
+
hashDir(envPath);
|
|
14350
|
+
return hash.digest("hex");
|
|
14351
|
+
};
|
|
14218
14352
|
}
|
|
14219
14353
|
};
|
|
14220
14354
|
var Environment = new EnvironmentClass();
|
|
@@ -14225,9 +14359,10 @@ var import_path15 = __toESM(require("path"));
|
|
|
14225
14359
|
var import_fs9 = __toESM(require("fs"));
|
|
14226
14360
|
var import_readline6 = __toESM(require("readline"));
|
|
14227
14361
|
var import_promises8 = __toESM(require("fs/promises"));
|
|
14362
|
+
var import_crypto4 = __toESM(require("crypto"));
|
|
14228
14363
|
|
|
14229
14364
|
// ../../packages/engines/src/CryptoEngine.ts
|
|
14230
|
-
var
|
|
14365
|
+
var import_crypto2 = __toESM(require("crypto"), 1);
|
|
14231
14366
|
|
|
14232
14367
|
// ../../packages/engines/src/RandomEngine.ts
|
|
14233
14368
|
var import_seedrandom = __toESM(require("seedrandom"), 1);
|
|
@@ -14293,7 +14428,7 @@ var CryptoEngineClass = class {
|
|
|
14293
14428
|
};
|
|
14294
14429
|
this.valueToHash = (value) => {
|
|
14295
14430
|
const textValue = JSON.stringify(value);
|
|
14296
|
-
return
|
|
14431
|
+
return import_crypto2.default.createHash("sha256").update(textValue).digest("hex");
|
|
14297
14432
|
};
|
|
14298
14433
|
this.hashValue = (maskType, value, valueType) => {
|
|
14299
14434
|
if (!Algo_default.hasVal(value)) return value;
|
|
@@ -14608,7 +14743,7 @@ var AutoMapperEngine = new AutoMapperEngineClass();
|
|
|
14608
14743
|
|
|
14609
14744
|
// ../../packages/engines/src/producer/ProducerEngine.ts
|
|
14610
14745
|
var import_path11 = __toESM(require("path"), 1);
|
|
14611
|
-
var
|
|
14746
|
+
var import_crypto3 = require("crypto");
|
|
14612
14747
|
|
|
14613
14748
|
// ../../packages/drivers/src/DeltaShareDriver.ts
|
|
14614
14749
|
var DeltaShareSourceDriver = class {
|
|
@@ -15555,11 +15690,13 @@ var DriverHelper = {
|
|
|
15555
15690
|
const { append, destinationPath, objects, delimiter } = options;
|
|
15556
15691
|
const writeOptions = append ? { flags: "a" } : {};
|
|
15557
15692
|
const writeStream = (0, import_fs5.createWriteStream)(destinationPath, writeOptions);
|
|
15693
|
+
const waitForDrain = () => new Promise((resolve) => writeStream.once("drain", resolve));
|
|
15558
15694
|
let lineCount = 0;
|
|
15559
15695
|
const keys = Object.keys(objects[0]);
|
|
15560
15696
|
for (const obj of objects) {
|
|
15561
15697
|
const serialized = keys.map((k) => obj[k]).join(delimiter) + "\n";
|
|
15562
|
-
writeStream.write(serialized)
|
|
15698
|
+
if (!writeStream.write(serialized))
|
|
15699
|
+
await waitForDrain();
|
|
15563
15700
|
lineCount++;
|
|
15564
15701
|
}
|
|
15565
15702
|
writeStream.end();
|
|
@@ -15906,8 +16043,10 @@ var LocalDestinationDriver = class {
|
|
|
15906
16043
|
const reader = fs8.createReadStream(fromPath);
|
|
15907
16044
|
const lineReader = import_readline3.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
15908
16045
|
const writer = fs8.createWriteStream(toFilePath);
|
|
16046
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
15909
16047
|
for await (const line of lineReader) {
|
|
15910
|
-
writer.write(transform(line) + "\n")
|
|
16048
|
+
if (!writer.write(transform(line) + "\n"))
|
|
16049
|
+
await waitForDrain();
|
|
15911
16050
|
}
|
|
15912
16051
|
writer.end();
|
|
15913
16052
|
await new Promise((resolve, reject) => {
|
|
@@ -16600,7 +16739,7 @@ var ProducerEngineClass = class {
|
|
|
16600
16739
|
let effectiveProducer = producer;
|
|
16601
16740
|
let effectiveSource = source;
|
|
16602
16741
|
if (compressionType) {
|
|
16603
|
-
scope = { id: (0,
|
|
16742
|
+
scope = { id: (0, import_crypto3.randomUUID)(), folder: `sample-${(0, import_crypto3.randomUUID)()}`, workersId: [] };
|
|
16604
16743
|
const driver = await DriverFactory_default.instantiateSource(source);
|
|
16605
16744
|
const readyResult = await driver.ready({ producer, scope });
|
|
16606
16745
|
const firstFile = readyResult.files[0].fullUri;
|
|
@@ -17749,8 +17888,9 @@ var TransformationEngineClass = class {
|
|
|
17749
17888
|
throw new Error(`Cannot apply combine_fields transformation without record context in field '${field.key}'`);
|
|
17750
17889
|
}
|
|
17751
17890
|
const { fields, separator = "", template } = transformations.combine_fields;
|
|
17891
|
+
const currentFieldKey = field.alias ?? field.key;
|
|
17752
17892
|
const fieldValues = fields.map((fieldName) => {
|
|
17753
|
-
const fieldValue = record[fieldName];
|
|
17893
|
+
const fieldValue = fieldName === currentFieldKey ? value : record[fieldName];
|
|
17754
17894
|
return fieldValue !== null && fieldValue !== void 0 ? String(fieldValue) : "";
|
|
17755
17895
|
});
|
|
17756
17896
|
if (template) {
|
|
@@ -18779,12 +18919,15 @@ var ConsumerExecutorClass = class {
|
|
|
18779
18919
|
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
18780
18920
|
const tempWorkPath = datasetPath + "_tmp";
|
|
18781
18921
|
const writer = import_fs9.default.createWriteStream(tempWorkPath);
|
|
18922
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
18782
18923
|
let newLineCount = 0;
|
|
18783
18924
|
const seen = /* @__PURE__ */ new Set();
|
|
18784
18925
|
for await (const line of lineReader) {
|
|
18785
|
-
|
|
18786
|
-
|
|
18787
|
-
|
|
18926
|
+
const hash = import_crypto4.default.createHash("sha1").update(line).digest("base64");
|
|
18927
|
+
if (!seen.has(hash)) {
|
|
18928
|
+
seen.add(hash);
|
|
18929
|
+
if (!writer.write(line + "\n"))
|
|
18930
|
+
await waitForDrain();
|
|
18788
18931
|
newLineCount++;
|
|
18789
18932
|
}
|
|
18790
18933
|
}
|
|
@@ -18829,8 +18972,10 @@ var ConsumerExecutorClass = class {
|
|
|
18829
18972
|
lineReader.close();
|
|
18830
18973
|
const tempWorkPath = datasetPath + "_tmp";
|
|
18831
18974
|
const writer = import_fs9.default.createWriteStream(tempWorkPath);
|
|
18975
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
18832
18976
|
for (const { line } of winners.values()) {
|
|
18833
|
-
writer.write(line + "\n")
|
|
18977
|
+
if (!writer.write(line + "\n"))
|
|
18978
|
+
await waitForDrain();
|
|
18834
18979
|
}
|
|
18835
18980
|
await new Promise((resolve, reject) => {
|
|
18836
18981
|
writer.on("close", resolve);
|
|
@@ -18904,6 +19049,7 @@ var ConsumerExecutorClass = class {
|
|
|
18904
19049
|
];
|
|
18905
19050
|
const tempWorkPath = datasetPath + "_tmp";
|
|
18906
19051
|
const writer = import_fs9.default.createWriteStream(tempWorkPath);
|
|
19052
|
+
const waitForDrain = () => new Promise((resolve) => writer.once("drain", resolve));
|
|
18907
19053
|
let outputCount = 0;
|
|
18908
19054
|
for (const { rowRecord, cells } of groups.values()) {
|
|
18909
19055
|
const outputRecord = { ...rowRecord };
|
|
@@ -18933,7 +19079,8 @@ var ConsumerExecutorClass = class {
|
|
|
18933
19079
|
}
|
|
18934
19080
|
}
|
|
18935
19081
|
const line = OutputExecutor_default.outputRecord(outputRecord, consumer, pivotedFields);
|
|
18936
|
-
writer.write(line + "\n")
|
|
19082
|
+
if (!writer.write(line + "\n"))
|
|
19083
|
+
await waitForDrain();
|
|
18937
19084
|
outputCount++;
|
|
18938
19085
|
}
|
|
18939
19086
|
await new Promise((resolve, reject) => {
|
|
@@ -19136,6 +19283,12 @@ var Executor = class {
|
|
|
19136
19283
|
Logger_default.log(`[${workerId}] Opened streams \u2014 chunk ${start}-${end} (${Math.round(totalBytes / 1024)}KB), ${fields.length} field(s)`);
|
|
19137
19284
|
Logger_default.log(`[${workerId}] Starting line-by-line processing`);
|
|
19138
19285
|
const lineStream = import_readline7.default.createInterface({ input: readStream, crlfDelay: Infinity });
|
|
19286
|
+
let drainCount = 0;
|
|
19287
|
+
const waitForDrain = () => {
|
|
19288
|
+
drainCount++;
|
|
19289
|
+
return new Promise((resolve) => writeStream.once("drain", resolve));
|
|
19290
|
+
};
|
|
19291
|
+
const isDebug = Logger_default._level === "debug";
|
|
19139
19292
|
for await (const line of lineStream) {
|
|
19140
19293
|
if (lineIndex === 0 && isFirstChunk) {
|
|
19141
19294
|
if (!this.shouldProcessFirstLine(producer)) {
|
|
@@ -19154,11 +19307,11 @@ var Executor = class {
|
|
|
19154
19307
|
});
|
|
19155
19308
|
this._performance.measure("process-line", performance.now() - perf);
|
|
19156
19309
|
if (!record) {
|
|
19157
|
-
Logger_default.log(`[${workerId}] Line ${lineIndex}: skipped by producer (no record produced)`);
|
|
19310
|
+
if (isDebug) Logger_default.log(`[${workerId}] Line ${lineIndex}: skipped by producer (no record produced)`);
|
|
19158
19311
|
lineIndex++;
|
|
19159
19312
|
continue;
|
|
19160
19313
|
}
|
|
19161
|
-
Logger_default.log(`[${workerId}] Line ${lineIndex}: parsed by producer`);
|
|
19314
|
+
if (isDebug) Logger_default.log(`[${workerId}] Line ${lineIndex}: parsed by producer`);
|
|
19162
19315
|
perf = performance.now();
|
|
19163
19316
|
record = ConsumerExecutor_default.processRecord({
|
|
19164
19317
|
record,
|
|
@@ -19171,16 +19324,17 @@ var Executor = class {
|
|
|
19171
19324
|
});
|
|
19172
19325
|
this._performance.measure("process-record", performance.now() - perf);
|
|
19173
19326
|
if (!record) {
|
|
19174
|
-
Logger_default.log(`[${workerId}] Line ${lineIndex}: filtered out by consumer`);
|
|
19327
|
+
if (isDebug) Logger_default.log(`[${workerId}] Line ${lineIndex}: filtered out by consumer`);
|
|
19175
19328
|
lineIndex++;
|
|
19176
19329
|
continue;
|
|
19177
19330
|
}
|
|
19178
|
-
Logger_default.log(`[${workerId}] Line ${lineIndex}: processed by consumer`);
|
|
19331
|
+
if (isDebug) Logger_default.log(`[${workerId}] Line ${lineIndex}: processed by consumer`);
|
|
19179
19332
|
perf = performance.now();
|
|
19180
19333
|
const outputLine = OutputExecutor_default.outputRecord(record, consumer, fields);
|
|
19181
19334
|
this._performance.measure("output-record", performance.now() - perf);
|
|
19182
|
-
Logger_default.log(`[${workerId}] Line ${lineIndex}: output written (record #${totalOutputCount + 1})`);
|
|
19183
|
-
writeStream.write(outputLine + "\n")
|
|
19335
|
+
if (isDebug) Logger_default.log(`[${workerId}] Line ${lineIndex}: output written (record #${totalOutputCount + 1})`);
|
|
19336
|
+
if (!writeStream.write(outputLine + "\n"))
|
|
19337
|
+
await waitForDrain();
|
|
19184
19338
|
totalOutputCount++;
|
|
19185
19339
|
lineIndex++;
|
|
19186
19340
|
if (reportWork && lineIndex % this._REPORT_WORK_AFTER_LINES === 0) {
|
|
@@ -19192,7 +19346,8 @@ var Executor = class {
|
|
|
19192
19346
|
break;
|
|
19193
19347
|
}
|
|
19194
19348
|
}
|
|
19195
|
-
Logger_default.log(`[${workerId}] Line processing complete \u2014 ${lineIndex} lines read, ${totalOutputCount} records written`);
|
|
19349
|
+
Logger_default.log(`[${workerId}] Line processing complete \u2014 ${lineIndex} lines read, ${totalOutputCount} records written, ${drainCount} drain(s)`);
|
|
19350
|
+
this._performance.measure("drain-count", drainCount);
|
|
19196
19351
|
writeStream.end();
|
|
19197
19352
|
await new Promise((resolve, reject) => {
|
|
19198
19353
|
writeStream.on("finish", resolve);
|
|
@@ -19340,6 +19495,7 @@ var ExecutorWriter = class {
|
|
|
19340
19495
|
let writerIndex = 0;
|
|
19341
19496
|
let destPath = this.getCompletedPath(sourcePath, writerIndex);
|
|
19342
19497
|
let writeStream = fs16.createWriteStream(destPath, { flags: "a" });
|
|
19498
|
+
const waitForDrain = () => new Promise((resolve) => writeStream.once("drain", resolve));
|
|
19343
19499
|
for await (const line of reader) {
|
|
19344
19500
|
if (readStream.bytesRead > maxOutputFileSize * (writerIndex + 1)) {
|
|
19345
19501
|
writeStream.end();
|
|
@@ -19351,7 +19507,8 @@ var ExecutorWriter = class {
|
|
|
19351
19507
|
destPath = this.getCompletedPath(sourcePath, writerIndex);
|
|
19352
19508
|
writeStream = fs16.createWriteStream(destPath, { flags: "a" });
|
|
19353
19509
|
}
|
|
19354
|
-
writeStream.write(line + "\n")
|
|
19510
|
+
if (!writeStream.write(line + "\n"))
|
|
19511
|
+
await waitForDrain();
|
|
19355
19512
|
}
|
|
19356
19513
|
writeStream.end();
|
|
19357
19514
|
await new Promise((resolve, reject) => {
|
|
@@ -19432,7 +19589,9 @@ var ExecutorOrchestratorClass = class {
|
|
|
19432
19589
|
};
|
|
19433
19590
|
this.launch = async (request) => {
|
|
19434
19591
|
Affirm_default(request, "Invalid options");
|
|
19435
|
-
|
|
19592
|
+
await Environment_default.refreshIfNeeded();
|
|
19593
|
+
const { details, logProgress, options } = request;
|
|
19594
|
+
const consumer = Environment_default.getConsumer(request.consumer.name) ?? request.consumer;
|
|
19436
19595
|
Affirm_default(consumer, "Invalid consumer");
|
|
19437
19596
|
Affirm_default(details, "Invalid execution details");
|
|
19438
19597
|
const tracker = new ExecutorPerformance_default();
|
|
@@ -19602,7 +19761,8 @@ var ExecutorOrchestratorClass = class {
|
|
|
19602
19761
|
if (fileSize < Constants_default.defaults.MIN_FILE_SIZE_FOR_PARALLEL) {
|
|
19603
19762
|
return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
19604
19763
|
}
|
|
19605
|
-
const
|
|
19764
|
+
const availableCores = Math.max(1, Math.floor(import_os.default.cpus().length * 0.75));
|
|
19765
|
+
const cpus = numChunks ?? Math.min(availableCores, Constants_default.defaults.MAX_THREAD_COUNT);
|
|
19606
19766
|
const maxChunksBySize = Math.floor(fileSize / Constants_default.defaults.MIN_CHUNK_SIZE);
|
|
19607
19767
|
const effectiveChunks = Math.min(cpus, maxChunksBySize);
|
|
19608
19768
|
if (effectiveChunks <= 1) return [{ start: 0, end: fileSize, isFirstChunk: true, fileUri }];
|
|
@@ -19611,7 +19771,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19611
19771
|
try {
|
|
19612
19772
|
const offsets = [];
|
|
19613
19773
|
let currentStart = 0;
|
|
19614
|
-
for (let i = 0; i <
|
|
19774
|
+
for (let i = 0; i < effectiveChunks - 1; i++) {
|
|
19615
19775
|
const targetEnd = currentStart + targetChunkSize;
|
|
19616
19776
|
if (targetEnd >= fileSize) {
|
|
19617
19777
|
break;
|
|
@@ -19685,6 +19845,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19685
19845
|
import_fs11.default.createReadStream(workerResult.resultUri),
|
|
19686
19846
|
import_fs11.default.createWriteStream(mainPath, { flags: "a" })
|
|
19687
19847
|
);
|
|
19848
|
+
await import_promises9.default.unlink(workerResult.resultUri);
|
|
19688
19849
|
}
|
|
19689
19850
|
tracker.measure("merge-workers", performance.now() - perf);
|
|
19690
19851
|
Logger_default.log(`[${scope.id}] Merge complete in ${Math.round(performance.now() - perf)}ms`);
|
|
@@ -19751,7 +19912,7 @@ var ExecutorOrchestrator = new ExecutorOrchestratorClass();
|
|
|
19751
19912
|
// src/workers/ExecutorWorker.ts
|
|
19752
19913
|
import_dotenv.default.configDotenv();
|
|
19753
19914
|
var run = async (workerData) => {
|
|
19754
|
-
Environment_default.
|
|
19915
|
+
Environment_default.loadFromResolvedUri();
|
|
19755
19916
|
if (workerData.loggerConfig)
|
|
19756
19917
|
Logger_default.initFromConfig(workerData.loggerConfig);
|
|
19757
19918
|
try {
|