@forzalabs/remora 1.1.14 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +107 -0
- package/index.js +210 -5
- package/json_schemas/consumer-schema.json +107 -21
- package/package.json +1 -1
- package/workers/ExecutorWorker.js +210 -5
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.
|
|
6
|
+
|
|
7
|
+
## Unreleased
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- Added field-level consumer validations with support for multiple rules per field and per-rule failure actions: `fail`, `skip`, `warn`, and `set_default`
|
|
11
|
+
- Added dataset-level consumer validations for `unique_fields`, `min_rows`, `max_rows`, `no_duplicates`, and `not_empty`
|
|
12
|
+
- Added `DataValidationEngine` to centralize field and dataset validation logic
|
|
13
|
+
- Added validation result type definitions to the definitions package for shared use across engines and executors
|
|
14
|
+
- Added `warn()` logging support for non-fatal validation outcomes
|
|
15
|
+
- Added canary consumer coverage for field-level and dataset-level validations with passing, warning, skipped, defaulted, and failing scenarios
|
|
16
|
+
- Added `verify:local` to the canary package to build the local CLI and run the canary suite against it instead of the published package
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
- Updated consumer field validation configuration from a single flat validation object to an ordered array of validation rules with explicit `onFail` behavior
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
- Fixed the consumer JSON schema to support the new field-level and dataset-level validation configuration
|
|
23
|
+
- Fixed AJV strict-mode compatibility for validation `in` and `not_in` rule arrays by replacing union `type` declarations with `oneOf`
|
|
24
|
+
|
|
25
|
+
## V 1.1.15 - 2026-03-26
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
- Added validation that consumer field keys exist in the referenced producer's dimensions/measures
|
|
29
|
+
- Added validation that every consumer field defines the required `key` property
|
|
30
|
+
- Added validation that `copyFrom` references a field that appears earlier in the consumer's field list
|
|
31
|
+
- Added validation that `distinctOn` keys and `orderBy` reference fields present in the consumer
|
|
32
|
+
- Added validation that join SQL `${P.field}` and `${producer.field}` references point to valid fields
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
- Fixed environment variable not exposed to front-end
|
|
36
|
+
- Fixed database endpoint selector
|
|
37
|
+
- Fixed worker image volume usage in a cloud environment
|
|
38
|
+
- Fixed worker-thread execution errors being logged only to the terminal by propagating them back to the orchestrator file logger
|
|
39
|
+
- Fixed `ConsumerExecutor.processRecord` error reporting to log step-specific failures for field resolution, aliasing, transformations, and filter evaluation
|
|
40
|
+
|
|
41
|
+
## V 1.1.11 - 2026-02-05
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
- Added `startRow` and `startColumn` settings for Excel producers (.xls/.xlsx), allowing users to specify the 1-indexed row and column from which to begin reading data
|
|
45
|
+
|
|
46
|
+
### Fixed
|
|
47
|
+
- Fixed `MaxListenersExceededWarning` on `WriteStream` during consumer execution by replacing shared stream merge with per-file append pipelines
|
|
48
|
+
- Fixed CLI `run` command always exiting with code 1 even on successful runs
|
|
49
|
+
- Fixed incomplete file logging caused by `process.exit()` terminating before winston could flush buffered writes; added `logger.flush()` to worker threads, orchestrator, and CLI exit paths
|
|
50
|
+
- Added `logger.flush()` before `process.exit()` in all data-processing CLI actions (sample, mock, automap, discover, debug) and worker startup
|
|
51
|
+
- Fixed CLI `discover` command exiting with code 1 on success instead of code 0
|
|
52
|
+
- Fixed per-worker `WriteStream` in `Executor.ts` never being closed, risking data loss before distinct/distinctOn post-processing passes
|
|
53
|
+
- Fixed `Dataset.ts` stream await pattern where `resolve` was never called in the `finish` handler (5 sites: transformStream, sort batches, k-way merge, append), causing promises to hang indefinitely
|
|
54
|
+
- Fixed `ExecutorWriter.ts` not awaiting intermediate stream flush during file-size-based rotation
|
|
55
|
+
- Fixed `DriverHelper.appendObjectsToUnifiedFile` and `LocalDestinationDriver.transformAndMove` not awaiting stream flush before returning
|
|
56
|
+
|
|
57
|
+
## V 1.1.9 - 2026-02-04
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
- Added `switch_case` transformation for mapping specific values to other values (similar to a switch/case statement)
|
|
61
|
+
- Added validation to detect multiple consumer fields reading from the same producer dimension (suggests using `copyFrom` instead)
|
|
62
|
+
- Added detailed logging to the executor orchestrator with usage ID tracing throughout the execution lifecycle
|
|
63
|
+
|
|
64
|
+
### Changed
|
|
65
|
+
- Cleaned up CLI execution error output to show concise messages in console while preserving full stack traces in internal logs
|
|
66
|
+
|
|
67
|
+
## V 1.1.8 - 2026-02-03
|
|
68
|
+
|
|
69
|
+
### Added
|
|
70
|
+
- Added `pivot` option to consumers, enabling row-to-column transformation with aggregation (sum, count, avg, min, max)
|
|
71
|
+
- Added `copyFrom` property to consumer fields, allowing a field to be a value copy of another field in the dataset
|
|
72
|
+
|
|
73
|
+
## V 1.1.7 - 2026-02-02
|
|
74
|
+
|
|
75
|
+
### Changed
|
|
76
|
+
- Improved the mock engine
|
|
77
|
+
- Improved logging
|
|
78
|
+
|
|
79
|
+
## V 1.1.6 - 2026-02-02
|
|
80
|
+
|
|
81
|
+
### Added
|
|
82
|
+
- Added `--limit` option to `remora run` command to process only the first N records
|
|
83
|
+
- Added descriptive error messages for failed field transformations with full stack trace preservation
|
|
84
|
+
- Added file logging with rotation (enabled via `REMORA_DEBUG_MODE=true` in production)
|
|
85
|
+
- Added structured logging across key application areas
|
|
86
|
+
|
|
87
|
+
### Changed
|
|
88
|
+
- Moved `DEBUG_MODE` from project.json settings to `REMORA_DEBUG_MODE` environment variable
|
|
89
|
+
|
|
90
|
+
## V 1.1.5 - 2026-02-01
|
|
91
|
+
|
|
92
|
+
### Added
|
|
93
|
+
- Refactored for monorepository
|
|
94
|
+
- Added output maximum file size definable from consumer
|
|
95
|
+
- Added support for nested subfolders inside remora configuration directories (sources, producers, consumers, schemas)
|
|
96
|
+
|
|
97
|
+
### Fixed
|
|
98
|
+
- Bug in parsing via GZ file
|
|
99
|
+
- Issues with concurrent requests
|
|
100
|
+
|
|
101
|
+
### Changed
|
|
102
|
+
- Dockerfile for apps in a monorepo build
|
|
103
|
+
- Package.json to workspaces compliance
|
|
104
|
+
- Refactored internal module structure
|
|
105
|
+
- Removed the _file annotations for environment variables
|
|
106
|
+
|
|
107
|
+
## V 1.0.18
|
package/index.js
CHANGED
|
@@ -13357,6 +13357,10 @@ var Logger = class {
|
|
|
13357
13357
|
console.info(message);
|
|
13358
13358
|
FileLogService_default.write("INFO", String(message));
|
|
13359
13359
|
};
|
|
13360
|
+
this.warn = (message) => {
|
|
13361
|
+
console.warn(message);
|
|
13362
|
+
FileLogService_default.write("WARN", String(message));
|
|
13363
|
+
};
|
|
13360
13364
|
this.flush = () => FileLogService_default.flush();
|
|
13361
13365
|
this.close = () => FileLogService_default.close();
|
|
13362
13366
|
this.error = (error) => {
|
|
@@ -13500,7 +13504,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
|
|
|
13500
13504
|
|
|
13501
13505
|
// ../../packages/constants/src/Constants.ts
|
|
13502
13506
|
var CONSTANTS = {
|
|
13503
|
-
cliVersion: "1.1
|
|
13507
|
+
cliVersion: "1.2.1",
|
|
13504
13508
|
backendVersion: 1,
|
|
13505
13509
|
backendPort: 5088,
|
|
13506
13510
|
workerVersion: 2,
|
|
@@ -13791,6 +13795,10 @@ var ValidatorClass = class {
|
|
|
13791
13795
|
try {
|
|
13792
13796
|
const recursionErrors = this.detectConsumerRecursion(consumer);
|
|
13793
13797
|
errors.push(...recursionErrors);
|
|
13798
|
+
for (const [i, field] of consumer.fields.entries()) {
|
|
13799
|
+
if (!field.key)
|
|
13800
|
+
errors.push(`Field at index ${i} in consumer "${consumer.name}" is missing the required "key" property`);
|
|
13801
|
+
}
|
|
13794
13802
|
const allFieldsWithNoFrom = consumer.fields.filter((x) => x.key === "*" && !x.from);
|
|
13795
13803
|
if (allFieldsWithNoFrom.length > 0 && consumer.producers.length > 1)
|
|
13796
13804
|
errors.push(`Field with key "*" was used without specifying the "from" producer and multiple producers were found.`);
|
|
@@ -18478,6 +18486,112 @@ var TransformationEngineClass = class {
|
|
|
18478
18486
|
var TransformationEngine = new TransformationEngineClass();
|
|
18479
18487
|
var TransformationEngine_default = TransformationEngine;
|
|
18480
18488
|
|
|
18489
|
+
// ../../packages/engines/src/transform/DataValidationEngine.ts
|
|
18490
|
+
var DataValidationEngineClass = class {
|
|
18491
|
+
constructor() {
|
|
18492
|
+
this.applyValidations = (value, validations, fieldKey) => {
|
|
18493
|
+
for (const validation of validations) {
|
|
18494
|
+
const passed = this.evaluateRule(value, validation.rule);
|
|
18495
|
+
if (!passed) {
|
|
18496
|
+
return {
|
|
18497
|
+
valid: false,
|
|
18498
|
+
message: this.buildMessage(value, validation.rule, fieldKey),
|
|
18499
|
+
onFail: validation.onFail
|
|
18500
|
+
};
|
|
18501
|
+
}
|
|
18502
|
+
}
|
|
18503
|
+
return { valid: true };
|
|
18504
|
+
};
|
|
18505
|
+
this.evaluateRule = (value, rule) => {
|
|
18506
|
+
if ("required" in rule)
|
|
18507
|
+
return Algo_default.hasVal(value);
|
|
18508
|
+
if ("min" in rule) {
|
|
18509
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18510
|
+
const num = Number(value);
|
|
18511
|
+
return !isNaN(num) && num >= rule.min;
|
|
18512
|
+
}
|
|
18513
|
+
if ("max" in rule) {
|
|
18514
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18515
|
+
const num = Number(value);
|
|
18516
|
+
return !isNaN(num) && num <= rule.max;
|
|
18517
|
+
}
|
|
18518
|
+
if ("regex" in rule) {
|
|
18519
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18520
|
+
return new RegExp(rule.regex).test(String(value));
|
|
18521
|
+
}
|
|
18522
|
+
if ("min_length" in rule) {
|
|
18523
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18524
|
+
return String(value).length >= rule.min_length;
|
|
18525
|
+
}
|
|
18526
|
+
if ("max_length" in rule) {
|
|
18527
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18528
|
+
return String(value).length <= rule.max_length;
|
|
18529
|
+
}
|
|
18530
|
+
if ("in" in rule) {
|
|
18531
|
+
return rule.in.includes(value);
|
|
18532
|
+
}
|
|
18533
|
+
if ("not_in" in rule) {
|
|
18534
|
+
return !rule.not_in.includes(value);
|
|
18535
|
+
}
|
|
18536
|
+
return true;
|
|
18537
|
+
};
|
|
18538
|
+
this.buildMessage = (value, rule, fieldKey) => {
|
|
18539
|
+
const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
|
|
18540
|
+
if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
|
|
18541
|
+
if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
|
|
18542
|
+
if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
|
|
18543
|
+
if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
|
|
18544
|
+
if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
|
|
18545
|
+
if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
|
|
18546
|
+
if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
|
|
18547
|
+
if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
|
|
18548
|
+
return `Field "${fieldKey}" failed validation`;
|
|
18549
|
+
};
|
|
18550
|
+
this.evaluateDatasetValidations = (validations, context) => {
|
|
18551
|
+
const results = [];
|
|
18552
|
+
for (const validation of validations) {
|
|
18553
|
+
const result = this.evaluateDatasetRule(validation, context);
|
|
18554
|
+
if (result) results.push(result);
|
|
18555
|
+
}
|
|
18556
|
+
return results;
|
|
18557
|
+
};
|
|
18558
|
+
this.extractUniqueFieldKeys = (validations) => {
|
|
18559
|
+
return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
|
|
18560
|
+
};
|
|
18561
|
+
this.hasRule = (validations, ruleKey) => {
|
|
18562
|
+
return validations.some((v) => ruleKey in v.rule);
|
|
18563
|
+
};
|
|
18564
|
+
this.evaluateDatasetRule = (validation, context) => {
|
|
18565
|
+
const { rule, onFail } = validation;
|
|
18566
|
+
const { rowCount, hasDuplicateRows, duplicateFields } = context;
|
|
18567
|
+
if ("not_empty" in rule) {
|
|
18568
|
+
if (rowCount === 0)
|
|
18569
|
+
return { message: "Dataset is empty", onFail };
|
|
18570
|
+
}
|
|
18571
|
+
if ("min_rows" in rule) {
|
|
18572
|
+
if (rowCount < rule.min_rows)
|
|
18573
|
+
return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
|
|
18574
|
+
}
|
|
18575
|
+
if ("max_rows" in rule) {
|
|
18576
|
+
if (rowCount > rule.max_rows)
|
|
18577
|
+
return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
|
|
18578
|
+
}
|
|
18579
|
+
if ("no_duplicates" in rule) {
|
|
18580
|
+
if (hasDuplicateRows)
|
|
18581
|
+
return { message: "Dataset contains duplicate rows", onFail };
|
|
18582
|
+
}
|
|
18583
|
+
if ("unique_fields" in rule) {
|
|
18584
|
+
const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
|
|
18585
|
+
if (failedFields.length > 0)
|
|
18586
|
+
return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
|
|
18587
|
+
}
|
|
18588
|
+
return null;
|
|
18589
|
+
};
|
|
18590
|
+
}
|
|
18591
|
+
};
|
|
18592
|
+
var DataValidationEngine = new DataValidationEngineClass();
|
|
18593
|
+
var DataValidationEngine_default = DataValidationEngine;
|
|
18594
|
+
|
|
18481
18595
|
// ../../packages/engines/src/usage/DataframeManager.ts
|
|
18482
18596
|
var DataframeManagerClass = class {
|
|
18483
18597
|
fill(points, from, to, onlyLastValue, maintainLastValue) {
|
|
@@ -18911,8 +19025,9 @@ var ConsumerExecutorClass = class {
|
|
|
18911
19025
|
for (const field of fields) {
|
|
18912
19026
|
const { cField } = field;
|
|
18913
19027
|
const fieldKey = cField.alias ?? cField.key;
|
|
19028
|
+
let dimension;
|
|
18914
19029
|
try {
|
|
18915
|
-
|
|
19030
|
+
dimension = dimensions.find((x) => x.name === cField.key);
|
|
18916
19031
|
if (!dimension) {
|
|
18917
19032
|
if (cField.fixed && Algo_default.hasVal(cField.default))
|
|
18918
19033
|
record[fieldKey] = cField.default;
|
|
@@ -18921,12 +19036,18 @@ var ConsumerExecutorClass = class {
|
|
|
18921
19036
|
else
|
|
18922
19037
|
throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map((x) => x.name).join(", ")})`);
|
|
18923
19038
|
}
|
|
18924
|
-
|
|
19039
|
+
} catch (error) {
|
|
19040
|
+
const err = new Error(`Resolving dimension for field "${fieldKey}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
|
|
19041
|
+
Logger_default.error(err);
|
|
19042
|
+
throw err;
|
|
19043
|
+
}
|
|
19044
|
+
try {
|
|
19045
|
+
if (cField.alias && dimension && cField.alias !== dimension.name) {
|
|
18925
19046
|
record[cField.alias] = record[dimension.name];
|
|
18926
19047
|
delete record[dimension.name];
|
|
18927
19048
|
}
|
|
18928
19049
|
} catch (error) {
|
|
18929
|
-
const err = new Error(`
|
|
19050
|
+
const err = new Error(`Aliasing field "${cField.key}" to "${cField.alias}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
|
|
18930
19051
|
Logger_default.error(err);
|
|
18931
19052
|
throw err;
|
|
18932
19053
|
}
|
|
@@ -18960,6 +19081,32 @@ var ConsumerExecutorClass = class {
|
|
|
18960
19081
|
}
|
|
18961
19082
|
}
|
|
18962
19083
|
}
|
|
19084
|
+
for (const field of fields) {
|
|
19085
|
+
const { cField } = field;
|
|
19086
|
+
const fieldKey = cField.alias ?? cField.key;
|
|
19087
|
+
if (cField.validate && cField.validate.length > 0) {
|
|
19088
|
+
const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
|
|
19089
|
+
if (!result.valid) {
|
|
19090
|
+
const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
|
|
19091
|
+
switch (result.onFail) {
|
|
19092
|
+
case "set_default":
|
|
19093
|
+
record[fieldKey] = cField.default;
|
|
19094
|
+
break;
|
|
19095
|
+
case "skip":
|
|
19096
|
+
return null;
|
|
19097
|
+
case "warn":
|
|
19098
|
+
Logger_default.warn(errorMessage);
|
|
19099
|
+
break;
|
|
19100
|
+
case "fail":
|
|
19101
|
+
default: {
|
|
19102
|
+
const err = new Error(errorMessage);
|
|
19103
|
+
Logger_default.error(err);
|
|
19104
|
+
throw err;
|
|
19105
|
+
}
|
|
19106
|
+
}
|
|
19107
|
+
}
|
|
19108
|
+
}
|
|
19109
|
+
}
|
|
18963
19110
|
try {
|
|
18964
19111
|
for (const dimension of dimensions) {
|
|
18965
19112
|
const field = fields.find((x) => x.cField.key === dimension.name);
|
|
@@ -19200,6 +19347,48 @@ var ConsumerExecutorClass = class {
|
|
|
19200
19347
|
return false;
|
|
19201
19348
|
}
|
|
19202
19349
|
};
|
|
19350
|
+
this.processDatasetValidation = async (consumer, datasetPath) => {
|
|
19351
|
+
const validations = consumer.validate;
|
|
19352
|
+
if (!validations || validations.length === 0) return [];
|
|
19353
|
+
const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
|
|
19354
|
+
const internalFields = ConsumerManager_default.getExpandedFields(consumer);
|
|
19355
|
+
let rowCount = 0;
|
|
19356
|
+
const seenRows = /* @__PURE__ */ new Set();
|
|
19357
|
+
const fieldValueSets = /* @__PURE__ */ new Map();
|
|
19358
|
+
let hasDuplicateRows = false;
|
|
19359
|
+
const duplicateFields = [];
|
|
19360
|
+
const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
|
|
19361
|
+
const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
|
|
19362
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
19363
|
+
fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
|
|
19364
|
+
}
|
|
19365
|
+
const reader = import_fs11.default.createReadStream(datasetPath);
|
|
19366
|
+
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
19367
|
+
for await (const line of lineReader) {
|
|
19368
|
+
rowCount++;
|
|
19369
|
+
if (checkDuplicateRows) {
|
|
19370
|
+
if (seenRows.has(line))
|
|
19371
|
+
hasDuplicateRows = true;
|
|
19372
|
+
else
|
|
19373
|
+
seenRows.add(line);
|
|
19374
|
+
}
|
|
19375
|
+
if (uniqueFieldKeys.length > 0) {
|
|
19376
|
+
const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
|
|
19377
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
19378
|
+
const valueSet = fieldValueSets.get(fieldKey);
|
|
19379
|
+
const val = String(record[fieldKey] ?? "");
|
|
19380
|
+
if (valueSet.has(val)) {
|
|
19381
|
+
if (!duplicateFields.includes(fieldKey))
|
|
19382
|
+
duplicateFields.push(fieldKey);
|
|
19383
|
+
} else {
|
|
19384
|
+
valueSet.add(val);
|
|
19385
|
+
}
|
|
19386
|
+
}
|
|
19387
|
+
}
|
|
19388
|
+
}
|
|
19389
|
+
lineReader.close();
|
|
19390
|
+
return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
|
|
19391
|
+
};
|
|
19203
19392
|
/**
|
|
19204
19393
|
* Compares two values, handling numbers, strings, and dates
|
|
19205
19394
|
* Returns: negative if a < b, positive if a > b, 0 if equal
|
|
@@ -19457,7 +19646,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19457
19646
|
const tracker = new ExecutorPerformance_default();
|
|
19458
19647
|
const _progress = new ExecutorProgress_default(logProgress);
|
|
19459
19648
|
const { usageId } = UsageManager_default.startUsage(consumer, details);
|
|
19460
|
-
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.
|
|
19649
|
+
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
|
|
19461
19650
|
const pool = this.createPool();
|
|
19462
19651
|
try {
|
|
19463
19652
|
const start = performance.now();
|
|
@@ -19552,6 +19741,22 @@ var ExecutorOrchestratorClass = class {
|
|
|
19552
19741
|
postOperation.totalOutputCount = unifiedOutputCount;
|
|
19553
19742
|
Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
|
|
19554
19743
|
}
|
|
19744
|
+
if (consumer.validate && consumer.validate.length > 0) {
|
|
19745
|
+
Logger_default.log(`[${usageId}] Running dataset-level validations`);
|
|
19746
|
+
counter = performance.now();
|
|
19747
|
+
const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
|
|
19748
|
+
tracker.measure("dataset-validation", performance.now() - counter);
|
|
19749
|
+
for (const result of validationResults) {
|
|
19750
|
+
if (result.onFail === "fail") {
|
|
19751
|
+
const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
|
|
19752
|
+
Logger_default.error(err);
|
|
19753
|
+
throw err;
|
|
19754
|
+
} else if (result.onFail === "warn") {
|
|
19755
|
+
Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
|
|
19756
|
+
}
|
|
19757
|
+
}
|
|
19758
|
+
Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
|
|
19759
|
+
}
|
|
19555
19760
|
counter = performance.now();
|
|
19556
19761
|
Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
|
|
19557
19762
|
const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);
|
|
@@ -129,31 +129,70 @@
|
|
|
129
129
|
]
|
|
130
130
|
},
|
|
131
131
|
"validate": {
|
|
132
|
-
"type": "
|
|
133
|
-
"description": "Rules to check field value compliance and data quality",
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
"
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
132
|
+
"type": "array",
|
|
133
|
+
"description": "Rules to check field value compliance and data quality. Each validation has its own rule and action to take on failure.",
|
|
134
|
+
"items": {
|
|
135
|
+
"type": "object",
|
|
136
|
+
"properties": {
|
|
137
|
+
"rule": {
|
|
138
|
+
"type": "object",
|
|
139
|
+
"description": "The validation rule to check",
|
|
140
|
+
"oneOf": [
|
|
141
|
+
{
|
|
142
|
+
"properties": { "min": { "type": "number", "description": "Minimum value for numeric fields" } },
|
|
143
|
+
"required": ["min"],
|
|
144
|
+
"additionalProperties": false
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"properties": { "max": { "type": "number", "description": "Maximum value for numeric fields" } },
|
|
148
|
+
"required": ["max"],
|
|
149
|
+
"additionalProperties": false
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"properties": { "regex": { "type": "string", "description": "Regular expression pattern to validate string fields" } },
|
|
153
|
+
"required": ["regex"],
|
|
154
|
+
"additionalProperties": false
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"properties": { "required": { "type": "boolean", "const": true, "description": "Whether the field value must be present" } },
|
|
158
|
+
"required": ["required"],
|
|
159
|
+
"additionalProperties": false
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"properties": { "min_length": { "type": "number", "description": "Minimum string length" } },
|
|
163
|
+
"required": ["min_length"],
|
|
164
|
+
"additionalProperties": false
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"properties": { "max_length": { "type": "number", "description": "Maximum string length" } },
|
|
168
|
+
"required": ["max_length"],
|
|
169
|
+
"additionalProperties": false
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"properties": { "in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Allowed values" } },
|
|
173
|
+
"required": ["in"],
|
|
174
|
+
"additionalProperties": false
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"properties": { "not_in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Disallowed values" } },
|
|
178
|
+
"required": ["not_in"],
|
|
179
|
+
"additionalProperties": false
|
|
180
|
+
}
|
|
181
|
+
]
|
|
182
|
+
},
|
|
183
|
+
"onFail": {
|
|
184
|
+
"type": "string",
|
|
185
|
+
"description": "Action to take when validation fails",
|
|
186
|
+
"enum": ["fail", "skip", "warn", "set_default"]
|
|
187
|
+
}
|
|
146
188
|
},
|
|
147
|
-
"required":
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
}
|
|
151
|
-
},
|
|
152
|
-
"additionalProperties": false
|
|
189
|
+
"required": ["rule", "onFail"],
|
|
190
|
+
"additionalProperties": false
|
|
191
|
+
}
|
|
153
192
|
},
|
|
154
193
|
"onError": {
|
|
155
194
|
"type": "string",
|
|
156
|
-
"description": "Action to take if an error occurs during transformations
|
|
195
|
+
"description": "Action to take if an error occurs during transformations",
|
|
157
196
|
"enum": ["set_default", "skip", "fail"]
|
|
158
197
|
},
|
|
159
198
|
"default": {
|
|
@@ -463,6 +502,53 @@
|
|
|
463
502
|
"_version": {
|
|
464
503
|
"type": "number",
|
|
465
504
|
"description": "Version number of the consumer configuration"
|
|
505
|
+
},
|
|
506
|
+
"validate": {
|
|
507
|
+
"type": "array",
|
|
508
|
+
"description": "Dataset-level validations applied to the final result set before export",
|
|
509
|
+
"items": {
|
|
510
|
+
"type": "object",
|
|
511
|
+
"properties": {
|
|
512
|
+
"rule": {
|
|
513
|
+
"type": "object",
|
|
514
|
+
"description": "The dataset validation rule to check",
|
|
515
|
+
"oneOf": [
|
|
516
|
+
{
|
|
517
|
+
"properties": { "unique_fields": { "type": "array", "items": { "type": "string" }, "minItems": 1, "description": "Field(s) that must have unique values across the dataset" } },
|
|
518
|
+
"required": ["unique_fields"],
|
|
519
|
+
"additionalProperties": false
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
"properties": { "min_rows": { "type": "number", "description": "Minimum number of rows expected in the dataset" } },
|
|
523
|
+
"required": ["min_rows"],
|
|
524
|
+
"additionalProperties": false
|
|
525
|
+
},
|
|
526
|
+
{
|
|
527
|
+
"properties": { "max_rows": { "type": "number", "description": "Maximum number of rows allowed in the dataset" } },
|
|
528
|
+
"required": ["max_rows"],
|
|
529
|
+
"additionalProperties": false
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"properties": { "no_duplicates": { "type": "boolean", "const": true, "description": "No fully duplicate rows allowed" } },
|
|
533
|
+
"required": ["no_duplicates"],
|
|
534
|
+
"additionalProperties": false
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"properties": { "not_empty": { "type": "boolean", "const": true, "description": "Dataset must contain at least one row" } },
|
|
538
|
+
"required": ["not_empty"],
|
|
539
|
+
"additionalProperties": false
|
|
540
|
+
}
|
|
541
|
+
]
|
|
542
|
+
},
|
|
543
|
+
"onFail": {
|
|
544
|
+
"type": "string",
|
|
545
|
+
"description": "Action to take when dataset validation fails",
|
|
546
|
+
"enum": ["fail", "warn"]
|
|
547
|
+
}
|
|
548
|
+
},
|
|
549
|
+
"required": ["rule", "onFail"],
|
|
550
|
+
"additionalProperties": false
|
|
551
|
+
}
|
|
466
552
|
}
|
|
467
553
|
},
|
|
468
554
|
"required": [
|
package/package.json
CHANGED
|
@@ -13351,6 +13351,10 @@ var Logger = class {
|
|
|
13351
13351
|
console.info(message);
|
|
13352
13352
|
FileLogService_default.write("INFO", String(message));
|
|
13353
13353
|
};
|
|
13354
|
+
this.warn = (message) => {
|
|
13355
|
+
console.warn(message);
|
|
13356
|
+
FileLogService_default.write("WARN", String(message));
|
|
13357
|
+
};
|
|
13354
13358
|
this.flush = () => FileLogService_default.flush();
|
|
13355
13359
|
this.close = () => FileLogService_default.close();
|
|
13356
13360
|
this.error = (error) => {
|
|
@@ -13494,7 +13498,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
|
|
|
13494
13498
|
|
|
13495
13499
|
// ../../packages/constants/src/Constants.ts
|
|
13496
13500
|
var CONSTANTS = {
|
|
13497
|
-
cliVersion: "1.1
|
|
13501
|
+
cliVersion: "1.2.1",
|
|
13498
13502
|
backendVersion: 1,
|
|
13499
13503
|
backendPort: 5088,
|
|
13500
13504
|
workerVersion: 2,
|
|
@@ -13785,6 +13789,10 @@ var ValidatorClass = class {
|
|
|
13785
13789
|
try {
|
|
13786
13790
|
const recursionErrors = this.detectConsumerRecursion(consumer);
|
|
13787
13791
|
errors.push(...recursionErrors);
|
|
13792
|
+
for (const [i, field] of consumer.fields.entries()) {
|
|
13793
|
+
if (!field.key)
|
|
13794
|
+
errors.push(`Field at index ${i} in consumer "${consumer.name}" is missing the required "key" property`);
|
|
13795
|
+
}
|
|
13788
13796
|
const allFieldsWithNoFrom = consumer.fields.filter((x) => x.key === "*" && !x.from);
|
|
13789
13797
|
if (allFieldsWithNoFrom.length > 0 && consumer.producers.length > 1)
|
|
13790
13798
|
errors.push(`Field with key "*" was used without specifying the "from" producer and multiple producers were found.`);
|
|
@@ -17808,6 +17816,112 @@ var TransformationEngineClass = class {
|
|
|
17808
17816
|
var TransformationEngine = new TransformationEngineClass();
|
|
17809
17817
|
var TransformationEngine_default = TransformationEngine;
|
|
17810
17818
|
|
|
17819
|
+
// ../../packages/engines/src/transform/DataValidationEngine.ts
|
|
17820
|
+
var DataValidationEngineClass = class {
|
|
17821
|
+
constructor() {
|
|
17822
|
+
this.applyValidations = (value, validations, fieldKey) => {
|
|
17823
|
+
for (const validation of validations) {
|
|
17824
|
+
const passed = this.evaluateRule(value, validation.rule);
|
|
17825
|
+
if (!passed) {
|
|
17826
|
+
return {
|
|
17827
|
+
valid: false,
|
|
17828
|
+
message: this.buildMessage(value, validation.rule, fieldKey),
|
|
17829
|
+
onFail: validation.onFail
|
|
17830
|
+
};
|
|
17831
|
+
}
|
|
17832
|
+
}
|
|
17833
|
+
return { valid: true };
|
|
17834
|
+
};
|
|
17835
|
+
this.evaluateRule = (value, rule) => {
|
|
17836
|
+
if ("required" in rule)
|
|
17837
|
+
return Algo_default.hasVal(value);
|
|
17838
|
+
if ("min" in rule) {
|
|
17839
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17840
|
+
const num = Number(value);
|
|
17841
|
+
return !isNaN(num) && num >= rule.min;
|
|
17842
|
+
}
|
|
17843
|
+
if ("max" in rule) {
|
|
17844
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17845
|
+
const num = Number(value);
|
|
17846
|
+
return !isNaN(num) && num <= rule.max;
|
|
17847
|
+
}
|
|
17848
|
+
if ("regex" in rule) {
|
|
17849
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17850
|
+
return new RegExp(rule.regex).test(String(value));
|
|
17851
|
+
}
|
|
17852
|
+
if ("min_length" in rule) {
|
|
17853
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17854
|
+
return String(value).length >= rule.min_length;
|
|
17855
|
+
}
|
|
17856
|
+
if ("max_length" in rule) {
|
|
17857
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17858
|
+
return String(value).length <= rule.max_length;
|
|
17859
|
+
}
|
|
17860
|
+
if ("in" in rule) {
|
|
17861
|
+
return rule.in.includes(value);
|
|
17862
|
+
}
|
|
17863
|
+
if ("not_in" in rule) {
|
|
17864
|
+
return !rule.not_in.includes(value);
|
|
17865
|
+
}
|
|
17866
|
+
return true;
|
|
17867
|
+
};
|
|
17868
|
+
this.buildMessage = (value, rule, fieldKey) => {
|
|
17869
|
+
const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
|
|
17870
|
+
if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
|
|
17871
|
+
if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
|
|
17872
|
+
if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
|
|
17873
|
+
if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
|
|
17874
|
+
if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
|
|
17875
|
+
if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
|
|
17876
|
+
if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
|
|
17877
|
+
if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
|
|
17878
|
+
return `Field "${fieldKey}" failed validation`;
|
|
17879
|
+
};
|
|
17880
|
+
this.evaluateDatasetValidations = (validations, context) => {
|
|
17881
|
+
const results = [];
|
|
17882
|
+
for (const validation of validations) {
|
|
17883
|
+
const result = this.evaluateDatasetRule(validation, context);
|
|
17884
|
+
if (result) results.push(result);
|
|
17885
|
+
}
|
|
17886
|
+
return results;
|
|
17887
|
+
};
|
|
17888
|
+
this.extractUniqueFieldKeys = (validations) => {
|
|
17889
|
+
return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
|
|
17890
|
+
};
|
|
17891
|
+
this.hasRule = (validations, ruleKey) => {
|
|
17892
|
+
return validations.some((v) => ruleKey in v.rule);
|
|
17893
|
+
};
|
|
17894
|
+
this.evaluateDatasetRule = (validation, context) => {
|
|
17895
|
+
const { rule, onFail } = validation;
|
|
17896
|
+
const { rowCount, hasDuplicateRows, duplicateFields } = context;
|
|
17897
|
+
if ("not_empty" in rule) {
|
|
17898
|
+
if (rowCount === 0)
|
|
17899
|
+
return { message: "Dataset is empty", onFail };
|
|
17900
|
+
}
|
|
17901
|
+
if ("min_rows" in rule) {
|
|
17902
|
+
if (rowCount < rule.min_rows)
|
|
17903
|
+
return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
|
|
17904
|
+
}
|
|
17905
|
+
if ("max_rows" in rule) {
|
|
17906
|
+
if (rowCount > rule.max_rows)
|
|
17907
|
+
return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
|
|
17908
|
+
}
|
|
17909
|
+
if ("no_duplicates" in rule) {
|
|
17910
|
+
if (hasDuplicateRows)
|
|
17911
|
+
return { message: "Dataset contains duplicate rows", onFail };
|
|
17912
|
+
}
|
|
17913
|
+
if ("unique_fields" in rule) {
|
|
17914
|
+
const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
|
|
17915
|
+
if (failedFields.length > 0)
|
|
17916
|
+
return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
|
|
17917
|
+
}
|
|
17918
|
+
return null;
|
|
17919
|
+
};
|
|
17920
|
+
}
|
|
17921
|
+
};
|
|
17922
|
+
var DataValidationEngine = new DataValidationEngineClass();
|
|
17923
|
+
var DataValidationEngine_default = DataValidationEngine;
|
|
17924
|
+
|
|
17811
17925
|
// ../../packages/engines/src/usage/DataframeManager.ts
|
|
17812
17926
|
var DataframeManagerClass = class {
|
|
17813
17927
|
fill(points, from, to, onlyLastValue, maintainLastValue) {
|
|
@@ -18510,8 +18624,9 @@ var ConsumerExecutorClass = class {
|
|
|
18510
18624
|
for (const field of fields) {
|
|
18511
18625
|
const { cField } = field;
|
|
18512
18626
|
const fieldKey = cField.alias ?? cField.key;
|
|
18627
|
+
let dimension;
|
|
18513
18628
|
try {
|
|
18514
|
-
|
|
18629
|
+
dimension = dimensions.find((x) => x.name === cField.key);
|
|
18515
18630
|
if (!dimension) {
|
|
18516
18631
|
if (cField.fixed && Algo_default.hasVal(cField.default))
|
|
18517
18632
|
record[fieldKey] = cField.default;
|
|
@@ -18520,12 +18635,18 @@ var ConsumerExecutorClass = class {
|
|
|
18520
18635
|
else
|
|
18521
18636
|
throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map((x) => x.name).join(", ")})`);
|
|
18522
18637
|
}
|
|
18523
|
-
|
|
18638
|
+
} catch (error) {
|
|
18639
|
+
const err = new Error(`Resolving dimension for field "${fieldKey}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
|
|
18640
|
+
Logger_default.error(err);
|
|
18641
|
+
throw err;
|
|
18642
|
+
}
|
|
18643
|
+
try {
|
|
18644
|
+
if (cField.alias && dimension && cField.alias !== dimension.name) {
|
|
18524
18645
|
record[cField.alias] = record[dimension.name];
|
|
18525
18646
|
delete record[dimension.name];
|
|
18526
18647
|
}
|
|
18527
18648
|
} catch (error) {
|
|
18528
|
-
const err = new Error(`
|
|
18649
|
+
const err = new Error(`Aliasing field "${cField.key}" to "${cField.alias}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
|
|
18529
18650
|
Logger_default.error(err);
|
|
18530
18651
|
throw err;
|
|
18531
18652
|
}
|
|
@@ -18559,6 +18680,32 @@ var ConsumerExecutorClass = class {
|
|
|
18559
18680
|
}
|
|
18560
18681
|
}
|
|
18561
18682
|
}
|
|
18683
|
+
for (const field of fields) {
|
|
18684
|
+
const { cField } = field;
|
|
18685
|
+
const fieldKey = cField.alias ?? cField.key;
|
|
18686
|
+
if (cField.validate && cField.validate.length > 0) {
|
|
18687
|
+
const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
|
|
18688
|
+
if (!result.valid) {
|
|
18689
|
+
const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
|
|
18690
|
+
switch (result.onFail) {
|
|
18691
|
+
case "set_default":
|
|
18692
|
+
record[fieldKey] = cField.default;
|
|
18693
|
+
break;
|
|
18694
|
+
case "skip":
|
|
18695
|
+
return null;
|
|
18696
|
+
case "warn":
|
|
18697
|
+
Logger_default.warn(errorMessage);
|
|
18698
|
+
break;
|
|
18699
|
+
case "fail":
|
|
18700
|
+
default: {
|
|
18701
|
+
const err = new Error(errorMessage);
|
|
18702
|
+
Logger_default.error(err);
|
|
18703
|
+
throw err;
|
|
18704
|
+
}
|
|
18705
|
+
}
|
|
18706
|
+
}
|
|
18707
|
+
}
|
|
18708
|
+
}
|
|
18562
18709
|
try {
|
|
18563
18710
|
for (const dimension of dimensions) {
|
|
18564
18711
|
const field = fields.find((x) => x.cField.key === dimension.name);
|
|
@@ -18799,6 +18946,48 @@ var ConsumerExecutorClass = class {
|
|
|
18799
18946
|
return false;
|
|
18800
18947
|
}
|
|
18801
18948
|
};
|
|
18949
|
+
this.processDatasetValidation = async (consumer, datasetPath) => {
|
|
18950
|
+
const validations = consumer.validate;
|
|
18951
|
+
if (!validations || validations.length === 0) return [];
|
|
18952
|
+
const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
|
|
18953
|
+
const internalFields = ConsumerManager_default.getExpandedFields(consumer);
|
|
18954
|
+
let rowCount = 0;
|
|
18955
|
+
const seenRows = /* @__PURE__ */ new Set();
|
|
18956
|
+
const fieldValueSets = /* @__PURE__ */ new Map();
|
|
18957
|
+
let hasDuplicateRows = false;
|
|
18958
|
+
const duplicateFields = [];
|
|
18959
|
+
const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
|
|
18960
|
+
const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
|
|
18961
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
18962
|
+
fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
|
|
18963
|
+
}
|
|
18964
|
+
const reader = import_fs9.default.createReadStream(datasetPath);
|
|
18965
|
+
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
18966
|
+
for await (const line of lineReader) {
|
|
18967
|
+
rowCount++;
|
|
18968
|
+
if (checkDuplicateRows) {
|
|
18969
|
+
if (seenRows.has(line))
|
|
18970
|
+
hasDuplicateRows = true;
|
|
18971
|
+
else
|
|
18972
|
+
seenRows.add(line);
|
|
18973
|
+
}
|
|
18974
|
+
if (uniqueFieldKeys.length > 0) {
|
|
18975
|
+
const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
|
|
18976
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
18977
|
+
const valueSet = fieldValueSets.get(fieldKey);
|
|
18978
|
+
const val = String(record[fieldKey] ?? "");
|
|
18979
|
+
if (valueSet.has(val)) {
|
|
18980
|
+
if (!duplicateFields.includes(fieldKey))
|
|
18981
|
+
duplicateFields.push(fieldKey);
|
|
18982
|
+
} else {
|
|
18983
|
+
valueSet.add(val);
|
|
18984
|
+
}
|
|
18985
|
+
}
|
|
18986
|
+
}
|
|
18987
|
+
}
|
|
18988
|
+
lineReader.close();
|
|
18989
|
+
return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
|
|
18990
|
+
};
|
|
18802
18991
|
/**
|
|
18803
18992
|
* Compares two values, handling numbers, strings, and dates
|
|
18804
18993
|
* Returns: negative if a < b, positive if a > b, 0 if equal
|
|
@@ -19216,7 +19405,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19216
19405
|
const tracker = new ExecutorPerformance_default();
|
|
19217
19406
|
const _progress = new ExecutorProgress_default(logProgress);
|
|
19218
19407
|
const { usageId } = UsageManager_default.startUsage(consumer, details);
|
|
19219
|
-
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.
|
|
19408
|
+
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
|
|
19220
19409
|
const pool = this.createPool();
|
|
19221
19410
|
try {
|
|
19222
19411
|
const start = performance.now();
|
|
@@ -19311,6 +19500,22 @@ var ExecutorOrchestratorClass = class {
|
|
|
19311
19500
|
postOperation.totalOutputCount = unifiedOutputCount;
|
|
19312
19501
|
Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
|
|
19313
19502
|
}
|
|
19503
|
+
if (consumer.validate && consumer.validate.length > 0) {
|
|
19504
|
+
Logger_default.log(`[${usageId}] Running dataset-level validations`);
|
|
19505
|
+
counter = performance.now();
|
|
19506
|
+
const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
|
|
19507
|
+
tracker.measure("dataset-validation", performance.now() - counter);
|
|
19508
|
+
for (const result of validationResults) {
|
|
19509
|
+
if (result.onFail === "fail") {
|
|
19510
|
+
const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
|
|
19511
|
+
Logger_default.error(err);
|
|
19512
|
+
throw err;
|
|
19513
|
+
} else if (result.onFail === "warn") {
|
|
19514
|
+
Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
|
|
19515
|
+
}
|
|
19516
|
+
}
|
|
19517
|
+
Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
|
|
19518
|
+
}
|
|
19314
19519
|
counter = performance.now();
|
|
19315
19520
|
Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
|
|
19316
19521
|
const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);
|