@forzalabs/remora 1.1.15 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +107 -0
- package/index.js +196 -2
- package/json_schemas/consumer-schema.json +107 -21
- package/package.json +1 -1
- package/workers/ExecutorWorker.js +196 -2
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.
|
|
6
|
+
|
|
7
|
+
## Unreleased
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- Added field-level consumer validations with support for multiple rules per field and per-rule failure actions: `fail`, `skip`, `warn`, and `set_default`
|
|
11
|
+
- Added dataset-level consumer validations for `unique_fields`, `min_rows`, `max_rows`, `no_duplicates`, and `not_empty`
|
|
12
|
+
- Added `DataValidationEngine` to centralize field and dataset validation logic
|
|
13
|
+
- Added validation result type definitions to the definitions package for shared use across engines and executors
|
|
14
|
+
- Added `warn()` logging support for non-fatal validation outcomes
|
|
15
|
+
- Added canary consumer coverage for field-level and dataset-level validations with passing, warning, skipped, defaulted, and failing scenarios
|
|
16
|
+
- Added `verify:local` to the canary package to build the local CLI and run the canary suite against it instead of the published package
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
- Updated consumer field validation configuration from a single flat validation object to an ordered array of validation rules with explicit `onFail` behavior
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
- Fixed the consumer JSON schema to support the new field-level and dataset-level validation configuration
|
|
23
|
+
- Fixed AJV strict-mode compatibility for validation `in` and `not_in` rule arrays by replacing union `type` declarations with `oneOf`
|
|
24
|
+
|
|
25
|
+
## V 1.1.15 - 2026-03-26
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
- Added validation that consumer field keys exist in the referenced producer's dimensions/measures
|
|
29
|
+
- Added validation that every consumer field defines the required `key` property
|
|
30
|
+
- Added validation that `copyFrom` references a field that appears earlier in the consumer's field list
|
|
31
|
+
- Added validation that `distinctOn` keys and `orderBy` reference fields present in the consumer
|
|
32
|
+
- Added validation that join SQL `${P.field}` and `${producer.field}` references point to valid fields
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
- Fixed environment variable not exposed to front-end
|
|
36
|
+
- Fixed database endpoint selector
|
|
37
|
+
- Fixed worker image volume usage in a cloud environment
|
|
38
|
+
- Fixed worker-thread execution errors being logged only to the terminal by propagating them back to the orchestrator file logger
|
|
39
|
+
- Fixed `ConsumerExecutor.processRecord` error reporting to log step-specific failures for field resolution, aliasing, transformations, and filter evaluation
|
|
40
|
+
|
|
41
|
+
## V 1.1.11 - 2026-02-05
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
- Added `startRow` and `startColumn` settings for Excel producers (.xls/.xlsx), allowing users to specify the 1-indexed row and column from which to begin reading data
|
|
45
|
+
|
|
46
|
+
### Fixed
|
|
47
|
+
- Fixed `MaxListenersExceededWarning` on `WriteStream` during consumer execution by replacing shared stream merge with per-file append pipelines
|
|
48
|
+
- Fixed CLI `run` command always exiting with code 1 even on successful runs
|
|
49
|
+
- Fixed incomplete file logging caused by `process.exit()` terminating before winston could flush buffered writes; added `logger.flush()` to worker threads, orchestrator, and CLI exit paths
|
|
50
|
+
- Added `logger.flush()` before `process.exit()` in all data-processing CLI actions (sample, mock, automap, discover, debug) and worker startup
|
|
51
|
+
- Fixed CLI `discover` command exiting with code 1 on success instead of code 0
|
|
52
|
+
- Fixed per-worker `WriteStream` in `Executor.ts` never being closed, risking data loss before distinct/distinctOn post-processing passes
|
|
53
|
+
- Fixed `Dataset.ts` stream await pattern where `resolve` was never called in the `finish` handler (5 sites: transformStream, sort batches, k-way merge, append), causing promises to hang indefinitely
|
|
54
|
+
- Fixed `ExecutorWriter.ts` not awaiting intermediate stream flush during file-size-based rotation
|
|
55
|
+
- Fixed `DriverHelper.appendObjectsToUnifiedFile` and `LocalDestinationDriver.transformAndMove` not awaiting stream flush before returning
|
|
56
|
+
|
|
57
|
+
## V 1.1.9 - 2026-02-04
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
- Added `switch_case` transformation for mapping specific values to other values (similar to a switch/case statement)
|
|
61
|
+
- Added validation to detect multiple consumer fields reading from the same producer dimension (suggests using `copyFrom` instead)
|
|
62
|
+
- Added detailed logging to the executor orchestrator with usage ID tracing throughout the execution lifecycle
|
|
63
|
+
|
|
64
|
+
### Changed
|
|
65
|
+
- Cleaned up CLI execution error output to show concise messages in console while preserving full stack traces in internal logs
|
|
66
|
+
|
|
67
|
+
## V 1.1.8 - 2026-02-03
|
|
68
|
+
|
|
69
|
+
### Added
|
|
70
|
+
- Added `pivot` option to consumers, enabling row-to-column transformation with aggregation (sum, count, avg, min, max)
|
|
71
|
+
- Added `copyFrom` property to consumer fields, allowing a field to be a value copy of another field in the dataset
|
|
72
|
+
|
|
73
|
+
## V 1.1.7 - 2026-02-02
|
|
74
|
+
|
|
75
|
+
### Changed
|
|
76
|
+
- Improved the mock engine
|
|
77
|
+
- Improved logging
|
|
78
|
+
|
|
79
|
+
## V 1.1.6 - 2026-02-02
|
|
80
|
+
|
|
81
|
+
### Added
|
|
82
|
+
- Added `--limit` option to `remora run` command to process only the first N records
|
|
83
|
+
- Added descriptive error messages for failed field transformations with full stack trace preservation
|
|
84
|
+
- Added file logging with rotation (enabled via `REMORA_DEBUG_MODE=true` in production)
|
|
85
|
+
- Added structured logging across key application areas
|
|
86
|
+
|
|
87
|
+
### Changed
|
|
88
|
+
- Moved `DEBUG_MODE` from project.json settings to `REMORA_DEBUG_MODE` environment variable
|
|
89
|
+
|
|
90
|
+
## V 1.1.5 - 2026-02-01
|
|
91
|
+
|
|
92
|
+
### Added
|
|
93
|
+
- Refactored for monorepository
|
|
94
|
+
- Added output maximum file size definable from consumer
|
|
95
|
+
- Added support for nested subfolders inside remora configuration directories (sources, producers, consumers, schemas)
|
|
96
|
+
|
|
97
|
+
### Fixed
|
|
98
|
+
- Bug in parsing via GZ file
|
|
99
|
+
- Issues with concurrent requests
|
|
100
|
+
|
|
101
|
+
### Changed
|
|
102
|
+
- Dockerfile for apps in a monorepo build
|
|
103
|
+
- Package.json to workspaces compliance
|
|
104
|
+
- Refactored internal module structure
|
|
105
|
+
- Removed the _file annotations for environment variables
|
|
106
|
+
|
|
107
|
+
## V 1.0.18
|
package/index.js
CHANGED
|
@@ -13357,6 +13357,10 @@ var Logger = class {
|
|
|
13357
13357
|
console.info(message);
|
|
13358
13358
|
FileLogService_default.write("INFO", String(message));
|
|
13359
13359
|
};
|
|
13360
|
+
this.warn = (message) => {
|
|
13361
|
+
console.warn(message);
|
|
13362
|
+
FileLogService_default.write("WARN", String(message));
|
|
13363
|
+
};
|
|
13360
13364
|
this.flush = () => FileLogService_default.flush();
|
|
13361
13365
|
this.close = () => FileLogService_default.close();
|
|
13362
13366
|
this.error = (error) => {
|
|
@@ -13500,7 +13504,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
|
|
|
13500
13504
|
|
|
13501
13505
|
// ../../packages/constants/src/Constants.ts
|
|
13502
13506
|
var CONSTANTS = {
|
|
13503
|
-
cliVersion: "1.1
|
|
13507
|
+
cliVersion: "1.2.1",
|
|
13504
13508
|
backendVersion: 1,
|
|
13505
13509
|
backendPort: 5088,
|
|
13506
13510
|
workerVersion: 2,
|
|
@@ -18482,6 +18486,112 @@ var TransformationEngineClass = class {
|
|
|
18482
18486
|
var TransformationEngine = new TransformationEngineClass();
|
|
18483
18487
|
var TransformationEngine_default = TransformationEngine;
|
|
18484
18488
|
|
|
18489
|
+
// ../../packages/engines/src/transform/DataValidationEngine.ts
|
|
18490
|
+
var DataValidationEngineClass = class {
|
|
18491
|
+
constructor() {
|
|
18492
|
+
this.applyValidations = (value, validations, fieldKey) => {
|
|
18493
|
+
for (const validation of validations) {
|
|
18494
|
+
const passed = this.evaluateRule(value, validation.rule);
|
|
18495
|
+
if (!passed) {
|
|
18496
|
+
return {
|
|
18497
|
+
valid: false,
|
|
18498
|
+
message: this.buildMessage(value, validation.rule, fieldKey),
|
|
18499
|
+
onFail: validation.onFail
|
|
18500
|
+
};
|
|
18501
|
+
}
|
|
18502
|
+
}
|
|
18503
|
+
return { valid: true };
|
|
18504
|
+
};
|
|
18505
|
+
this.evaluateRule = (value, rule) => {
|
|
18506
|
+
if ("required" in rule)
|
|
18507
|
+
return Algo_default.hasVal(value);
|
|
18508
|
+
if ("min" in rule) {
|
|
18509
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18510
|
+
const num = Number(value);
|
|
18511
|
+
return !isNaN(num) && num >= rule.min;
|
|
18512
|
+
}
|
|
18513
|
+
if ("max" in rule) {
|
|
18514
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18515
|
+
const num = Number(value);
|
|
18516
|
+
return !isNaN(num) && num <= rule.max;
|
|
18517
|
+
}
|
|
18518
|
+
if ("regex" in rule) {
|
|
18519
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18520
|
+
return new RegExp(rule.regex).test(String(value));
|
|
18521
|
+
}
|
|
18522
|
+
if ("min_length" in rule) {
|
|
18523
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18524
|
+
return String(value).length >= rule.min_length;
|
|
18525
|
+
}
|
|
18526
|
+
if ("max_length" in rule) {
|
|
18527
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
18528
|
+
return String(value).length <= rule.max_length;
|
|
18529
|
+
}
|
|
18530
|
+
if ("in" in rule) {
|
|
18531
|
+
return rule.in.includes(value);
|
|
18532
|
+
}
|
|
18533
|
+
if ("not_in" in rule) {
|
|
18534
|
+
return !rule.not_in.includes(value);
|
|
18535
|
+
}
|
|
18536
|
+
return true;
|
|
18537
|
+
};
|
|
18538
|
+
this.buildMessage = (value, rule, fieldKey) => {
|
|
18539
|
+
const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
|
|
18540
|
+
if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
|
|
18541
|
+
if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
|
|
18542
|
+
if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
|
|
18543
|
+
if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
|
|
18544
|
+
if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
|
|
18545
|
+
if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
|
|
18546
|
+
if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
|
|
18547
|
+
if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
|
|
18548
|
+
return `Field "${fieldKey}" failed validation`;
|
|
18549
|
+
};
|
|
18550
|
+
this.evaluateDatasetValidations = (validations, context) => {
|
|
18551
|
+
const results = [];
|
|
18552
|
+
for (const validation of validations) {
|
|
18553
|
+
const result = this.evaluateDatasetRule(validation, context);
|
|
18554
|
+
if (result) results.push(result);
|
|
18555
|
+
}
|
|
18556
|
+
return results;
|
|
18557
|
+
};
|
|
18558
|
+
this.extractUniqueFieldKeys = (validations) => {
|
|
18559
|
+
return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
|
|
18560
|
+
};
|
|
18561
|
+
this.hasRule = (validations, ruleKey) => {
|
|
18562
|
+
return validations.some((v) => ruleKey in v.rule);
|
|
18563
|
+
};
|
|
18564
|
+
this.evaluateDatasetRule = (validation, context) => {
|
|
18565
|
+
const { rule, onFail } = validation;
|
|
18566
|
+
const { rowCount, hasDuplicateRows, duplicateFields } = context;
|
|
18567
|
+
if ("not_empty" in rule) {
|
|
18568
|
+
if (rowCount === 0)
|
|
18569
|
+
return { message: "Dataset is empty", onFail };
|
|
18570
|
+
}
|
|
18571
|
+
if ("min_rows" in rule) {
|
|
18572
|
+
if (rowCount < rule.min_rows)
|
|
18573
|
+
return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
|
|
18574
|
+
}
|
|
18575
|
+
if ("max_rows" in rule) {
|
|
18576
|
+
if (rowCount > rule.max_rows)
|
|
18577
|
+
return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
|
|
18578
|
+
}
|
|
18579
|
+
if ("no_duplicates" in rule) {
|
|
18580
|
+
if (hasDuplicateRows)
|
|
18581
|
+
return { message: "Dataset contains duplicate rows", onFail };
|
|
18582
|
+
}
|
|
18583
|
+
if ("unique_fields" in rule) {
|
|
18584
|
+
const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
|
|
18585
|
+
if (failedFields.length > 0)
|
|
18586
|
+
return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
|
|
18587
|
+
}
|
|
18588
|
+
return null;
|
|
18589
|
+
};
|
|
18590
|
+
}
|
|
18591
|
+
};
|
|
18592
|
+
var DataValidationEngine = new DataValidationEngineClass();
|
|
18593
|
+
var DataValidationEngine_default = DataValidationEngine;
|
|
18594
|
+
|
|
18485
18595
|
// ../../packages/engines/src/usage/DataframeManager.ts
|
|
18486
18596
|
var DataframeManagerClass = class {
|
|
18487
18597
|
fill(points, from, to, onlyLastValue, maintainLastValue) {
|
|
@@ -18971,6 +19081,32 @@ var ConsumerExecutorClass = class {
|
|
|
18971
19081
|
}
|
|
18972
19082
|
}
|
|
18973
19083
|
}
|
|
19084
|
+
for (const field of fields) {
|
|
19085
|
+
const { cField } = field;
|
|
19086
|
+
const fieldKey = cField.alias ?? cField.key;
|
|
19087
|
+
if (cField.validate && cField.validate.length > 0) {
|
|
19088
|
+
const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
|
|
19089
|
+
if (!result.valid) {
|
|
19090
|
+
const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
|
|
19091
|
+
switch (result.onFail) {
|
|
19092
|
+
case "set_default":
|
|
19093
|
+
record[fieldKey] = cField.default;
|
|
19094
|
+
break;
|
|
19095
|
+
case "skip":
|
|
19096
|
+
return null;
|
|
19097
|
+
case "warn":
|
|
19098
|
+
Logger_default.warn(errorMessage);
|
|
19099
|
+
break;
|
|
19100
|
+
case "fail":
|
|
19101
|
+
default: {
|
|
19102
|
+
const err = new Error(errorMessage);
|
|
19103
|
+
Logger_default.error(err);
|
|
19104
|
+
throw err;
|
|
19105
|
+
}
|
|
19106
|
+
}
|
|
19107
|
+
}
|
|
19108
|
+
}
|
|
19109
|
+
}
|
|
18974
19110
|
try {
|
|
18975
19111
|
for (const dimension of dimensions) {
|
|
18976
19112
|
const field = fields.find((x) => x.cField.key === dimension.name);
|
|
@@ -19211,6 +19347,48 @@ var ConsumerExecutorClass = class {
|
|
|
19211
19347
|
return false;
|
|
19212
19348
|
}
|
|
19213
19349
|
};
|
|
19350
|
+
this.processDatasetValidation = async (consumer, datasetPath) => {
|
|
19351
|
+
const validations = consumer.validate;
|
|
19352
|
+
if (!validations || validations.length === 0) return [];
|
|
19353
|
+
const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
|
|
19354
|
+
const internalFields = ConsumerManager_default.getExpandedFields(consumer);
|
|
19355
|
+
let rowCount = 0;
|
|
19356
|
+
const seenRows = /* @__PURE__ */ new Set();
|
|
19357
|
+
const fieldValueSets = /* @__PURE__ */ new Map();
|
|
19358
|
+
let hasDuplicateRows = false;
|
|
19359
|
+
const duplicateFields = [];
|
|
19360
|
+
const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
|
|
19361
|
+
const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
|
|
19362
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
19363
|
+
fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
|
|
19364
|
+
}
|
|
19365
|
+
const reader = import_fs11.default.createReadStream(datasetPath);
|
|
19366
|
+
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
19367
|
+
for await (const line of lineReader) {
|
|
19368
|
+
rowCount++;
|
|
19369
|
+
if (checkDuplicateRows) {
|
|
19370
|
+
if (seenRows.has(line))
|
|
19371
|
+
hasDuplicateRows = true;
|
|
19372
|
+
else
|
|
19373
|
+
seenRows.add(line);
|
|
19374
|
+
}
|
|
19375
|
+
if (uniqueFieldKeys.length > 0) {
|
|
19376
|
+
const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
|
|
19377
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
19378
|
+
const valueSet = fieldValueSets.get(fieldKey);
|
|
19379
|
+
const val = String(record[fieldKey] ?? "");
|
|
19380
|
+
if (valueSet.has(val)) {
|
|
19381
|
+
if (!duplicateFields.includes(fieldKey))
|
|
19382
|
+
duplicateFields.push(fieldKey);
|
|
19383
|
+
} else {
|
|
19384
|
+
valueSet.add(val);
|
|
19385
|
+
}
|
|
19386
|
+
}
|
|
19387
|
+
}
|
|
19388
|
+
}
|
|
19389
|
+
lineReader.close();
|
|
19390
|
+
return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
|
|
19391
|
+
};
|
|
19214
19392
|
/**
|
|
19215
19393
|
* Compares two values, handling numbers, strings, and dates
|
|
19216
19394
|
* Returns: negative if a < b, positive if a > b, 0 if equal
|
|
@@ -19468,7 +19646,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19468
19646
|
const tracker = new ExecutorPerformance_default();
|
|
19469
19647
|
const _progress = new ExecutorProgress_default(logProgress);
|
|
19470
19648
|
const { usageId } = UsageManager_default.startUsage(consumer, details);
|
|
19471
|
-
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.
|
|
19649
|
+
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
|
|
19472
19650
|
const pool = this.createPool();
|
|
19473
19651
|
try {
|
|
19474
19652
|
const start = performance.now();
|
|
@@ -19563,6 +19741,22 @@ var ExecutorOrchestratorClass = class {
|
|
|
19563
19741
|
postOperation.totalOutputCount = unifiedOutputCount;
|
|
19564
19742
|
Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
|
|
19565
19743
|
}
|
|
19744
|
+
if (consumer.validate && consumer.validate.length > 0) {
|
|
19745
|
+
Logger_default.log(`[${usageId}] Running dataset-level validations`);
|
|
19746
|
+
counter = performance.now();
|
|
19747
|
+
const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
|
|
19748
|
+
tracker.measure("dataset-validation", performance.now() - counter);
|
|
19749
|
+
for (const result of validationResults) {
|
|
19750
|
+
if (result.onFail === "fail") {
|
|
19751
|
+
const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
|
|
19752
|
+
Logger_default.error(err);
|
|
19753
|
+
throw err;
|
|
19754
|
+
} else if (result.onFail === "warn") {
|
|
19755
|
+
Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
|
|
19756
|
+
}
|
|
19757
|
+
}
|
|
19758
|
+
Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
|
|
19759
|
+
}
|
|
19566
19760
|
counter = performance.now();
|
|
19567
19761
|
Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
|
|
19568
19762
|
const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);
|
|
@@ -129,31 +129,70 @@
|
|
|
129
129
|
]
|
|
130
130
|
},
|
|
131
131
|
"validate": {
|
|
132
|
-
"type": "
|
|
133
|
-
"description": "Rules to check field value compliance and data quality",
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
"
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
132
|
+
"type": "array",
|
|
133
|
+
"description": "Rules to check field value compliance and data quality. Each validation has its own rule and action to take on failure.",
|
|
134
|
+
"items": {
|
|
135
|
+
"type": "object",
|
|
136
|
+
"properties": {
|
|
137
|
+
"rule": {
|
|
138
|
+
"type": "object",
|
|
139
|
+
"description": "The validation rule to check",
|
|
140
|
+
"oneOf": [
|
|
141
|
+
{
|
|
142
|
+
"properties": { "min": { "type": "number", "description": "Minimum value for numeric fields" } },
|
|
143
|
+
"required": ["min"],
|
|
144
|
+
"additionalProperties": false
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"properties": { "max": { "type": "number", "description": "Maximum value for numeric fields" } },
|
|
148
|
+
"required": ["max"],
|
|
149
|
+
"additionalProperties": false
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"properties": { "regex": { "type": "string", "description": "Regular expression pattern to validate string fields" } },
|
|
153
|
+
"required": ["regex"],
|
|
154
|
+
"additionalProperties": false
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
"properties": { "required": { "type": "boolean", "const": true, "description": "Whether the field value must be present" } },
|
|
158
|
+
"required": ["required"],
|
|
159
|
+
"additionalProperties": false
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"properties": { "min_length": { "type": "number", "description": "Minimum string length" } },
|
|
163
|
+
"required": ["min_length"],
|
|
164
|
+
"additionalProperties": false
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"properties": { "max_length": { "type": "number", "description": "Maximum string length" } },
|
|
168
|
+
"required": ["max_length"],
|
|
169
|
+
"additionalProperties": false
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"properties": { "in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Allowed values" } },
|
|
173
|
+
"required": ["in"],
|
|
174
|
+
"additionalProperties": false
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"properties": { "not_in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Disallowed values" } },
|
|
178
|
+
"required": ["not_in"],
|
|
179
|
+
"additionalProperties": false
|
|
180
|
+
}
|
|
181
|
+
]
|
|
182
|
+
},
|
|
183
|
+
"onFail": {
|
|
184
|
+
"type": "string",
|
|
185
|
+
"description": "Action to take when validation fails",
|
|
186
|
+
"enum": ["fail", "skip", "warn", "set_default"]
|
|
187
|
+
}
|
|
146
188
|
},
|
|
147
|
-
"required":
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
}
|
|
151
|
-
},
|
|
152
|
-
"additionalProperties": false
|
|
189
|
+
"required": ["rule", "onFail"],
|
|
190
|
+
"additionalProperties": false
|
|
191
|
+
}
|
|
153
192
|
},
|
|
154
193
|
"onError": {
|
|
155
194
|
"type": "string",
|
|
156
|
-
"description": "Action to take if an error occurs during transformations
|
|
195
|
+
"description": "Action to take if an error occurs during transformations",
|
|
157
196
|
"enum": ["set_default", "skip", "fail"]
|
|
158
197
|
},
|
|
159
198
|
"default": {
|
|
@@ -463,6 +502,53 @@
|
|
|
463
502
|
"_version": {
|
|
464
503
|
"type": "number",
|
|
465
504
|
"description": "Version number of the consumer configuration"
|
|
505
|
+
},
|
|
506
|
+
"validate": {
|
|
507
|
+
"type": "array",
|
|
508
|
+
"description": "Dataset-level validations applied to the final result set before export",
|
|
509
|
+
"items": {
|
|
510
|
+
"type": "object",
|
|
511
|
+
"properties": {
|
|
512
|
+
"rule": {
|
|
513
|
+
"type": "object",
|
|
514
|
+
"description": "The dataset validation rule to check",
|
|
515
|
+
"oneOf": [
|
|
516
|
+
{
|
|
517
|
+
"properties": { "unique_fields": { "type": "array", "items": { "type": "string" }, "minItems": 1, "description": "Field(s) that must have unique values across the dataset" } },
|
|
518
|
+
"required": ["unique_fields"],
|
|
519
|
+
"additionalProperties": false
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
"properties": { "min_rows": { "type": "number", "description": "Minimum number of rows expected in the dataset" } },
|
|
523
|
+
"required": ["min_rows"],
|
|
524
|
+
"additionalProperties": false
|
|
525
|
+
},
|
|
526
|
+
{
|
|
527
|
+
"properties": { "max_rows": { "type": "number", "description": "Maximum number of rows allowed in the dataset" } },
|
|
528
|
+
"required": ["max_rows"],
|
|
529
|
+
"additionalProperties": false
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"properties": { "no_duplicates": { "type": "boolean", "const": true, "description": "No fully duplicate rows allowed" } },
|
|
533
|
+
"required": ["no_duplicates"],
|
|
534
|
+
"additionalProperties": false
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"properties": { "not_empty": { "type": "boolean", "const": true, "description": "Dataset must contain at least one row" } },
|
|
538
|
+
"required": ["not_empty"],
|
|
539
|
+
"additionalProperties": false
|
|
540
|
+
}
|
|
541
|
+
]
|
|
542
|
+
},
|
|
543
|
+
"onFail": {
|
|
544
|
+
"type": "string",
|
|
545
|
+
"description": "Action to take when dataset validation fails",
|
|
546
|
+
"enum": ["fail", "warn"]
|
|
547
|
+
}
|
|
548
|
+
},
|
|
549
|
+
"required": ["rule", "onFail"],
|
|
550
|
+
"additionalProperties": false
|
|
551
|
+
}
|
|
466
552
|
}
|
|
467
553
|
},
|
|
468
554
|
"required": [
|
package/package.json
CHANGED
|
@@ -13351,6 +13351,10 @@ var Logger = class {
|
|
|
13351
13351
|
console.info(message);
|
|
13352
13352
|
FileLogService_default.write("INFO", String(message));
|
|
13353
13353
|
};
|
|
13354
|
+
this.warn = (message) => {
|
|
13355
|
+
console.warn(message);
|
|
13356
|
+
FileLogService_default.write("WARN", String(message));
|
|
13357
|
+
};
|
|
13354
13358
|
this.flush = () => FileLogService_default.flush();
|
|
13355
13359
|
this.close = () => FileLogService_default.close();
|
|
13356
13360
|
this.error = (error) => {
|
|
@@ -13494,7 +13498,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
|
|
|
13494
13498
|
|
|
13495
13499
|
// ../../packages/constants/src/Constants.ts
|
|
13496
13500
|
var CONSTANTS = {
|
|
13497
|
-
cliVersion: "1.1
|
|
13501
|
+
cliVersion: "1.2.1",
|
|
13498
13502
|
backendVersion: 1,
|
|
13499
13503
|
backendPort: 5088,
|
|
13500
13504
|
workerVersion: 2,
|
|
@@ -17812,6 +17816,112 @@ var TransformationEngineClass = class {
|
|
|
17812
17816
|
var TransformationEngine = new TransformationEngineClass();
|
|
17813
17817
|
var TransformationEngine_default = TransformationEngine;
|
|
17814
17818
|
|
|
17819
|
+
// ../../packages/engines/src/transform/DataValidationEngine.ts
|
|
17820
|
+
var DataValidationEngineClass = class {
|
|
17821
|
+
constructor() {
|
|
17822
|
+
this.applyValidations = (value, validations, fieldKey) => {
|
|
17823
|
+
for (const validation of validations) {
|
|
17824
|
+
const passed = this.evaluateRule(value, validation.rule);
|
|
17825
|
+
if (!passed) {
|
|
17826
|
+
return {
|
|
17827
|
+
valid: false,
|
|
17828
|
+
message: this.buildMessage(value, validation.rule, fieldKey),
|
|
17829
|
+
onFail: validation.onFail
|
|
17830
|
+
};
|
|
17831
|
+
}
|
|
17832
|
+
}
|
|
17833
|
+
return { valid: true };
|
|
17834
|
+
};
|
|
17835
|
+
this.evaluateRule = (value, rule) => {
|
|
17836
|
+
if ("required" in rule)
|
|
17837
|
+
return Algo_default.hasVal(value);
|
|
17838
|
+
if ("min" in rule) {
|
|
17839
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17840
|
+
const num = Number(value);
|
|
17841
|
+
return !isNaN(num) && num >= rule.min;
|
|
17842
|
+
}
|
|
17843
|
+
if ("max" in rule) {
|
|
17844
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17845
|
+
const num = Number(value);
|
|
17846
|
+
return !isNaN(num) && num <= rule.max;
|
|
17847
|
+
}
|
|
17848
|
+
if ("regex" in rule) {
|
|
17849
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17850
|
+
return new RegExp(rule.regex).test(String(value));
|
|
17851
|
+
}
|
|
17852
|
+
if ("min_length" in rule) {
|
|
17853
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17854
|
+
return String(value).length >= rule.min_length;
|
|
17855
|
+
}
|
|
17856
|
+
if ("max_length" in rule) {
|
|
17857
|
+
if (!Algo_default.hasVal(value)) return true;
|
|
17858
|
+
return String(value).length <= rule.max_length;
|
|
17859
|
+
}
|
|
17860
|
+
if ("in" in rule) {
|
|
17861
|
+
return rule.in.includes(value);
|
|
17862
|
+
}
|
|
17863
|
+
if ("not_in" in rule) {
|
|
17864
|
+
return !rule.not_in.includes(value);
|
|
17865
|
+
}
|
|
17866
|
+
return true;
|
|
17867
|
+
};
|
|
17868
|
+
this.buildMessage = (value, rule, fieldKey) => {
|
|
17869
|
+
const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
|
|
17870
|
+
if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
|
|
17871
|
+
if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
|
|
17872
|
+
if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
|
|
17873
|
+
if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
|
|
17874
|
+
if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
|
|
17875
|
+
if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
|
|
17876
|
+
if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
|
|
17877
|
+
if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
|
|
17878
|
+
return `Field "${fieldKey}" failed validation`;
|
|
17879
|
+
};
|
|
17880
|
+
this.evaluateDatasetValidations = (validations, context) => {
|
|
17881
|
+
const results = [];
|
|
17882
|
+
for (const validation of validations) {
|
|
17883
|
+
const result = this.evaluateDatasetRule(validation, context);
|
|
17884
|
+
if (result) results.push(result);
|
|
17885
|
+
}
|
|
17886
|
+
return results;
|
|
17887
|
+
};
|
|
17888
|
+
this.extractUniqueFieldKeys = (validations) => {
|
|
17889
|
+
return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
|
|
17890
|
+
};
|
|
17891
|
+
this.hasRule = (validations, ruleKey) => {
|
|
17892
|
+
return validations.some((v) => ruleKey in v.rule);
|
|
17893
|
+
};
|
|
17894
|
+
this.evaluateDatasetRule = (validation, context) => {
|
|
17895
|
+
const { rule, onFail } = validation;
|
|
17896
|
+
const { rowCount, hasDuplicateRows, duplicateFields } = context;
|
|
17897
|
+
if ("not_empty" in rule) {
|
|
17898
|
+
if (rowCount === 0)
|
|
17899
|
+
return { message: "Dataset is empty", onFail };
|
|
17900
|
+
}
|
|
17901
|
+
if ("min_rows" in rule) {
|
|
17902
|
+
if (rowCount < rule.min_rows)
|
|
17903
|
+
return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
|
|
17904
|
+
}
|
|
17905
|
+
if ("max_rows" in rule) {
|
|
17906
|
+
if (rowCount > rule.max_rows)
|
|
17907
|
+
return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
|
|
17908
|
+
}
|
|
17909
|
+
if ("no_duplicates" in rule) {
|
|
17910
|
+
if (hasDuplicateRows)
|
|
17911
|
+
return { message: "Dataset contains duplicate rows", onFail };
|
|
17912
|
+
}
|
|
17913
|
+
if ("unique_fields" in rule) {
|
|
17914
|
+
const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
|
|
17915
|
+
if (failedFields.length > 0)
|
|
17916
|
+
return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
|
|
17917
|
+
}
|
|
17918
|
+
return null;
|
|
17919
|
+
};
|
|
17920
|
+
}
|
|
17921
|
+
};
|
|
17922
|
+
var DataValidationEngine = new DataValidationEngineClass();
|
|
17923
|
+
var DataValidationEngine_default = DataValidationEngine;
|
|
17924
|
+
|
|
17815
17925
|
// ../../packages/engines/src/usage/DataframeManager.ts
|
|
17816
17926
|
var DataframeManagerClass = class {
|
|
17817
17927
|
fill(points, from, to, onlyLastValue, maintainLastValue) {
|
|
@@ -18570,6 +18680,32 @@ var ConsumerExecutorClass = class {
|
|
|
18570
18680
|
}
|
|
18571
18681
|
}
|
|
18572
18682
|
}
|
|
18683
|
+
for (const field of fields) {
|
|
18684
|
+
const { cField } = field;
|
|
18685
|
+
const fieldKey = cField.alias ?? cField.key;
|
|
18686
|
+
if (cField.validate && cField.validate.length > 0) {
|
|
18687
|
+
const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
|
|
18688
|
+
if (!result.valid) {
|
|
18689
|
+
const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
|
|
18690
|
+
switch (result.onFail) {
|
|
18691
|
+
case "set_default":
|
|
18692
|
+
record[fieldKey] = cField.default;
|
|
18693
|
+
break;
|
|
18694
|
+
case "skip":
|
|
18695
|
+
return null;
|
|
18696
|
+
case "warn":
|
|
18697
|
+
Logger_default.warn(errorMessage);
|
|
18698
|
+
break;
|
|
18699
|
+
case "fail":
|
|
18700
|
+
default: {
|
|
18701
|
+
const err = new Error(errorMessage);
|
|
18702
|
+
Logger_default.error(err);
|
|
18703
|
+
throw err;
|
|
18704
|
+
}
|
|
18705
|
+
}
|
|
18706
|
+
}
|
|
18707
|
+
}
|
|
18708
|
+
}
|
|
18573
18709
|
try {
|
|
18574
18710
|
for (const dimension of dimensions) {
|
|
18575
18711
|
const field = fields.find((x) => x.cField.key === dimension.name);
|
|
@@ -18810,6 +18946,48 @@ var ConsumerExecutorClass = class {
|
|
|
18810
18946
|
return false;
|
|
18811
18947
|
}
|
|
18812
18948
|
};
|
|
18949
|
+
this.processDatasetValidation = async (consumer, datasetPath) => {
|
|
18950
|
+
const validations = consumer.validate;
|
|
18951
|
+
if (!validations || validations.length === 0) return [];
|
|
18952
|
+
const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
|
|
18953
|
+
const internalFields = ConsumerManager_default.getExpandedFields(consumer);
|
|
18954
|
+
let rowCount = 0;
|
|
18955
|
+
const seenRows = /* @__PURE__ */ new Set();
|
|
18956
|
+
const fieldValueSets = /* @__PURE__ */ new Map();
|
|
18957
|
+
let hasDuplicateRows = false;
|
|
18958
|
+
const duplicateFields = [];
|
|
18959
|
+
const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
|
|
18960
|
+
const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
|
|
18961
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
18962
|
+
fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
|
|
18963
|
+
}
|
|
18964
|
+
const reader = import_fs9.default.createReadStream(datasetPath);
|
|
18965
|
+
const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
|
|
18966
|
+
for await (const line of lineReader) {
|
|
18967
|
+
rowCount++;
|
|
18968
|
+
if (checkDuplicateRows) {
|
|
18969
|
+
if (seenRows.has(line))
|
|
18970
|
+
hasDuplicateRows = true;
|
|
18971
|
+
else
|
|
18972
|
+
seenRows.add(line);
|
|
18973
|
+
}
|
|
18974
|
+
if (uniqueFieldKeys.length > 0) {
|
|
18975
|
+
const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
|
|
18976
|
+
for (const fieldKey of uniqueFieldKeys) {
|
|
18977
|
+
const valueSet = fieldValueSets.get(fieldKey);
|
|
18978
|
+
const val = String(record[fieldKey] ?? "");
|
|
18979
|
+
if (valueSet.has(val)) {
|
|
18980
|
+
if (!duplicateFields.includes(fieldKey))
|
|
18981
|
+
duplicateFields.push(fieldKey);
|
|
18982
|
+
} else {
|
|
18983
|
+
valueSet.add(val);
|
|
18984
|
+
}
|
|
18985
|
+
}
|
|
18986
|
+
}
|
|
18987
|
+
}
|
|
18988
|
+
lineReader.close();
|
|
18989
|
+
return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
|
|
18990
|
+
};
|
|
18813
18991
|
/**
|
|
18814
18992
|
* Compares two values, handling numbers, strings, and dates
|
|
18815
18993
|
* Returns: negative if a < b, positive if a > b, 0 if equal
|
|
@@ -19227,7 +19405,7 @@ var ExecutorOrchestratorClass = class {
|
|
|
19227
19405
|
const tracker = new ExecutorPerformance_default();
|
|
19228
19406
|
const _progress = new ExecutorProgress_default(logProgress);
|
|
19229
19407
|
const { usageId } = UsageManager_default.startUsage(consumer, details);
|
|
19230
|
-
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.
|
|
19408
|
+
const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
|
|
19231
19409
|
const pool = this.createPool();
|
|
19232
19410
|
try {
|
|
19233
19411
|
const start = performance.now();
|
|
@@ -19322,6 +19500,22 @@ var ExecutorOrchestratorClass = class {
|
|
|
19322
19500
|
postOperation.totalOutputCount = unifiedOutputCount;
|
|
19323
19501
|
Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
|
|
19324
19502
|
}
|
|
19503
|
+
if (consumer.validate && consumer.validate.length > 0) {
|
|
19504
|
+
Logger_default.log(`[${usageId}] Running dataset-level validations`);
|
|
19505
|
+
counter = performance.now();
|
|
19506
|
+
const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
|
|
19507
|
+
tracker.measure("dataset-validation", performance.now() - counter);
|
|
19508
|
+
for (const result of validationResults) {
|
|
19509
|
+
if (result.onFail === "fail") {
|
|
19510
|
+
const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
|
|
19511
|
+
Logger_default.error(err);
|
|
19512
|
+
throw err;
|
|
19513
|
+
} else if (result.onFail === "warn") {
|
|
19514
|
+
Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
|
|
19515
|
+
}
|
|
19516
|
+
}
|
|
19517
|
+
Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
|
|
19518
|
+
}
|
|
19325
19519
|
counter = performance.now();
|
|
19326
19520
|
Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
|
|
19327
19521
|
const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);
|