@forzalabs/remora 1.1.14 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,107 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on Keep a Changelog, and this project adheres to Semantic Versioning.
6
+
7
+ ## Unreleased
8
+
9
+ ### Added
10
+ - Added field-level consumer validations with support for multiple rules per field and per-rule failure actions: `fail`, `skip`, `warn`, and `set_default`
11
+ - Added dataset-level consumer validations for `unique_fields`, `min_rows`, `max_rows`, `no_duplicates`, and `not_empty`
12
+ - Added `DataValidationEngine` to centralize field and dataset validation logic
13
+ - Added validation result type definitions to the definitions package for shared use across engines and executors
14
+ - Added `warn()` logging support for non-fatal validation outcomes
15
+ - Added canary consumer coverage for field-level and dataset-level validations with passing, warning, skipped, defaulted, and failing scenarios
16
+ - Added `verify:local` to the canary package to build the local CLI and run the canary suite against it instead of the published package
17
+
18
+ ### Changed
19
+ - Updated consumer field validation configuration from a single flat validation object to an ordered array of validation rules with explicit `onFail` behavior
20
+
21
+ ### Fixed
22
+ - Fixed the consumer JSON schema to support the new field-level and dataset-level validation configuration
23
+ - Fixed AJV strict-mode compatibility for validation `in` and `not_in` rule arrays by replacing union `type` declarations with `oneOf`
24
+
25
+ ## V 1.1.15 - 2026-03-26
26
+
27
+ ### Added
28
+ - Added validation that consumer field keys exist in the referenced producer's dimensions/measures
29
+ - Added validation that every consumer field defines the required `key` property
30
+ - Added validation that `copyFrom` references a field that appears earlier in the consumer's field list
31
+ - Added validation that `distinctOn` keys and `orderBy` reference fields present in the consumer
32
+ - Added validation that join SQL `${P.field}` and `${producer.field}` references point to valid fields
33
+
34
+ ### Fixed
35
+ - Fixed environment variable not exposed to front-end
36
+ - Fixed database endpoint selector
37
+ - Fixed worker image volume usage in a cloud environment
38
+ - Fixed worker-thread execution errors being logged only to the terminal by propagating them back to the orchestrator file logger
39
+ - Fixed `ConsumerExecutor.processRecord` error reporting to log step-specific failures for field resolution, aliasing, transformations, and filter evaluation
40
+
41
+ ## V 1.1.11 - 2026-02-05
42
+
43
+ ### Added
44
+ - Added `startRow` and `startColumn` settings for Excel producers (.xls/.xlsx), allowing users to specify the 1-indexed row and column from which to begin reading data
45
+
46
+ ### Fixed
47
+ - Fixed `MaxListenersExceededWarning` on `WriteStream` during consumer execution by replacing shared stream merge with per-file append pipelines
48
+ - Fixed CLI `run` command always exiting with code 1 even on successful runs
49
+ - Fixed incomplete file logging caused by `process.exit()` terminating before winston could flush buffered writes; added `logger.flush()` to worker threads, orchestrator, and CLI exit paths
50
+ - Added `logger.flush()` before `process.exit()` in all data-processing CLI actions (sample, mock, automap, discover, debug) and worker startup
51
+ - Fixed CLI `discover` command exiting with code 1 on success instead of code 0
52
+ - Fixed per-worker `WriteStream` in `Executor.ts` never being closed, risking data loss before distinct/distinctOn post-processing passes
53
+ - Fixed `Dataset.ts` stream await pattern where `resolve` was never called in the `finish` handler (5 sites: transformStream, sort batches, k-way merge, append), causing promises to hang indefinitely
54
+ - Fixed `ExecutorWriter.ts` not awaiting intermediate stream flush during file-size-based rotation
55
+ - Fixed `DriverHelper.appendObjectsToUnifiedFile` and `LocalDestinationDriver.transformAndMove` not awaiting stream flush before returning
56
+
57
+ ## V 1.1.9 - 2026-02-04
58
+
59
+ ### Added
60
+ - Added `switch_case` transformation for mapping specific values to other values (similar to a switch/case statement)
61
+ - Added validation to detect multiple consumer fields reading from the same producer dimension (suggests using `copyFrom` instead)
62
+ - Added detailed logging to the executor orchestrator with usage ID tracing throughout the execution lifecycle
63
+
64
+ ### Changed
65
+ - Cleaned up CLI execution error output to show concise messages in console while preserving full stack traces in internal logs
66
+
67
+ ## V 1.1.8 - 2026-02-03
68
+
69
+ ### Added
70
+ - Added `pivot` option to consumers, enabling row-to-column transformation with aggregation (sum, count, avg, min, max)
71
+ - Added `copyFrom` property to consumer fields, allowing a field to be a value copy of another field in the dataset
72
+
73
+ ## V 1.1.7 - 2026-02-02
74
+
75
+ ### Changed
76
+ - Improved the mock engine
77
+ - Improved logging
78
+
79
+ ## V 1.1.6 - 2026-02-02
80
+
81
+ ### Added
82
+ - Added `--limit` option to `remora run` command to process only the first N records
83
+ - Added descriptive error messages for failed field transformations with full stack trace preservation
84
+ - Added file logging with rotation (enabled via `REMORA_DEBUG_MODE=true` in production)
85
+ - Added structured logging across key application areas
86
+
87
+ ### Changed
88
+ - Moved `DEBUG_MODE` from project.json settings to `REMORA_DEBUG_MODE` environment variable
89
+
90
+ ## V 1.1.5 - 2026-02-01
91
+
92
+ ### Added
93
+ - Refactored for monorepository
94
+ - Added output maximum file size definable from consumer
95
+ - Added support for nested subfolders inside remora configuration directories (sources, producers, consumers, schemas)
96
+
97
+ ### Fixed
98
+ - Bug in parsing via GZ file
99
+ - Issues with concurrent requests
100
+
101
+ ### Changed
102
+ - Dockerfile for apps in a monorepo build
103
+ - Package.json to workspaces compliance
104
+ - Refactored internal module structure
105
+ - Removed the _file annotations for environment variables
106
+
107
+ ## V 1.0.18
package/index.js CHANGED
@@ -13357,6 +13357,10 @@ var Logger = class {
13357
13357
  console.info(message);
13358
13358
  FileLogService_default.write("INFO", String(message));
13359
13359
  };
13360
+ this.warn = (message) => {
13361
+ console.warn(message);
13362
+ FileLogService_default.write("WARN", String(message));
13363
+ };
13360
13364
  this.flush = () => FileLogService_default.flush();
13361
13365
  this.close = () => FileLogService_default.close();
13362
13366
  this.error = (error) => {
@@ -13500,7 +13504,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
13500
13504
 
13501
13505
  // ../../packages/constants/src/Constants.ts
13502
13506
  var CONSTANTS = {
13503
- cliVersion: "1.1.14",
13507
+ cliVersion: "1.2.1",
13504
13508
  backendVersion: 1,
13505
13509
  backendPort: 5088,
13506
13510
  workerVersion: 2,
@@ -13791,6 +13795,10 @@ var ValidatorClass = class {
13791
13795
  try {
13792
13796
  const recursionErrors = this.detectConsumerRecursion(consumer);
13793
13797
  errors.push(...recursionErrors);
13798
+ for (const [i, field] of consumer.fields.entries()) {
13799
+ if (!field.key)
13800
+ errors.push(`Field at index ${i} in consumer "${consumer.name}" is missing the required "key" property`);
13801
+ }
13794
13802
  const allFieldsWithNoFrom = consumer.fields.filter((x) => x.key === "*" && !x.from);
13795
13803
  if (allFieldsWithNoFrom.length > 0 && consumer.producers.length > 1)
13796
13804
  errors.push(`Field with key "*" was used without specifying the "from" producer and multiple producers were found.`);
@@ -18478,6 +18486,112 @@ var TransformationEngineClass = class {
18478
18486
  var TransformationEngine = new TransformationEngineClass();
18479
18487
  var TransformationEngine_default = TransformationEngine;
18480
18488
 
18489
+ // ../../packages/engines/src/transform/DataValidationEngine.ts
18490
+ var DataValidationEngineClass = class {
18491
+ constructor() {
18492
+ this.applyValidations = (value, validations, fieldKey) => {
18493
+ for (const validation of validations) {
18494
+ const passed = this.evaluateRule(value, validation.rule);
18495
+ if (!passed) {
18496
+ return {
18497
+ valid: false,
18498
+ message: this.buildMessage(value, validation.rule, fieldKey),
18499
+ onFail: validation.onFail
18500
+ };
18501
+ }
18502
+ }
18503
+ return { valid: true };
18504
+ };
18505
+ this.evaluateRule = (value, rule) => {
18506
+ if ("required" in rule)
18507
+ return Algo_default.hasVal(value);
18508
+ if ("min" in rule) {
18509
+ if (!Algo_default.hasVal(value)) return true;
18510
+ const num = Number(value);
18511
+ return !isNaN(num) && num >= rule.min;
18512
+ }
18513
+ if ("max" in rule) {
18514
+ if (!Algo_default.hasVal(value)) return true;
18515
+ const num = Number(value);
18516
+ return !isNaN(num) && num <= rule.max;
18517
+ }
18518
+ if ("regex" in rule) {
18519
+ if (!Algo_default.hasVal(value)) return true;
18520
+ return new RegExp(rule.regex).test(String(value));
18521
+ }
18522
+ if ("min_length" in rule) {
18523
+ if (!Algo_default.hasVal(value)) return true;
18524
+ return String(value).length >= rule.min_length;
18525
+ }
18526
+ if ("max_length" in rule) {
18527
+ if (!Algo_default.hasVal(value)) return true;
18528
+ return String(value).length <= rule.max_length;
18529
+ }
18530
+ if ("in" in rule) {
18531
+ return rule.in.includes(value);
18532
+ }
18533
+ if ("not_in" in rule) {
18534
+ return !rule.not_in.includes(value);
18535
+ }
18536
+ return true;
18537
+ };
18538
+ this.buildMessage = (value, rule, fieldKey) => {
18539
+ const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
18540
+ if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
18541
+ if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
18542
+ if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
18543
+ if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
18544
+ if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
18545
+ if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
18546
+ if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
18547
+ if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
18548
+ return `Field "${fieldKey}" failed validation`;
18549
+ };
18550
+ this.evaluateDatasetValidations = (validations, context) => {
18551
+ const results = [];
18552
+ for (const validation of validations) {
18553
+ const result = this.evaluateDatasetRule(validation, context);
18554
+ if (result) results.push(result);
18555
+ }
18556
+ return results;
18557
+ };
18558
+ this.extractUniqueFieldKeys = (validations) => {
18559
+ return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
18560
+ };
18561
+ this.hasRule = (validations, ruleKey) => {
18562
+ return validations.some((v) => ruleKey in v.rule);
18563
+ };
18564
+ this.evaluateDatasetRule = (validation, context) => {
18565
+ const { rule, onFail } = validation;
18566
+ const { rowCount, hasDuplicateRows, duplicateFields } = context;
18567
+ if ("not_empty" in rule) {
18568
+ if (rowCount === 0)
18569
+ return { message: "Dataset is empty", onFail };
18570
+ }
18571
+ if ("min_rows" in rule) {
18572
+ if (rowCount < rule.min_rows)
18573
+ return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
18574
+ }
18575
+ if ("max_rows" in rule) {
18576
+ if (rowCount > rule.max_rows)
18577
+ return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
18578
+ }
18579
+ if ("no_duplicates" in rule) {
18580
+ if (hasDuplicateRows)
18581
+ return { message: "Dataset contains duplicate rows", onFail };
18582
+ }
18583
+ if ("unique_fields" in rule) {
18584
+ const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
18585
+ if (failedFields.length > 0)
18586
+ return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
18587
+ }
18588
+ return null;
18589
+ };
18590
+ }
18591
+ };
18592
+ var DataValidationEngine = new DataValidationEngineClass();
18593
+ var DataValidationEngine_default = DataValidationEngine;
18594
+
18481
18595
  // ../../packages/engines/src/usage/DataframeManager.ts
18482
18596
  var DataframeManagerClass = class {
18483
18597
  fill(points, from, to, onlyLastValue, maintainLastValue) {
@@ -18911,8 +19025,9 @@ var ConsumerExecutorClass = class {
18911
19025
  for (const field of fields) {
18912
19026
  const { cField } = field;
18913
19027
  const fieldKey = cField.alias ?? cField.key;
19028
+ let dimension;
18914
19029
  try {
18915
- const dimension = dimensions.find((x) => x.name === cField.key);
19030
+ dimension = dimensions.find((x) => x.name === cField.key);
18916
19031
  if (!dimension) {
18917
19032
  if (cField.fixed && Algo_default.hasVal(cField.default))
18918
19033
  record[fieldKey] = cField.default;
@@ -18921,12 +19036,18 @@ var ConsumerExecutorClass = class {
18921
19036
  else
18922
19037
  throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map((x) => x.name).join(", ")})`);
18923
19038
  }
18924
- if (cField.alias && cField.alias !== dimension.name) {
19039
+ } catch (error) {
19040
+ const err = new Error(`Resolving dimension for field "${fieldKey}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
19041
+ Logger_default.error(err);
19042
+ throw err;
19043
+ }
19044
+ try {
19045
+ if (cField.alias && dimension && cField.alias !== dimension.name) {
18925
19046
  record[cField.alias] = record[dimension.name];
18926
19047
  delete record[dimension.name];
18927
19048
  }
18928
19049
  } catch (error) {
18929
- const err = new Error(`Field mapping failed for field "${fieldKey}" of producer "${producer.name}" (index: ${recordIndex}): ${error.message}`, { cause: error });
19050
+ const err = new Error(`Aliasing field "${cField.key}" to "${cField.alias}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
18930
19051
  Logger_default.error(err);
18931
19052
  throw err;
18932
19053
  }
@@ -18960,6 +19081,32 @@ var ConsumerExecutorClass = class {
18960
19081
  }
18961
19082
  }
18962
19083
  }
19084
+ for (const field of fields) {
19085
+ const { cField } = field;
19086
+ const fieldKey = cField.alias ?? cField.key;
19087
+ if (cField.validate && cField.validate.length > 0) {
19088
+ const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
19089
+ if (!result.valid) {
19090
+ const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
19091
+ switch (result.onFail) {
19092
+ case "set_default":
19093
+ record[fieldKey] = cField.default;
19094
+ break;
19095
+ case "skip":
19096
+ return null;
19097
+ case "warn":
19098
+ Logger_default.warn(errorMessage);
19099
+ break;
19100
+ case "fail":
19101
+ default: {
19102
+ const err = new Error(errorMessage);
19103
+ Logger_default.error(err);
19104
+ throw err;
19105
+ }
19106
+ }
19107
+ }
19108
+ }
19109
+ }
18963
19110
  try {
18964
19111
  for (const dimension of dimensions) {
18965
19112
  const field = fields.find((x) => x.cField.key === dimension.name);
@@ -19200,6 +19347,48 @@ var ConsumerExecutorClass = class {
19200
19347
  return false;
19201
19348
  }
19202
19349
  };
19350
+ this.processDatasetValidation = async (consumer, datasetPath) => {
19351
+ const validations = consumer.validate;
19352
+ if (!validations || validations.length === 0) return [];
19353
+ const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
19354
+ const internalFields = ConsumerManager_default.getExpandedFields(consumer);
19355
+ let rowCount = 0;
19356
+ const seenRows = /* @__PURE__ */ new Set();
19357
+ const fieldValueSets = /* @__PURE__ */ new Map();
19358
+ let hasDuplicateRows = false;
19359
+ const duplicateFields = [];
19360
+ const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
19361
+ const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
19362
+ for (const fieldKey of uniqueFieldKeys) {
19363
+ fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
19364
+ }
19365
+ const reader = import_fs11.default.createReadStream(datasetPath);
19366
+ const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
19367
+ for await (const line of lineReader) {
19368
+ rowCount++;
19369
+ if (checkDuplicateRows) {
19370
+ if (seenRows.has(line))
19371
+ hasDuplicateRows = true;
19372
+ else
19373
+ seenRows.add(line);
19374
+ }
19375
+ if (uniqueFieldKeys.length > 0) {
19376
+ const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
19377
+ for (const fieldKey of uniqueFieldKeys) {
19378
+ const valueSet = fieldValueSets.get(fieldKey);
19379
+ const val = String(record[fieldKey] ?? "");
19380
+ if (valueSet.has(val)) {
19381
+ if (!duplicateFields.includes(fieldKey))
19382
+ duplicateFields.push(fieldKey);
19383
+ } else {
19384
+ valueSet.add(val);
19385
+ }
19386
+ }
19387
+ }
19388
+ }
19389
+ lineReader.close();
19390
+ return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
19391
+ };
19203
19392
  /**
19204
19393
  * Compares two values, handling numbers, strings, and dates
19205
19394
  * Returns: negative if a < b, positive if a > b, 0 if equal
@@ -19457,7 +19646,7 @@ var ExecutorOrchestratorClass = class {
19457
19646
  const tracker = new ExecutorPerformance_default();
19458
19647
  const _progress = new ExecutorProgress_default(logProgress);
19459
19648
  const { usageId } = UsageManager_default.startUsage(consumer, details);
19460
- const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.MaximumFileSize };
19649
+ const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
19461
19650
  const pool = this.createPool();
19462
19651
  try {
19463
19652
  const start = performance.now();
@@ -19552,6 +19741,22 @@ var ExecutorOrchestratorClass = class {
19552
19741
  postOperation.totalOutputCount = unifiedOutputCount;
19553
19742
  Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
19554
19743
  }
19744
+ if (consumer.validate && consumer.validate.length > 0) {
19745
+ Logger_default.log(`[${usageId}] Running dataset-level validations`);
19746
+ counter = performance.now();
19747
+ const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
19748
+ tracker.measure("dataset-validation", performance.now() - counter);
19749
+ for (const result of validationResults) {
19750
+ if (result.onFail === "fail") {
19751
+ const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
19752
+ Logger_default.error(err);
19753
+ throw err;
19754
+ } else if (result.onFail === "warn") {
19755
+ Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
19756
+ }
19757
+ }
19758
+ Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
19759
+ }
19555
19760
  counter = performance.now();
19556
19761
  Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
19557
19762
  const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);
@@ -129,31 +129,70 @@
129
129
  ]
130
130
  },
131
131
  "validate": {
132
- "type": "object",
133
- "description": "Rules to check field value compliance and data quality",
134
- "properties": {
135
- "min": {
136
- "type": "number",
137
- "description": "Minimum value for numeric fields"
138
- },
139
- "max": {
140
- "type": "number",
141
- "description": "Maximum value for numeric fields"
142
- },
143
- "regex": {
144
- "type": "string",
145
- "description": "Regular expression pattern to validate string fields"
132
+ "type": "array",
133
+ "description": "Rules to check field value compliance and data quality. Each validation has its own rule and action to take on failure.",
134
+ "items": {
135
+ "type": "object",
136
+ "properties": {
137
+ "rule": {
138
+ "type": "object",
139
+ "description": "The validation rule to check",
140
+ "oneOf": [
141
+ {
142
+ "properties": { "min": { "type": "number", "description": "Minimum value for numeric fields" } },
143
+ "required": ["min"],
144
+ "additionalProperties": false
145
+ },
146
+ {
147
+ "properties": { "max": { "type": "number", "description": "Maximum value for numeric fields" } },
148
+ "required": ["max"],
149
+ "additionalProperties": false
150
+ },
151
+ {
152
+ "properties": { "regex": { "type": "string", "description": "Regular expression pattern to validate string fields" } },
153
+ "required": ["regex"],
154
+ "additionalProperties": false
155
+ },
156
+ {
157
+ "properties": { "required": { "type": "boolean", "const": true, "description": "Whether the field value must be present" } },
158
+ "required": ["required"],
159
+ "additionalProperties": false
160
+ },
161
+ {
162
+ "properties": { "min_length": { "type": "number", "description": "Minimum string length" } },
163
+ "required": ["min_length"],
164
+ "additionalProperties": false
165
+ },
166
+ {
167
+ "properties": { "max_length": { "type": "number", "description": "Maximum string length" } },
168
+ "required": ["max_length"],
169
+ "additionalProperties": false
170
+ },
171
+ {
172
+ "properties": { "in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Allowed values" } },
173
+ "required": ["in"],
174
+ "additionalProperties": false
175
+ },
176
+ {
177
+ "properties": { "not_in": { "type": "array", "items": { "oneOf": [{ "type": "string" }, { "type": "number" }, { "type": "boolean" }] }, "description": "Disallowed values" } },
178
+ "required": ["not_in"],
179
+ "additionalProperties": false
180
+ }
181
+ ]
182
+ },
183
+ "onFail": {
184
+ "type": "string",
185
+ "description": "Action to take when validation fails",
186
+ "enum": ["fail", "skip", "warn", "set_default"]
187
+ }
146
188
  },
147
- "required": {
148
- "type": "boolean",
149
- "description": "Whether the field is required"
150
- }
151
- },
152
- "additionalProperties": false
189
+ "required": ["rule", "onFail"],
190
+ "additionalProperties": false
191
+ }
153
192
  },
154
193
  "onError": {
155
194
  "type": "string",
156
- "description": "Action to take if an error occurs during transformations or validation",
195
+ "description": "Action to take if an error occurs during transformations",
157
196
  "enum": ["set_default", "skip", "fail"]
158
197
  },
159
198
  "default": {
@@ -463,6 +502,53 @@
463
502
  "_version": {
464
503
  "type": "number",
465
504
  "description": "Version number of the consumer configuration"
505
+ },
506
+ "validate": {
507
+ "type": "array",
508
+ "description": "Dataset-level validations applied to the final result set before export",
509
+ "items": {
510
+ "type": "object",
511
+ "properties": {
512
+ "rule": {
513
+ "type": "object",
514
+ "description": "The dataset validation rule to check",
515
+ "oneOf": [
516
+ {
517
+ "properties": { "unique_fields": { "type": "array", "items": { "type": "string" }, "minItems": 1, "description": "Field(s) that must have unique values across the dataset" } },
518
+ "required": ["unique_fields"],
519
+ "additionalProperties": false
520
+ },
521
+ {
522
+ "properties": { "min_rows": { "type": "number", "description": "Minimum number of rows expected in the dataset" } },
523
+ "required": ["min_rows"],
524
+ "additionalProperties": false
525
+ },
526
+ {
527
+ "properties": { "max_rows": { "type": "number", "description": "Maximum number of rows allowed in the dataset" } },
528
+ "required": ["max_rows"],
529
+ "additionalProperties": false
530
+ },
531
+ {
532
+ "properties": { "no_duplicates": { "type": "boolean", "const": true, "description": "No fully duplicate rows allowed" } },
533
+ "required": ["no_duplicates"],
534
+ "additionalProperties": false
535
+ },
536
+ {
537
+ "properties": { "not_empty": { "type": "boolean", "const": true, "description": "Dataset must contain at least one row" } },
538
+ "required": ["not_empty"],
539
+ "additionalProperties": false
540
+ }
541
+ ]
542
+ },
543
+ "onFail": {
544
+ "type": "string",
545
+ "description": "Action to take when dataset validation fails",
546
+ "enum": ["fail", "warn"]
547
+ }
548
+ },
549
+ "required": ["rule", "onFail"],
550
+ "additionalProperties": false
551
+ }
466
552
  }
467
553
  },
468
554
  "required": [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "1.1.14",
3
+ "version": "1.2.1",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,
@@ -13351,6 +13351,10 @@ var Logger = class {
13351
13351
  console.info(message);
13352
13352
  FileLogService_default.write("INFO", String(message));
13353
13353
  };
13354
+ this.warn = (message) => {
13355
+ console.warn(message);
13356
+ FileLogService_default.write("WARN", String(message));
13357
+ };
13354
13358
  this.flush = () => FileLogService_default.flush();
13355
13359
  this.close = () => FileLogService_default.close();
13356
13360
  this.error = (error) => {
@@ -13494,7 +13498,7 @@ var import_promises = __toESM(require("fs/promises"), 1);
13494
13498
 
13495
13499
  // ../../packages/constants/src/Constants.ts
13496
13500
  var CONSTANTS = {
13497
- cliVersion: "1.1.14",
13501
+ cliVersion: "1.2.1",
13498
13502
  backendVersion: 1,
13499
13503
  backendPort: 5088,
13500
13504
  workerVersion: 2,
@@ -13785,6 +13789,10 @@ var ValidatorClass = class {
13785
13789
  try {
13786
13790
  const recursionErrors = this.detectConsumerRecursion(consumer);
13787
13791
  errors.push(...recursionErrors);
13792
+ for (const [i, field] of consumer.fields.entries()) {
13793
+ if (!field.key)
13794
+ errors.push(`Field at index ${i} in consumer "${consumer.name}" is missing the required "key" property`);
13795
+ }
13788
13796
  const allFieldsWithNoFrom = consumer.fields.filter((x) => x.key === "*" && !x.from);
13789
13797
  if (allFieldsWithNoFrom.length > 0 && consumer.producers.length > 1)
13790
13798
  errors.push(`Field with key "*" was used without specifying the "from" producer and multiple producers were found.`);
@@ -17808,6 +17816,112 @@ var TransformationEngineClass = class {
17808
17816
  var TransformationEngine = new TransformationEngineClass();
17809
17817
  var TransformationEngine_default = TransformationEngine;
17810
17818
 
17819
+ // ../../packages/engines/src/transform/DataValidationEngine.ts
17820
+ var DataValidationEngineClass = class {
17821
+ constructor() {
17822
+ this.applyValidations = (value, validations, fieldKey) => {
17823
+ for (const validation of validations) {
17824
+ const passed = this.evaluateRule(value, validation.rule);
17825
+ if (!passed) {
17826
+ return {
17827
+ valid: false,
17828
+ message: this.buildMessage(value, validation.rule, fieldKey),
17829
+ onFail: validation.onFail
17830
+ };
17831
+ }
17832
+ }
17833
+ return { valid: true };
17834
+ };
17835
+ this.evaluateRule = (value, rule) => {
17836
+ if ("required" in rule)
17837
+ return Algo_default.hasVal(value);
17838
+ if ("min" in rule) {
17839
+ if (!Algo_default.hasVal(value)) return true;
17840
+ const num = Number(value);
17841
+ return !isNaN(num) && num >= rule.min;
17842
+ }
17843
+ if ("max" in rule) {
17844
+ if (!Algo_default.hasVal(value)) return true;
17845
+ const num = Number(value);
17846
+ return !isNaN(num) && num <= rule.max;
17847
+ }
17848
+ if ("regex" in rule) {
17849
+ if (!Algo_default.hasVal(value)) return true;
17850
+ return new RegExp(rule.regex).test(String(value));
17851
+ }
17852
+ if ("min_length" in rule) {
17853
+ if (!Algo_default.hasVal(value)) return true;
17854
+ return String(value).length >= rule.min_length;
17855
+ }
17856
+ if ("max_length" in rule) {
17857
+ if (!Algo_default.hasVal(value)) return true;
17858
+ return String(value).length <= rule.max_length;
17859
+ }
17860
+ if ("in" in rule) {
17861
+ return rule.in.includes(value);
17862
+ }
17863
+ if ("not_in" in rule) {
17864
+ return !rule.not_in.includes(value);
17865
+ }
17866
+ return true;
17867
+ };
17868
+ this.buildMessage = (value, rule, fieldKey) => {
17869
+ const preview = Algo_default.hasVal(value) ? JSON.stringify(value) : "null/undefined";
17870
+ if ("required" in rule) return `Field "${fieldKey}" is required but got ${preview}`;
17871
+ if ("min" in rule) return `Field "${fieldKey}" value ${preview} is below minimum ${rule.min}`;
17872
+ if ("max" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum ${rule.max}`;
17873
+ if ("regex" in rule) return `Field "${fieldKey}" value ${preview} does not match pattern "${rule.regex}"`;
17874
+ if ("min_length" in rule) return `Field "${fieldKey}" value ${preview} is shorter than minimum length ${rule.min_length}`;
17875
+ if ("max_length" in rule) return `Field "${fieldKey}" value ${preview} exceeds maximum length ${rule.max_length}`;
17876
+ if ("in" in rule) return `Field "${fieldKey}" value ${preview} is not in the allowed values`;
17877
+ if ("not_in" in rule) return `Field "${fieldKey}" value ${preview} is in the disallowed values`;
17878
+ return `Field "${fieldKey}" failed validation`;
17879
+ };
17880
+ this.evaluateDatasetValidations = (validations, context) => {
17881
+ const results = [];
17882
+ for (const validation of validations) {
17883
+ const result = this.evaluateDatasetRule(validation, context);
17884
+ if (result) results.push(result);
17885
+ }
17886
+ return results;
17887
+ };
17888
+ this.extractUniqueFieldKeys = (validations) => {
17889
+ return validations.filter((v) => "unique_fields" in v.rule).flatMap((v) => v.rule.unique_fields);
17890
+ };
17891
+ this.hasRule = (validations, ruleKey) => {
17892
+ return validations.some((v) => ruleKey in v.rule);
17893
+ };
17894
+ this.evaluateDatasetRule = (validation, context) => {
17895
+ const { rule, onFail } = validation;
17896
+ const { rowCount, hasDuplicateRows, duplicateFields } = context;
17897
+ if ("not_empty" in rule) {
17898
+ if (rowCount === 0)
17899
+ return { message: "Dataset is empty", onFail };
17900
+ }
17901
+ if ("min_rows" in rule) {
17902
+ if (rowCount < rule.min_rows)
17903
+ return { message: `Dataset has ${rowCount} rows, expected at least ${rule.min_rows}`, onFail };
17904
+ }
17905
+ if ("max_rows" in rule) {
17906
+ if (rowCount > rule.max_rows)
17907
+ return { message: `Dataset has ${rowCount} rows, expected at most ${rule.max_rows}`, onFail };
17908
+ }
17909
+ if ("no_duplicates" in rule) {
17910
+ if (hasDuplicateRows)
17911
+ return { message: "Dataset contains duplicate rows", onFail };
17912
+ }
17913
+ if ("unique_fields" in rule) {
17914
+ const failedFields = rule.unique_fields.filter((f) => duplicateFields.includes(f));
17915
+ if (failedFields.length > 0)
17916
+ return { message: `Duplicate values found in field(s): ${failedFields.join(", ")}`, onFail };
17917
+ }
17918
+ return null;
17919
+ };
17920
+ }
17921
+ };
17922
+ var DataValidationEngine = new DataValidationEngineClass();
17923
+ var DataValidationEngine_default = DataValidationEngine;
17924
+
17811
17925
  // ../../packages/engines/src/usage/DataframeManager.ts
17812
17926
  var DataframeManagerClass = class {
17813
17927
  fill(points, from, to, onlyLastValue, maintainLastValue) {
@@ -18510,8 +18624,9 @@ var ConsumerExecutorClass = class {
18510
18624
  for (const field of fields) {
18511
18625
  const { cField } = field;
18512
18626
  const fieldKey = cField.alias ?? cField.key;
18627
+ let dimension;
18513
18628
  try {
18514
- const dimension = dimensions.find((x) => x.name === cField.key);
18629
+ dimension = dimensions.find((x) => x.name === cField.key);
18515
18630
  if (!dimension) {
18516
18631
  if (cField.fixed && Algo_default.hasVal(cField.default))
18517
18632
  record[fieldKey] = cField.default;
@@ -18520,12 +18635,18 @@ var ConsumerExecutorClass = class {
18520
18635
  else
18521
18636
  throw new Error(`The requested field "${cField.key}" from the consumer is not present in the underlying producer "${producer.name}" (${dimensions.map((x) => x.name).join(", ")})`);
18522
18637
  }
18523
- if (cField.alias && cField.alias !== dimension.name) {
18638
+ } catch (error) {
18639
+ const err = new Error(`Resolving dimension for field "${fieldKey}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
18640
+ Logger_default.error(err);
18641
+ throw err;
18642
+ }
18643
+ try {
18644
+ if (cField.alias && dimension && cField.alias !== dimension.name) {
18524
18645
  record[cField.alias] = record[dimension.name];
18525
18646
  delete record[dimension.name];
18526
18647
  }
18527
18648
  } catch (error) {
18528
- const err = new Error(`Field mapping failed for field "${fieldKey}" of producer "${producer.name}" (index: ${recordIndex}): ${error.message}`, { cause: error });
18649
+ const err = new Error(`Aliasing field "${cField.key}" to "${cField.alias}" of producer "${producer.name}" failed (index: ${recordIndex}): ${error.message}`, { cause: error });
18529
18650
  Logger_default.error(err);
18530
18651
  throw err;
18531
18652
  }
@@ -18559,6 +18680,32 @@ var ConsumerExecutorClass = class {
18559
18680
  }
18560
18681
  }
18561
18682
  }
18683
+ for (const field of fields) {
18684
+ const { cField } = field;
18685
+ const fieldKey = cField.alias ?? cField.key;
18686
+ if (cField.validate && cField.validate.length > 0) {
18687
+ const result = DataValidationEngine_default.applyValidations(record[fieldKey], cField.validate, fieldKey);
18688
+ if (!result.valid) {
18689
+ const errorMessage = `Validation failed for field "${fieldKey}" (index: ${recordIndex}): ${result.message}`;
18690
+ switch (result.onFail) {
18691
+ case "set_default":
18692
+ record[fieldKey] = cField.default;
18693
+ break;
18694
+ case "skip":
18695
+ return null;
18696
+ case "warn":
18697
+ Logger_default.warn(errorMessage);
18698
+ break;
18699
+ case "fail":
18700
+ default: {
18701
+ const err = new Error(errorMessage);
18702
+ Logger_default.error(err);
18703
+ throw err;
18704
+ }
18705
+ }
18706
+ }
18707
+ }
18708
+ }
18562
18709
  try {
18563
18710
  for (const dimension of dimensions) {
18564
18711
  const field = fields.find((x) => x.cField.key === dimension.name);
@@ -18799,6 +18946,48 @@ var ConsumerExecutorClass = class {
18799
18946
  return false;
18800
18947
  }
18801
18948
  };
18949
+ this.processDatasetValidation = async (consumer, datasetPath) => {
18950
+ const validations = consumer.validate;
18951
+ if (!validations || validations.length === 0) return [];
18952
+ const internalRecordFormat = OutputExecutor_default._getInternalRecordFormat(consumer);
18953
+ const internalFields = ConsumerManager_default.getExpandedFields(consumer);
18954
+ let rowCount = 0;
18955
+ const seenRows = /* @__PURE__ */ new Set();
18956
+ const fieldValueSets = /* @__PURE__ */ new Map();
18957
+ let hasDuplicateRows = false;
18958
+ const duplicateFields = [];
18959
+ const uniqueFieldKeys = DataValidationEngine_default.extractUniqueFieldKeys(validations);
18960
+ const checkDuplicateRows = DataValidationEngine_default.hasRule(validations, "no_duplicates");
18961
+ for (const fieldKey of uniqueFieldKeys) {
18962
+ fieldValueSets.set(fieldKey, /* @__PURE__ */ new Set());
18963
+ }
18964
+ const reader = import_fs9.default.createReadStream(datasetPath);
18965
+ const lineReader = import_readline6.default.createInterface({ input: reader, crlfDelay: Infinity });
18966
+ for await (const line of lineReader) {
18967
+ rowCount++;
18968
+ if (checkDuplicateRows) {
18969
+ if (seenRows.has(line))
18970
+ hasDuplicateRows = true;
18971
+ else
18972
+ seenRows.add(line);
18973
+ }
18974
+ if (uniqueFieldKeys.length > 0) {
18975
+ const record = internalRecordFormat === "CSV" || internalRecordFormat === "TXT" ? LineParser_default._internalParseCSV(line, internalFields) : LineParser_default._internalParseJSON(line);
18976
+ for (const fieldKey of uniqueFieldKeys) {
18977
+ const valueSet = fieldValueSets.get(fieldKey);
18978
+ const val = String(record[fieldKey] ?? "");
18979
+ if (valueSet.has(val)) {
18980
+ if (!duplicateFields.includes(fieldKey))
18981
+ duplicateFields.push(fieldKey);
18982
+ } else {
18983
+ valueSet.add(val);
18984
+ }
18985
+ }
18986
+ }
18987
+ }
18988
+ lineReader.close();
18989
+ return DataValidationEngine_default.evaluateDatasetValidations(validations, { rowCount, hasDuplicateRows, duplicateFields });
18990
+ };
18802
18991
  /**
18803
18992
  * Compares two values, handling numbers, strings, and dates
18804
18993
  * Returns: negative if a < b, positive if a > b, 0 if equal
@@ -19216,7 +19405,7 @@ var ExecutorOrchestratorClass = class {
19216
19405
  const tracker = new ExecutorPerformance_default();
19217
19406
  const _progress = new ExecutorProgress_default(logProgress);
19218
19407
  const { usageId } = UsageManager_default.startUsage(consumer, details);
19219
- const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.MaximumFileSize };
19408
+ const scope = { id: usageId, folder: `${consumer.name}_${usageId}`, workersId: [], limitFileSize: consumer.maximumFileSize };
19220
19409
  const pool = this.createPool();
19221
19410
  try {
19222
19411
  const start = performance.now();
@@ -19311,6 +19500,22 @@ var ExecutorOrchestratorClass = class {
19311
19500
  postOperation.totalOutputCount = unifiedOutputCount;
19312
19501
  Logger_default.log(`[${usageId}] Pivot complete: ${unifiedOutputCount} rows in ${Math.round(performance.now() - counter)}ms`);
19313
19502
  }
19503
+ if (consumer.validate && consumer.validate.length > 0) {
19504
+ Logger_default.log(`[${usageId}] Running dataset-level validations`);
19505
+ counter = performance.now();
19506
+ const validationResults = await ConsumerExecutor_default.processDatasetValidation(consumer, ExecutorScope_default2.getMainPath(scope));
19507
+ tracker.measure("dataset-validation", performance.now() - counter);
19508
+ for (const result of validationResults) {
19509
+ if (result.onFail === "fail") {
19510
+ const err = new Error(`Dataset validation failed for consumer "${consumer.name}": ${result.message}`);
19511
+ Logger_default.error(err);
19512
+ throw err;
19513
+ } else if (result.onFail === "warn") {
19514
+ Logger_default.warn(`Dataset validation warning for consumer "${consumer.name}": ${result.message}`);
19515
+ }
19516
+ }
19517
+ Logger_default.log(`[${usageId}] Dataset validations complete in ${Math.round(performance.now() - counter)}ms`);
19518
+ }
19314
19519
  counter = performance.now();
19315
19520
  Logger_default.log(`[${usageId}] Exporting results to ${consumer.outputs.length} output(s)`);
19316
19521
  const exportRes = await OutputExecutor_default.exportResult(consumer, ConsumerManager_default.getExpandedFields(consumer), scope);