@forzalabs/remora 0.0.50-nasco.3 → 0.0.52-nasco.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Constants.js CHANGED
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const CONSTANTS = {
4
- cliVersion: '0.0.50-nasco',
4
+ cliVersion: '0.0.52-nasco',
5
5
  lambdaVersion: 1,
6
6
  port: 5069,
7
7
  defaults: {
@@ -79,6 +79,10 @@
79
79
  "code"
80
80
  ],
81
81
  "additionalProperties": false
82
+ },
83
+ "union": {
84
+ "type": "boolean",
85
+ "description": "Merges the data from the various producers in a single dataset. They must have the same output dimensions. If true, then you can't set any joins on any producer, since all producers are merged in a single dataset."
82
86
  }
83
87
  },
84
88
  "required": [
@@ -207,7 +211,7 @@
207
211
  ],
208
212
  "description": "The output format of the consumer"
209
213
  },
210
- "accellerated": {
214
+ "accelerated": {
211
215
  "type": "boolean",
212
216
  "description": "If true and supported, the consumer will be materialized to improve query performance"
213
217
  },
@@ -28,7 +28,7 @@
28
28
  "properties": {
29
29
  "name": {
30
30
  "type": "string",
31
- "description": "The name of the dimension"
31
+ "description": "The name of the dimension. This is the output name of this dimension."
32
32
  },
33
33
  "description": {
34
34
  "type": "string",
@@ -45,7 +45,7 @@
45
45
  },
46
46
  "alias": {
47
47
  "type": "string",
48
- "description": "The SQL column or field key that corresponds to this dimension. If left empty, the column name is assumed to be the same as the dimension name"
48
+ "description": "The SQL column or field key that corresponds to this dimension. If left empty, the column name is assumed to be the same as the dimension name."
49
49
  },
50
50
  "pk": {
51
51
  "type": "boolean",
@@ -1,8 +1,14 @@
1
1
  {
2
2
  "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "Project Schema",
4
+ "description": "Schema for defining remora project configuration",
3
5
  "type": "object",
4
6
  "required": ["name", "version", "consumers", "producers", "sources", "schemas", "settings"],
5
7
  "properties": {
8
+ "$schema": {
9
+ "type": "string",
10
+ "format": "uri"
11
+ },
6
12
  "name": {
7
13
  "type": "string",
8
14
  "description": "Name of the remora project"
@@ -12,6 +18,10 @@
12
18
  "pattern": "^\\d+\\.\\d+\\.\\d+$",
13
19
  "description": "Version of the project in semver format"
14
20
  },
21
+ "description": {
22
+ "type": "string",
23
+ "description": "Optional description of the project"
24
+ },
15
25
  "consumers": {
16
26
  "type": "array",
17
27
  "items": {
@@ -53,11 +63,38 @@
53
63
  "minimum": 1,
54
64
  "description": "Maximum number of rows for SQL queries"
55
65
  },
66
+ "STRING_MAX_CHARACTERS_LENGTH": {
67
+ "type": "integer",
68
+ "minimum": 1,
69
+ "description": "Maximum length for string fields"
70
+ },
71
+ "MAX_ITEMS_IN_MEMORY": {
72
+ "type": "integer",
73
+ "minimum": 1,
74
+ "description": "Maximum number of items to keep in memory"
75
+ },
56
76
  "DEBUG_MODE": {
57
77
  "type": "boolean",
58
- "description": "Enable logging of internal steps."
78
+ "description": "Enable logging of internal steps"
59
79
  }
60
80
  }
61
81
  }
62
- }
82
+ },
83
+ "additionalProperties": false,
84
+ "examples": [
85
+ {
86
+ "$schema": "https://raw.githubusercontent.com/ForzaLabs/remora-public/refs/heads/main/json_schemas/project-schema.json",
87
+ "name": "analytics-project",
88
+ "version": "1.0.0",
89
+ "description": "Analytics data processing project",
90
+ "consumers": ["/consumers"],
91
+ "producers": ["/producers"],
92
+ "sources": ["/sources"],
93
+ "schemas": ["/schemas"],
94
+ "settings": {
95
+ "SQL_MAX_QUERY_ROWS": 10000,
96
+ "DEBUG_MODE": true
97
+ }
98
+ }
99
+ ]
63
100
  }
@@ -22,7 +22,7 @@ const ExecutionEnvironment_1 = __importDefault(require("../execution/ExecutionEn
22
22
  const ProducerManager_1 = __importDefault(require("../producer/ProducerManager"));
23
23
  const SQLCompiler_1 = __importDefault(require("../sql/SQLCompiler"));
24
24
  const SQLUtils_1 = __importDefault(require("../sql/SQLUtils"));
25
- const UsageManager_1 = __importDefault(require("../UsageManager"));
25
+ const UsageManager_1 = __importDefault(require("../usage/UsageManager"));
26
26
  const ConsumerManager_1 = __importDefault(require("./ConsumerManager"));
27
27
  class ConsumerEngineClass {
28
28
  constructor() {
@@ -132,7 +132,12 @@ class ConsumerManagerClass {
132
132
  else {
133
133
  const matches = columns.filter(x => x.nameInProducer === field.key);
134
134
  (0, Affirm_1.default)(matches.length > 0, `Consumer "${consumer.name}" misconfiguration: the field "${field.key}" is not found in any of the included producers (${consumer.producers.map(x => x.name).join(', ')})`);
135
- (0, Affirm_1.default)(matches.length === 1, `Consumer "${consumer.name}" misconfiguration: the field "${field.key}" is ambiguos between the fields with same name from the producers: ${matches.map(x => x.owner).join(', ')}`);
135
+ if (matches.length === 1) {
136
+ // Need to check if the producers have "union" if they do, I don't care about this check
137
+ const cProd = consumer.producers.find(x => x.name === matches[0].owner);
138
+ if (!cProd.union)
139
+ (0, Affirm_1.default)(matches.length === 1, `Consumer "${consumer.name}" misconfiguration: the field "${field.key}" is ambiguos between the fields with same name from the producers: ${matches.map(x => x.owner).join(', ')}`);
140
+ }
136
141
  column = matches[0];
137
142
  }
138
143
  if (!column) {
@@ -15,7 +15,7 @@ class DeploymentPlannerClass {
15
15
  switch (output.format) {
16
16
  // csv, json, parquet outputs do not need to generate anything at deploy
17
17
  case 'SQL': {
18
- if (output.accellerated && !output.direct)
18
+ if (output.accelerated && !output.direct)
19
19
  plan.push({ type: 'create-materialized-view', output: output });
20
20
  else if (!output.direct)
21
21
  plan.push({ type: 'create-view', output: output });
@@ -30,7 +30,7 @@ const Logger_1 = __importDefault(require("../../helper/Logger"));
30
30
  class ExecutionEnvironment {
31
31
  constructor(consumer) {
32
32
  this.run = (options) => __awaiter(this, void 0, void 0, function* () {
33
- var _a;
33
+ var _a, _b;
34
34
  (0, Affirm_1.default)(this._consumer, 'Invalid consumer');
35
35
  const plan = ExecutionPlanner_1.default.plan(this._consumer, options);
36
36
  (0, Affirm_1.default)(plan, `Invalid execution plan`);
@@ -153,7 +153,7 @@ class ExecutionEnvironment {
153
153
  }
154
154
  }
155
155
  catch (error) {
156
- const ds = (_a = this._resultingDataset) !== null && _a !== void 0 ? _a : this._getIntermidiate(currentStep);
156
+ const ds = (_a = this._resultingDataset) !== null && _a !== void 0 ? _a : (_b = this._producedData.at(-1)) === null || _b === void 0 ? void 0 : _b.dataset;
157
157
  if (ds)
158
158
  Logger_1.default.log(`Failed execution of consumer at step ${currentStep.type}:\n\tSize: ${ds.getSize()}\n\tCycles: ${ds.getCycles()}\n\tOperations: ${Logger_1.default.formatList(ds.getOperations())}`);
159
159
  Logger_1.default.log(`\tFailed step: ${currentStep.type}->\n\t${error}`);
@@ -78,7 +78,7 @@ class SQLCompilerClass {
78
78
  };
79
79
  this.getConsumerReference = (consumer) => {
80
80
  (0, Affirm_1.default)(consumer, 'Invalid consumer');
81
- if (consumer.outputs.some(x => x.format === 'SQL' && x.accellerated))
81
+ if (consumer.outputs.some(x => x.format === 'SQL' && x.accelerated))
82
82
  return `SELECT * FROM "av_remora_${SQLUtils_1.default.sanitizeName(consumer.name)}"`;
83
83
  if (consumer.outputs.some(x => x.format === 'SQL' && !x.direct))
84
84
  return `SELECT * FROM "v_remora_${SQLUtils_1.default.sanitizeName(consumer.name)}"`;
@@ -92,6 +92,8 @@ class JoinEngineClass {
92
92
  (0, Affirm_1.default)(producedData, 'Invalid produced data');
93
93
  if (consumer.producers.length <= 1)
94
94
  return this.findProducerData(consumer.producers[0].name, producedData);
95
+ if (consumer.producers.some(x => x.union))
96
+ return yield this.union(consumer, producedData);
95
97
  const consumerShape = ConsumerEngine_1.default.getOutputShape(consumer);
96
98
  const consumerColumns = ConsumerEngine_1.default.compile(consumer);
97
99
  // Create a new dataset for the joined result
@@ -132,6 +134,21 @@ class JoinEngineClass {
132
134
  }
133
135
  return resultDataset;
134
136
  });
137
+ this.union = (consumer, producedData) => __awaiter(this, void 0, void 0, function* () {
138
+ const getDimensionsKey = (ds) => ds.getDimensions().map(x => x.name.trim()).join(';').trim();
139
+ const mainDataset = producedData[0].dataset;
140
+ const mainDimKey = getDimensionsKey(mainDataset);
141
+ const otherProducedData = producedData.slice(1);
142
+ for (const prodData of otherProducedData) {
143
+ const prodDimKey = getDimensionsKey(prodData.dataset);
144
+ if (mainDimKey !== prodDimKey)
145
+ throw new Error(`On consumer "${consumer.name}", can't union the dataset "${prodData.dataset['_name']}" (producer: ${prodData.producerKey}) because the dimensions are different from the main dataset "${mainDataset['_name']}" (producer: ${producedData[0].producerKey}). "${mainDimKey}" != "${prodDimKey}"`);
146
+ yield prodData.dataset.streamBatches((batch) => __awaiter(this, void 0, void 0, function* () {
147
+ yield mainDataset.append(batch);
148
+ }));
149
+ }
150
+ return mainDataset;
151
+ });
135
152
  this.performStreamingJoin = (leftDataset, rightLookup, condition, relationship, consumerColumns, resultDataset) => __awaiter(this, void 0, void 0, function* () {
136
153
  const joinedRecords = [];
137
154
  const batchSize = leftDataset.getBatchSize();
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const Algo_1 = __importDefault(require("../../core/Algo"));
7
+ const Helper_1 = __importDefault(require("../../helper/Helper"));
8
+ class DataframeManagerClass {
9
+ fill(points, from, to, onlyLastValue, maintainLastValue) {
10
+ const min = from !== null && from !== void 0 ? from : this.getMinDate(points);
11
+ const max = to !== null && to !== void 0 ? to : this.getMaxDate(points);
12
+ const orderPoints = points.length > 0 ? Algo_1.default.orderBy(points, 'x') : [];
13
+ const filledPoints = [];
14
+ const currentDate = new Date(min);
15
+ while (currentDate <= max) {
16
+ const monthKey = Helper_1.default.formatDateToYYYYMM(currentDate);
17
+ filledPoints.push({ x: monthKey, y: 0 });
18
+ currentDate.setMonth(currentDate.getMonth() + 1);
19
+ }
20
+ for (let i = 0; i < orderPoints.length; i++) {
21
+ const point = orderPoints[i];
22
+ const date = new Date(point.x);
23
+ const filledPoint = filledPoints.find(x => x.x === Helper_1.default.formatDateToYYYYMM(date));
24
+ if (filledPoint) {
25
+ if (!onlyLastValue)
26
+ filledPoint.y += point.y;
27
+ else
28
+ filledPoint.y = point.y;
29
+ if (maintainLastValue) {
30
+ const index = filledPoints.findIndex(x => x.x === Helper_1.default.formatDateToYYYYMM(date));
31
+ for (let k = index; k < filledPoints.length; k++) {
32
+ const nextFilledPoint = filledPoints[k];
33
+ nextFilledPoint.y = filledPoint.y;
34
+ }
35
+ }
36
+ }
37
+ }
38
+ return filledPoints;
39
+ }
40
+ getMinDate(points) {
41
+ if (!points || points.length === 0) {
42
+ const currentDate = new Date();
43
+ return new Date(currentDate.getFullYear() - 1, currentDate.getMonth(), currentDate.getDate());
44
+ }
45
+ return points.reduce((min, point) => (new Date(point.x) < min ? new Date(point === null || point === void 0 ? void 0 : point.x) : min), new Date(points[0].x));
46
+ }
47
+ getMaxDate(points) {
48
+ if (!points || points.length === 0) {
49
+ return new Date();
50
+ }
51
+ return points.reduce((max, point) => (new Date(point.x) > max ? new Date(point.x) : max), new Date(points[0].x));
52
+ }
53
+ }
54
+ const DataframeManager = new DataframeManagerClass();
55
+ exports.default = DataframeManager;
@@ -0,0 +1,61 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const Affirm_1 = __importDefault(require("../../core/Affirm"));
7
+ const DSTE_1 = __importDefault(require("../../core/dste/DSTE"));
8
+ const DatabaseEngine_1 = __importDefault(require("../../database/DatabaseEngine"));
9
+ const Helper_1 = __importDefault(require("../../helper/Helper"));
10
+ const Settings_1 = __importDefault(require("../../helper/Settings"));
11
+ class UsageManagerClass {
12
+ constructor() {
13
+ /**
14
+ * TODO: I need to group the usage stats into a bucket daily. When and how I do it is still a question...
15
+ */
16
+ this.getTodayBucketId = (consumer) => {
17
+ (0, Affirm_1.default)(consumer, `Invalid consumer`);
18
+ const now = DSTE_1.default.now();
19
+ return `${consumer.name}_${now.getUTCFullYear()}_${now.getUTCMonth()}_${now.getUTCDate()}`.toLowerCase();
20
+ };
21
+ this.startUsage = (consumer, user) => {
22
+ const newUsage = {
23
+ _id: Helper_1.default.uuid(),
24
+ consumer: consumer.name,
25
+ startedAt: DSTE_1.default.now(),
26
+ executedBy: { name: user.name, _id: user._id },
27
+ itemsCount: -1,
28
+ status: 'started',
29
+ _signature: ''
30
+ };
31
+ if (Helper_1.default.isDev())
32
+ return { usageId: newUsage._id, usage: Promise.resolve(newUsage) };
33
+ const updateRes = DatabaseEngine_1.default.upsert(Settings_1.default.db.collections.usage, newUsage._id, newUsage);
34
+ return { usageId: newUsage._id, usage: updateRes };
35
+ };
36
+ this.endUsage = (usageId, itemsCount) => {
37
+ const update = {
38
+ itemsCount: itemsCount,
39
+ status: 'success',
40
+ finishedAt: DSTE_1.default.now()
41
+ };
42
+ if (Helper_1.default.isDev())
43
+ return { usageId: null, usage: Promise.resolve(update) };
44
+ const updateRes = DatabaseEngine_1.default.upsert(Settings_1.default.db.collections.usage, usageId, update);
45
+ return { usageId: usageId, usage: updateRes };
46
+ };
47
+ this.failUsage = (usageId, error) => {
48
+ const update = {
49
+ status: 'failed',
50
+ error: error,
51
+ finishedAt: DSTE_1.default.now()
52
+ };
53
+ if (Helper_1.default.isDev())
54
+ return { usageId: null, usage: Promise.resolve(update) };
55
+ const updateRes = DatabaseEngine_1.default.upsert(Settings_1.default.db.collections.usage, usageId, update);
56
+ return { usageId: usageId, usage: updateRes };
57
+ };
58
+ }
59
+ }
60
+ const UsageManager = new UsageManagerClass();
61
+ exports.default = UsageManager;
@@ -90,6 +90,12 @@ class ValidatorClass {
90
90
  errors.push('No producers found');
91
91
  if (producers.some(x => !x))
92
92
  errors.push(`Invalid producer found in consumer "${consumer.name}"`);
93
+ if (consumer.producers.length > 0) {
94
+ const withJoins = consumer.producers.filter(x => (Algo_1.default.hasVal(x.joins) && x.joins.length > 0) || !x.union);
95
+ const withUnions = consumer.producers.filter(x => x.union === true);
96
+ if (withJoins.length > 0 && withUnions.length)
97
+ errors.push(`Multiple producers in consumer have mixed "joins" and "union": you can either have multiple producers with "joins" or multiple producers with "union", but not both (joins: ${withJoins.map(x => x.name).join(', ')}; unions: ${withUnions.map(x => x.name).join(', ')})`);
98
+ }
93
99
  // Validation on sources
94
100
  const sources = producers.map(x => Environment_1.default.getSource(x.source));
95
101
  if (sources.length === 0)
@@ -175,7 +181,7 @@ class ValidatorClass {
175
181
  }
176
182
  for (const output of consumer.outputs) {
177
183
  const format = output.format.toUpperCase();
178
- if (format === 'SQL' && output.accellerated && output.direct)
184
+ if (format === 'SQL' && output.accelerated && output.direct)
179
185
  errors.push(`An output SQL cannot be both direct and accelerated (output: ${format})`);
180
186
  if ((format === 'CSV' || format === 'JSON' || format === 'PARQUET')) {
181
187
  if (!output.exportDestination)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forzalabs/remora",
3
- "version": "0.0.50-nasco.3",
3
+ "version": "0.0.52-nasco.3",
4
4
  "description": "A powerful CLI tool for seamless data translation.",
5
5
  "main": "index.js",
6
6
  "private": false,