vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,501 @@
1
+ import inspect
2
+ import json
3
+ from collections import Counter
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+ from typing import Any, Dict, List, Optional, Type, Union
7
+
8
+ import pandas as pd
9
+ import sqlglot
10
+ import sqlglot.expressions as exp
11
+ from pandas import DataFrame as PandasDataFrame
12
+ from pandas._testing import assert_frame_equal
13
+
14
+ import vtlengine.DataTypes as DataTypes
15
+ from vtlengine.DataTypes import SCALAR_TYPES, ScalarType
16
+ from vtlengine.DataTypes.TimeHandling import TimePeriodHandler
17
+ from vtlengine.Exceptions import InputValidationException, SemanticError
18
+
19
+ # from pyspark.pandas import DataFrame as SparkDataFrame, Series as SparkSeries
20
+
21
+
22
+ @dataclass
23
+ class Scalar:
24
+ """
25
+ Class representing a scalar value
26
+ """
27
+
28
+ name: str
29
+ data_type: Type[ScalarType]
30
+ _value: Any
31
+ persistent: bool = False
32
+
33
+ def __init__(
34
+ self, name: str, data_type: Type[ScalarType], value: Any, persistent: bool = False
35
+ ) -> None:
36
+ self.name = name
37
+ self.data_type = data_type
38
+ self.value = value
39
+ self.persistent = persistent
40
+
41
+ @property
42
+ def value(self) -> Any:
43
+ return self._value
44
+
45
+ @value.setter
46
+ def value(self, new_value: Any) -> None:
47
+ if self.data_type and not self.data_type.check(new_value):
48
+ raise InputValidationException(
49
+ code="0-1-2-7",
50
+ value=new_value,
51
+ type_=self.data_type.__name__,
52
+ op_type=self.__class__.__name__,
53
+ name=self.name,
54
+ )
55
+ self._value = new_value
56
+
57
+ @classmethod
58
+ def from_json(cls, json_str: str) -> "Scalar":
59
+ data = json.loads(json_str)
60
+ return cls(data["name"], SCALAR_TYPES[data["data_type"]], data["value"])
61
+
62
+ def __eq__(self, other: Any) -> bool:
63
+ same_name = self.name == other.name
64
+ same_type = self.data_type == other.data_type
65
+ x = None if not pd.isnull(self.value) else self.value
66
+ y = None if not pd.isnull(other.value) else other.value
67
+ same_value = x == y
68
+ return same_name and same_type and same_value
69
+
70
+
71
+ Role_keys = [
72
+ "Identifier",
73
+ "Attribute",
74
+ "Measure",
75
+ ]
76
+
77
+
78
+ class Role(Enum):
79
+ """
80
+ Enum class for the role of a component (Identifier, Attribute, Measure)
81
+ """
82
+
83
+ IDENTIFIER = "Identifier"
84
+ ATTRIBUTE = "Attribute"
85
+ MEASURE = "Measure"
86
+
87
+
88
+ @dataclass
89
+ class DataComponent:
90
+ """A component of a dataset with data"""
91
+
92
+ name: str
93
+ # data: Optional[Union[PandasSeries, SparkSeries]]
94
+ data: Optional[Any]
95
+ data_type: Type[ScalarType]
96
+ role: Role = Role.MEASURE
97
+ nullable: bool = True
98
+
99
+ def __eq__(self, other: Any) -> bool:
100
+ if not isinstance(other, DataComponent):
101
+ return False
102
+ return self.to_dict() == other.to_dict()
103
+
104
+ @classmethod
105
+ def from_json(cls, json_str: Any) -> "DataComponent":
106
+ return cls(
107
+ json_str["name"],
108
+ None,
109
+ SCALAR_TYPES[json_str["data_type"]],
110
+ Role(json_str["role"]),
111
+ json_str["nullable"],
112
+ )
113
+
114
+ def to_dict(self) -> Dict[str, Any]:
115
+ return {
116
+ "name": self.name,
117
+ "data": self.data,
118
+ "data_type": self.data_type,
119
+ "role": self.role,
120
+ }
121
+
122
+ def to_json(self) -> str:
123
+ return json.dumps(self.to_dict(), indent=4)
124
+
125
+
126
+ @dataclass
127
+ class Component:
128
+ """
129
+ Class representing a component of a dataset
130
+ """
131
+
132
+ name: str
133
+ data_type: Type[ScalarType]
134
+ role: Role
135
+ nullable: bool
136
+
137
+ def __post_init__(self) -> None:
138
+ if self.role == Role.IDENTIFIER and self.nullable:
139
+ raise ValueError(f"Identifier {self.name} cannot be nullable")
140
+
141
+ def __eq__(self, other: Any) -> bool:
142
+ return self.to_dict() == other.to_dict()
143
+
144
+ def copy(self) -> "Component":
145
+ return Component(self.name, self.data_type, self.role, self.nullable)
146
+
147
+ @classmethod
148
+ def from_json(cls, json_str: Any) -> "Component":
149
+ return cls(
150
+ json_str["name"],
151
+ SCALAR_TYPES[json_str["data_type"]],
152
+ Role(json_str["role"]),
153
+ json_str["nullable"],
154
+ )
155
+
156
+ def to_dict(self) -> Dict[str, Any]:
157
+ data_type = self.data_type
158
+ if not inspect.isclass(self.data_type):
159
+ data_type = self.data_type.__class__ # type: ignore[assignment]
160
+ return {
161
+ "name": self.name,
162
+ "data_type": DataTypes.SCALAR_TYPES_CLASS_REVERSE[data_type],
163
+ # Need to check here for NoneType as UDO argument has it
164
+ "role": self.role.value if self.role is not None else None, # type: ignore[redundant-expr]
165
+ "nullable": self.nullable,
166
+ }
167
+
168
+ def to_json(self) -> str:
169
+ return json.dumps(self.to_dict())
170
+
171
+ def rename(self, new_name: str) -> None:
172
+ self.name = new_name
173
+
174
+ def __str__(self) -> str:
175
+ return self.to_json()
176
+
177
+ __repr__ = __str__
178
+
179
+
180
+ @dataclass
181
+ class Dataset:
182
+ name: str
183
+ components: Dict[str, Component]
184
+ data: Optional[PandasDataFrame] = None
185
+ persistent: bool = False
186
+
187
+ def __post_init__(self) -> None:
188
+ if self.data is not None:
189
+ if len(self.components) != len(self.data.columns):
190
+ raise ValueError(
191
+ "The number of components must match the number of columns in the data"
192
+ )
193
+ for name, _ in self.components.items():
194
+ if name not in self.data.columns:
195
+ raise ValueError(f"Component {name} not found in the data")
196
+
197
+ def __eq__(self, other: Any) -> bool:
198
+ if not isinstance(other, Dataset):
199
+ return False
200
+
201
+ same_name = self.name == other.name
202
+ if not same_name:
203
+ print("\nName mismatch")
204
+ print("result:", self.name)
205
+ print("reference:", other.name)
206
+ same_components = self.components == other.components
207
+ if not same_components:
208
+ print("\nComponents mismatch")
209
+ result_comps = self.to_dict()["components"]
210
+ reference_comps = other.to_dict()["components"]
211
+ if len(result_comps) != len(reference_comps):
212
+ print(
213
+ f"Shape mismatch: result:{len(result_comps)} "
214
+ f"!= reference:{len(reference_comps)}"
215
+ )
216
+ if len(result_comps) < len(reference_comps):
217
+ print(
218
+ "Missing components in result:",
219
+ set(reference_comps.keys()) - set(result_comps.keys()),
220
+ )
221
+ else:
222
+ print(
223
+ "Additional components in result:",
224
+ set(result_comps.keys()) - set(reference_comps.keys()),
225
+ )
226
+ return False
227
+
228
+ diff_comps = {
229
+ k: v
230
+ for k, v in result_comps.items()
231
+ if (k in reference_comps and v != reference_comps[k]) or k not in reference_comps
232
+ }
233
+ ref_diff_comps = {k: v for k, v in reference_comps.items() if k in diff_comps}
234
+ print(f"Differences in components {self.name}: ")
235
+ print("result:", json.dumps(diff_comps, indent=4))
236
+ print("reference:", json.dumps(ref_diff_comps, indent=4))
237
+ return False
238
+
239
+ if self.data is None and other.data is None:
240
+ return True
241
+ elif self.data is None or other.data is None:
242
+ return False
243
+ if len(self.data) == len(other.data) == 0 and self.data.shape != other.data.shape:
244
+ raise SemanticError("0-1-1-14", dataset1=self.name, dataset2=other.name)
245
+
246
+ self.data.fillna("", inplace=True)
247
+ other.data.fillna("", inplace=True)
248
+ sorted_identifiers = sorted(self.get_identifiers_names())
249
+ self.data = self.data.sort_values(by=sorted_identifiers).reset_index(drop=True)
250
+ other.data = other.data.sort_values(by=sorted_identifiers).reset_index(drop=True)
251
+ self.data = self.data.reindex(sorted(self.data.columns), axis=1)
252
+ other.data = other.data.reindex(sorted(other.data.columns), axis=1)
253
+ for comp in self.components.values():
254
+ type_name: str = comp.data_type.__name__.__str__()
255
+ if type_name in ["String", "Date"]:
256
+ self.data[comp.name] = self.data[comp.name].astype(str)
257
+ other.data[comp.name] = other.data[comp.name].astype(str)
258
+ elif type_name == "TimePeriod":
259
+ self.data[comp.name] = self.data[comp.name].astype(str)
260
+ other.data[comp.name] = other.data[comp.name].astype(str)
261
+ self.data[comp.name] = self.data[comp.name].map(
262
+ lambda x: str(TimePeriodHandler(str(x))) if x != "" else "",
263
+ na_action="ignore",
264
+ )
265
+ other.data[comp.name] = other.data[comp.name].map(
266
+ lambda x: str(TimePeriodHandler(str(x))) if x != "" else "",
267
+ na_action="ignore",
268
+ )
269
+ elif type_name in ["Integer", "Number"]:
270
+ type_ = "int64" if type_name == "Integer" else "float32"
271
+ # We use here a number to avoid errors on equality on empty strings
272
+ self.data[comp.name] = (
273
+ self.data[comp.name].replace("", -1234997).astype(type_) # type: ignore[call-overload]
274
+ )
275
+ other.data[comp.name] = (
276
+ other.data[comp.name].replace("", -1234997).astype(type_) # type: ignore[call-overload]
277
+ )
278
+ try:
279
+ assert_frame_equal(
280
+ self.data,
281
+ other.data,
282
+ check_dtype=False,
283
+ check_index_type=False,
284
+ check_datetimelike_compat=True,
285
+ check_exact=False,
286
+ rtol=0.01,
287
+ atol=0.01,
288
+ )
289
+ except AssertionError as e:
290
+ if "DataFrame shape" in str(e):
291
+ print(f"\nDataFrame shape mismatch {self.name}:")
292
+ print("result:", self.data.shape)
293
+ print("reference:", other.data.shape)
294
+ # Differences between the dataframes
295
+ diff = pd.concat([self.data, other.data]).drop_duplicates(keep=False)
296
+ if len(diff) == 0:
297
+ return True
298
+ # To display actual null values instead of -1234997
299
+ for comp in self.components.values():
300
+ if comp.data_type.__name__.__str__() in ["Integer", "Number"]:
301
+ diff[comp.name] = diff[comp.name].replace(-1234997, "")
302
+ print("\n Differences between the dataframes in", self.name)
303
+ print(diff)
304
+ raise e
305
+ return True
306
+
307
+ def get_component(self, component_name: str) -> Component:
308
+ return self.components[component_name]
309
+
310
+ def add_component(self, component: Component) -> None:
311
+ if component.name in self.components:
312
+ raise ValueError(f"Component with name {component.name} already exists")
313
+ self.components[component.name] = component
314
+
315
+ def delete_component(self, component_name: str) -> None:
316
+ self.components.pop(component_name, None)
317
+ if self.data is not None:
318
+ self.data.drop(columns=[component_name], inplace=True)
319
+
320
+ def get_components(self) -> List[Component]:
321
+ return list(self.components.values())
322
+
323
+ def get_identifiers(self) -> List[Component]:
324
+ return [
325
+ component for component in self.components.values() if component.role == Role.IDENTIFIER
326
+ ]
327
+
328
+ def get_attributes(self) -> List[Component]:
329
+ return [
330
+ component for component in self.components.values() if component.role == Role.ATTRIBUTE
331
+ ]
332
+
333
+ def get_measures(self) -> List[Component]:
334
+ return [
335
+ component for component in self.components.values() if component.role == Role.MEASURE
336
+ ]
337
+
338
+ def get_identifiers_names(self) -> List[str]:
339
+ return [
340
+ name for name, component in self.components.items() if component.role == Role.IDENTIFIER
341
+ ]
342
+
343
+ def get_attributes_names(self) -> List[str]:
344
+ return [
345
+ name for name, component in self.components.items() if component.role == Role.ATTRIBUTE
346
+ ]
347
+
348
+ def get_measures_names(self) -> List[str]:
349
+ return [
350
+ name for name, component in self.components.items() if component.role == Role.MEASURE
351
+ ]
352
+
353
+ def get_components_names(self) -> List[str]:
354
+ return list(self.components.keys())
355
+
356
+ @classmethod
357
+ def from_json(cls, json_str: Any) -> "Dataset":
358
+ components = {k: Component.from_json(v) for k, v in json_str["components"].items()}
359
+ return cls(json_str["name"], components, pd.DataFrame(json_str["data"]))
360
+
361
+ def to_dict(self) -> Dict[str, Any]:
362
+ return {
363
+ "name": self.name,
364
+ "components": {k: v.to_dict() for k, v in self.components.items()},
365
+ "data": (self.data.to_dict(orient="records") if self.data is not None else None),
366
+ }
367
+
368
+ def to_json(self) -> str:
369
+ return json.dumps(self.to_dict(), indent=4)
370
+
371
+ def to_json_datastructure(self) -> str:
372
+ dict_dataset = self.to_dict()["components"]
373
+ order_keys = ["name", "role", "type", "nullable"]
374
+ # Rename data_type to type
375
+ for k in dict_dataset:
376
+ dict_dataset[k] = {
377
+ ik if ik != "data_type" else "type": v for ik, v in dict_dataset[k].items()
378
+ }
379
+
380
+ # Order keys
381
+ for k in dict_dataset:
382
+ dict_dataset[k] = {ik: dict_dataset[k][ik] for ik in order_keys}
383
+ comp_values = list(dict_dataset.values())
384
+ ds_info = {"name": self.name, "DataStructure": comp_values}
385
+ result = {"datasets": [ds_info]}
386
+ return json.dumps(result, indent=2)
387
+
388
+
389
+ @dataclass
390
+ class ScalarSet:
391
+ """
392
+ Class representing a set of scalar values
393
+ """
394
+
395
+ data_type: Type[ScalarType]
396
+ _values: List[Union[int, float, str, bool]]
397
+
398
+ def __init__(
399
+ self, data_type: Type[ScalarType], values: List[Union[int, float, str, bool]]
400
+ ) -> None:
401
+ self.data_type = data_type
402
+ self.values = values
403
+
404
+ @property
405
+ def values(self) -> List[Union[int, float, str, bool]]:
406
+ return self._values
407
+
408
+ @values.setter
409
+ def values(self, new_values: List[Union[int, float, str, bool]]) -> None:
410
+ for value in new_values:
411
+ if self.data_type and not self.data_type.check(value):
412
+ raise InputValidationException(
413
+ code="0-1-2-7",
414
+ value=value,
415
+ type_=self.data_type.__name__,
416
+ op_type=self.__class__.__name__,
417
+ name="",
418
+ )
419
+ self._values = new_values
420
+
421
+ def __contains__(self, item: str) -> Optional[bool]:
422
+ if isinstance(item, float) and item.is_integer():
423
+ item = int(item)
424
+ if self.data_type == DataTypes.Null:
425
+ return None
426
+ value = self.data_type.cast(item)
427
+ return value in self.values
428
+
429
+
430
+ @dataclass
431
+ class ValueDomain:
432
+ """
433
+ Class representing a value domain
434
+ """
435
+
436
+ name: str
437
+ type: Type[ScalarType]
438
+ setlist: List[Union[int, float, str, bool]]
439
+
440
+ def __post_init__(self) -> None:
441
+ if len(set(self.setlist)) != len(self.setlist):
442
+ duplicated = [item for item, count in Counter(self.setlist).items() if count > 1]
443
+ raise ValueError(
444
+ f"The setlist must have unique values. Duplicated values: {duplicated}"
445
+ )
446
+
447
+ # Cast values to the correct type
448
+ self.setlist = [self.type.cast(value) for value in self.setlist]
449
+
450
+ @classmethod
451
+ def from_json(cls, json_str: str) -> str:
452
+ if len(json_str) == 0:
453
+ raise ValueError("Empty JSON string for ValueDomain")
454
+
455
+ json_info = json.loads(json_str)
456
+ return cls.from_dict(json_info)
457
+
458
+ @classmethod
459
+ def from_dict(cls, value: Dict[str, Any]) -> Any:
460
+ for x in ("name", "type", "setlist"):
461
+ if x not in value:
462
+ raise Exception("Invalid format for ValueDomain. Requires name, type and setlist.")
463
+ if value["type"] not in SCALAR_TYPES:
464
+ raise ValueError(f"Invalid data type {value['type']} for ValueDomain {value['name']}")
465
+
466
+ return cls(value["name"], SCALAR_TYPES[value["type"]], value["setlist"])
467
+
468
+ def to_dict(self) -> Dict[str, Any]:
469
+ return {"name": self.name, "type": self.type.__name__, "setlist": self.setlist}
470
+
471
+ def to_json(self) -> str:
472
+ return json.dumps(self.to_dict(), indent=4)
473
+
474
+ def __eq__(self, other: Any) -> bool:
475
+ return self.to_dict() == other.to_dict()
476
+
477
+
478
+ @dataclass
479
+ class ExternalRoutine:
480
+ """
481
+ Class representing an external routine, used in Eval operator
482
+ """
483
+
484
+ dataset_names: List[str]
485
+ query: str
486
+ name: str
487
+
488
+ @classmethod
489
+ def from_sql_query(cls, name: str, query: str) -> "ExternalRoutine":
490
+ try:
491
+ dataset_names = cls._extract_dataset_names(query)
492
+ return cls(dataset_names, query, name)
493
+ except sqlglot.errors.ParseError as e:
494
+ raise Exception(f"Invalid SQL query in external routine '{name}': {e}") from e
495
+
496
+ @classmethod
497
+ def _extract_dataset_names(cls, query: str) -> List[str]:
498
+ expression = sqlglot.parse_one(query, dialect="duckdb")
499
+ tables_info = list(expression.find_all(exp.Table))
500
+ dataset_names = [t.name for t in tables_info]
501
+ return dataset_names