vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,455 @@
1
+ from copy import copy
2
+ from typing import List, Optional
3
+
4
+ import duckdb
5
+
6
+ # if os.environ.get("SPARK"):
7
+ # import pyspark.pandas as pd
8
+ # else:
9
+ # import pandas as pd
10
+ import pandas as pd
11
+
12
+ import vtlengine.Operators as Operator
13
+ from vtlengine.AST import OrderBy, Windowing
14
+ from vtlengine.AST.Grammar.tokens import (
15
+ AVG,
16
+ COUNT,
17
+ FIRST_VALUE,
18
+ LAG,
19
+ LAST_VALUE,
20
+ LEAD,
21
+ MAX,
22
+ MEDIAN,
23
+ MIN,
24
+ RANK,
25
+ RATIO_TO_REPORT,
26
+ STDDEV_POP,
27
+ STDDEV_SAMP,
28
+ SUM,
29
+ VAR_POP,
30
+ VAR_SAMP,
31
+ )
32
+ from vtlengine.DataTypes import (
33
+ COMP_NAME_MAPPING,
34
+ Integer,
35
+ Number,
36
+ unary_implicit_promotion,
37
+ )
38
+ from vtlengine.Exceptions import SemanticError
39
+ from vtlengine.Model import Component, Dataset, Role
40
+ from vtlengine.Utils.__Virtual_Assets import VirtualCounter
41
+
42
+ return_integer_operators = [MAX, MIN, SUM]
43
+
44
+
45
+ # noinspection PyMethodOverriding
46
+ class Analytic(Operator.Unary):
47
+ """
48
+ Analytic class
49
+
50
+ Class that inherits from Unary.
51
+
52
+ Class methods:
53
+ Validate: Validates the Dataset.
54
+ analyticfunc: Specify class method that returns a dataframe using the duckdb library.
55
+ Evaluate: Ensures the type of data is the correct one to perform the Analytic operators.
56
+ """
57
+
58
+ return_integer = None
59
+ sql_op: Optional[str] = None
60
+
61
+ @classmethod
62
+ def validate( # type: ignore[override] # noqa: C901
63
+ cls,
64
+ operand: Dataset,
65
+ partitioning: List[str],
66
+ ordering: Optional[List[OrderBy]],
67
+ window: Optional[Windowing],
68
+ params: Optional[List[int]],
69
+ component_name: Optional[str] = None,
70
+ ) -> Dataset:
71
+ order_components = [] if ordering is None else [o.component for o in ordering]
72
+ identifier_names = operand.get_identifiers_names()
73
+ result_components = operand.components.copy()
74
+
75
+ for comp_name in partitioning:
76
+ if comp_name not in operand.components:
77
+ raise SemanticError(
78
+ "1-1-1-10",
79
+ op=cls.op,
80
+ comp_name=comp_name,
81
+ dataset_name=operand.name,
82
+ )
83
+ if comp_name not in identifier_names:
84
+ raise SemanticError(
85
+ "1-1-3-2",
86
+ op=cls.op,
87
+ id_name=comp_name,
88
+ id_type=operand.components[comp_name].role,
89
+ )
90
+ for comp_name in order_components:
91
+ if comp_name not in operand.components:
92
+ raise SemanticError(
93
+ "1-1-1-10",
94
+ op=cls.op,
95
+ comp_name=comp_name,
96
+ dataset_name=operand.name,
97
+ )
98
+ if component_name is not None:
99
+ if cls.type_to_check is not None:
100
+ unary_implicit_promotion(
101
+ operand.components[component_name].data_type, cls.type_to_check
102
+ )
103
+
104
+ if cls.op in return_integer_operators:
105
+ cls.return_integer = isinstance(cls.return_type, Integer)
106
+
107
+ elif cls.return_type is not None:
108
+ result_components[component_name] = Component(
109
+ name=component_name,
110
+ data_type=cls.return_type,
111
+ role=operand.components[component_name].role,
112
+ nullable=operand.components[component_name].nullable,
113
+ )
114
+ if cls.op == COUNT:
115
+ measure_name = COMP_NAME_MAPPING[cls.return_type]
116
+ result_components[measure_name] = Component(
117
+ name=measure_name,
118
+ data_type=cls.return_type,
119
+ role=Role.MEASURE,
120
+ nullable=operand.components[component_name].nullable,
121
+ )
122
+ if component_name in result_components:
123
+ del result_components[component_name]
124
+ else:
125
+ measures = operand.get_measures()
126
+ if len(measures) == 0:
127
+ raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
128
+
129
+ if cls.op in return_integer_operators:
130
+ isNumber = False
131
+ for measure in measures:
132
+ isNumber |= isinstance(measure.data_type, Number)
133
+ cls.return_integer = not isNumber
134
+
135
+ if cls.type_to_check is not None:
136
+ for measure in measures:
137
+ unary_implicit_promotion(measure.data_type, cls.type_to_check)
138
+
139
+ if cls.op in return_integer_operators:
140
+ for measure in measures:
141
+ new_measure = copy(measure)
142
+ new_measure.data_type = Integer if cls.return_integer else Number
143
+ result_components[measure.name] = new_measure
144
+ elif cls.return_type is not None:
145
+ for measure in measures:
146
+ new_measure = copy(measure)
147
+ new_measure.data_type = cls.return_type
148
+ result_components[measure.name] = new_measure
149
+
150
+ if cls.op == COUNT and len(measures) <= 1:
151
+ measure_name = COMP_NAME_MAPPING[cls.return_type]
152
+ nullable = False if len(measures) == 0 else measures[0].nullable
153
+ if len(measures) == 1:
154
+ del result_components[measures[0].name]
155
+ result_components[measure_name] = Component(
156
+ name=measure_name,
157
+ data_type=cls.return_type,
158
+ role=Role.MEASURE,
159
+ nullable=nullable,
160
+ )
161
+ dataset_name = VirtualCounter._new_ds_name()
162
+ return Dataset(name=dataset_name, components=result_components, data=None)
163
+
164
+ @classmethod
165
+ def analyticfunc(
166
+ cls,
167
+ df: pd.DataFrame,
168
+ partitioning: List[str],
169
+ identifier_names: List[str],
170
+ measure_names: List[str],
171
+ ordering: List[OrderBy],
172
+ window: Optional[Windowing],
173
+ params: Optional[List[int]] = None,
174
+ ) -> pd.DataFrame:
175
+ """Annotation class
176
+
177
+ It is used to analyze the attributes specified bellow
178
+ ensuring that the type of data is the correct one to perform
179
+ the operation.
180
+
181
+ Attributes:
182
+ identifier_names: List with the id names.
183
+ measure_names: List with the measures names.
184
+ ordering: List with the ordering modes.
185
+ window: ...
186
+ params: No params are related to this class.
187
+ """
188
+ # Windowing
189
+ window_str = ""
190
+ if window is not None:
191
+ mode = "ROWS" if window.type_ == "data" else "RANGE"
192
+ start_mode = (
193
+ window.start_mode.upper()
194
+ if (isinstance(window.start, int) and window.start != 0)
195
+ or (isinstance(window.start, str) and window.start == "unbounded")
196
+ else ""
197
+ )
198
+ stop_mode = (
199
+ window.stop_mode.upper()
200
+ if (isinstance(window.stop, int) and window.stop != 0)
201
+ or (isinstance(window.stop, str) and window.stop == "unbounded")
202
+ else ""
203
+ )
204
+ start = (
205
+ "UNBOUNDED"
206
+ if window.start == "unbounded" or window.start == -1
207
+ else str(window.start)
208
+ )
209
+ stop = (
210
+ "CURRENT ROW" if window.stop == "current" or window.stop == 0 else str(window.stop)
211
+ )
212
+ window_str = f"{mode} BETWEEN {start} {start_mode} AND {stop} {stop_mode}"
213
+
214
+ # Partitioning
215
+ partition = "PARTITION BY " + ", ".join(partitioning) if len(partitioning) > 0 else ""
216
+
217
+ # Ordering
218
+ order_str = ""
219
+ if len(ordering) > 0:
220
+ for x in ordering:
221
+ order_str += f"{x.component} {x.order}, "
222
+ if len(order_str) > 0:
223
+ order_str = "ORDER BY " + order_str[:-2]
224
+
225
+ # Generating the complete analytic string
226
+ analytic_str = f"OVER ( {partition} {order_str} {window_str})"
227
+
228
+ measure_queries = []
229
+ for measure in measure_names:
230
+ if cls.op == RANK:
231
+ measure_query = f"{cls.sql_op}()"
232
+ elif cls.op == RATIO_TO_REPORT:
233
+ measure_query = f"CAST({measure} AS DOUBLE) / SUM(CAST({measure} AS DOUBLE))"
234
+ elif cls.op in [LAG, LEAD]:
235
+ measure_query = f"{cls.sql_op}({measure}, {','.join(map(str, params or []))})"
236
+ else:
237
+ measure_query = f"{cls.sql_op}({measure})"
238
+ if cls.op == COUNT and len(measure_names) == 1:
239
+ measure_query += f" {analytic_str} as {COMP_NAME_MAPPING[cls.return_type]}"
240
+ elif cls.op in return_integer_operators and cls.return_integer:
241
+ measure_query = f"CAST({measure_query} {analytic_str} AS INTEGER) as {measure}"
242
+ else:
243
+ measure_query += f" {analytic_str} as {measure}"
244
+ measure_queries.append(measure_query)
245
+ if cls.op == COUNT and len(measure_names) == 0:
246
+ measure_queries.append(
247
+ f"COUNT(*) {analytic_str} as {COMP_NAME_MAPPING[cls.return_type]}"
248
+ )
249
+
250
+ measures_sql = ", ".join(measure_queries)
251
+ identifiers_sql = ", ".join(identifier_names)
252
+ query = f"SELECT {identifiers_sql} , {measures_sql} FROM df"
253
+
254
+ if cls.op == COUNT:
255
+ df[measure_names] = df[measure_names].fillna(-1)
256
+ # if os.getenv("SPARK", False):
257
+ # df = df.to_pandas()
258
+ return duckdb.query(query).to_df().astype(object)
259
+
260
+ @classmethod
261
+ def evaluate( # type: ignore[override]
262
+ cls,
263
+ operand: Dataset,
264
+ partitioning: List[str],
265
+ ordering: Optional[List[OrderBy]],
266
+ window: Optional[Windowing],
267
+ params: Optional[List[int]],
268
+ component_name: Optional[str] = None,
269
+ ) -> Dataset:
270
+ result = cls.validate(operand, partitioning, ordering, window, params, component_name)
271
+ df = operand.data.copy() if operand.data is not None else pd.DataFrame()
272
+ identifier_names = operand.get_identifiers_names()
273
+
274
+ if component_name is not None:
275
+ measure_names = [component_name]
276
+ else:
277
+ measure_names = operand.get_measures_names()
278
+
279
+ result.data = cls.analyticfunc(
280
+ df=df,
281
+ partitioning=partitioning,
282
+ identifier_names=identifier_names,
283
+ measure_names=measure_names,
284
+ ordering=ordering or [],
285
+ window=window,
286
+ params=params,
287
+ )
288
+
289
+ # if cls.return_type == Integer:
290
+ # result.data[measure_names] = result.data[measure_names].astype('Int64')
291
+
292
+ return result
293
+
294
+
295
+ class Max(Analytic):
296
+ """
297
+ Max operator
298
+ """
299
+
300
+ op = MAX
301
+ sql_op = "MAX"
302
+ return_integer = False
303
+
304
+
305
+ class Min(Analytic):
306
+ """
307
+ Min operator
308
+ """
309
+
310
+ op = MIN
311
+ sql_op = "MIN"
312
+ return_integer = False
313
+
314
+
315
+ class Sum(Analytic):
316
+ """
317
+ Sum operator
318
+ """
319
+
320
+ op = SUM
321
+ sql_op = "SUM"
322
+ return_integer = False
323
+
324
+
325
+ class Count(Analytic):
326
+ """
327
+ Count operator
328
+ """
329
+
330
+ op = COUNT
331
+ type_to_check = None
332
+ return_type = Integer
333
+ sql_op = "COUNT"
334
+
335
+
336
+ class Avg(Analytic):
337
+ """
338
+ Average operator
339
+ """
340
+
341
+ op = AVG
342
+ type_to_check = Number
343
+ return_type = Number
344
+ sql_op = "AVG"
345
+
346
+
347
+ class Median(Analytic):
348
+ """
349
+ Median operator
350
+ """
351
+
352
+ op = MEDIAN
353
+ type_to_check = Number
354
+ return_type = Number
355
+ sql_op = "MEDIAN"
356
+
357
+
358
+ class PopulationStandardDeviation(Analytic):
359
+ """
360
+ Population deviation operator
361
+ """
362
+
363
+ op = STDDEV_POP
364
+ type_to_check = Number
365
+ return_type = Number
366
+ sql_op = "STDDEV_POP"
367
+
368
+
369
+ class SampleStandardDeviation(Analytic):
370
+ """
371
+ Sample standard deviation operator.
372
+ """
373
+
374
+ op = STDDEV_SAMP
375
+ type_to_check = Number
376
+ return_type = Number
377
+ sql_op = "STDDEV_SAMP"
378
+
379
+
380
+ class PopulationVariance(Analytic):
381
+ """
382
+ Variance operator
383
+ """
384
+
385
+ op = VAR_POP
386
+ type_to_check = Number
387
+ return_type = Number
388
+ sql_op = "VAR_POP"
389
+
390
+
391
+ class SampleVariance(Analytic):
392
+ """
393
+ Sample variance operator
394
+ """
395
+
396
+ op = VAR_SAMP
397
+ type_to_check = Number
398
+ return_type = Number
399
+ sql_op = "VAR_SAMP"
400
+
401
+
402
+ class FirstValue(Analytic):
403
+ """
404
+ First value operator
405
+ """
406
+
407
+ op = FIRST_VALUE
408
+ sql_op = "FIRST"
409
+
410
+
411
+ class LastValue(Analytic):
412
+ """
413
+ Last value operator
414
+ """
415
+
416
+ op = LAST_VALUE
417
+ sql_op = "LAST"
418
+
419
+
420
+ class Lag(Analytic):
421
+ """
422
+ Lag operator
423
+ """
424
+
425
+ op = LAG
426
+ sql_op = "LAG"
427
+
428
+
429
+ class Lead(Analytic):
430
+ """
431
+ Lead operator
432
+ """
433
+
434
+ op = LEAD
435
+ sql_op = "LEAD"
436
+
437
+
438
+ class Rank(Analytic):
439
+ """
440
+ Rank operator
441
+ """
442
+
443
+ op = RANK
444
+ sql_op = "RANK"
445
+ return_type = Integer
446
+
447
+
448
+ class RatioToReport(Analytic):
449
+ """
450
+ Ratio operator
451
+ """
452
+
453
+ op = RATIO_TO_REPORT
454
+ type_to_check = Number
455
+ return_type = Number
@@ -0,0 +1,23 @@
1
+ from typing import Any, Union
2
+
3
+ from vtlengine.Exceptions import SemanticError
4
+ from vtlengine.Model import DataComponent, Dataset
5
+ from vtlengine.Operators import Binary
6
+
7
+ ALL_MODEL_TYPES = Union[DataComponent, Dataset]
8
+
9
+
10
+ class Assignment(Binary):
11
+ @classmethod
12
+ def validate(cls, left_operand: Any, right_operand: Any) -> ALL_MODEL_TYPES:
13
+ if (
14
+ isinstance(right_operand, DataComponent)
15
+ and right_operand.role.__str__() == "IDENTIFIER"
16
+ ):
17
+ raise SemanticError("1-1-6-13", op=cls.op, comp_name=right_operand.name)
18
+ right_operand.name = left_operand
19
+ return right_operand
20
+
21
+ @classmethod
22
+ def evaluate(cls, left_operand: Any, right_operand: Any) -> ALL_MODEL_TYPES:
23
+ return cls.validate(left_operand, right_operand)
@@ -0,0 +1,106 @@
1
+ # if os.environ.get("SPARK", False):
2
+ # import pyspark.pandas as pd
3
+ # else:
4
+ # import pandas as pd
5
+ from typing import Any, Optional
6
+
7
+ import pandas as pd
8
+
9
+ import vtlengine.Operators as Operator
10
+ from vtlengine.AST.Grammar.tokens import AND, NOT, OR, XOR
11
+ from vtlengine.DataTypes import Boolean
12
+
13
+
14
+ class Unary(Operator.Unary):
15
+ type_to_check = Boolean
16
+ return_type = Boolean
17
+
18
+
19
+ class Binary(Operator.Binary):
20
+ type_to_check = Boolean
21
+ return_type = Boolean
22
+ comp_op: Any = None
23
+
24
+ @classmethod
25
+ def apply_operation_series_scalar(cls, series: Any, scalar: Any, series_left: bool) -> Any:
26
+ if series_left:
27
+ return series.map(lambda x: cls.py_op(x, scalar))
28
+ else:
29
+ return series.map(lambda x: cls.py_op(scalar, x))
30
+
31
+ @classmethod
32
+ def apply_operation_two_series(cls, left_series: Any, right_series: Any) -> Any:
33
+ result = cls.comp_op(left_series.astype("boolean"), right_series.astype("boolean"))
34
+ return result.replace({pd.NA: None}).astype(object)
35
+
36
+ @classmethod
37
+ def op_func(cls, x: Optional[bool], y: Optional[bool]) -> Optional[bool]:
38
+ return cls.py_op(x, y)
39
+
40
+
41
+ class And(Binary):
42
+ op = AND
43
+ comp_op = pd.Series.__and__
44
+
45
+ @staticmethod
46
+ # @numba.njit
47
+ def py_op(x: Optional[bool], y: Optional[bool]) -> Optional[bool]:
48
+ if (x is None and y == False) or (x == False and y is None):
49
+ return False
50
+ elif x is None or y is None:
51
+ return None
52
+ return x and y
53
+
54
+ # @classmethod
55
+ # def spark_op(cls, x: pd.Series, y: pd.Series) -> pd.Series:
56
+ # return x & y
57
+
58
+
59
+ class Or(Binary):
60
+ op = OR
61
+ comp_op = pd.Series.__or__
62
+
63
+ @staticmethod
64
+ # @numba.njit
65
+ def py_op(x: Optional[bool], y: Optional[bool]) -> Optional[bool]:
66
+ if (x is None and y == True) or (x == True and y is None):
67
+ return True
68
+ elif x is None or y is None:
69
+ return None
70
+ return x or y
71
+
72
+ # @classmethod
73
+ # def spark_op(cls, x: pd.Series, y: pd.Series) -> pd.Series:
74
+ # return x | y
75
+
76
+
77
+ class Xor(Binary):
78
+ op = XOR
79
+ comp_op = pd.Series.__xor__
80
+
81
+ @classmethod
82
+ def py_op(cls, x: Optional[bool], y: Optional[bool]) -> Optional[bool]:
83
+ if pd.isnull(x) or pd.isnull(y):
84
+ return None
85
+ return (x and not y) or (not x and y)
86
+
87
+ # @classmethod
88
+ # def spark_op(cls, x: pd.Series, y: pd.Series) -> pd.Series:
89
+ # return x ^ y
90
+
91
+
92
+ class Not(Unary):
93
+ op = NOT
94
+
95
+ @staticmethod
96
+ # @numba.njit
97
+ def py_op(x: Optional[bool]) -> Optional[bool]:
98
+ return None if x is None else not x
99
+
100
+ # @classmethod
101
+ # def spark_op(cls, series: pd.Series) -> pd.Series:
102
+ # return ~series
103
+
104
+ @classmethod
105
+ def apply_operation_component(cls, series: Any) -> Any:
106
+ return series.map(lambda x: not x, na_action="ignore")