vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,357 @@
1
+ from copy import copy
2
+ from typing import Any, List, Optional
3
+
4
+ import duckdb
5
+ import pandas as pd
6
+
7
+ import vtlengine.Operators as Operator
8
+ from vtlengine.AST.Grammar.tokens import (
9
+ AVG,
10
+ COUNT,
11
+ MAX,
12
+ MEDIAN,
13
+ MIN,
14
+ STDDEV_POP,
15
+ STDDEV_SAMP,
16
+ SUM,
17
+ VAR_POP,
18
+ VAR_SAMP,
19
+ )
20
+ from vtlengine.DataTypes import (
21
+ Boolean,
22
+ Date,
23
+ Duration,
24
+ Integer,
25
+ Number,
26
+ String,
27
+ TimeInterval,
28
+ TimePeriod,
29
+ unary_implicit_promotion,
30
+ )
31
+ from vtlengine.DataTypes.TimeHandling import (
32
+ PERIOD_IND_MAPPING,
33
+ PERIOD_IND_MAPPING_REVERSE,
34
+ TimeIntervalHandler,
35
+ TimePeriodHandler,
36
+ )
37
+ from vtlengine.Exceptions import RunTimeError, SemanticError
38
+ from vtlengine.Model import Component, Dataset, Role
39
+
40
+
41
+ def extract_grouping_identifiers(
42
+ identifier_names: List[str], group_op: Optional[str], grouping_components: Any
43
+ ) -> List[str]:
44
+ if group_op == "group by":
45
+ return grouping_components
46
+ elif group_op == "group except":
47
+ return [comp for comp in identifier_names if comp not in grouping_components]
48
+ else:
49
+ return identifier_names
50
+
51
+
52
+ # noinspection PyMethodOverriding
53
+ class Aggregation(Operator.Unary):
54
+ @classmethod
55
+ def _handle_data_types(cls, data: pd.DataFrame, measures: List[Component], mode: str) -> None:
56
+ to_replace: List[Optional[str]]
57
+ new_value: List[Optional[str]]
58
+ if cls.op == COUNT:
59
+ return
60
+ if mode == "input":
61
+ to_replace = [None]
62
+ new_value = [""]
63
+ else:
64
+ to_replace = [""]
65
+ new_value = [None]
66
+
67
+ for measure in measures:
68
+ if measure.data_type == Date:
69
+ if cls.op == MIN:
70
+ if mode == "input":
71
+ # Invalid date only for null values
72
+ new_value = ["9999-99-99"]
73
+ else:
74
+ to_replace = ["9999-99-99"]
75
+ data[measure.name] = data[measure.name].replace(to_replace, new_value) # type: ignore[arg-type, unused-ignore]
76
+ elif measure.data_type == TimePeriod:
77
+ if mode == "input":
78
+ data[measure.name] = (
79
+ data[measure.name]
80
+ .astype(object)
81
+ .map(lambda x: TimePeriodHandler(str(x)), na_action="ignore")
82
+ )
83
+ if cls.op in [MAX, MIN]:
84
+ indicators = {v.period_indicator for v in data[measure.name].dropna()}
85
+ if len(indicators) > 1:
86
+ raise RunTimeError("2-1-19-20", op=cls.op)
87
+ else:
88
+ data[measure.name] = data[measure.name].map(
89
+ lambda x: str(x), na_action="ignore"
90
+ )
91
+ elif measure.data_type == TimeInterval:
92
+ if mode == "input":
93
+ data[measure.name] = (
94
+ data[measure.name]
95
+ .astype(object)
96
+ .map(
97
+ lambda x: TimeIntervalHandler.from_iso_format(str(x)),
98
+ na_action="ignore",
99
+ )
100
+ )
101
+ else:
102
+ data[measure.name] = data[measure.name].map(
103
+ lambda x: str(x), na_action="ignore"
104
+ )
105
+ elif measure.data_type == String:
106
+ data[measure.name] = data[measure.name].replace(to_replace, new_value) # type: ignore[arg-type, unused-ignore]
107
+ elif measure.data_type == Duration:
108
+ if mode == "input":
109
+ data[measure.name] = data[measure.name].map(
110
+ lambda x: PERIOD_IND_MAPPING[x],
111
+ na_action="ignore",
112
+ )
113
+ else:
114
+ data[measure.name] = data[measure.name].map(
115
+ lambda x: PERIOD_IND_MAPPING_REVERSE[x],
116
+ na_action="ignore",
117
+ )
118
+ elif measure.data_type == Boolean:
119
+ if mode == "result":
120
+ data[measure.name] = data[measure.name].map(
121
+ lambda x: Boolean().cast(x), na_action="ignore"
122
+ )
123
+ data[measure.name] = data[measure.name].astype(object)
124
+
125
+ @classmethod
126
+ def validate( # type: ignore[override]
127
+ cls,
128
+ operand: Dataset,
129
+ group_op: Optional[str],
130
+ grouping_columns: Any,
131
+ having_data: Any,
132
+ ) -> Dataset:
133
+ result_components = {k: copy(v) for k, v in operand.components.items()}
134
+ if cls.op not in [COUNT, MIN, MAX] and len(operand.get_measures_names()) == 0:
135
+ raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
136
+ if group_op is not None:
137
+ for comp_name in grouping_columns:
138
+ if comp_name not in operand.components:
139
+ raise SemanticError(
140
+ "1-1-1-10",
141
+ op=cls.op,
142
+ comp_name=comp_name,
143
+ dataset_name=operand.name,
144
+ )
145
+ if operand.components[comp_name].role != Role.IDENTIFIER:
146
+ raise SemanticError(
147
+ "1-1-2-2",
148
+ op=cls.op,
149
+ id_name=comp_name,
150
+ id_type=operand.components[comp_name].role,
151
+ )
152
+
153
+ identifiers_to_keep = extract_grouping_identifiers(
154
+ operand.get_identifiers_names(), group_op, grouping_columns
155
+ )
156
+ for comp_name, comp in operand.components.items():
157
+ if comp.role == Role.IDENTIFIER and comp_name not in identifiers_to_keep:
158
+ del result_components[comp_name]
159
+ else:
160
+ for comp_name, comp in operand.components.items():
161
+ if comp.role == Role.IDENTIFIER:
162
+ del result_components[comp_name]
163
+ # Remove Attributes
164
+ for comp_name, comp in operand.components.items():
165
+ if comp.role == Role.ATTRIBUTE:
166
+ del result_components[comp_name]
167
+ # Change Measure data type
168
+ for _, comp in result_components.items():
169
+ if comp.role == Role.MEASURE:
170
+ unary_implicit_promotion(comp.data_type, cls.type_to_check)
171
+ if cls.return_type is not None:
172
+ comp.data_type = cls.return_type
173
+ if cls.op == COUNT:
174
+ for measure_name in operand.get_measures_names():
175
+ result_components.pop(measure_name)
176
+ new_comp = Component(
177
+ name="int_var",
178
+ role=Role.MEASURE,
179
+ data_type=Integer,
180
+ nullable=True,
181
+ )
182
+ result_components["int_var"] = new_comp
183
+
184
+ # VDS is handled in visit_Aggregation
185
+ return Dataset(name="result", components=result_components, data=None)
186
+
187
+ @classmethod
188
+ def _agg_func(
189
+ cls,
190
+ df: pd.DataFrame,
191
+ grouping_keys: Optional[List[str]],
192
+ measure_names: Optional[List[str]],
193
+ having_expression: Optional[str],
194
+ ) -> pd.DataFrame:
195
+ grouping_names = (
196
+ [f'"{name}"' for name in grouping_keys] if grouping_keys is not None else None
197
+ )
198
+ if grouping_names is not None and len(grouping_names) > 0:
199
+ grouping = "GROUP BY " + ", ".join(grouping_names)
200
+ else:
201
+ grouping = ""
202
+
203
+ if having_expression is None:
204
+ having_expression = ""
205
+
206
+ if measure_names is not None and len(measure_names) == 0 and cls.op == COUNT:
207
+ if grouping_names is not None:
208
+ query = (
209
+ f"SELECT {', '.join(grouping_names)}, COUNT() AS "
210
+ f"int_var from df {grouping} {having_expression}"
211
+ )
212
+ else:
213
+ query = f"SELECT COUNT() AS int_var from df {grouping}"
214
+ return duckdb.query(query).to_df()
215
+
216
+ if measure_names is not None and len(measure_names) > 0:
217
+ functions = ""
218
+ for e in measure_names:
219
+ e = f'"{e}"'
220
+ if cls.type_to_check is not None and cls.op != COUNT:
221
+ functions += (
222
+ f"{cls.py_op}(CAST({e} AS DOUBLE)) AS {e}, " # Count can only be one here
223
+ )
224
+ elif cls.op == COUNT:
225
+ functions += f"{cls.py_op}({e}) AS int_var, "
226
+ break
227
+ else:
228
+ functions += f"{cls.py_op}({e}) AS {e}, "
229
+ if grouping_names is not None and len(grouping_names) > 0:
230
+ query = (
231
+ f"SELECT {', '.join(grouping_names) + ', '}{functions[:-2]} "
232
+ f"from df {grouping} {having_expression}"
233
+ )
234
+ else:
235
+ query = f"SELECT {functions[:-2]} from df"
236
+
237
+ else:
238
+ query = (
239
+ f"SELECT {', '.join(grouping_names or [])} from df {grouping} {having_expression}"
240
+ )
241
+
242
+ try:
243
+ return duckdb.query(query).to_df().astype(object)
244
+ except RuntimeError as e:
245
+ if "Conversion" in e.args[0]:
246
+ raise RunTimeError("2-3-8", op=cls.op, msg=e.args[0].split(":")[-1])
247
+ else:
248
+ raise RunTimeError("2-1-1-1", op=cls.op)
249
+
250
+ @classmethod
251
+ def evaluate( # type: ignore[override]
252
+ cls,
253
+ operand: Dataset,
254
+ group_op: Optional[str],
255
+ grouping_columns: Optional[List[str]],
256
+ having_expr: Optional[str],
257
+ ) -> Dataset:
258
+ result = cls.validate(operand, group_op, grouping_columns, having_expr)
259
+
260
+ grouping_keys = result.get_identifiers_names()
261
+ result_df = operand.data.copy() if operand.data is not None else pd.DataFrame()
262
+ measure_names = operand.get_measures_names()
263
+ result_df = result_df[grouping_keys + measure_names]
264
+ if cls.op == COUNT:
265
+ result_df = result_df.dropna(subset=measure_names, how="any")
266
+ if cls.op in [MAX, MIN]:
267
+ for measure in operand.get_measures():
268
+ if measure.data_type == TimeInterval:
269
+ raise RunTimeError("2-1-19-18", op=cls.op)
270
+ cls._handle_data_types(result_df, operand.get_measures(), "input")
271
+ result_df = cls._agg_func(result_df, grouping_keys, measure_names, having_expr)
272
+
273
+ cls._handle_data_types(result_df, operand.get_measures(), "result")
274
+ # Handle correct order on result
275
+ aux_df = (
276
+ operand.data[grouping_keys].drop_duplicates()
277
+ if operand.data is not None
278
+ else pd.DataFrame()
279
+ )
280
+ if len(grouping_keys) == 0:
281
+ aux_df = result_df
282
+ aux_df.dropna(subset=result.get_measures_names(), how="all", inplace=True)
283
+ if cls.op == COUNT and len(result_df) == 0:
284
+ aux_df["int_var"] = 0
285
+ elif len(aux_df) == 0:
286
+ aux_df = pd.DataFrame(columns=result.get_components_names())
287
+ else:
288
+ aux_df = pd.merge(aux_df, result_df, how="left", on=grouping_keys)
289
+ if having_expr is not None:
290
+ aux_df.dropna(subset=result.get_measures_names(), how="any", inplace=True)
291
+ result.data = aux_df
292
+ return result
293
+
294
+
295
+ class Max(Aggregation):
296
+ op = MAX
297
+ py_op = "max"
298
+
299
+
300
+ class Min(Aggregation):
301
+ op = MIN
302
+ py_op = "min"
303
+
304
+
305
+ class Sum(Aggregation):
306
+ op = SUM
307
+ type_to_check = Number
308
+ py_op = "sum"
309
+
310
+
311
+ class Count(Aggregation):
312
+ op = COUNT
313
+ type_to_check = None
314
+ return_type = Integer
315
+ py_op = "count"
316
+
317
+
318
+ class Avg(Aggregation):
319
+ op = AVG
320
+ type_to_check = Number
321
+ return_type = Number
322
+ py_op = "avg"
323
+
324
+
325
+ class Median(Aggregation):
326
+ op = MEDIAN
327
+ type_to_check = Number
328
+ return_type = Number
329
+ py_op = "median"
330
+
331
+
332
+ class PopulationStandardDeviation(Aggregation):
333
+ op = STDDEV_POP
334
+ type_to_check = Number
335
+ return_type = Number
336
+ py_op = "stddev_pop"
337
+
338
+
339
+ class SampleStandardDeviation(Aggregation):
340
+ op = STDDEV_SAMP
341
+ type_to_check = Number
342
+ return_type = Number
343
+ py_op = "stddev_samp"
344
+
345
+
346
+ class PopulationVariance(Aggregation):
347
+ op = VAR_POP
348
+ type_to_check = Number
349
+ return_type = Number
350
+ py_op = "var_pop"
351
+
352
+
353
+ class SampleVariance(Aggregation):
354
+ op = VAR_SAMP
355
+ type_to_check = Number
356
+ return_type = Number
357
+ py_op = "var_samp"