vtlengine 1.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vtlengine might be problematic. Click here for more details.
- vtlengine/API/_InternalApi.py +159 -102
- vtlengine/API/__init__.py +110 -68
- vtlengine/AST/ASTConstructor.py +188 -98
- vtlengine/AST/ASTConstructorModules/Expr.py +402 -205
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +248 -104
- vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
- vtlengine/AST/ASTEncoders.py +1 -1
- vtlengine/AST/ASTTemplate.py +24 -9
- vtlengine/AST/ASTVisitor.py +8 -12
- vtlengine/AST/DAG/__init__.py +43 -35
- vtlengine/AST/DAG/_words.py +4 -4
- vtlengine/AST/Grammar/Vtl.g4 +49 -20
- vtlengine/AST/Grammar/VtlTokens.g4 +13 -1
- vtlengine/AST/Grammar/lexer.py +2012 -1312
- vtlengine/AST/Grammar/parser.py +7524 -4343
- vtlengine/AST/Grammar/tokens.py +140 -128
- vtlengine/AST/VtlVisitor.py +16 -5
- vtlengine/AST/__init__.py +41 -11
- vtlengine/DataTypes/NumericTypesHandling.py +5 -4
- vtlengine/DataTypes/TimeHandling.py +196 -301
- vtlengine/DataTypes/__init__.py +304 -218
- vtlengine/Exceptions/__init__.py +96 -27
- vtlengine/Exceptions/messages.py +149 -69
- vtlengine/Interpreter/__init__.py +817 -497
- vtlengine/Model/__init__.py +172 -121
- vtlengine/Operators/Aggregation.py +156 -95
- vtlengine/Operators/Analytic.py +167 -79
- vtlengine/Operators/Assignment.py +7 -4
- vtlengine/Operators/Boolean.py +27 -32
- vtlengine/Operators/CastOperator.py +177 -131
- vtlengine/Operators/Clause.py +137 -99
- vtlengine/Operators/Comparison.py +148 -117
- vtlengine/Operators/Conditional.py +290 -98
- vtlengine/Operators/General.py +68 -47
- vtlengine/Operators/HROperators.py +91 -72
- vtlengine/Operators/Join.py +217 -118
- vtlengine/Operators/Numeric.py +129 -46
- vtlengine/Operators/RoleSetter.py +16 -15
- vtlengine/Operators/Set.py +61 -36
- vtlengine/Operators/String.py +213 -139
- vtlengine/Operators/Time.py +467 -215
- vtlengine/Operators/Validation.py +117 -76
- vtlengine/Operators/__init__.py +340 -213
- vtlengine/Utils/__init__.py +232 -41
- vtlengine/__init__.py +1 -1
- vtlengine/files/output/__init__.py +15 -6
- vtlengine/files/output/_time_period_representation.py +10 -9
- vtlengine/files/parser/__init__.py +79 -52
- vtlengine/files/parser/_rfc_dialect.py +6 -5
- vtlengine/files/parser/_time_checking.py +48 -37
- vtlengine-1.0.2.dist-info/METADATA +245 -0
- vtlengine-1.0.2.dist-info/RECORD +58 -0
- {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/WHEEL +1 -1
- vtlengine-1.0.dist-info/METADATA +0 -104
- vtlengine-1.0.dist-info/RECORD +0 -58
- {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/LICENSE.md +0 -0
|
@@ -1,27 +1,49 @@
|
|
|
1
1
|
from copy import copy
|
|
2
|
-
from typing import List, Optional
|
|
2
|
+
from typing import List, Optional, Any
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from vtlengine.DataTypes import
|
|
6
|
+
from vtlengine.DataTypes import (
|
|
7
|
+
Integer,
|
|
8
|
+
Number,
|
|
9
|
+
unary_implicit_promotion,
|
|
10
|
+
Boolean,
|
|
11
|
+
String,
|
|
12
|
+
Duration,
|
|
13
|
+
TimeInterval,
|
|
14
|
+
TimePeriod,
|
|
15
|
+
Date,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
import vtlengine.Operators as Operator
|
|
9
|
-
from vtlengine.AST.Grammar.tokens import (
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
19
|
+
from vtlengine.AST.Grammar.tokens import (
|
|
20
|
+
AVG,
|
|
21
|
+
COUNT,
|
|
22
|
+
MAX,
|
|
23
|
+
MEDIAN,
|
|
24
|
+
MIN,
|
|
25
|
+
STDDEV_POP,
|
|
26
|
+
STDDEV_SAMP,
|
|
27
|
+
SUM,
|
|
28
|
+
VAR_POP,
|
|
29
|
+
VAR_SAMP,
|
|
30
|
+
)
|
|
31
|
+
from vtlengine.DataTypes.TimeHandling import (
|
|
32
|
+
DURATION_MAPPING,
|
|
33
|
+
DURATION_MAPPING_REVERSED,
|
|
34
|
+
TimePeriodHandler,
|
|
35
|
+
TimeIntervalHandler,
|
|
36
|
+
)
|
|
15
37
|
from vtlengine.Exceptions import SemanticError
|
|
16
|
-
from vtlengine.Model import Component,
|
|
38
|
+
from vtlengine.Model import Component, Dataset, Role
|
|
17
39
|
|
|
18
40
|
|
|
19
|
-
def extract_grouping_identifiers(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
if group_op ==
|
|
41
|
+
def extract_grouping_identifiers(
|
|
42
|
+
identifier_names: List[str], group_op: Optional[str], grouping_components: Any
|
|
43
|
+
) -> List[str]:
|
|
44
|
+
if group_op == "group by":
|
|
23
45
|
return grouping_components
|
|
24
|
-
elif group_op ==
|
|
46
|
+
elif group_op == "group except":
|
|
25
47
|
return [comp for comp in identifier_names if comp not in grouping_components]
|
|
26
48
|
else:
|
|
27
49
|
return identifier_names
|
|
@@ -30,77 +52,95 @@ def extract_grouping_identifiers(identifier_names: List[str],
|
|
|
30
52
|
# noinspection PyMethodOverriding
|
|
31
53
|
class Aggregation(Operator.Unary):
|
|
32
54
|
@classmethod
|
|
33
|
-
def _handle_data_types(cls, data: pd.DataFrame, measures: List[Component], mode: str):
|
|
55
|
+
def _handle_data_types(cls, data: pd.DataFrame, measures: List[Component], mode: str) -> None:
|
|
56
|
+
to_replace: List[Optional[str]]
|
|
57
|
+
new_value: List[Optional[str]]
|
|
34
58
|
if cls.op == COUNT:
|
|
35
59
|
return
|
|
36
|
-
if mode ==
|
|
60
|
+
if mode == "input":
|
|
37
61
|
to_replace = [None]
|
|
38
|
-
new_value = [
|
|
62
|
+
new_value = [""]
|
|
39
63
|
else:
|
|
40
|
-
to_replace = [
|
|
64
|
+
to_replace = [""]
|
|
41
65
|
new_value = [None]
|
|
42
66
|
|
|
43
67
|
for measure in measures:
|
|
44
|
-
if measure.data_type
|
|
68
|
+
if measure.data_type == Date:
|
|
45
69
|
if cls.op == MIN:
|
|
46
|
-
if mode ==
|
|
70
|
+
if mode == "input":
|
|
47
71
|
# Invalid date only for null values
|
|
48
|
-
new_value = [
|
|
72
|
+
new_value = ["9999-99-99"]
|
|
49
73
|
else:
|
|
50
|
-
to_replace = [
|
|
74
|
+
to_replace = ["9999-99-99"]
|
|
51
75
|
data[measure.name] = data[measure.name].replace(to_replace, new_value)
|
|
52
|
-
elif measure.data_type
|
|
53
|
-
if mode ==
|
|
54
|
-
data[measure.name] =
|
|
55
|
-
|
|
56
|
-
|
|
76
|
+
elif measure.data_type == TimePeriod:
|
|
77
|
+
if mode == "input":
|
|
78
|
+
data[measure.name] = (
|
|
79
|
+
data[measure.name]
|
|
80
|
+
.astype(object)
|
|
81
|
+
.map(lambda x: TimePeriodHandler(x), na_action="ignore")
|
|
82
|
+
)
|
|
57
83
|
else:
|
|
58
84
|
data[measure.name] = data[measure.name].map(
|
|
59
|
-
lambda x: str(x), na_action=
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
85
|
+
lambda x: str(x), na_action="ignore"
|
|
86
|
+
)
|
|
87
|
+
elif measure.data_type == TimeInterval:
|
|
88
|
+
if mode == "input":
|
|
89
|
+
data[measure.name] = (
|
|
90
|
+
data[measure.name]
|
|
91
|
+
.astype(object)
|
|
92
|
+
.map(lambda x: TimeIntervalHandler.from_iso_format(x), na_action="ignore")
|
|
93
|
+
)
|
|
65
94
|
else:
|
|
66
95
|
data[measure.name] = data[measure.name].map(
|
|
67
|
-
lambda x: str(x), na_action=
|
|
68
|
-
|
|
96
|
+
lambda x: str(x), na_action="ignore"
|
|
97
|
+
)
|
|
98
|
+
elif measure.data_type == String:
|
|
69
99
|
data[measure.name] = data[measure.name].replace(to_replace, new_value)
|
|
70
|
-
elif measure.data_type
|
|
71
|
-
if mode ==
|
|
72
|
-
data[measure.name] = data[measure.name].map(
|
|
73
|
-
|
|
100
|
+
elif measure.data_type == Duration:
|
|
101
|
+
if mode == "input":
|
|
102
|
+
data[measure.name] = data[measure.name].map(
|
|
103
|
+
lambda x: DURATION_MAPPING[x], na_action="ignore"
|
|
104
|
+
)
|
|
74
105
|
else:
|
|
75
106
|
data[measure.name] = data[measure.name].map(
|
|
76
|
-
lambda x: DURATION_MAPPING_REVERSED[x], na_action=
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
107
|
+
lambda x: DURATION_MAPPING_REVERSED[x], na_action="ignore"
|
|
108
|
+
)
|
|
109
|
+
elif measure.data_type == Boolean:
|
|
110
|
+
if mode == "result":
|
|
111
|
+
data[measure.name] = data[measure.name].map(
|
|
112
|
+
lambda x: Boolean().cast(x), na_action="ignore"
|
|
113
|
+
)
|
|
81
114
|
data[measure.name] = data[measure.name].astype(object)
|
|
82
115
|
|
|
83
116
|
@classmethod
|
|
84
|
-
def validate(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
117
|
+
def validate( # type: ignore[override]
|
|
118
|
+
cls,
|
|
119
|
+
operand: Dataset,
|
|
120
|
+
group_op: Optional[str],
|
|
121
|
+
grouping_columns: Any,
|
|
122
|
+
having_data: Any,
|
|
123
|
+
) -> Dataset:
|
|
88
124
|
result_components = {k: copy(v) for k, v in operand.components.items()}
|
|
89
125
|
if cls.op not in [COUNT, MIN, MAX] and len(operand.get_measures_names()) == 0:
|
|
90
126
|
raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
|
|
91
127
|
if group_op is not None:
|
|
92
128
|
for comp_name in grouping_columns:
|
|
93
129
|
if comp_name not in operand.components:
|
|
94
|
-
raise SemanticError(
|
|
95
|
-
|
|
130
|
+
raise SemanticError(
|
|
131
|
+
"1-1-1-10", op=cls.op, comp_name=comp_name, dataset_name=operand.name
|
|
132
|
+
)
|
|
96
133
|
if operand.components[comp_name].role != Role.IDENTIFIER:
|
|
97
|
-
raise SemanticError(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
134
|
+
raise SemanticError(
|
|
135
|
+
"1-1-2-2",
|
|
136
|
+
op=cls.op,
|
|
137
|
+
id_name=comp_name,
|
|
138
|
+
id_type=operand.components[comp_name].role,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
identifiers_to_keep = extract_grouping_identifiers(
|
|
142
|
+
operand.get_identifiers_names(), group_op, grouping_columns
|
|
143
|
+
)
|
|
104
144
|
for comp_name, comp in operand.components.items():
|
|
105
145
|
if comp.role == Role.IDENTIFIER and comp_name not in identifiers_to_keep:
|
|
106
146
|
del result_components[comp_name]
|
|
@@ -121,89 +161,110 @@ class Aggregation(Operator.Unary):
|
|
|
121
161
|
if cls.op == COUNT:
|
|
122
162
|
for measure_name in operand.get_measures_names():
|
|
123
163
|
result_components.pop(measure_name)
|
|
124
|
-
new_comp = Component(
|
|
125
|
-
|
|
164
|
+
new_comp = Component(
|
|
165
|
+
name="int_var", role=Role.MEASURE, data_type=Integer, nullable=True
|
|
166
|
+
)
|
|
126
167
|
result_components["int_var"] = new_comp
|
|
127
168
|
return Dataset(name="result", components=result_components, data=None)
|
|
128
169
|
|
|
129
170
|
@classmethod
|
|
130
|
-
def _agg_func(
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
171
|
+
def _agg_func(
|
|
172
|
+
cls,
|
|
173
|
+
df: pd.DataFrame,
|
|
174
|
+
grouping_keys: Optional[List[str]],
|
|
175
|
+
measure_names: Optional[List[str]],
|
|
176
|
+
having_expression: Optional[str],
|
|
177
|
+
) -> pd.DataFrame:
|
|
178
|
+
grouping_names = (
|
|
179
|
+
[f'"{name}"' for name in grouping_keys] if grouping_keys is not None else None
|
|
180
|
+
)
|
|
135
181
|
if grouping_names is not None and len(grouping_names) > 0:
|
|
136
|
-
grouping = "GROUP BY " +
|
|
182
|
+
grouping = "GROUP BY " + ", ".join(grouping_names)
|
|
137
183
|
else:
|
|
138
184
|
grouping = ""
|
|
139
185
|
|
|
140
186
|
if having_expression is None:
|
|
141
187
|
having_expression = ""
|
|
142
188
|
|
|
143
|
-
if len(measure_names) == 0 and cls.op == COUNT:
|
|
189
|
+
if measure_names is not None and len(measure_names) == 0 and cls.op == COUNT:
|
|
144
190
|
if grouping_names is not None:
|
|
145
|
-
query =
|
|
191
|
+
query = (
|
|
192
|
+
f"SELECT {', '.join(grouping_names)}, COUNT() AS "
|
|
193
|
+
f"int_var from df {grouping} {having_expression}"
|
|
194
|
+
)
|
|
146
195
|
else:
|
|
147
196
|
query = f"SELECT COUNT() AS int_var from df {grouping}"
|
|
148
197
|
return duckdb.query(query).to_df()
|
|
149
198
|
|
|
150
|
-
if len(measure_names) > 0:
|
|
199
|
+
if measure_names is not None and len(measure_names) > 0:
|
|
151
200
|
functions = ""
|
|
152
201
|
for e in measure_names:
|
|
153
202
|
e = f'"{e}"'
|
|
154
203
|
if cls.type_to_check is not None and cls.op != COUNT:
|
|
155
|
-
functions +=
|
|
204
|
+
functions += (
|
|
205
|
+
f"{cls.py_op}(CAST({e} AS REAL)) AS {e}, " # Count can only be one here
|
|
206
|
+
)
|
|
156
207
|
elif cls.op == COUNT:
|
|
157
208
|
functions += f"{cls.py_op}({e}) AS int_var, "
|
|
158
209
|
break
|
|
159
210
|
else:
|
|
160
211
|
functions += f"{cls.py_op}({e}) AS {e}, "
|
|
161
212
|
if grouping_names is not None and len(grouping_names) > 0:
|
|
162
|
-
query =
|
|
213
|
+
query = (
|
|
214
|
+
f"SELECT {', '.join(grouping_names) + ', '}{functions[:-2]} "
|
|
215
|
+
f"from df {grouping} {having_expression}"
|
|
216
|
+
)
|
|
163
217
|
else:
|
|
164
218
|
query = f"SELECT {functions[:-2]} from df"
|
|
165
219
|
|
|
166
220
|
else:
|
|
167
|
-
query =
|
|
221
|
+
query = (
|
|
222
|
+
f"SELECT {', '.join(grouping_names or [])} from df {grouping} {having_expression}"
|
|
223
|
+
)
|
|
168
224
|
|
|
169
225
|
try:
|
|
170
226
|
return duckdb.query(query).to_df()
|
|
171
227
|
except RuntimeError as e:
|
|
172
|
-
if
|
|
228
|
+
if "Conversion" in e.args[0]:
|
|
173
229
|
raise SemanticError("2-3-8", op=cls.op, msg=e.args[0].split(":")[-1])
|
|
174
230
|
else:
|
|
175
231
|
raise SemanticError("2-1-1-1", op=cls.op)
|
|
176
232
|
|
|
177
233
|
@classmethod
|
|
178
|
-
def evaluate(
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
234
|
+
def evaluate( # type: ignore[override]
|
|
235
|
+
cls,
|
|
236
|
+
operand: Dataset,
|
|
237
|
+
group_op: Optional[str],
|
|
238
|
+
grouping_columns: Optional[List[str]],
|
|
239
|
+
having_expr: Optional[str],
|
|
240
|
+
) -> Dataset:
|
|
183
241
|
result = cls.validate(operand, group_op, grouping_columns, having_expr)
|
|
184
242
|
|
|
185
243
|
grouping_keys = result.get_identifiers_names()
|
|
186
|
-
result_df = operand.data.copy()
|
|
244
|
+
result_df = operand.data.copy() if operand.data is not None else pd.DataFrame()
|
|
187
245
|
measure_names = operand.get_measures_names()
|
|
188
246
|
result_df = result_df[grouping_keys + measure_names]
|
|
189
247
|
if cls.op == COUNT:
|
|
190
248
|
result_df = result_df.dropna(subset=measure_names, how="any")
|
|
191
|
-
cls._handle_data_types(result_df, operand.get_measures(),
|
|
192
|
-
result_df = cls._agg_func(result_df, grouping_keys, measure_names,
|
|
193
|
-
having_expr)
|
|
249
|
+
cls._handle_data_types(result_df, operand.get_measures(), "input")
|
|
250
|
+
result_df = cls._agg_func(result_df, grouping_keys, measure_names, having_expr)
|
|
194
251
|
|
|
195
|
-
cls._handle_data_types(result_df, operand.get_measures(),
|
|
252
|
+
cls._handle_data_types(result_df, operand.get_measures(), "result")
|
|
196
253
|
# Handle correct order on result
|
|
197
|
-
aux_df =
|
|
254
|
+
aux_df = (
|
|
255
|
+
operand.data[grouping_keys].drop_duplicates()
|
|
256
|
+
if operand.data is not None
|
|
257
|
+
else pd.DataFrame()
|
|
258
|
+
)
|
|
198
259
|
if len(grouping_keys) == 0:
|
|
199
260
|
aux_df = result_df
|
|
200
261
|
aux_df.dropna(subset=result.get_measures_names(), how="all", inplace=True)
|
|
201
262
|
if cls.op == COUNT and len(result_df) == 0:
|
|
202
|
-
aux_df[
|
|
263
|
+
aux_df["int_var"] = 0
|
|
203
264
|
elif len(aux_df) == 0:
|
|
204
265
|
aux_df = pd.DataFrame(columns=result.get_components_names())
|
|
205
266
|
else:
|
|
206
|
-
aux_df = pd.merge(aux_df, result_df, how=
|
|
267
|
+
aux_df = pd.merge(aux_df, result_df, how="left", on=grouping_keys)
|
|
207
268
|
if having_expr is not None:
|
|
208
269
|
aux_df.dropna(subset=result.get_measures_names(), how="any", inplace=True)
|
|
209
270
|
result.data = aux_df
|
|
@@ -212,64 +273,64 @@ class Aggregation(Operator.Unary):
|
|
|
212
273
|
|
|
213
274
|
class Max(Aggregation):
|
|
214
275
|
op = MAX
|
|
215
|
-
py_op =
|
|
276
|
+
py_op = "max"
|
|
216
277
|
|
|
217
278
|
|
|
218
279
|
class Min(Aggregation):
|
|
219
280
|
op = MIN
|
|
220
|
-
py_op =
|
|
281
|
+
py_op = "min"
|
|
221
282
|
|
|
222
283
|
|
|
223
284
|
class Sum(Aggregation):
|
|
224
285
|
op = SUM
|
|
225
286
|
type_to_check = Number
|
|
226
|
-
py_op =
|
|
287
|
+
py_op = "sum"
|
|
227
288
|
|
|
228
289
|
|
|
229
290
|
class Count(Aggregation):
|
|
230
291
|
op = COUNT
|
|
231
292
|
type_to_check = None
|
|
232
293
|
return_type = Integer
|
|
233
|
-
py_op =
|
|
294
|
+
py_op = "count"
|
|
234
295
|
|
|
235
296
|
|
|
236
297
|
class Avg(Aggregation):
|
|
237
298
|
op = AVG
|
|
238
299
|
type_to_check = Number
|
|
239
300
|
return_type = Number
|
|
240
|
-
py_op =
|
|
301
|
+
py_op = "avg"
|
|
241
302
|
|
|
242
303
|
|
|
243
304
|
class Median(Aggregation):
|
|
244
305
|
op = MEDIAN
|
|
245
306
|
type_to_check = Number
|
|
246
307
|
return_type = Number
|
|
247
|
-
py_op =
|
|
308
|
+
py_op = "median"
|
|
248
309
|
|
|
249
310
|
|
|
250
311
|
class PopulationStandardDeviation(Aggregation):
|
|
251
312
|
op = STDDEV_POP
|
|
252
313
|
type_to_check = Number
|
|
253
314
|
return_type = Number
|
|
254
|
-
py_op =
|
|
315
|
+
py_op = "stddev_pop"
|
|
255
316
|
|
|
256
317
|
|
|
257
318
|
class SampleStandardDeviation(Aggregation):
|
|
258
319
|
op = STDDEV_SAMP
|
|
259
320
|
type_to_check = Number
|
|
260
321
|
return_type = Number
|
|
261
|
-
py_op =
|
|
322
|
+
py_op = "stddev_samp"
|
|
262
323
|
|
|
263
324
|
|
|
264
325
|
class PopulationVariance(Aggregation):
|
|
265
326
|
op = VAR_POP
|
|
266
327
|
type_to_check = Number
|
|
267
328
|
return_type = Number
|
|
268
|
-
py_op =
|
|
329
|
+
py_op = "var_pop"
|
|
269
330
|
|
|
270
331
|
|
|
271
332
|
class SampleVariance(Aggregation):
|
|
272
333
|
op = VAR_SAMP
|
|
273
334
|
type_to_check = Number
|
|
274
335
|
return_type = Number
|
|
275
|
-
py_op =
|
|
336
|
+
py_op = "var_samp"
|