vtlengine 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vtlengine might be problematic. Click here for more details.

Files changed (54) hide show
  1. vtlengine/API/_InternalApi.py +153 -100
  2. vtlengine/API/__init__.py +109 -67
  3. vtlengine/AST/ASTConstructor.py +188 -98
  4. vtlengine/AST/ASTConstructorModules/Expr.py +306 -200
  5. vtlengine/AST/ASTConstructorModules/ExprComponents.py +172 -102
  6. vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
  7. vtlengine/AST/ASTEncoders.py +1 -1
  8. vtlengine/AST/ASTTemplate.py +8 -9
  9. vtlengine/AST/ASTVisitor.py +8 -12
  10. vtlengine/AST/DAG/__init__.py +43 -35
  11. vtlengine/AST/DAG/_words.py +4 -4
  12. vtlengine/AST/Grammar/lexer.py +732 -142
  13. vtlengine/AST/Grammar/parser.py +2188 -826
  14. vtlengine/AST/Grammar/tokens.py +128 -128
  15. vtlengine/AST/VtlVisitor.py +7 -4
  16. vtlengine/AST/__init__.py +22 -11
  17. vtlengine/DataTypes/NumericTypesHandling.py +5 -4
  18. vtlengine/DataTypes/TimeHandling.py +194 -301
  19. vtlengine/DataTypes/__init__.py +304 -218
  20. vtlengine/Exceptions/__init__.py +52 -27
  21. vtlengine/Exceptions/messages.py +134 -62
  22. vtlengine/Interpreter/__init__.py +781 -487
  23. vtlengine/Model/__init__.py +165 -121
  24. vtlengine/Operators/Aggregation.py +156 -95
  25. vtlengine/Operators/Analytic.py +115 -59
  26. vtlengine/Operators/Assignment.py +7 -4
  27. vtlengine/Operators/Boolean.py +27 -32
  28. vtlengine/Operators/CastOperator.py +177 -131
  29. vtlengine/Operators/Clause.py +137 -99
  30. vtlengine/Operators/Comparison.py +148 -117
  31. vtlengine/Operators/Conditional.py +149 -98
  32. vtlengine/Operators/General.py +68 -47
  33. vtlengine/Operators/HROperators.py +91 -72
  34. vtlengine/Operators/Join.py +217 -118
  35. vtlengine/Operators/Numeric.py +89 -44
  36. vtlengine/Operators/RoleSetter.py +16 -15
  37. vtlengine/Operators/Set.py +61 -36
  38. vtlengine/Operators/String.py +213 -139
  39. vtlengine/Operators/Time.py +334 -216
  40. vtlengine/Operators/Validation.py +117 -76
  41. vtlengine/Operators/__init__.py +340 -213
  42. vtlengine/Utils/__init__.py +195 -40
  43. vtlengine/__init__.py +1 -1
  44. vtlengine/files/output/__init__.py +15 -6
  45. vtlengine/files/output/_time_period_representation.py +10 -9
  46. vtlengine/files/parser/__init__.py +77 -52
  47. vtlengine/files/parser/_rfc_dialect.py +6 -5
  48. vtlengine/files/parser/_time_checking.py +46 -37
  49. vtlengine-1.0.1.dist-info/METADATA +236 -0
  50. vtlengine-1.0.1.dist-info/RECORD +58 -0
  51. {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/WHEEL +1 -1
  52. vtlengine-1.0.dist-info/METADATA +0 -104
  53. vtlengine-1.0.dist-info/RECORD +0 -58
  54. {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/LICENSE.md +0 -0
@@ -1,27 +1,49 @@
1
1
  from copy import copy
2
- from typing import List, Optional
2
+ from typing import List, Optional, Any
3
3
 
4
4
  import duckdb
5
5
  import pandas as pd
6
- from vtlengine.DataTypes import Integer, Number, unary_implicit_promotion, Boolean
6
+ from vtlengine.DataTypes import (
7
+ Integer,
8
+ Number,
9
+ unary_implicit_promotion,
10
+ Boolean,
11
+ String,
12
+ Duration,
13
+ TimeInterval,
14
+ TimePeriod,
15
+ Date,
16
+ )
7
17
 
8
18
  import vtlengine.Operators as Operator
9
- from vtlengine.AST.Grammar.tokens import (AVG, COUNT, MAX, MEDIAN, MIN, STDDEV_POP, STDDEV_SAMP,
10
- SUM, VAR_POP,
11
- VAR_SAMP)
12
- from vtlengine.DataTypes.TimeHandling import DURATION_MAPPING, DURATION_MAPPING_REVERSED, \
13
- TimePeriodHandler, \
14
- TimeIntervalHandler
19
+ from vtlengine.AST.Grammar.tokens import (
20
+ AVG,
21
+ COUNT,
22
+ MAX,
23
+ MEDIAN,
24
+ MIN,
25
+ STDDEV_POP,
26
+ STDDEV_SAMP,
27
+ SUM,
28
+ VAR_POP,
29
+ VAR_SAMP,
30
+ )
31
+ from vtlengine.DataTypes.TimeHandling import (
32
+ DURATION_MAPPING,
33
+ DURATION_MAPPING_REVERSED,
34
+ TimePeriodHandler,
35
+ TimeIntervalHandler,
36
+ )
15
37
  from vtlengine.Exceptions import SemanticError
16
- from vtlengine.Model import Component, DataComponent, Dataset, Role
38
+ from vtlengine.Model import Component, Dataset, Role
17
39
 
18
40
 
19
- def extract_grouping_identifiers(identifier_names: List[str],
20
- group_op: str,
21
- grouping_components: List[str]) -> List[str]:
22
- if group_op == 'group by':
41
+ def extract_grouping_identifiers(
42
+ identifier_names: List[str], group_op: Optional[str], grouping_components: Any
43
+ ) -> List[str]:
44
+ if group_op == "group by":
23
45
  return grouping_components
24
- elif group_op == 'group except':
46
+ elif group_op == "group except":
25
47
  return [comp for comp in identifier_names if comp not in grouping_components]
26
48
  else:
27
49
  return identifier_names
@@ -30,77 +52,95 @@ def extract_grouping_identifiers(identifier_names: List[str],
30
52
  # noinspection PyMethodOverriding
31
53
  class Aggregation(Operator.Unary):
32
54
  @classmethod
33
- def _handle_data_types(cls, data: pd.DataFrame, measures: List[Component], mode: str):
55
+ def _handle_data_types(cls, data: pd.DataFrame, measures: List[Component], mode: str) -> None:
56
+ to_replace: List[Optional[str]]
57
+ new_value: List[Optional[str]]
34
58
  if cls.op == COUNT:
35
59
  return
36
- if mode == 'input':
60
+ if mode == "input":
37
61
  to_replace = [None]
38
- new_value = ['']
62
+ new_value = [""]
39
63
  else:
40
- to_replace = ['']
64
+ to_replace = [""]
41
65
  new_value = [None]
42
66
 
43
67
  for measure in measures:
44
- if measure.data_type.__name__ == 'Date':
68
+ if measure.data_type == Date:
45
69
  if cls.op == MIN:
46
- if mode == 'input':
70
+ if mode == "input":
47
71
  # Invalid date only for null values
48
- new_value = ['9999-99-99']
72
+ new_value = ["9999-99-99"]
49
73
  else:
50
- to_replace = ['9999-99-99']
74
+ to_replace = ["9999-99-99"]
51
75
  data[measure.name] = data[measure.name].replace(to_replace, new_value)
52
- elif measure.data_type.__name__ == 'TimePeriod':
53
- if mode == 'input':
54
- data[measure.name] = data[measure.name].astype(object).map(
55
- lambda x: TimePeriodHandler(x),
56
- na_action='ignore')
76
+ elif measure.data_type == TimePeriod:
77
+ if mode == "input":
78
+ data[measure.name] = (
79
+ data[measure.name]
80
+ .astype(object)
81
+ .map(lambda x: TimePeriodHandler(x), na_action="ignore")
82
+ )
57
83
  else:
58
84
  data[measure.name] = data[measure.name].map(
59
- lambda x: str(x), na_action='ignore')
60
- elif measure.data_type.__name__ == 'TimeInterval':
61
- if mode == 'input':
62
- data[measure.name] = data[measure.name].astype(object).map(
63
- lambda x: TimeIntervalHandler.from_iso_format(x),
64
- na_action='ignore')
85
+ lambda x: str(x), na_action="ignore"
86
+ )
87
+ elif measure.data_type == TimeInterval:
88
+ if mode == "input":
89
+ data[measure.name] = (
90
+ data[measure.name]
91
+ .astype(object)
92
+ .map(lambda x: TimeIntervalHandler.from_iso_format(x), na_action="ignore")
93
+ )
65
94
  else:
66
95
  data[measure.name] = data[measure.name].map(
67
- lambda x: str(x), na_action='ignore')
68
- elif measure.data_type.__name__ == 'String':
96
+ lambda x: str(x), na_action="ignore"
97
+ )
98
+ elif measure.data_type == String:
69
99
  data[measure.name] = data[measure.name].replace(to_replace, new_value)
70
- elif measure.data_type.__name__ == 'Duration':
71
- if mode == 'input':
72
- data[measure.name] = data[measure.name].map(lambda x: DURATION_MAPPING[x],
73
- na_action='ignore')
100
+ elif measure.data_type == Duration:
101
+ if mode == "input":
102
+ data[measure.name] = data[measure.name].map(
103
+ lambda x: DURATION_MAPPING[x], na_action="ignore"
104
+ )
74
105
  else:
75
106
  data[measure.name] = data[measure.name].map(
76
- lambda x: DURATION_MAPPING_REVERSED[x], na_action='ignore')
77
- elif measure.data_type.__name__ == 'Boolean':
78
- if mode == 'result':
79
- data[measure.name] = data[measure.name].map(lambda x: Boolean().cast(x),
80
- na_action='ignore')
107
+ lambda x: DURATION_MAPPING_REVERSED[x], na_action="ignore"
108
+ )
109
+ elif measure.data_type == Boolean:
110
+ if mode == "result":
111
+ data[measure.name] = data[measure.name].map(
112
+ lambda x: Boolean().cast(x), na_action="ignore"
113
+ )
81
114
  data[measure.name] = data[measure.name].astype(object)
82
115
 
83
116
  @classmethod
84
- def validate(cls, operand: Dataset,
85
- group_op: Optional[str],
86
- grouping_columns: Optional[List[str]],
87
- having_data: Optional[List[DataComponent]]) -> Dataset:
117
+ def validate( # type: ignore[override]
118
+ cls,
119
+ operand: Dataset,
120
+ group_op: Optional[str],
121
+ grouping_columns: Any,
122
+ having_data: Any,
123
+ ) -> Dataset:
88
124
  result_components = {k: copy(v) for k, v in operand.components.items()}
89
125
  if cls.op not in [COUNT, MIN, MAX] and len(operand.get_measures_names()) == 0:
90
126
  raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
91
127
  if group_op is not None:
92
128
  for comp_name in grouping_columns:
93
129
  if comp_name not in operand.components:
94
- raise SemanticError("1-1-1-10", op=cls.op, comp_name=comp_name,
95
- dataset_name=operand.name)
130
+ raise SemanticError(
131
+ "1-1-1-10", op=cls.op, comp_name=comp_name, dataset_name=operand.name
132
+ )
96
133
  if operand.components[comp_name].role != Role.IDENTIFIER:
97
- raise SemanticError("1-1-2-2", op=cls.op,
98
- id_name=comp_name,
99
- id_type=operand.components[comp_name].role)
100
-
101
- identifiers_to_keep = extract_grouping_identifiers(operand.get_identifiers_names(),
102
- group_op,
103
- grouping_columns)
134
+ raise SemanticError(
135
+ "1-1-2-2",
136
+ op=cls.op,
137
+ id_name=comp_name,
138
+ id_type=operand.components[comp_name].role,
139
+ )
140
+
141
+ identifiers_to_keep = extract_grouping_identifiers(
142
+ operand.get_identifiers_names(), group_op, grouping_columns
143
+ )
104
144
  for comp_name, comp in operand.components.items():
105
145
  if comp.role == Role.IDENTIFIER and comp_name not in identifiers_to_keep:
106
146
  del result_components[comp_name]
@@ -121,89 +161,110 @@ class Aggregation(Operator.Unary):
121
161
  if cls.op == COUNT:
122
162
  for measure_name in operand.get_measures_names():
123
163
  result_components.pop(measure_name)
124
- new_comp = Component(name="int_var", role=Role.MEASURE, data_type=Integer,
125
- nullable=True)
164
+ new_comp = Component(
165
+ name="int_var", role=Role.MEASURE, data_type=Integer, nullable=True
166
+ )
126
167
  result_components["int_var"] = new_comp
127
168
  return Dataset(name="result", components=result_components, data=None)
128
169
 
129
170
  @classmethod
130
- def _agg_func(cls, df: pd.DataFrame, grouping_keys: Optional[List[str]],
131
- measure_names: Optional[List[str]],
132
- having_expression: Optional[str]) -> pd.DataFrame:
133
- grouping_names = [f'"{name}"' for name in
134
- grouping_keys] if grouping_keys is not None else None
171
+ def _agg_func(
172
+ cls,
173
+ df: pd.DataFrame,
174
+ grouping_keys: Optional[List[str]],
175
+ measure_names: Optional[List[str]],
176
+ having_expression: Optional[str],
177
+ ) -> pd.DataFrame:
178
+ grouping_names = (
179
+ [f'"{name}"' for name in grouping_keys] if grouping_keys is not None else None
180
+ )
135
181
  if grouping_names is not None and len(grouping_names) > 0:
136
- grouping = "GROUP BY " + ', '.join(grouping_names)
182
+ grouping = "GROUP BY " + ", ".join(grouping_names)
137
183
  else:
138
184
  grouping = ""
139
185
 
140
186
  if having_expression is None:
141
187
  having_expression = ""
142
188
 
143
- if len(measure_names) == 0 and cls.op == COUNT:
189
+ if measure_names is not None and len(measure_names) == 0 and cls.op == COUNT:
144
190
  if grouping_names is not None:
145
- query = f"SELECT {', '.join(grouping_names)}, COUNT() AS int_var from df {grouping} {having_expression}"
191
+ query = (
192
+ f"SELECT {', '.join(grouping_names)}, COUNT() AS "
193
+ f"int_var from df {grouping} {having_expression}"
194
+ )
146
195
  else:
147
196
  query = f"SELECT COUNT() AS int_var from df {grouping}"
148
197
  return duckdb.query(query).to_df()
149
198
 
150
- if len(measure_names) > 0:
199
+ if measure_names is not None and len(measure_names) > 0:
151
200
  functions = ""
152
201
  for e in measure_names:
153
202
  e = f'"{e}"'
154
203
  if cls.type_to_check is not None and cls.op != COUNT:
155
- functions += f"{cls.py_op}(CAST({e} AS REAL)) AS {e}, " # Count can only be one here
204
+ functions += (
205
+ f"{cls.py_op}(CAST({e} AS REAL)) AS {e}, " # Count can only be one here
206
+ )
156
207
  elif cls.op == COUNT:
157
208
  functions += f"{cls.py_op}({e}) AS int_var, "
158
209
  break
159
210
  else:
160
211
  functions += f"{cls.py_op}({e}) AS {e}, "
161
212
  if grouping_names is not None and len(grouping_names) > 0:
162
- query = f"SELECT {', '.join(grouping_names) + ', '}{functions[:-2]} from df {grouping} {having_expression}"
213
+ query = (
214
+ f"SELECT {', '.join(grouping_names) + ', '}{functions[:-2]} "
215
+ f"from df {grouping} {having_expression}"
216
+ )
163
217
  else:
164
218
  query = f"SELECT {functions[:-2]} from df"
165
219
 
166
220
  else:
167
- query = f"SELECT {', '.join(grouping_names)} from df {grouping} {having_expression}"
221
+ query = (
222
+ f"SELECT {', '.join(grouping_names or [])} from df {grouping} {having_expression}"
223
+ )
168
224
 
169
225
  try:
170
226
  return duckdb.query(query).to_df()
171
227
  except RuntimeError as e:
172
- if 'Conversion' in e.args[0]:
228
+ if "Conversion" in e.args[0]:
173
229
  raise SemanticError("2-3-8", op=cls.op, msg=e.args[0].split(":")[-1])
174
230
  else:
175
231
  raise SemanticError("2-1-1-1", op=cls.op)
176
232
 
177
233
  @classmethod
178
- def evaluate(cls,
179
- operand: Dataset,
180
- group_op: Optional[str],
181
- grouping_columns: Optional[List[str]],
182
- having_expr: Optional[str]) -> Dataset:
234
+ def evaluate( # type: ignore[override]
235
+ cls,
236
+ operand: Dataset,
237
+ group_op: Optional[str],
238
+ grouping_columns: Optional[List[str]],
239
+ having_expr: Optional[str],
240
+ ) -> Dataset:
183
241
  result = cls.validate(operand, group_op, grouping_columns, having_expr)
184
242
 
185
243
  grouping_keys = result.get_identifiers_names()
186
- result_df = operand.data.copy()
244
+ result_df = operand.data.copy() if operand.data is not None else pd.DataFrame()
187
245
  measure_names = operand.get_measures_names()
188
246
  result_df = result_df[grouping_keys + measure_names]
189
247
  if cls.op == COUNT:
190
248
  result_df = result_df.dropna(subset=measure_names, how="any")
191
- cls._handle_data_types(result_df, operand.get_measures(), 'input')
192
- result_df = cls._agg_func(result_df, grouping_keys, measure_names,
193
- having_expr)
249
+ cls._handle_data_types(result_df, operand.get_measures(), "input")
250
+ result_df = cls._agg_func(result_df, grouping_keys, measure_names, having_expr)
194
251
 
195
- cls._handle_data_types(result_df, operand.get_measures(), 'result')
252
+ cls._handle_data_types(result_df, operand.get_measures(), "result")
196
253
  # Handle correct order on result
197
- aux_df = operand.data[grouping_keys].drop_duplicates()
254
+ aux_df = (
255
+ operand.data[grouping_keys].drop_duplicates()
256
+ if operand.data is not None
257
+ else pd.DataFrame()
258
+ )
198
259
  if len(grouping_keys) == 0:
199
260
  aux_df = result_df
200
261
  aux_df.dropna(subset=result.get_measures_names(), how="all", inplace=True)
201
262
  if cls.op == COUNT and len(result_df) == 0:
202
- aux_df['int_var'] = 0
263
+ aux_df["int_var"] = 0
203
264
  elif len(aux_df) == 0:
204
265
  aux_df = pd.DataFrame(columns=result.get_components_names())
205
266
  else:
206
- aux_df = pd.merge(aux_df, result_df, how='left', on=grouping_keys)
267
+ aux_df = pd.merge(aux_df, result_df, how="left", on=grouping_keys)
207
268
  if having_expr is not None:
208
269
  aux_df.dropna(subset=result.get_measures_names(), how="any", inplace=True)
209
270
  result.data = aux_df
@@ -212,64 +273,64 @@ class Aggregation(Operator.Unary):
212
273
 
213
274
  class Max(Aggregation):
214
275
  op = MAX
215
- py_op = 'max'
276
+ py_op = "max"
216
277
 
217
278
 
218
279
  class Min(Aggregation):
219
280
  op = MIN
220
- py_op = 'min'
281
+ py_op = "min"
221
282
 
222
283
 
223
284
  class Sum(Aggregation):
224
285
  op = SUM
225
286
  type_to_check = Number
226
- py_op = 'sum'
287
+ py_op = "sum"
227
288
 
228
289
 
229
290
  class Count(Aggregation):
230
291
  op = COUNT
231
292
  type_to_check = None
232
293
  return_type = Integer
233
- py_op = 'count'
294
+ py_op = "count"
234
295
 
235
296
 
236
297
  class Avg(Aggregation):
237
298
  op = AVG
238
299
  type_to_check = Number
239
300
  return_type = Number
240
- py_op = 'avg'
301
+ py_op = "avg"
241
302
 
242
303
 
243
304
  class Median(Aggregation):
244
305
  op = MEDIAN
245
306
  type_to_check = Number
246
307
  return_type = Number
247
- py_op = 'median'
308
+ py_op = "median"
248
309
 
249
310
 
250
311
  class PopulationStandardDeviation(Aggregation):
251
312
  op = STDDEV_POP
252
313
  type_to_check = Number
253
314
  return_type = Number
254
- py_op = 'stddev_pop'
315
+ py_op = "stddev_pop"
255
316
 
256
317
 
257
318
  class SampleStandardDeviation(Aggregation):
258
319
  op = STDDEV_SAMP
259
320
  type_to_check = Number
260
321
  return_type = Number
261
- py_op = 'stddev_samp'
322
+ py_op = "stddev_samp"
262
323
 
263
324
 
264
325
  class PopulationVariance(Aggregation):
265
326
  op = VAR_POP
266
327
  type_to_check = Number
267
328
  return_type = Number
268
- py_op = 'var_pop'
329
+ py_op = "var_pop"
269
330
 
270
331
 
271
332
  class SampleVariance(Aggregation):
272
333
  op = VAR_SAMP
273
334
  type_to_check = Number
274
335
  return_type = Number
275
- py_op = 'var_samp'
336
+ py_op = "var_samp"