vtlengine 1.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vtlengine/API/_InternalApi.py +791 -0
- vtlengine/API/__init__.py +612 -0
- vtlengine/API/data/schema/external_routines_schema.json +34 -0
- vtlengine/API/data/schema/json_schema_2.1.json +116 -0
- vtlengine/API/data/schema/value_domain_schema.json +97 -0
- vtlengine/AST/ASTComment.py +57 -0
- vtlengine/AST/ASTConstructor.py +598 -0
- vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
- vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
- vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
- vtlengine/AST/ASTDataExchange.py +10 -0
- vtlengine/AST/ASTEncoders.py +32 -0
- vtlengine/AST/ASTString.py +675 -0
- vtlengine/AST/ASTTemplate.py +558 -0
- vtlengine/AST/ASTVisitor.py +25 -0
- vtlengine/AST/DAG/__init__.py +479 -0
- vtlengine/AST/DAG/_words.py +10 -0
- vtlengine/AST/Grammar/Vtl.g4 +705 -0
- vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
- vtlengine/AST/Grammar/__init__.py +0 -0
- vtlengine/AST/Grammar/lexer.py +2139 -0
- vtlengine/AST/Grammar/parser.py +16597 -0
- vtlengine/AST/Grammar/tokens.py +169 -0
- vtlengine/AST/VtlVisitor.py +824 -0
- vtlengine/AST/__init__.py +674 -0
- vtlengine/DataTypes/TimeHandling.py +562 -0
- vtlengine/DataTypes/__init__.py +863 -0
- vtlengine/DataTypes/_time_checking.py +135 -0
- vtlengine/Exceptions/__exception_file_generator.py +96 -0
- vtlengine/Exceptions/__init__.py +159 -0
- vtlengine/Exceptions/messages.py +1004 -0
- vtlengine/Interpreter/__init__.py +2048 -0
- vtlengine/Model/__init__.py +501 -0
- vtlengine/Operators/Aggregation.py +357 -0
- vtlengine/Operators/Analytic.py +455 -0
- vtlengine/Operators/Assignment.py +23 -0
- vtlengine/Operators/Boolean.py +106 -0
- vtlengine/Operators/CastOperator.py +451 -0
- vtlengine/Operators/Clause.py +366 -0
- vtlengine/Operators/Comparison.py +488 -0
- vtlengine/Operators/Conditional.py +495 -0
- vtlengine/Operators/General.py +191 -0
- vtlengine/Operators/HROperators.py +254 -0
- vtlengine/Operators/Join.py +447 -0
- vtlengine/Operators/Numeric.py +422 -0
- vtlengine/Operators/RoleSetter.py +77 -0
- vtlengine/Operators/Set.py +176 -0
- vtlengine/Operators/String.py +578 -0
- vtlengine/Operators/Time.py +1144 -0
- vtlengine/Operators/Validation.py +275 -0
- vtlengine/Operators/__init__.py +900 -0
- vtlengine/Utils/__Virtual_Assets.py +34 -0
- vtlengine/Utils/__init__.py +479 -0
- vtlengine/__extras_check.py +17 -0
- vtlengine/__init__.py +27 -0
- vtlengine/files/__init__.py +0 -0
- vtlengine/files/output/__init__.py +35 -0
- vtlengine/files/output/_time_period_representation.py +55 -0
- vtlengine/files/parser/__init__.py +240 -0
- vtlengine/files/parser/_rfc_dialect.py +22 -0
- vtlengine/py.typed +0 -0
- vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
- vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
- vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
- vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import operator
|
|
2
|
+
from copy import copy
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pandas import DataFrame
|
|
7
|
+
|
|
8
|
+
import vtlengine.Operators as Operators
|
|
9
|
+
from vtlengine.AST.Grammar.tokens import HIERARCHY
|
|
10
|
+
from vtlengine.DataTypes import Boolean, Number
|
|
11
|
+
from vtlengine.Model import Component, DataComponent, Dataset, Role
|
|
12
|
+
from vtlengine.Utils.__Virtual_Assets import VirtualCounter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_measure_from_dataset(dataset: Dataset, code_item: str) -> DataComponent:
|
|
16
|
+
measure_name = dataset.get_measures_names()[0]
|
|
17
|
+
data = None if dataset.data is None else dataset.data[measure_name]
|
|
18
|
+
return DataComponent(
|
|
19
|
+
name=code_item,
|
|
20
|
+
data=data,
|
|
21
|
+
data_type=dataset.components[measure_name].data_type,
|
|
22
|
+
role=dataset.components[measure_name].role,
|
|
23
|
+
nullable=dataset.components[measure_name].nullable,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HRComparison(Operators.Binary):
|
|
28
|
+
@classmethod
|
|
29
|
+
def imbalance_func(cls, x: Any, y: Any) -> Any:
|
|
30
|
+
return None if pd.isnull(x) or pd.isnull(y) else x - y
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def hr_func(left_series: Any, right_series: Any, hr_mode: str) -> Any:
|
|
34
|
+
result = pd.Series(True, index=left_series.index)
|
|
35
|
+
mask_remove = None
|
|
36
|
+
mask_null = None
|
|
37
|
+
|
|
38
|
+
if hr_mode in ("partial_null", "partial_zero"):
|
|
39
|
+
mask_remove = (right_series == "REMOVE_VALUE") & (right_series.notnull())
|
|
40
|
+
if hr_mode == "partial_null":
|
|
41
|
+
mask_null = mask_remove & left_series.notnull()
|
|
42
|
+
else:
|
|
43
|
+
mask_null = mask_remove & (left_series != 0)
|
|
44
|
+
elif hr_mode == "non_null":
|
|
45
|
+
mask_remove = left_series.isnull() | right_series.isnull()
|
|
46
|
+
elif hr_mode == "non_zero":
|
|
47
|
+
mask_remove = (left_series == 0) & (right_series == 0)
|
|
48
|
+
|
|
49
|
+
if mask_remove is not None:
|
|
50
|
+
# Adding ignore here because mypy cannot infer typing of setting values in a Series
|
|
51
|
+
result[mask_remove] = "REMOVE_VALUE" # type: ignore[call-overload, unused-ignore]
|
|
52
|
+
if mask_null is not None:
|
|
53
|
+
result[mask_null] = None
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def apply_hr_func(cls, left_series: Any, right_series: Any, hr_mode: str, func: Any) -> Any:
|
|
58
|
+
# In order not to apply the function to the whole series, we align the series
|
|
59
|
+
# and apply the function only to the valid values based on a validation mask.
|
|
60
|
+
# The function is applied to the aligned series and the result is combined with the
|
|
61
|
+
# original series.
|
|
62
|
+
left_series, right_series = left_series.align(right_series)
|
|
63
|
+
remove_result = cls.hr_func(left_series, right_series, hr_mode)
|
|
64
|
+
mask_valid = remove_result == True
|
|
65
|
+
result = pd.Series(remove_result, index=left_series.index)
|
|
66
|
+
result.loc[mask_valid] = left_series[mask_valid].combine(right_series[mask_valid], func)
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def validate(cls, left_operand: Dataset, right_operand: DataComponent, hr_mode: str) -> Dataset:
|
|
71
|
+
result_components = {
|
|
72
|
+
comp_name: copy(comp)
|
|
73
|
+
for comp_name, comp in left_operand.components.items()
|
|
74
|
+
if comp.role == Role.IDENTIFIER
|
|
75
|
+
}
|
|
76
|
+
result_components["bool_var"] = Component(
|
|
77
|
+
name="bool_var", data_type=Boolean, role=Role.MEASURE, nullable=True
|
|
78
|
+
)
|
|
79
|
+
result_components["imbalance"] = Component(
|
|
80
|
+
name="imbalance", data_type=Number, role=Role.MEASURE, nullable=True
|
|
81
|
+
)
|
|
82
|
+
return Dataset(
|
|
83
|
+
name=f"{left_operand.name}{cls.op}{right_operand.name}",
|
|
84
|
+
components=result_components,
|
|
85
|
+
data=None,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def evaluate(cls, left: Dataset, right: DataComponent, hr_mode: str) -> Dataset: # type: ignore[override]
|
|
90
|
+
result = cls.validate(left, right, hr_mode)
|
|
91
|
+
result.data = left.data.copy() if left.data is not None else pd.DataFrame()
|
|
92
|
+
measure_name = left.get_measures_names()[0]
|
|
93
|
+
|
|
94
|
+
if left.data is not None and right.data is not None:
|
|
95
|
+
result.data["bool_var"] = cls.apply_hr_func(
|
|
96
|
+
left.data[measure_name], right.data, hr_mode, cls.op_func
|
|
97
|
+
)
|
|
98
|
+
result.data["imbalance"] = cls.apply_hr_func(
|
|
99
|
+
left.data[measure_name], right.data, hr_mode, cls.imbalance_func
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Removing datapoints that should not be returned
|
|
103
|
+
# (we do it below imbalance calculation
|
|
104
|
+
# to avoid errors on different shape)
|
|
105
|
+
result.data = result.data[result.data["bool_var"] != "REMOVE_VALUE"]
|
|
106
|
+
result.data.drop(measure_name, axis=1, inplace=True)
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class HREqual(HRComparison):
|
|
111
|
+
op = "="
|
|
112
|
+
py_op = operator.eq
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class HRGreater(HRComparison):
|
|
116
|
+
op = ">"
|
|
117
|
+
py_op = operator.gt
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class HRGreaterEqual(HRComparison):
|
|
121
|
+
op = ">="
|
|
122
|
+
py_op = operator.ge
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class HRLess(HRComparison):
|
|
126
|
+
op = "<"
|
|
127
|
+
py_op = operator.lt
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class HRLessEqual(HRComparison):
|
|
131
|
+
op = "<="
|
|
132
|
+
py_op = operator.le
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class HRBinNumeric(Operators.Binary):
|
|
136
|
+
@classmethod
|
|
137
|
+
def op_func(cls, x: Any, y: Any) -> Any:
|
|
138
|
+
if not pd.isnull(x) and x == "REMOVE_VALUE":
|
|
139
|
+
return "REMOVE_VALUE"
|
|
140
|
+
return super().op_func(x, y)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def evaluate(cls, left: DataComponent, right: DataComponent) -> DataComponent:
|
|
144
|
+
result_data = cls.apply_operation_two_series(left.data, right.data)
|
|
145
|
+
return DataComponent(
|
|
146
|
+
name=f"{left.name}{cls.op}{right.name}",
|
|
147
|
+
data=result_data,
|
|
148
|
+
data_type=left.data_type,
|
|
149
|
+
role=left.role,
|
|
150
|
+
nullable=left.nullable,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class HRBinPlus(HRBinNumeric):
|
|
155
|
+
op = "+"
|
|
156
|
+
py_op = operator.add
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class HRBinMinus(HRBinNumeric):
|
|
160
|
+
op = "-"
|
|
161
|
+
py_op = operator.sub
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class HRUnNumeric(Operators.Unary):
|
|
165
|
+
@classmethod
|
|
166
|
+
def evaluate(cls, operand: DataComponent) -> DataComponent: # type: ignore[override]
|
|
167
|
+
result_data = cls.apply_operation_component(operand.data)
|
|
168
|
+
return DataComponent(
|
|
169
|
+
name=f"{cls.op}({operand.name})",
|
|
170
|
+
data=result_data,
|
|
171
|
+
data_type=operand.data_type,
|
|
172
|
+
role=operand.role,
|
|
173
|
+
nullable=operand.nullable,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class HRUnPlus(HRUnNumeric):
|
|
178
|
+
op = "+"
|
|
179
|
+
py_op = operator.pos
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class HRUnMinus(HRUnNumeric):
|
|
183
|
+
op = "-"
|
|
184
|
+
py_op = operator.neg
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class HAAssignment(Operators.Binary):
|
|
188
|
+
@classmethod
|
|
189
|
+
def validate(cls, left: Dataset, right: DataComponent, hr_mode: str) -> Dataset:
|
|
190
|
+
result_components = {comp_name: copy(comp) for comp_name, comp in left.components.items()}
|
|
191
|
+
return Dataset(name=f"{left.name}", components=result_components, data=None)
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
def evaluate( # type: ignore[override]
|
|
195
|
+
cls, left: Dataset, right: DataComponent, hr_mode: str
|
|
196
|
+
) -> Dataset:
|
|
197
|
+
result = cls.validate(left, right, hr_mode)
|
|
198
|
+
measure_name = left.get_measures_names()[0]
|
|
199
|
+
result.data = left.data.copy() if left.data is not None else pd.DataFrame()
|
|
200
|
+
if right.data is not None:
|
|
201
|
+
result.data[measure_name] = right.data.map(lambda x: cls.handle_mode(x, hr_mode))
|
|
202
|
+
result.data = result.data[result.data[measure_name] != "REMOVE_VALUE"]
|
|
203
|
+
return result
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def handle_mode(cls, x: Any, hr_mode: str) -> Any:
|
|
207
|
+
if not pd.isnull(x) and x == "REMOVE_VALUE":
|
|
208
|
+
return "REMOVE_VALUE"
|
|
209
|
+
if hr_mode == "non_null" and pd.isnull(x) or hr_mode == "non_zero" and x == 0:
|
|
210
|
+
return "REMOVE_VALUE"
|
|
211
|
+
return x
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class Hierarchy(Operators.Operator):
|
|
215
|
+
op = HIERARCHY
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def generate_computed_data(computed_dict: Dict[str, DataFrame]) -> DataFrame:
|
|
219
|
+
list_data = list(computed_dict.values())
|
|
220
|
+
df = pd.concat(list_data, axis=0)
|
|
221
|
+
df.reset_index(drop=True, inplace=True)
|
|
222
|
+
return df
|
|
223
|
+
|
|
224
|
+
@classmethod
|
|
225
|
+
def validate(
|
|
226
|
+
cls, dataset: Dataset, computed_dict: Dict[str, DataFrame], output: str
|
|
227
|
+
) -> Dataset:
|
|
228
|
+
dataset_name = VirtualCounter._new_ds_name()
|
|
229
|
+
result_components = {
|
|
230
|
+
comp_name: copy(comp) for comp_name, comp in dataset.components.items()
|
|
231
|
+
}
|
|
232
|
+
return Dataset(name=dataset_name, components=result_components, data=None)
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def evaluate(
|
|
236
|
+
cls, dataset: Dataset, computed_dict: Dict[str, DataFrame], output: str
|
|
237
|
+
) -> Dataset:
|
|
238
|
+
result = cls.validate(dataset, computed_dict, output)
|
|
239
|
+
if len(computed_dict) == 0:
|
|
240
|
+
computed_data = pd.DataFrame(columns=dataset.get_components_names())
|
|
241
|
+
else:
|
|
242
|
+
computed_data = cls.generate_computed_data(computed_dict)
|
|
243
|
+
if output == "computed":
|
|
244
|
+
result.data = computed_data
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
# union(setdiff(op, R), R) where R is the computed data.
|
|
248
|
+
# It is the same as union(op, R) and drop duplicates, selecting the last one available
|
|
249
|
+
result.data = pd.concat([dataset.data, computed_data], axis=0, ignore_index=True)
|
|
250
|
+
result.data.drop_duplicates(
|
|
251
|
+
subset=dataset.get_identifiers_names(), keep="last", inplace=True
|
|
252
|
+
)
|
|
253
|
+
result.data.reset_index(drop=True, inplace=True)
|
|
254
|
+
return result
|
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
from copy import copy
|
|
2
|
+
from functools import reduce
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
# if os.environ.get("SPARK"):
|
|
6
|
+
# import pyspark.pandas as pd
|
|
7
|
+
# else:
|
|
8
|
+
# import pandas as pd
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from vtlengine.AST import BinOp
|
|
12
|
+
from vtlengine.AST.Grammar.tokens import CROSS_JOIN, FULL_JOIN, INNER_JOIN, LEFT_JOIN
|
|
13
|
+
from vtlengine.DataTypes import binary_implicit_promotion
|
|
14
|
+
from vtlengine.Exceptions import SemanticError
|
|
15
|
+
from vtlengine.Model import Component, Dataset, Role
|
|
16
|
+
from vtlengine.Operators import Operator, _id_type_promotion_join_keys
|
|
17
|
+
from vtlengine.Utils.__Virtual_Assets import VirtualCounter
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Join(Operator):
|
|
21
|
+
how: str
|
|
22
|
+
reference_dataset: Dataset
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def get_components_union(cls, datasets: List[Dataset]) -> List[Component]:
|
|
26
|
+
common: List[Any] = []
|
|
27
|
+
common.extend(
|
|
28
|
+
copy(comp)
|
|
29
|
+
for dataset in datasets
|
|
30
|
+
for comp in dataset.components.values()
|
|
31
|
+
if comp not in common
|
|
32
|
+
)
|
|
33
|
+
return common
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def get_components_intersection(cls, operands: List[Any]) -> Any:
|
|
37
|
+
element_count: Dict[str, Any] = {}
|
|
38
|
+
for operand in operands:
|
|
39
|
+
operand_set = set(operand)
|
|
40
|
+
for element in operand_set:
|
|
41
|
+
element_count[element] = element_count.get(element, 0) + 1
|
|
42
|
+
result = []
|
|
43
|
+
for element, count in element_count.items():
|
|
44
|
+
if count >= 2:
|
|
45
|
+
result.append(element)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def merge_components(
|
|
50
|
+
cls, operands: Any, using: Optional[List[str]] = None
|
|
51
|
+
) -> Dict[str, Component]:
|
|
52
|
+
nullability = {}
|
|
53
|
+
merged_components = {}
|
|
54
|
+
using = using or []
|
|
55
|
+
common = cls.get_components_intersection([op.get_components_names() for op in operands])
|
|
56
|
+
totally_common = list(
|
|
57
|
+
reduce(
|
|
58
|
+
lambda x, y: x & set(y.get_components_names()), # type: ignore[operator]
|
|
59
|
+
operands[1:],
|
|
60
|
+
set(operands[0].get_components_names()),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
for op in operands:
|
|
65
|
+
for comp in op.components.values():
|
|
66
|
+
if comp.name in using:
|
|
67
|
+
is_identifier = all(
|
|
68
|
+
operand.components[comp.name].role == Role.IDENTIFIER
|
|
69
|
+
for operand in operands
|
|
70
|
+
if comp.name in operand.get_components_names()
|
|
71
|
+
)
|
|
72
|
+
comp.role = (
|
|
73
|
+
Role.IDENTIFIER
|
|
74
|
+
if is_identifier
|
|
75
|
+
else Role.MEASURE
|
|
76
|
+
if comp.role == Role.IDENTIFIER
|
|
77
|
+
else comp.role
|
|
78
|
+
)
|
|
79
|
+
if comp.name not in nullability:
|
|
80
|
+
nullability[comp.name] = copy(comp.nullable)
|
|
81
|
+
if comp.role == Role.IDENTIFIER:
|
|
82
|
+
nullability[comp.name] = False
|
|
83
|
+
elif comp.name in totally_common:
|
|
84
|
+
nullability[comp.name] |= copy(comp.nullable)
|
|
85
|
+
elif cls.how == "outer" or (
|
|
86
|
+
cls.how == "left"
|
|
87
|
+
and comp.name not in cls.reference_dataset.get_components_names()
|
|
88
|
+
):
|
|
89
|
+
nullability[comp.name] = True
|
|
90
|
+
else:
|
|
91
|
+
nullability[comp.name] = copy(comp.nullable)
|
|
92
|
+
|
|
93
|
+
for operand in operands:
|
|
94
|
+
operand_name = operand.name
|
|
95
|
+
components = {comp.name: copy(comp) for comp in operand.components.values()}
|
|
96
|
+
|
|
97
|
+
for component_name, component in components.items():
|
|
98
|
+
component.nullable = nullability[component_name]
|
|
99
|
+
|
|
100
|
+
if component_name in common and component_name not in using:
|
|
101
|
+
if component.role != Role.IDENTIFIER or cls.how == "cross":
|
|
102
|
+
new_name = f"{operand_name}#{component_name}"
|
|
103
|
+
if new_name in merged_components:
|
|
104
|
+
raise SemanticError("1-1-13-9", comp_name=new_name)
|
|
105
|
+
while new_name in common:
|
|
106
|
+
new_name += "_dup"
|
|
107
|
+
merged_components[new_name] = component
|
|
108
|
+
merged_components[new_name].name = new_name
|
|
109
|
+
else:
|
|
110
|
+
merged_components[component_name] = component
|
|
111
|
+
else:
|
|
112
|
+
if component_name in using and component_name in merged_components:
|
|
113
|
+
data_type = binary_implicit_promotion(
|
|
114
|
+
merged_components[component_name].data_type,
|
|
115
|
+
component.data_type,
|
|
116
|
+
)
|
|
117
|
+
component.data_type = data_type
|
|
118
|
+
merged_components[component_name] = component
|
|
119
|
+
|
|
120
|
+
return merged_components
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def generate_result_components(
|
|
124
|
+
cls, operands: List[Dataset], using: Optional[List[str]] = None
|
|
125
|
+
) -> Dict[str, Component]:
|
|
126
|
+
components = {}
|
|
127
|
+
inter_identifiers = cls.get_components_intersection(
|
|
128
|
+
[op.get_identifiers_names() for op in operands]
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
for op in operands:
|
|
132
|
+
ids = op.get_identifiers_names()
|
|
133
|
+
for id in inter_identifiers:
|
|
134
|
+
components.update({id: copy(op.components[id])} if id in ids else {})
|
|
135
|
+
return components
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def evaluate(cls, operands: List[Dataset], using: List[str]) -> Dataset:
|
|
139
|
+
result = cls.execute([copy(operand) for operand in operands], using)
|
|
140
|
+
if result.data is not None and sorted(result.get_components_names()) != sorted(
|
|
141
|
+
result.data.columns.tolist()
|
|
142
|
+
):
|
|
143
|
+
missing = list(set(result.get_components_names()) - set(result.data.columns.tolist()))
|
|
144
|
+
if len(missing) == 0:
|
|
145
|
+
missing.append("None")
|
|
146
|
+
raise SemanticError("1-1-1-10", comp_name=missing[0], dataset_name=result.name)
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def execute(cls, operands: List[Dataset], using: List[str]) -> Dataset:
|
|
151
|
+
result = cls.validate(operands, using)
|
|
152
|
+
using = using if using else []
|
|
153
|
+
if len(operands) == 1:
|
|
154
|
+
result.data = operands[0].data
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
common_measures = cls.get_components_intersection(
|
|
158
|
+
[op.get_measures_names() + op.get_attributes_names() for op in operands]
|
|
159
|
+
)
|
|
160
|
+
for op in operands:
|
|
161
|
+
if op.data is not None:
|
|
162
|
+
for column in op.data.columns.tolist():
|
|
163
|
+
if column in common_measures and column not in using:
|
|
164
|
+
op.data = op.data.rename(columns={column: op.name + "#" + column})
|
|
165
|
+
result.data = copy(cls.reference_dataset.data)
|
|
166
|
+
|
|
167
|
+
join_keys = using if using else result.get_identifiers_names()
|
|
168
|
+
|
|
169
|
+
for op in operands:
|
|
170
|
+
if op is not cls.reference_dataset:
|
|
171
|
+
merge_join_keys = (
|
|
172
|
+
[key for key in join_keys if key in op.data.columns.tolist()]
|
|
173
|
+
if (op.data is not None)
|
|
174
|
+
else []
|
|
175
|
+
)
|
|
176
|
+
if len(merge_join_keys) == 0:
|
|
177
|
+
raise SemanticError("1-1-13-14", name=op.name)
|
|
178
|
+
for join_key in merge_join_keys:
|
|
179
|
+
_id_type_promotion_join_keys(
|
|
180
|
+
result.get_component(join_key),
|
|
181
|
+
op.get_component(join_key),
|
|
182
|
+
join_key,
|
|
183
|
+
result.data,
|
|
184
|
+
op.data,
|
|
185
|
+
)
|
|
186
|
+
if op.data is not None and result.data is not None:
|
|
187
|
+
result.data = pd.merge(
|
|
188
|
+
result.data,
|
|
189
|
+
op.data,
|
|
190
|
+
how=cls.how, # type: ignore[arg-type]
|
|
191
|
+
on=merge_join_keys,
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
result.data = pd.DataFrame()
|
|
195
|
+
if result.data is not None:
|
|
196
|
+
result.data.reset_index(drop=True, inplace=True)
|
|
197
|
+
return result
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def validate(cls, operands: List[Dataset], using: Optional[List[str]]) -> Dataset:
|
|
201
|
+
dataset_name = VirtualCounter._new_ds_name()
|
|
202
|
+
if len(operands) < 1 or sum([isinstance(op, Dataset) for op in operands]) < 1:
|
|
203
|
+
raise Exception("Join operator requires at least 1 dataset")
|
|
204
|
+
if not all(isinstance(op, Dataset) for op in operands):
|
|
205
|
+
raise SemanticError("1-1-13-10")
|
|
206
|
+
if len(operands) == 1 and isinstance(operands[0], Dataset):
|
|
207
|
+
return Dataset(name=dataset_name, components=operands[0].components, data=None)
|
|
208
|
+
for op in operands:
|
|
209
|
+
if len(op.get_identifiers()) == 0:
|
|
210
|
+
raise SemanticError("1-2-10", op=cls.op)
|
|
211
|
+
cls.reference_dataset = (
|
|
212
|
+
max(operands, key=lambda x: len(x.get_identifiers_names()))
|
|
213
|
+
if cls.how not in ["cross", "left"]
|
|
214
|
+
else operands[0]
|
|
215
|
+
)
|
|
216
|
+
cls.identifiers_validation(operands, using)
|
|
217
|
+
components = cls.merge_components(operands, using)
|
|
218
|
+
if len(set(components.keys())) != len(components):
|
|
219
|
+
raise SemanticError("1-1-13-9", comp_name="")
|
|
220
|
+
|
|
221
|
+
return Dataset(name=dataset_name, components=components, data=None)
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def identifiers_validation(cls, operands: List[Dataset], using: Optional[List[str]]) -> None:
|
|
225
|
+
# (Case A)
|
|
226
|
+
info = {op.name: op.get_identifiers_names() for op in operands}
|
|
227
|
+
for op_name, identifiers in info.items():
|
|
228
|
+
if len(identifiers) == 0:
|
|
229
|
+
raise SemanticError("1-1-13-14", op=cls.op, name=op_name)
|
|
230
|
+
|
|
231
|
+
for op_name, identifiers in info.items():
|
|
232
|
+
if (
|
|
233
|
+
using is None
|
|
234
|
+
and op_name != cls.reference_dataset.name
|
|
235
|
+
and not set(identifiers).issubset(set(info[cls.reference_dataset.name]))
|
|
236
|
+
):
|
|
237
|
+
missing_components = list(set(identifiers) - set(info[cls.reference_dataset.name]))
|
|
238
|
+
raise SemanticError(
|
|
239
|
+
"1-1-13-11",
|
|
240
|
+
op=cls.op,
|
|
241
|
+
dataset_reference=cls.reference_dataset.name,
|
|
242
|
+
component=missing_components[0],
|
|
243
|
+
)
|
|
244
|
+
if using is None:
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
# (Case B1)
|
|
248
|
+
if cls.reference_dataset is not None:
|
|
249
|
+
for op_name, identifiers in info.items():
|
|
250
|
+
if op_name != cls.reference_dataset.name and not set(identifiers).issubset(using):
|
|
251
|
+
raise SemanticError("1-1-13-4", op=cls.op, using_names=using, dataset=op_name)
|
|
252
|
+
reference_components = cls.reference_dataset.get_components_names()
|
|
253
|
+
if not set(using).issubset(reference_components):
|
|
254
|
+
raise SemanticError(
|
|
255
|
+
"1-1-13-6",
|
|
256
|
+
op=cls.op,
|
|
257
|
+
using_components=using,
|
|
258
|
+
reference=cls.reference_dataset.name,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
for _, identifiers in info.items():
|
|
262
|
+
if not set(using).issubset(identifiers):
|
|
263
|
+
# (Case B2)
|
|
264
|
+
if not set(using).issubset(reference_components):
|
|
265
|
+
raise SemanticError("1-1-13-5", op=cls.op, using_names=using)
|
|
266
|
+
else:
|
|
267
|
+
for op in operands:
|
|
268
|
+
if op is not cls.reference_dataset:
|
|
269
|
+
for component in using:
|
|
270
|
+
if component not in op.get_components_names():
|
|
271
|
+
raise SemanticError(
|
|
272
|
+
"1-1-1-10",
|
|
273
|
+
op=cls.op,
|
|
274
|
+
comp_name=component,
|
|
275
|
+
dataset_name=op.name,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class InnerJoin(Join):
|
|
280
|
+
op = INNER_JOIN
|
|
281
|
+
how = "inner"
|
|
282
|
+
|
|
283
|
+
@classmethod
|
|
284
|
+
def generate_result_components(
|
|
285
|
+
cls, operands: List[Dataset], using: Optional[List[str]] = None
|
|
286
|
+
) -> Dict[str, Component]:
|
|
287
|
+
if using is None:
|
|
288
|
+
return super().generate_result_components(operands, using)
|
|
289
|
+
|
|
290
|
+
components = {}
|
|
291
|
+
for op in operands:
|
|
292
|
+
components.update(
|
|
293
|
+
{id: op.components[id] for id in using if id in op.get_measures_names()}
|
|
294
|
+
)
|
|
295
|
+
for op in operands:
|
|
296
|
+
components.update({id: op.components[id] for id in op.get_identifiers_names()})
|
|
297
|
+
return components
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class LeftJoin(Join):
|
|
301
|
+
op = LEFT_JOIN
|
|
302
|
+
how = "left"
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class FullJoin(Join):
|
|
306
|
+
op = FULL_JOIN
|
|
307
|
+
how = "outer"
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
def identifiers_validation(
|
|
311
|
+
cls, operands: List[Dataset], using: Optional[List[str]] = None
|
|
312
|
+
) -> None:
|
|
313
|
+
if using is not None:
|
|
314
|
+
raise SemanticError("1-1-13-8", op=cls.op)
|
|
315
|
+
for op in operands:
|
|
316
|
+
if op is cls.reference_dataset:
|
|
317
|
+
continue
|
|
318
|
+
if len(op.get_identifiers_names()) != len(
|
|
319
|
+
cls.reference_dataset.get_identifiers_names()
|
|
320
|
+
):
|
|
321
|
+
raise SemanticError("1-1-13-13", op=cls.op)
|
|
322
|
+
if op.get_identifiers_names() != cls.reference_dataset.get_identifiers_names():
|
|
323
|
+
raise SemanticError("1-1-13-12", op=cls.op)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class CrossJoin(Join):
|
|
327
|
+
op = CROSS_JOIN
|
|
328
|
+
how = "cross"
|
|
329
|
+
|
|
330
|
+
@classmethod
|
|
331
|
+
def execute(cls, operands: List[Dataset], using: Optional[List[str]] = None) -> Dataset:
|
|
332
|
+
result = cls.validate(operands, using)
|
|
333
|
+
if len(operands) == 1:
|
|
334
|
+
result.data = operands[0].data
|
|
335
|
+
return result
|
|
336
|
+
common = cls.get_components_intersection([op.get_components_names() for op in operands])
|
|
337
|
+
|
|
338
|
+
for op in operands:
|
|
339
|
+
if op.data is None:
|
|
340
|
+
op.data = pd.DataFrame(columns=op.get_components_names())
|
|
341
|
+
if op is operands[0]:
|
|
342
|
+
result.data = op.data
|
|
343
|
+
else:
|
|
344
|
+
if result.data is not None:
|
|
345
|
+
result.data = pd.merge(
|
|
346
|
+
result.data,
|
|
347
|
+
op.data,
|
|
348
|
+
how=cls.how, # type: ignore[arg-type]
|
|
349
|
+
)
|
|
350
|
+
if result.data is not None:
|
|
351
|
+
result.data = result.data.rename(
|
|
352
|
+
columns={
|
|
353
|
+
column: op.name + "#" + column
|
|
354
|
+
for column in result.data.columns.tolist()
|
|
355
|
+
if column in common
|
|
356
|
+
}
|
|
357
|
+
)
|
|
358
|
+
if result.data is not None:
|
|
359
|
+
result.data.reset_index(drop=True, inplace=True)
|
|
360
|
+
return result
|
|
361
|
+
|
|
362
|
+
@classmethod
|
|
363
|
+
def identifiers_validation(
|
|
364
|
+
cls, operands: List[Dataset], using: Optional[List[str]] = None
|
|
365
|
+
) -> None:
|
|
366
|
+
if using is not None:
|
|
367
|
+
raise SemanticError("1-1-13-8", op=cls.op)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
class Apply(Operator):
|
|
371
|
+
@classmethod
|
|
372
|
+
def evaluate(cls, dataset: Dataset, expression: Any, op_map: Dict[str, Any]) -> Dataset:
|
|
373
|
+
for child in expression:
|
|
374
|
+
dataset = cls.execute(dataset, op_map[child.op], child.left.value, child.right.value)
|
|
375
|
+
return dataset
|
|
376
|
+
|
|
377
|
+
@classmethod
|
|
378
|
+
def execute(cls, dataset: Dataset, op: Any, left: str, right: str) -> Dataset:
|
|
379
|
+
left_dataset = cls.create_dataset("left", left, dataset)
|
|
380
|
+
right_dataset = cls.create_dataset("right", right, dataset)
|
|
381
|
+
left_dataset, right_dataset = cls.get_common_components(left_dataset, right_dataset)
|
|
382
|
+
return op.evaluate(left_dataset, right_dataset)
|
|
383
|
+
|
|
384
|
+
@classmethod
|
|
385
|
+
def validate(cls, dataset: Dataset, child: Any, op_map: Dict[str, Any]) -> None:
|
|
386
|
+
if not isinstance(child, BinOp):
|
|
387
|
+
raise Exception(
|
|
388
|
+
f"Invalid expression {child} on apply operator. Only BinOp are accepted"
|
|
389
|
+
)
|
|
390
|
+
if child.op not in op_map:
|
|
391
|
+
raise Exception(f"Operator {child.op} not implemented")
|
|
392
|
+
if hasattr(child.left, "value") and hasattr(child.right, "value"):
|
|
393
|
+
left_components = [
|
|
394
|
+
comp.name[len(child.left.value) + 1]
|
|
395
|
+
for comp in dataset.components.values()
|
|
396
|
+
if comp.name.startswith(child.left.value)
|
|
397
|
+
]
|
|
398
|
+
right_components = [
|
|
399
|
+
comp.name[len(child.right.value) + 1]
|
|
400
|
+
for comp in dataset.components.values()
|
|
401
|
+
if comp.name.startswith(child.right.value)
|
|
402
|
+
]
|
|
403
|
+
if len(set(left_components) & set(right_components)) == 0:
|
|
404
|
+
raise Exception(
|
|
405
|
+
f"{child.left.value} and {child.right.value} "
|
|
406
|
+
f"has not any match on dataset components"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
@classmethod
|
|
410
|
+
def create_dataset(cls, name: str, prefix: str, dataset: Dataset) -> Dataset:
|
|
411
|
+
prefix += "#"
|
|
412
|
+
components = {
|
|
413
|
+
component.name: component
|
|
414
|
+
for component in dataset.components.values()
|
|
415
|
+
if component.name.startswith(prefix) or component.role is Role.IDENTIFIER
|
|
416
|
+
}
|
|
417
|
+
data = dataset.data[list(components.keys())] if dataset.data is not None else pd.DataFrame()
|
|
418
|
+
|
|
419
|
+
for component in components.values():
|
|
420
|
+
component.name = (
|
|
421
|
+
component.name[len(prefix) :]
|
|
422
|
+
if (component.name.startswith(prefix) and component.role is not Role.IDENTIFIER)
|
|
423
|
+
else component.name
|
|
424
|
+
)
|
|
425
|
+
components = {component.name: component for component in components.values()}
|
|
426
|
+
data.rename(
|
|
427
|
+
columns={
|
|
428
|
+
column: column[len(prefix) :]
|
|
429
|
+
for column in data.columns
|
|
430
|
+
if column.startswith(prefix)
|
|
431
|
+
},
|
|
432
|
+
inplace=True,
|
|
433
|
+
)
|
|
434
|
+
return Dataset(name=name, components=components, data=data)
|
|
435
|
+
|
|
436
|
+
@classmethod
|
|
437
|
+
def get_common_components(cls, left: Dataset, right: Dataset) -> (Dataset, Dataset): # type: ignore[syntax]
|
|
438
|
+
common = set(left.get_components_names()) & set(right.get_components_names())
|
|
439
|
+
left.components = {
|
|
440
|
+
comp.name: comp for comp in left.components.values() if comp.name in common
|
|
441
|
+
}
|
|
442
|
+
right.components = {
|
|
443
|
+
comp.name: comp for comp in right.components.values() if comp.name in common
|
|
444
|
+
}
|
|
445
|
+
left.data = left.data[list(common)] if left.data is not None else pd.DataFrame()
|
|
446
|
+
right.data = right.data[list(common)] if right.data is not None else pd.DataFrame()
|
|
447
|
+
return left, right
|