vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,254 @@
1
+ import operator
2
+ from copy import copy
3
+ from typing import Any, Dict
4
+
5
+ import pandas as pd
6
+ from pandas import DataFrame
7
+
8
+ import vtlengine.Operators as Operators
9
+ from vtlengine.AST.Grammar.tokens import HIERARCHY
10
+ from vtlengine.DataTypes import Boolean, Number
11
+ from vtlengine.Model import Component, DataComponent, Dataset, Role
12
+ from vtlengine.Utils.__Virtual_Assets import VirtualCounter
13
+
14
+
15
+ def get_measure_from_dataset(dataset: Dataset, code_item: str) -> DataComponent:
16
+ measure_name = dataset.get_measures_names()[0]
17
+ data = None if dataset.data is None else dataset.data[measure_name]
18
+ return DataComponent(
19
+ name=code_item,
20
+ data=data,
21
+ data_type=dataset.components[measure_name].data_type,
22
+ role=dataset.components[measure_name].role,
23
+ nullable=dataset.components[measure_name].nullable,
24
+ )
25
+
26
+
27
+ class HRComparison(Operators.Binary):
28
+ @classmethod
29
+ def imbalance_func(cls, x: Any, y: Any) -> Any:
30
+ return None if pd.isnull(x) or pd.isnull(y) else x - y
31
+
32
+ @staticmethod
33
+ def hr_func(left_series: Any, right_series: Any, hr_mode: str) -> Any:
34
+ result = pd.Series(True, index=left_series.index)
35
+ mask_remove = None
36
+ mask_null = None
37
+
38
+ if hr_mode in ("partial_null", "partial_zero"):
39
+ mask_remove = (right_series == "REMOVE_VALUE") & (right_series.notnull())
40
+ if hr_mode == "partial_null":
41
+ mask_null = mask_remove & left_series.notnull()
42
+ else:
43
+ mask_null = mask_remove & (left_series != 0)
44
+ elif hr_mode == "non_null":
45
+ mask_remove = left_series.isnull() | right_series.isnull()
46
+ elif hr_mode == "non_zero":
47
+ mask_remove = (left_series == 0) & (right_series == 0)
48
+
49
+ if mask_remove is not None:
50
+ # Adding ignore here because mypy cannot infer typing of setting values in a Series
51
+ result[mask_remove] = "REMOVE_VALUE" # type: ignore[call-overload, unused-ignore]
52
+ if mask_null is not None:
53
+ result[mask_null] = None
54
+ return result
55
+
56
+ @classmethod
57
+ def apply_hr_func(cls, left_series: Any, right_series: Any, hr_mode: str, func: Any) -> Any:
58
+ # In order not to apply the function to the whole series, we align the series
59
+ # and apply the function only to the valid values based on a validation mask.
60
+ # The function is applied to the aligned series and the result is combined with the
61
+ # original series.
62
+ left_series, right_series = left_series.align(right_series)
63
+ remove_result = cls.hr_func(left_series, right_series, hr_mode)
64
+ mask_valid = remove_result == True
65
+ result = pd.Series(remove_result, index=left_series.index)
66
+ result.loc[mask_valid] = left_series[mask_valid].combine(right_series[mask_valid], func)
67
+ return result
68
+
69
+ @classmethod
70
+ def validate(cls, left_operand: Dataset, right_operand: DataComponent, hr_mode: str) -> Dataset:
71
+ result_components = {
72
+ comp_name: copy(comp)
73
+ for comp_name, comp in left_operand.components.items()
74
+ if comp.role == Role.IDENTIFIER
75
+ }
76
+ result_components["bool_var"] = Component(
77
+ name="bool_var", data_type=Boolean, role=Role.MEASURE, nullable=True
78
+ )
79
+ result_components["imbalance"] = Component(
80
+ name="imbalance", data_type=Number, role=Role.MEASURE, nullable=True
81
+ )
82
+ return Dataset(
83
+ name=f"{left_operand.name}{cls.op}{right_operand.name}",
84
+ components=result_components,
85
+ data=None,
86
+ )
87
+
88
+ @classmethod
89
+ def evaluate(cls, left: Dataset, right: DataComponent, hr_mode: str) -> Dataset: # type: ignore[override]
90
+ result = cls.validate(left, right, hr_mode)
91
+ result.data = left.data.copy() if left.data is not None else pd.DataFrame()
92
+ measure_name = left.get_measures_names()[0]
93
+
94
+ if left.data is not None and right.data is not None:
95
+ result.data["bool_var"] = cls.apply_hr_func(
96
+ left.data[measure_name], right.data, hr_mode, cls.op_func
97
+ )
98
+ result.data["imbalance"] = cls.apply_hr_func(
99
+ left.data[measure_name], right.data, hr_mode, cls.imbalance_func
100
+ )
101
+
102
+ # Removing datapoints that should not be returned
103
+ # (we do it below imbalance calculation
104
+ # to avoid errors on different shape)
105
+ result.data = result.data[result.data["bool_var"] != "REMOVE_VALUE"]
106
+ result.data.drop(measure_name, axis=1, inplace=True)
107
+ return result
108
+
109
+
110
+ class HREqual(HRComparison):
111
+ op = "="
112
+ py_op = operator.eq
113
+
114
+
115
+ class HRGreater(HRComparison):
116
+ op = ">"
117
+ py_op = operator.gt
118
+
119
+
120
+ class HRGreaterEqual(HRComparison):
121
+ op = ">="
122
+ py_op = operator.ge
123
+
124
+
125
+ class HRLess(HRComparison):
126
+ op = "<"
127
+ py_op = operator.lt
128
+
129
+
130
+ class HRLessEqual(HRComparison):
131
+ op = "<="
132
+ py_op = operator.le
133
+
134
+
135
+ class HRBinNumeric(Operators.Binary):
136
+ @classmethod
137
+ def op_func(cls, x: Any, y: Any) -> Any:
138
+ if not pd.isnull(x) and x == "REMOVE_VALUE":
139
+ return "REMOVE_VALUE"
140
+ return super().op_func(x, y)
141
+
142
+ @classmethod
143
+ def evaluate(cls, left: DataComponent, right: DataComponent) -> DataComponent:
144
+ result_data = cls.apply_operation_two_series(left.data, right.data)
145
+ return DataComponent(
146
+ name=f"{left.name}{cls.op}{right.name}",
147
+ data=result_data,
148
+ data_type=left.data_type,
149
+ role=left.role,
150
+ nullable=left.nullable,
151
+ )
152
+
153
+
154
+ class HRBinPlus(HRBinNumeric):
155
+ op = "+"
156
+ py_op = operator.add
157
+
158
+
159
+ class HRBinMinus(HRBinNumeric):
160
+ op = "-"
161
+ py_op = operator.sub
162
+
163
+
164
+ class HRUnNumeric(Operators.Unary):
165
+ @classmethod
166
+ def evaluate(cls, operand: DataComponent) -> DataComponent: # type: ignore[override]
167
+ result_data = cls.apply_operation_component(operand.data)
168
+ return DataComponent(
169
+ name=f"{cls.op}({operand.name})",
170
+ data=result_data,
171
+ data_type=operand.data_type,
172
+ role=operand.role,
173
+ nullable=operand.nullable,
174
+ )
175
+
176
+
177
+ class HRUnPlus(HRUnNumeric):
178
+ op = "+"
179
+ py_op = operator.pos
180
+
181
+
182
+ class HRUnMinus(HRUnNumeric):
183
+ op = "-"
184
+ py_op = operator.neg
185
+
186
+
187
+ class HAAssignment(Operators.Binary):
188
+ @classmethod
189
+ def validate(cls, left: Dataset, right: DataComponent, hr_mode: str) -> Dataset:
190
+ result_components = {comp_name: copy(comp) for comp_name, comp in left.components.items()}
191
+ return Dataset(name=f"{left.name}", components=result_components, data=None)
192
+
193
+ @classmethod
194
+ def evaluate( # type: ignore[override]
195
+ cls, left: Dataset, right: DataComponent, hr_mode: str
196
+ ) -> Dataset:
197
+ result = cls.validate(left, right, hr_mode)
198
+ measure_name = left.get_measures_names()[0]
199
+ result.data = left.data.copy() if left.data is not None else pd.DataFrame()
200
+ if right.data is not None:
201
+ result.data[measure_name] = right.data.map(lambda x: cls.handle_mode(x, hr_mode))
202
+ result.data = result.data[result.data[measure_name] != "REMOVE_VALUE"]
203
+ return result
204
+
205
+ @classmethod
206
+ def handle_mode(cls, x: Any, hr_mode: str) -> Any:
207
+ if not pd.isnull(x) and x == "REMOVE_VALUE":
208
+ return "REMOVE_VALUE"
209
+ if hr_mode == "non_null" and pd.isnull(x) or hr_mode == "non_zero" and x == 0:
210
+ return "REMOVE_VALUE"
211
+ return x
212
+
213
+
214
+ class Hierarchy(Operators.Operator):
215
+ op = HIERARCHY
216
+
217
+ @staticmethod
218
+ def generate_computed_data(computed_dict: Dict[str, DataFrame]) -> DataFrame:
219
+ list_data = list(computed_dict.values())
220
+ df = pd.concat(list_data, axis=0)
221
+ df.reset_index(drop=True, inplace=True)
222
+ return df
223
+
224
+ @classmethod
225
+ def validate(
226
+ cls, dataset: Dataset, computed_dict: Dict[str, DataFrame], output: str
227
+ ) -> Dataset:
228
+ dataset_name = VirtualCounter._new_ds_name()
229
+ result_components = {
230
+ comp_name: copy(comp) for comp_name, comp in dataset.components.items()
231
+ }
232
+ return Dataset(name=dataset_name, components=result_components, data=None)
233
+
234
+ @classmethod
235
+ def evaluate(
236
+ cls, dataset: Dataset, computed_dict: Dict[str, DataFrame], output: str
237
+ ) -> Dataset:
238
+ result = cls.validate(dataset, computed_dict, output)
239
+ if len(computed_dict) == 0:
240
+ computed_data = pd.DataFrame(columns=dataset.get_components_names())
241
+ else:
242
+ computed_data = cls.generate_computed_data(computed_dict)
243
+ if output == "computed":
244
+ result.data = computed_data
245
+ return result
246
+
247
+ # union(setdiff(op, R), R) where R is the computed data.
248
+ # It is the same as union(op, R) and drop duplicates, selecting the last one available
249
+ result.data = pd.concat([dataset.data, computed_data], axis=0, ignore_index=True)
250
+ result.data.drop_duplicates(
251
+ subset=dataset.get_identifiers_names(), keep="last", inplace=True
252
+ )
253
+ result.data.reset_index(drop=True, inplace=True)
254
+ return result
@@ -0,0 +1,447 @@
1
+ from copy import copy
2
+ from functools import reduce
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ # if os.environ.get("SPARK"):
6
+ # import pyspark.pandas as pd
7
+ # else:
8
+ # import pandas as pd
9
+ import pandas as pd
10
+
11
+ from vtlengine.AST import BinOp
12
+ from vtlengine.AST.Grammar.tokens import CROSS_JOIN, FULL_JOIN, INNER_JOIN, LEFT_JOIN
13
+ from vtlengine.DataTypes import binary_implicit_promotion
14
+ from vtlengine.Exceptions import SemanticError
15
+ from vtlengine.Model import Component, Dataset, Role
16
+ from vtlengine.Operators import Operator, _id_type_promotion_join_keys
17
+ from vtlengine.Utils.__Virtual_Assets import VirtualCounter
18
+
19
+
20
+ class Join(Operator):
21
+ how: str
22
+ reference_dataset: Dataset
23
+
24
+ @classmethod
25
+ def get_components_union(cls, datasets: List[Dataset]) -> List[Component]:
26
+ common: List[Any] = []
27
+ common.extend(
28
+ copy(comp)
29
+ for dataset in datasets
30
+ for comp in dataset.components.values()
31
+ if comp not in common
32
+ )
33
+ return common
34
+
35
+ @classmethod
36
+ def get_components_intersection(cls, operands: List[Any]) -> Any:
37
+ element_count: Dict[str, Any] = {}
38
+ for operand in operands:
39
+ operand_set = set(operand)
40
+ for element in operand_set:
41
+ element_count[element] = element_count.get(element, 0) + 1
42
+ result = []
43
+ for element, count in element_count.items():
44
+ if count >= 2:
45
+ result.append(element)
46
+ return result
47
+
48
+ @classmethod
49
+ def merge_components(
50
+ cls, operands: Any, using: Optional[List[str]] = None
51
+ ) -> Dict[str, Component]:
52
+ nullability = {}
53
+ merged_components = {}
54
+ using = using or []
55
+ common = cls.get_components_intersection([op.get_components_names() for op in operands])
56
+ totally_common = list(
57
+ reduce(
58
+ lambda x, y: x & set(y.get_components_names()), # type: ignore[operator]
59
+ operands[1:],
60
+ set(operands[0].get_components_names()),
61
+ )
62
+ )
63
+
64
+ for op in operands:
65
+ for comp in op.components.values():
66
+ if comp.name in using:
67
+ is_identifier = all(
68
+ operand.components[comp.name].role == Role.IDENTIFIER
69
+ for operand in operands
70
+ if comp.name in operand.get_components_names()
71
+ )
72
+ comp.role = (
73
+ Role.IDENTIFIER
74
+ if is_identifier
75
+ else Role.MEASURE
76
+ if comp.role == Role.IDENTIFIER
77
+ else comp.role
78
+ )
79
+ if comp.name not in nullability:
80
+ nullability[comp.name] = copy(comp.nullable)
81
+ if comp.role == Role.IDENTIFIER:
82
+ nullability[comp.name] = False
83
+ elif comp.name in totally_common:
84
+ nullability[comp.name] |= copy(comp.nullable)
85
+ elif cls.how == "outer" or (
86
+ cls.how == "left"
87
+ and comp.name not in cls.reference_dataset.get_components_names()
88
+ ):
89
+ nullability[comp.name] = True
90
+ else:
91
+ nullability[comp.name] = copy(comp.nullable)
92
+
93
+ for operand in operands:
94
+ operand_name = operand.name
95
+ components = {comp.name: copy(comp) for comp in operand.components.values()}
96
+
97
+ for component_name, component in components.items():
98
+ component.nullable = nullability[component_name]
99
+
100
+ if component_name in common and component_name not in using:
101
+ if component.role != Role.IDENTIFIER or cls.how == "cross":
102
+ new_name = f"{operand_name}#{component_name}"
103
+ if new_name in merged_components:
104
+ raise SemanticError("1-1-13-9", comp_name=new_name)
105
+ while new_name in common:
106
+ new_name += "_dup"
107
+ merged_components[new_name] = component
108
+ merged_components[new_name].name = new_name
109
+ else:
110
+ merged_components[component_name] = component
111
+ else:
112
+ if component_name in using and component_name in merged_components:
113
+ data_type = binary_implicit_promotion(
114
+ merged_components[component_name].data_type,
115
+ component.data_type,
116
+ )
117
+ component.data_type = data_type
118
+ merged_components[component_name] = component
119
+
120
+ return merged_components
121
+
122
+ @classmethod
123
+ def generate_result_components(
124
+ cls, operands: List[Dataset], using: Optional[List[str]] = None
125
+ ) -> Dict[str, Component]:
126
+ components = {}
127
+ inter_identifiers = cls.get_components_intersection(
128
+ [op.get_identifiers_names() for op in operands]
129
+ )
130
+
131
+ for op in operands:
132
+ ids = op.get_identifiers_names()
133
+ for id in inter_identifiers:
134
+ components.update({id: copy(op.components[id])} if id in ids else {})
135
+ return components
136
+
137
+ @classmethod
138
+ def evaluate(cls, operands: List[Dataset], using: List[str]) -> Dataset:
139
+ result = cls.execute([copy(operand) for operand in operands], using)
140
+ if result.data is not None and sorted(result.get_components_names()) != sorted(
141
+ result.data.columns.tolist()
142
+ ):
143
+ missing = list(set(result.get_components_names()) - set(result.data.columns.tolist()))
144
+ if len(missing) == 0:
145
+ missing.append("None")
146
+ raise SemanticError("1-1-1-10", comp_name=missing[0], dataset_name=result.name)
147
+ return result
148
+
149
+ @classmethod
150
+ def execute(cls, operands: List[Dataset], using: List[str]) -> Dataset:
151
+ result = cls.validate(operands, using)
152
+ using = using if using else []
153
+ if len(operands) == 1:
154
+ result.data = operands[0].data
155
+ return result
156
+
157
+ common_measures = cls.get_components_intersection(
158
+ [op.get_measures_names() + op.get_attributes_names() for op in operands]
159
+ )
160
+ for op in operands:
161
+ if op.data is not None:
162
+ for column in op.data.columns.tolist():
163
+ if column in common_measures and column not in using:
164
+ op.data = op.data.rename(columns={column: op.name + "#" + column})
165
+ result.data = copy(cls.reference_dataset.data)
166
+
167
+ join_keys = using if using else result.get_identifiers_names()
168
+
169
+ for op in operands:
170
+ if op is not cls.reference_dataset:
171
+ merge_join_keys = (
172
+ [key for key in join_keys if key in op.data.columns.tolist()]
173
+ if (op.data is not None)
174
+ else []
175
+ )
176
+ if len(merge_join_keys) == 0:
177
+ raise SemanticError("1-1-13-14", name=op.name)
178
+ for join_key in merge_join_keys:
179
+ _id_type_promotion_join_keys(
180
+ result.get_component(join_key),
181
+ op.get_component(join_key),
182
+ join_key,
183
+ result.data,
184
+ op.data,
185
+ )
186
+ if op.data is not None and result.data is not None:
187
+ result.data = pd.merge(
188
+ result.data,
189
+ op.data,
190
+ how=cls.how, # type: ignore[arg-type]
191
+ on=merge_join_keys,
192
+ )
193
+ else:
194
+ result.data = pd.DataFrame()
195
+ if result.data is not None:
196
+ result.data.reset_index(drop=True, inplace=True)
197
+ return result
198
+
199
+ @classmethod
200
+ def validate(cls, operands: List[Dataset], using: Optional[List[str]]) -> Dataset:
201
+ dataset_name = VirtualCounter._new_ds_name()
202
+ if len(operands) < 1 or sum([isinstance(op, Dataset) for op in operands]) < 1:
203
+ raise Exception("Join operator requires at least 1 dataset")
204
+ if not all(isinstance(op, Dataset) for op in operands):
205
+ raise SemanticError("1-1-13-10")
206
+ if len(operands) == 1 and isinstance(operands[0], Dataset):
207
+ return Dataset(name=dataset_name, components=operands[0].components, data=None)
208
+ for op in operands:
209
+ if len(op.get_identifiers()) == 0:
210
+ raise SemanticError("1-2-10", op=cls.op)
211
+ cls.reference_dataset = (
212
+ max(operands, key=lambda x: len(x.get_identifiers_names()))
213
+ if cls.how not in ["cross", "left"]
214
+ else operands[0]
215
+ )
216
+ cls.identifiers_validation(operands, using)
217
+ components = cls.merge_components(operands, using)
218
+ if len(set(components.keys())) != len(components):
219
+ raise SemanticError("1-1-13-9", comp_name="")
220
+
221
+ return Dataset(name=dataset_name, components=components, data=None)
222
+
223
+ @classmethod
224
+ def identifiers_validation(cls, operands: List[Dataset], using: Optional[List[str]]) -> None:
225
+ # (Case A)
226
+ info = {op.name: op.get_identifiers_names() for op in operands}
227
+ for op_name, identifiers in info.items():
228
+ if len(identifiers) == 0:
229
+ raise SemanticError("1-1-13-14", op=cls.op, name=op_name)
230
+
231
+ for op_name, identifiers in info.items():
232
+ if (
233
+ using is None
234
+ and op_name != cls.reference_dataset.name
235
+ and not set(identifiers).issubset(set(info[cls.reference_dataset.name]))
236
+ ):
237
+ missing_components = list(set(identifiers) - set(info[cls.reference_dataset.name]))
238
+ raise SemanticError(
239
+ "1-1-13-11",
240
+ op=cls.op,
241
+ dataset_reference=cls.reference_dataset.name,
242
+ component=missing_components[0],
243
+ )
244
+ if using is None:
245
+ return
246
+
247
+ # (Case B1)
248
+ if cls.reference_dataset is not None:
249
+ for op_name, identifiers in info.items():
250
+ if op_name != cls.reference_dataset.name and not set(identifiers).issubset(using):
251
+ raise SemanticError("1-1-13-4", op=cls.op, using_names=using, dataset=op_name)
252
+ reference_components = cls.reference_dataset.get_components_names()
253
+ if not set(using).issubset(reference_components):
254
+ raise SemanticError(
255
+ "1-1-13-6",
256
+ op=cls.op,
257
+ using_components=using,
258
+ reference=cls.reference_dataset.name,
259
+ )
260
+
261
+ for _, identifiers in info.items():
262
+ if not set(using).issubset(identifiers):
263
+ # (Case B2)
264
+ if not set(using).issubset(reference_components):
265
+ raise SemanticError("1-1-13-5", op=cls.op, using_names=using)
266
+ else:
267
+ for op in operands:
268
+ if op is not cls.reference_dataset:
269
+ for component in using:
270
+ if component not in op.get_components_names():
271
+ raise SemanticError(
272
+ "1-1-1-10",
273
+ op=cls.op,
274
+ comp_name=component,
275
+ dataset_name=op.name,
276
+ )
277
+
278
+
279
+ class InnerJoin(Join):
280
+ op = INNER_JOIN
281
+ how = "inner"
282
+
283
+ @classmethod
284
+ def generate_result_components(
285
+ cls, operands: List[Dataset], using: Optional[List[str]] = None
286
+ ) -> Dict[str, Component]:
287
+ if using is None:
288
+ return super().generate_result_components(operands, using)
289
+
290
+ components = {}
291
+ for op in operands:
292
+ components.update(
293
+ {id: op.components[id] for id in using if id in op.get_measures_names()}
294
+ )
295
+ for op in operands:
296
+ components.update({id: op.components[id] for id in op.get_identifiers_names()})
297
+ return components
298
+
299
+
300
+ class LeftJoin(Join):
301
+ op = LEFT_JOIN
302
+ how = "left"
303
+
304
+
305
+ class FullJoin(Join):
306
+ op = FULL_JOIN
307
+ how = "outer"
308
+
309
+ @classmethod
310
+ def identifiers_validation(
311
+ cls, operands: List[Dataset], using: Optional[List[str]] = None
312
+ ) -> None:
313
+ if using is not None:
314
+ raise SemanticError("1-1-13-8", op=cls.op)
315
+ for op in operands:
316
+ if op is cls.reference_dataset:
317
+ continue
318
+ if len(op.get_identifiers_names()) != len(
319
+ cls.reference_dataset.get_identifiers_names()
320
+ ):
321
+ raise SemanticError("1-1-13-13", op=cls.op)
322
+ if op.get_identifiers_names() != cls.reference_dataset.get_identifiers_names():
323
+ raise SemanticError("1-1-13-12", op=cls.op)
324
+
325
+
326
+ class CrossJoin(Join):
327
+ op = CROSS_JOIN
328
+ how = "cross"
329
+
330
+ @classmethod
331
+ def execute(cls, operands: List[Dataset], using: Optional[List[str]] = None) -> Dataset:
332
+ result = cls.validate(operands, using)
333
+ if len(operands) == 1:
334
+ result.data = operands[0].data
335
+ return result
336
+ common = cls.get_components_intersection([op.get_components_names() for op in operands])
337
+
338
+ for op in operands:
339
+ if op.data is None:
340
+ op.data = pd.DataFrame(columns=op.get_components_names())
341
+ if op is operands[0]:
342
+ result.data = op.data
343
+ else:
344
+ if result.data is not None:
345
+ result.data = pd.merge(
346
+ result.data,
347
+ op.data,
348
+ how=cls.how, # type: ignore[arg-type]
349
+ )
350
+ if result.data is not None:
351
+ result.data = result.data.rename(
352
+ columns={
353
+ column: op.name + "#" + column
354
+ for column in result.data.columns.tolist()
355
+ if column in common
356
+ }
357
+ )
358
+ if result.data is not None:
359
+ result.data.reset_index(drop=True, inplace=True)
360
+ return result
361
+
362
+ @classmethod
363
+ def identifiers_validation(
364
+ cls, operands: List[Dataset], using: Optional[List[str]] = None
365
+ ) -> None:
366
+ if using is not None:
367
+ raise SemanticError("1-1-13-8", op=cls.op)
368
+
369
+
370
+ class Apply(Operator):
371
+ @classmethod
372
+ def evaluate(cls, dataset: Dataset, expression: Any, op_map: Dict[str, Any]) -> Dataset:
373
+ for child in expression:
374
+ dataset = cls.execute(dataset, op_map[child.op], child.left.value, child.right.value)
375
+ return dataset
376
+
377
+ @classmethod
378
+ def execute(cls, dataset: Dataset, op: Any, left: str, right: str) -> Dataset:
379
+ left_dataset = cls.create_dataset("left", left, dataset)
380
+ right_dataset = cls.create_dataset("right", right, dataset)
381
+ left_dataset, right_dataset = cls.get_common_components(left_dataset, right_dataset)
382
+ return op.evaluate(left_dataset, right_dataset)
383
+
384
+ @classmethod
385
+ def validate(cls, dataset: Dataset, child: Any, op_map: Dict[str, Any]) -> None:
386
+ if not isinstance(child, BinOp):
387
+ raise Exception(
388
+ f"Invalid expression {child} on apply operator. Only BinOp are accepted"
389
+ )
390
+ if child.op not in op_map:
391
+ raise Exception(f"Operator {child.op} not implemented")
392
+ if hasattr(child.left, "value") and hasattr(child.right, "value"):
393
+ left_components = [
394
+ comp.name[len(child.left.value) + 1]
395
+ for comp in dataset.components.values()
396
+ if comp.name.startswith(child.left.value)
397
+ ]
398
+ right_components = [
399
+ comp.name[len(child.right.value) + 1]
400
+ for comp in dataset.components.values()
401
+ if comp.name.startswith(child.right.value)
402
+ ]
403
+ if len(set(left_components) & set(right_components)) == 0:
404
+ raise Exception(
405
+ f"{child.left.value} and {child.right.value} "
406
+ f"has not any match on dataset components"
407
+ )
408
+
409
+ @classmethod
410
+ def create_dataset(cls, name: str, prefix: str, dataset: Dataset) -> Dataset:
411
+ prefix += "#"
412
+ components = {
413
+ component.name: component
414
+ for component in dataset.components.values()
415
+ if component.name.startswith(prefix) or component.role is Role.IDENTIFIER
416
+ }
417
+ data = dataset.data[list(components.keys())] if dataset.data is not None else pd.DataFrame()
418
+
419
+ for component in components.values():
420
+ component.name = (
421
+ component.name[len(prefix) :]
422
+ if (component.name.startswith(prefix) and component.role is not Role.IDENTIFIER)
423
+ else component.name
424
+ )
425
+ components = {component.name: component for component in components.values()}
426
+ data.rename(
427
+ columns={
428
+ column: column[len(prefix) :]
429
+ for column in data.columns
430
+ if column.startswith(prefix)
431
+ },
432
+ inplace=True,
433
+ )
434
+ return Dataset(name=name, components=components, data=data)
435
+
436
+ @classmethod
437
+ def get_common_components(cls, left: Dataset, right: Dataset) -> (Dataset, Dataset): # type: ignore[syntax]
438
+ common = set(left.get_components_names()) & set(right.get_components_names())
439
+ left.components = {
440
+ comp.name: comp for comp in left.components.values() if comp.name in common
441
+ }
442
+ right.components = {
443
+ comp.name: comp for comp in right.components.values() if comp.name in common
444
+ }
445
+ left.data = left.data[list(common)] if left.data is not None else pd.DataFrame()
446
+ right.data = right.data[list(common)] if right.data is not None else pd.DataFrame()
447
+ return left, right