vtlengine 1.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vtlengine/API/_InternalApi.py +791 -0
- vtlengine/API/__init__.py +612 -0
- vtlengine/API/data/schema/external_routines_schema.json +34 -0
- vtlengine/API/data/schema/json_schema_2.1.json +116 -0
- vtlengine/API/data/schema/value_domain_schema.json +97 -0
- vtlengine/AST/ASTComment.py +57 -0
- vtlengine/AST/ASTConstructor.py +598 -0
- vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
- vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
- vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
- vtlengine/AST/ASTDataExchange.py +10 -0
- vtlengine/AST/ASTEncoders.py +32 -0
- vtlengine/AST/ASTString.py +675 -0
- vtlengine/AST/ASTTemplate.py +558 -0
- vtlengine/AST/ASTVisitor.py +25 -0
- vtlengine/AST/DAG/__init__.py +479 -0
- vtlengine/AST/DAG/_words.py +10 -0
- vtlengine/AST/Grammar/Vtl.g4 +705 -0
- vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
- vtlengine/AST/Grammar/__init__.py +0 -0
- vtlengine/AST/Grammar/lexer.py +2139 -0
- vtlengine/AST/Grammar/parser.py +16597 -0
- vtlengine/AST/Grammar/tokens.py +169 -0
- vtlengine/AST/VtlVisitor.py +824 -0
- vtlengine/AST/__init__.py +674 -0
- vtlengine/DataTypes/TimeHandling.py +562 -0
- vtlengine/DataTypes/__init__.py +863 -0
- vtlengine/DataTypes/_time_checking.py +135 -0
- vtlengine/Exceptions/__exception_file_generator.py +96 -0
- vtlengine/Exceptions/__init__.py +159 -0
- vtlengine/Exceptions/messages.py +1004 -0
- vtlengine/Interpreter/__init__.py +2048 -0
- vtlengine/Model/__init__.py +501 -0
- vtlengine/Operators/Aggregation.py +357 -0
- vtlengine/Operators/Analytic.py +455 -0
- vtlengine/Operators/Assignment.py +23 -0
- vtlengine/Operators/Boolean.py +106 -0
- vtlengine/Operators/CastOperator.py +451 -0
- vtlengine/Operators/Clause.py +366 -0
- vtlengine/Operators/Comparison.py +488 -0
- vtlengine/Operators/Conditional.py +495 -0
- vtlengine/Operators/General.py +191 -0
- vtlengine/Operators/HROperators.py +254 -0
- vtlengine/Operators/Join.py +447 -0
- vtlengine/Operators/Numeric.py +422 -0
- vtlengine/Operators/RoleSetter.py +77 -0
- vtlengine/Operators/Set.py +176 -0
- vtlengine/Operators/String.py +578 -0
- vtlengine/Operators/Time.py +1144 -0
- vtlengine/Operators/Validation.py +275 -0
- vtlengine/Operators/__init__.py +900 -0
- vtlengine/Utils/__Virtual_Assets.py +34 -0
- vtlengine/Utils/__init__.py +479 -0
- vtlengine/__extras_check.py +17 -0
- vtlengine/__init__.py +27 -0
- vtlengine/files/__init__.py +0 -0
- vtlengine/files/output/__init__.py +35 -0
- vtlengine/files/output/_time_period_representation.py +55 -0
- vtlengine/files/parser/__init__.py +240 -0
- vtlengine/files/parser/_rfc_dialect.py +22 -0
- vtlengine/py.typed +0 -0
- vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
- vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
- vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
- vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
import operator
|
|
2
|
+
import re
|
|
3
|
+
from copy import copy
|
|
4
|
+
from typing import Any, Optional, Union
|
|
5
|
+
|
|
6
|
+
# if os.environ.get("SPARK"):
|
|
7
|
+
# import pyspark.pandas as pd
|
|
8
|
+
# else:
|
|
9
|
+
# import pandas as pd
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
import vtlengine.Operators as Operator
|
|
13
|
+
from vtlengine.AST.Grammar.tokens import (
|
|
14
|
+
CHARSET_MATCH,
|
|
15
|
+
EQ,
|
|
16
|
+
GT,
|
|
17
|
+
GTE,
|
|
18
|
+
IN,
|
|
19
|
+
ISNULL,
|
|
20
|
+
LT,
|
|
21
|
+
LTE,
|
|
22
|
+
NEQ,
|
|
23
|
+
NOT_IN,
|
|
24
|
+
)
|
|
25
|
+
from vtlengine.DataTypes import COMP_NAME_MAPPING, Boolean, Null, Number, String
|
|
26
|
+
from vtlengine.Exceptions import SemanticError
|
|
27
|
+
from vtlengine.Model import Component, DataComponent, Dataset, Role, Scalar, ScalarSet
|
|
28
|
+
from vtlengine.Utils.__Virtual_Assets import VirtualCounter
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Unary(Operator.Unary):
|
|
32
|
+
"""
|
|
33
|
+
Unary comparison operator. It returns a boolean.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
return_type = Boolean
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class IsNull(Unary):
|
|
40
|
+
"""
|
|
41
|
+
Class that allows to perform the isnull comparison operator.
|
|
42
|
+
It has different class methods to allow performing the operation with different datatypes.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
op = ISNULL
|
|
46
|
+
py_op = pd.isnull
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def apply_operation_component(cls, series: Any) -> Any:
|
|
50
|
+
return series.isnull()
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def op_func(cls, x: Any) -> Any:
|
|
54
|
+
return pd.isnull(x)
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def dataset_validation(cls, operand: Dataset) -> Dataset:
|
|
58
|
+
result = super().dataset_validation(operand)
|
|
59
|
+
for measure in result.get_measures():
|
|
60
|
+
measure.nullable = False
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def component_validation(cls, operand: DataComponent) -> DataComponent:
|
|
65
|
+
result = super().component_validation(operand)
|
|
66
|
+
result.nullable = False
|
|
67
|
+
return result
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Binary(Operator.Binary):
|
|
71
|
+
"""
|
|
72
|
+
Binary comparison operator. It returns a boolean.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
return_type = Boolean
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def _cast_values(
|
|
79
|
+
cls,
|
|
80
|
+
x: Optional[Union[int, float, str, bool]],
|
|
81
|
+
y: Optional[Union[int, float, str, bool]],
|
|
82
|
+
) -> Any:
|
|
83
|
+
# Cast values to compatible types for comparison
|
|
84
|
+
try:
|
|
85
|
+
if isinstance(x, str) and isinstance(y, bool):
|
|
86
|
+
y = String.cast(y)
|
|
87
|
+
elif isinstance(x, bool) and isinstance(y, str):
|
|
88
|
+
x = String.cast(x)
|
|
89
|
+
elif isinstance(x, str) and isinstance(y, (int, float)):
|
|
90
|
+
x = Number.cast(x)
|
|
91
|
+
elif isinstance(x, (int, float)) and isinstance(y, str):
|
|
92
|
+
y = Number.cast(y)
|
|
93
|
+
except ValueError:
|
|
94
|
+
x = str(x)
|
|
95
|
+
y = str(y)
|
|
96
|
+
|
|
97
|
+
return x, y
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def op_func(cls, x: Any, y: Any) -> Any:
|
|
101
|
+
# Return None if any of the values are NaN
|
|
102
|
+
if pd.isnull(x) or pd.isnull(y):
|
|
103
|
+
return None
|
|
104
|
+
x, y = cls._cast_values(x, y)
|
|
105
|
+
return cls.py_op(x, y)
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def apply_operation_series_scalar(cls, series: Any, scalar: Any, series_left: bool) -> Any:
|
|
109
|
+
if pd.isnull(scalar):
|
|
110
|
+
return pd.Series(None, index=series.index)
|
|
111
|
+
|
|
112
|
+
first_non_null = series.dropna().iloc[0] if not series.dropna().empty else None
|
|
113
|
+
if first_non_null is not None:
|
|
114
|
+
scalar, first_non_null = cls._cast_values(scalar, first_non_null)
|
|
115
|
+
|
|
116
|
+
series_type = pd.api.types.infer_dtype(series, skipna=True)
|
|
117
|
+
first_non_null_type = pd.api.types.infer_dtype([first_non_null])
|
|
118
|
+
|
|
119
|
+
if series_type != first_non_null_type:
|
|
120
|
+
if isinstance(first_non_null, str):
|
|
121
|
+
series = series.astype(str)
|
|
122
|
+
elif isinstance(first_non_null, (int, float)):
|
|
123
|
+
series = series.astype(float)
|
|
124
|
+
|
|
125
|
+
op = cls.py_op if cls.py_op is not None else cls.op_func
|
|
126
|
+
if series_left:
|
|
127
|
+
result = series.map(lambda x: op(x, scalar), na_action="ignore")
|
|
128
|
+
else:
|
|
129
|
+
result = series.map(lambda x: op(scalar, x), na_action="ignore")
|
|
130
|
+
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def apply_return_type_dataset(
|
|
135
|
+
cls,
|
|
136
|
+
result_dataset: Dataset,
|
|
137
|
+
left_operand: Dataset,
|
|
138
|
+
right_operand: Union[Dataset, Scalar, ScalarSet],
|
|
139
|
+
) -> None:
|
|
140
|
+
super().apply_return_type_dataset(result_dataset, left_operand, right_operand)
|
|
141
|
+
is_mono_measure = len(result_dataset.get_measures()) == 1
|
|
142
|
+
if is_mono_measure:
|
|
143
|
+
measure = result_dataset.get_measures()[0]
|
|
144
|
+
component = Component(
|
|
145
|
+
name=COMP_NAME_MAPPING[Boolean],
|
|
146
|
+
data_type=Boolean,
|
|
147
|
+
role=Role.MEASURE,
|
|
148
|
+
nullable=measure.nullable,
|
|
149
|
+
)
|
|
150
|
+
result_dataset.delete_component(measure.name)
|
|
151
|
+
result_dataset.add_component(component)
|
|
152
|
+
if result_dataset.data is not None:
|
|
153
|
+
result_dataset.data.rename(columns={measure.name: component.name}, inplace=True)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class Equal(Binary):
|
|
157
|
+
op = EQ
|
|
158
|
+
py_op = operator.eq
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class NotEqual(Binary):
|
|
162
|
+
op = NEQ
|
|
163
|
+
py_op = operator.ne
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class Greater(Binary):
|
|
167
|
+
op = GT
|
|
168
|
+
py_op = operator.gt
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class GreaterEqual(Binary):
|
|
172
|
+
op = GTE
|
|
173
|
+
py_op = operator.ge
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class Less(Binary):
|
|
177
|
+
op = LT
|
|
178
|
+
py_op = operator.lt
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class LessEqual(Binary):
|
|
182
|
+
op = LTE
|
|
183
|
+
py_op = operator.le
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class In(Binary):
|
|
187
|
+
op = IN
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def apply_operation_two_series(cls, left_series: Any, right_series: ScalarSet) -> Any:
|
|
191
|
+
if right_series.data_type == Null:
|
|
192
|
+
return pd.Series(None, index=left_series.index)
|
|
193
|
+
|
|
194
|
+
return left_series.map(lambda x: x in right_series, na_action="ignore")
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def py_op(cls, x: Any, y: Any) -> Any:
|
|
198
|
+
if y.data_type == Null:
|
|
199
|
+
return None
|
|
200
|
+
return operator.contains(y, x)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class NotIn(Binary):
|
|
204
|
+
op = NOT_IN
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def apply_operation_two_series(cls, left_series: Any, right_series: Any) -> Any:
|
|
208
|
+
series_result = In.apply_operation_two_series(left_series, right_series)
|
|
209
|
+
return series_result.map(lambda x: not x, na_action="ignore")
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def py_op(cls, x: Any, y: Any) -> Any:
|
|
213
|
+
return not operator.contains(y, x)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class Match(Binary):
|
|
217
|
+
op = CHARSET_MATCH
|
|
218
|
+
type_to_check = String
|
|
219
|
+
|
|
220
|
+
@classmethod
|
|
221
|
+
def op_func(cls, x: Optional[str], y: Optional[str]) -> Optional[bool]:
|
|
222
|
+
if pd.isnull(x) or pd.isnull(y):
|
|
223
|
+
return None
|
|
224
|
+
if isinstance(x, pd.Series):
|
|
225
|
+
return x.str.fullmatch(y)
|
|
226
|
+
return bool(re.fullmatch(str(y), str(x)))
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class Between(Operator.Operator):
|
|
230
|
+
return_type = Boolean
|
|
231
|
+
"""
|
|
232
|
+
This comparison operator has the following class methods.
|
|
233
|
+
|
|
234
|
+
Class methods:
|
|
235
|
+
op_function: Sets the data to be manipulated.
|
|
236
|
+
apply_operation_component: Returns a pandas dataframe with the operation,
|
|
237
|
+
|
|
238
|
+
considering each component with the schema of op_function.
|
|
239
|
+
|
|
240
|
+
apply_return_type_dataset: Because the result must be a boolean,
|
|
241
|
+
this function evaluates if the measure is actually a boolean one.
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
@classmethod
|
|
245
|
+
def op_func(
|
|
246
|
+
cls,
|
|
247
|
+
x: Optional[Union[int, float, bool, str]],
|
|
248
|
+
y: Optional[Union[int, float, bool, str]],
|
|
249
|
+
z: Optional[Union[int, float, bool, str]],
|
|
250
|
+
) -> Optional[bool]:
|
|
251
|
+
return (
|
|
252
|
+
None if (pd.isnull(x) or pd.isnull(y) or pd.isnull(z)) else y <= x <= z # type: ignore[operator]
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
@classmethod
|
|
256
|
+
def apply_operation_component(cls, series: Any, from_data: Any, to_data: Any) -> Any:
|
|
257
|
+
control_any_series_from_to = isinstance(from_data, pd.Series) or isinstance(
|
|
258
|
+
to_data, pd.Series
|
|
259
|
+
)
|
|
260
|
+
if control_any_series_from_to:
|
|
261
|
+
if not isinstance(from_data, pd.Series):
|
|
262
|
+
from_data = pd.Series(from_data, index=series.index, dtype=object)
|
|
263
|
+
if not isinstance(to_data, pd.Series):
|
|
264
|
+
to_data = pd.Series(to_data, index=series.index)
|
|
265
|
+
df = pd.DataFrame({"operand": series, "from_data": from_data, "to_data": to_data})
|
|
266
|
+
return df.apply(
|
|
267
|
+
lambda x: cls.op_func(x["operand"], x["from_data"], x["to_data"]),
|
|
268
|
+
axis=1,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return series.map(lambda x: cls.op_func(x, from_data, to_data))
|
|
272
|
+
|
|
273
|
+
@classmethod
|
|
274
|
+
def apply_return_type_dataset(cls, result_dataset: Dataset, operand: Dataset) -> None:
|
|
275
|
+
is_mono_measure = len(operand.get_measures()) == 1
|
|
276
|
+
for measure in result_dataset.get_measures():
|
|
277
|
+
operand_type = operand.get_component(measure.name).data_type
|
|
278
|
+
result_data_type = cls.type_validation(operand_type)
|
|
279
|
+
if is_mono_measure and operand_type.promotion_changed_type(result_data_type):
|
|
280
|
+
component = Component(
|
|
281
|
+
name=COMP_NAME_MAPPING[result_data_type],
|
|
282
|
+
data_type=result_data_type,
|
|
283
|
+
role=Role.MEASURE,
|
|
284
|
+
nullable=measure.nullable,
|
|
285
|
+
)
|
|
286
|
+
result_dataset.delete_component(measure.name)
|
|
287
|
+
result_dataset.add_component(component)
|
|
288
|
+
if result_dataset.data is not None:
|
|
289
|
+
result_dataset.data.rename(columns={measure.name: component.name}, inplace=True)
|
|
290
|
+
elif is_mono_measure is False and operand_type.promotion_changed_type(result_data_type):
|
|
291
|
+
raise SemanticError("1-1-1-4", op=cls.op)
|
|
292
|
+
else:
|
|
293
|
+
measure.data_type = result_data_type
|
|
294
|
+
|
|
295
|
+
@classmethod
|
|
296
|
+
def validate(
|
|
297
|
+
cls,
|
|
298
|
+
operand: Union[Dataset, DataComponent, Scalar],
|
|
299
|
+
from_: Union[DataComponent, Scalar],
|
|
300
|
+
to: Union[DataComponent, Scalar],
|
|
301
|
+
) -> Any:
|
|
302
|
+
result: Union[Dataset, DataComponent, Scalar]
|
|
303
|
+
if isinstance(operand, Dataset):
|
|
304
|
+
if len(operand.get_measures()) == 0:
|
|
305
|
+
raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
|
|
306
|
+
result_components = {
|
|
307
|
+
comp_name: copy(comp)
|
|
308
|
+
for comp_name, comp in operand.components.items()
|
|
309
|
+
if comp.role == Role.IDENTIFIER or comp.role == Role.MEASURE
|
|
310
|
+
}
|
|
311
|
+
result = Dataset(name=operand.name, components=result_components, data=None)
|
|
312
|
+
elif isinstance(operand, DataComponent):
|
|
313
|
+
result = DataComponent(
|
|
314
|
+
name=operand.name,
|
|
315
|
+
data=None,
|
|
316
|
+
data_type=cls.return_type,
|
|
317
|
+
role=operand.role,
|
|
318
|
+
nullable=operand.nullable,
|
|
319
|
+
)
|
|
320
|
+
elif isinstance(from_, Scalar) and isinstance(to, Scalar):
|
|
321
|
+
result = Scalar(name=operand.name, value=None, data_type=cls.return_type)
|
|
322
|
+
else:
|
|
323
|
+
# From or To is a DataComponent, or both
|
|
324
|
+
result = DataComponent(
|
|
325
|
+
name=operand.name,
|
|
326
|
+
data=None,
|
|
327
|
+
data_type=cls.return_type,
|
|
328
|
+
role=Role.MEASURE,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if isinstance(operand, Dataset):
|
|
332
|
+
for measure in operand.get_measures():
|
|
333
|
+
cls.validate_type_compatibility(measure.data_type, from_.data_type)
|
|
334
|
+
cls.validate_type_compatibility(measure.data_type, to.data_type)
|
|
335
|
+
if isinstance(result, Dataset):
|
|
336
|
+
cls.apply_return_type_dataset(result, operand)
|
|
337
|
+
else:
|
|
338
|
+
cls.validate_type_compatibility(operand.data_type, from_.data_type)
|
|
339
|
+
cls.validate_type_compatibility(operand.data_type, to.data_type)
|
|
340
|
+
|
|
341
|
+
return result
|
|
342
|
+
|
|
343
|
+
@classmethod
|
|
344
|
+
def evaluate(
|
|
345
|
+
cls,
|
|
346
|
+
operand: Union[DataComponent, Scalar],
|
|
347
|
+
from_: Union[DataComponent, Scalar],
|
|
348
|
+
to: Union[DataComponent, Scalar],
|
|
349
|
+
) -> Any:
|
|
350
|
+
result = cls.validate(operand, from_, to)
|
|
351
|
+
from_data = from_.data if isinstance(from_, DataComponent) else from_.value
|
|
352
|
+
to_data = to.data if isinstance(to, DataComponent) else to.value
|
|
353
|
+
|
|
354
|
+
if (
|
|
355
|
+
isinstance(from_data, pd.Series)
|
|
356
|
+
and isinstance(to_data, pd.Series)
|
|
357
|
+
and len(from_data) != len(to_data)
|
|
358
|
+
):
|
|
359
|
+
raise ValueError("From and To must have the same length")
|
|
360
|
+
|
|
361
|
+
if isinstance(operand, Dataset):
|
|
362
|
+
result.data = operand.data.copy()
|
|
363
|
+
for measure_name in operand.get_measures_names():
|
|
364
|
+
result.data[measure_name] = cls.apply_operation_component(
|
|
365
|
+
operand.data[measure_name], from_data, to_data
|
|
366
|
+
)
|
|
367
|
+
if len(result.get_measures()) == 1:
|
|
368
|
+
result.data[COMP_NAME_MAPPING[cls.return_type]] = result.data[measure_name]
|
|
369
|
+
result.data = result.data.drop(columns=[measure_name])
|
|
370
|
+
result.data = result.data[result.get_components_names()]
|
|
371
|
+
if isinstance(operand, DataComponent):
|
|
372
|
+
result.data = cls.apply_operation_component(operand.data, from_data, to_data)
|
|
373
|
+
if isinstance(operand, Scalar) and isinstance(from_, Scalar) and isinstance(to, Scalar):
|
|
374
|
+
if operand.value is None or from_data is None or to_data is None:
|
|
375
|
+
result.value = None
|
|
376
|
+
else:
|
|
377
|
+
result.value = from_data <= operand.value <= to_data
|
|
378
|
+
elif isinstance(operand, Scalar) and (
|
|
379
|
+
isinstance(from_data, pd.Series) or isinstance(to_data, pd.Series)
|
|
380
|
+
): # From or To is a DataComponent, or both
|
|
381
|
+
if isinstance(from_data, pd.Series):
|
|
382
|
+
series = pd.Series(operand.value, index=from_data.index, dtype=object)
|
|
383
|
+
elif isinstance(to_data, pd.Series):
|
|
384
|
+
series = pd.Series(operand.value, index=to_data.index, dtype=object)
|
|
385
|
+
result_series = cls.apply_operation_component(series, from_data, to_data)
|
|
386
|
+
result = DataComponent(
|
|
387
|
+
name=operand.name,
|
|
388
|
+
data=result_series,
|
|
389
|
+
data_type=cls.return_type,
|
|
390
|
+
role=Role.MEASURE,
|
|
391
|
+
)
|
|
392
|
+
return result
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class ExistIn(Operator.Operator):
|
|
396
|
+
"""
|
|
397
|
+
Class methods:
|
|
398
|
+
validate: Sets the identifiers and check if the left one exists in the right one.
|
|
399
|
+
evaluate: Evaluates if the result data type is actually a boolean.
|
|
400
|
+
"""
|
|
401
|
+
|
|
402
|
+
op = IN
|
|
403
|
+
|
|
404
|
+
# noinspection PyTypeChecker
|
|
405
|
+
@classmethod
|
|
406
|
+
def validate(
|
|
407
|
+
cls, dataset_1: Dataset, dataset_2: Dataset, retain_element: Optional[Boolean]
|
|
408
|
+
) -> Any:
|
|
409
|
+
dataset_name = VirtualCounter._new_ds_name()
|
|
410
|
+
left_identifiers = dataset_1.get_identifiers_names()
|
|
411
|
+
right_identifiers = dataset_2.get_identifiers_names()
|
|
412
|
+
|
|
413
|
+
is_subset_right = set(right_identifiers).issubset(left_identifiers)
|
|
414
|
+
is_subset_left = set(left_identifiers).issubset(right_identifiers)
|
|
415
|
+
if not (is_subset_left or is_subset_right):
|
|
416
|
+
raise ValueError("Datasets must have common identifiers")
|
|
417
|
+
|
|
418
|
+
result_components = {comp.name: copy(comp) for comp in dataset_1.get_identifiers()}
|
|
419
|
+
result_dataset = Dataset(name=dataset_name, components=result_components, data=None)
|
|
420
|
+
result_dataset.add_component(
|
|
421
|
+
Component(name="bool_var", data_type=Boolean, role=Role.MEASURE, nullable=False)
|
|
422
|
+
)
|
|
423
|
+
return result_dataset
|
|
424
|
+
|
|
425
|
+
@classmethod
|
|
426
|
+
def evaluate(
|
|
427
|
+
cls, dataset_1: Dataset, dataset_2: Dataset, retain_element: Optional[Boolean]
|
|
428
|
+
) -> Any:
|
|
429
|
+
result_dataset = cls.validate(dataset_1, dataset_2, retain_element)
|
|
430
|
+
|
|
431
|
+
# Checking the subset
|
|
432
|
+
left_id_names = dataset_1.get_identifiers_names()
|
|
433
|
+
right_id_names = dataset_2.get_identifiers_names()
|
|
434
|
+
is_subset_left = set(left_id_names).issubset(right_id_names)
|
|
435
|
+
|
|
436
|
+
# Identifiers for the result dataset
|
|
437
|
+
reference_identifiers_names = left_id_names
|
|
438
|
+
|
|
439
|
+
# Checking if the left dataset is a subset of the right dataset
|
|
440
|
+
common_columns = left_id_names if is_subset_left else right_id_names
|
|
441
|
+
|
|
442
|
+
# Check if the common identifiers are equal between the two datasets
|
|
443
|
+
if dataset_1.data is not None and dataset_2.data is not None:
|
|
444
|
+
true_results = pd.merge(
|
|
445
|
+
dataset_1.data,
|
|
446
|
+
dataset_2.data,
|
|
447
|
+
how="inner",
|
|
448
|
+
left_on=common_columns,
|
|
449
|
+
right_on=common_columns,
|
|
450
|
+
)
|
|
451
|
+
true_results = true_results[reference_identifiers_names]
|
|
452
|
+
else:
|
|
453
|
+
true_results = pd.DataFrame(columns=reference_identifiers_names)
|
|
454
|
+
|
|
455
|
+
# Check for empty values
|
|
456
|
+
if true_results.empty:
|
|
457
|
+
true_results["bool_var"] = None
|
|
458
|
+
else:
|
|
459
|
+
true_results["bool_var"] = True
|
|
460
|
+
if dataset_1.data is None:
|
|
461
|
+
dataset_1.data = pd.DataFrame(columns=reference_identifiers_names)
|
|
462
|
+
final_result = pd.merge(
|
|
463
|
+
dataset_1.data,
|
|
464
|
+
true_results,
|
|
465
|
+
how="left",
|
|
466
|
+
left_on=reference_identifiers_names,
|
|
467
|
+
right_on=reference_identifiers_names,
|
|
468
|
+
)
|
|
469
|
+
final_result = final_result[reference_identifiers_names + ["bool_var"]]
|
|
470
|
+
|
|
471
|
+
# No null values are returned, only True or False
|
|
472
|
+
final_result["bool_var"] = final_result["bool_var"].fillna(False)
|
|
473
|
+
|
|
474
|
+
# Adding to the result dataset
|
|
475
|
+
result_dataset.data = final_result
|
|
476
|
+
|
|
477
|
+
# Retain only the elements that are specified (True or False)
|
|
478
|
+
if retain_element is not None:
|
|
479
|
+
result_dataset.data = result_dataset.data[
|
|
480
|
+
result_dataset.data["bool_var"] == retain_element
|
|
481
|
+
]
|
|
482
|
+
result_dataset.data = result_dataset.data.reset_index(drop=True)
|
|
483
|
+
|
|
484
|
+
return result_dataset
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def _check_all_columns(row: Any) -> bool:
|
|
488
|
+
return all(col_value == True for col_value in row)
|