vtlengine 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vtlengine might be problematic. Click here for more details.
- vtlengine/API/_InternalApi.py +153 -100
- vtlengine/API/__init__.py +109 -67
- vtlengine/AST/ASTConstructor.py +188 -98
- vtlengine/AST/ASTConstructorModules/Expr.py +306 -200
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +172 -102
- vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
- vtlengine/AST/ASTEncoders.py +1 -1
- vtlengine/AST/ASTTemplate.py +8 -9
- vtlengine/AST/ASTVisitor.py +8 -12
- vtlengine/AST/DAG/__init__.py +43 -35
- vtlengine/AST/DAG/_words.py +4 -4
- vtlengine/AST/Grammar/lexer.py +732 -142
- vtlengine/AST/Grammar/parser.py +2188 -826
- vtlengine/AST/Grammar/tokens.py +128 -128
- vtlengine/AST/VtlVisitor.py +7 -4
- vtlengine/AST/__init__.py +22 -11
- vtlengine/DataTypes/NumericTypesHandling.py +5 -4
- vtlengine/DataTypes/TimeHandling.py +194 -301
- vtlengine/DataTypes/__init__.py +304 -218
- vtlengine/Exceptions/__init__.py +52 -27
- vtlengine/Exceptions/messages.py +134 -62
- vtlengine/Interpreter/__init__.py +781 -487
- vtlengine/Model/__init__.py +165 -121
- vtlengine/Operators/Aggregation.py +156 -95
- vtlengine/Operators/Analytic.py +115 -59
- vtlengine/Operators/Assignment.py +7 -4
- vtlengine/Operators/Boolean.py +27 -32
- vtlengine/Operators/CastOperator.py +177 -131
- vtlengine/Operators/Clause.py +137 -99
- vtlengine/Operators/Comparison.py +148 -117
- vtlengine/Operators/Conditional.py +149 -98
- vtlengine/Operators/General.py +68 -47
- vtlengine/Operators/HROperators.py +91 -72
- vtlengine/Operators/Join.py +217 -118
- vtlengine/Operators/Numeric.py +89 -44
- vtlengine/Operators/RoleSetter.py +16 -15
- vtlengine/Operators/Set.py +61 -36
- vtlengine/Operators/String.py +213 -139
- vtlengine/Operators/Time.py +334 -216
- vtlengine/Operators/Validation.py +117 -76
- vtlengine/Operators/__init__.py +340 -213
- vtlengine/Utils/__init__.py +195 -40
- vtlengine/__init__.py +1 -1
- vtlengine/files/output/__init__.py +15 -6
- vtlengine/files/output/_time_period_representation.py +10 -9
- vtlengine/files/parser/__init__.py +77 -52
- vtlengine/files/parser/_rfc_dialect.py +6 -5
- vtlengine/files/parser/_time_checking.py +46 -37
- vtlengine-1.0.1.dist-info/METADATA +236 -0
- vtlengine-1.0.1.dist-info/RECORD +58 -0
- {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/WHEEL +1 -1
- vtlengine-1.0.dist-info/METADATA +0 -104
- vtlengine-1.0.dist-info/RECORD +0 -58
- {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/LICENSE.md +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import operator
|
|
2
|
-
import os
|
|
3
2
|
import re
|
|
4
3
|
from copy import copy
|
|
5
4
|
from typing import Any, Optional, Union
|
|
@@ -7,33 +6,47 @@ from typing import Any, Optional, Union
|
|
|
7
6
|
from vtlengine.Exceptions import SemanticError
|
|
8
7
|
from vtlengine.Model import Component, DataComponent, Dataset, Role, Scalar, ScalarSet
|
|
9
8
|
|
|
10
|
-
if os.environ.get("SPARK"):
|
|
11
|
-
|
|
12
|
-
else:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
9
|
+
# if os.environ.get("SPARK"):
|
|
10
|
+
# import pyspark.pandas as pd
|
|
11
|
+
# else:
|
|
12
|
+
# import pandas as pd
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from vtlengine.AST.Grammar.tokens import (
|
|
16
|
+
CHARSET_MATCH,
|
|
17
|
+
EQ,
|
|
18
|
+
GT,
|
|
19
|
+
GTE,
|
|
20
|
+
IN,
|
|
21
|
+
ISNULL,
|
|
22
|
+
LT,
|
|
23
|
+
LTE,
|
|
24
|
+
NEQ,
|
|
25
|
+
NOT_IN,
|
|
26
|
+
)
|
|
17
27
|
from vtlengine.DataTypes import Boolean, COMP_NAME_MAPPING, String, Number, Null
|
|
18
28
|
import vtlengine.Operators as Operator
|
|
19
29
|
|
|
30
|
+
|
|
20
31
|
class Unary(Operator.Unary):
|
|
21
32
|
"""
|
|
22
33
|
Unary comparison operator. It returns a boolean.
|
|
23
34
|
"""
|
|
35
|
+
|
|
24
36
|
return_type = Boolean
|
|
25
37
|
|
|
26
38
|
|
|
27
39
|
class IsNull(Unary):
|
|
28
40
|
"""
|
|
29
|
-
Class that allows to perform the isnull comparison operator.
|
|
30
|
-
the operation with different datatypes.
|
|
41
|
+
Class that allows to perform the isnull comparison operator.
|
|
42
|
+
It has different class methods to allow performing the operation with different datatypes.
|
|
31
43
|
"""
|
|
44
|
+
|
|
32
45
|
op = ISNULL
|
|
33
46
|
py_op = pd.isnull
|
|
34
47
|
|
|
35
48
|
@classmethod
|
|
36
|
-
def apply_operation_component(cls, series:
|
|
49
|
+
def apply_operation_component(cls, series: Any) -> Any:
|
|
37
50
|
return series.isnull()
|
|
38
51
|
|
|
39
52
|
@classmethod
|
|
@@ -41,14 +54,14 @@ class IsNull(Unary):
|
|
|
41
54
|
return pd.isnull(x)
|
|
42
55
|
|
|
43
56
|
@classmethod
|
|
44
|
-
def dataset_validation(cls, operand: Dataset):
|
|
57
|
+
def dataset_validation(cls, operand: Dataset) -> Dataset:
|
|
45
58
|
result = super().dataset_validation(operand)
|
|
46
59
|
for measure in result.get_measures():
|
|
47
60
|
measure.nullable = False
|
|
48
61
|
return result
|
|
49
62
|
|
|
50
63
|
@classmethod
|
|
51
|
-
def component_validation(cls, operand: DataComponent):
|
|
64
|
+
def component_validation(cls, operand: DataComponent) -> DataComponent:
|
|
52
65
|
result = super().component_validation(operand)
|
|
53
66
|
result.nullable = False
|
|
54
67
|
return result
|
|
@@ -58,11 +71,13 @@ class Binary(Operator.Binary):
|
|
|
58
71
|
"""
|
|
59
72
|
Binary comparison operator. It returns a boolean.
|
|
60
73
|
"""
|
|
74
|
+
|
|
61
75
|
return_type = Boolean
|
|
62
76
|
|
|
63
77
|
@classmethod
|
|
64
|
-
def _cast_values(
|
|
65
|
-
|
|
78
|
+
def _cast_values(
|
|
79
|
+
cls, x: Optional[Union[int, float, str, bool]], y: Optional[Union[int, float, str, bool]]
|
|
80
|
+
) -> Any:
|
|
66
81
|
# Cast both values to the same data type
|
|
67
82
|
# An integer can be considered a bool, we must check first boolean, then numbers
|
|
68
83
|
try:
|
|
@@ -88,19 +103,20 @@ class Binary(Operator.Binary):
|
|
|
88
103
|
return cls.py_op(x, y)
|
|
89
104
|
|
|
90
105
|
@classmethod
|
|
91
|
-
def apply_operation_series_scalar(cls, series:
|
|
92
|
-
series_left: bool) -> Any:
|
|
106
|
+
def apply_operation_series_scalar(cls, series: Any, scalar: Any, series_left: bool) -> Any:
|
|
93
107
|
if scalar is None:
|
|
94
108
|
return pd.Series(None, index=series.index)
|
|
95
109
|
if series_left:
|
|
96
|
-
return series.map(lambda x: cls.op_func(x, scalar), na_action=
|
|
110
|
+
return series.map(lambda x: cls.op_func(x, scalar), na_action="ignore")
|
|
97
111
|
else:
|
|
98
|
-
return series.map(lambda x: cls.op_func(scalar, x), na_action=
|
|
112
|
+
return series.map(lambda x: cls.op_func(scalar, x), na_action="ignore")
|
|
99
113
|
|
|
100
114
|
@classmethod
|
|
101
115
|
def apply_return_type_dataset(
|
|
102
|
-
|
|
103
|
-
|
|
116
|
+
cls,
|
|
117
|
+
result_dataset: Dataset,
|
|
118
|
+
left_operand: Dataset,
|
|
119
|
+
right_operand: Union[Dataset, Scalar, ScalarSet],
|
|
104
120
|
) -> None:
|
|
105
121
|
super().apply_return_type_dataset(result_dataset, left_operand, right_operand)
|
|
106
122
|
is_mono_measure = len(result_dataset.get_measures()) == 1
|
|
@@ -110,7 +126,7 @@ class Binary(Operator.Binary):
|
|
|
110
126
|
name=COMP_NAME_MAPPING[Boolean],
|
|
111
127
|
data_type=Boolean,
|
|
112
128
|
role=Role.MEASURE,
|
|
113
|
-
nullable=measure.nullable
|
|
129
|
+
nullable=measure.nullable,
|
|
114
130
|
)
|
|
115
131
|
result_dataset.delete_component(measure.name)
|
|
116
132
|
result_dataset.add_component(component)
|
|
@@ -152,16 +168,14 @@ class In(Binary):
|
|
|
152
168
|
op = IN
|
|
153
169
|
|
|
154
170
|
@classmethod
|
|
155
|
-
def apply_operation_two_series(cls,
|
|
156
|
-
left_series: Any,
|
|
157
|
-
right_series: ScalarSet) -> Any:
|
|
171
|
+
def apply_operation_two_series(cls, left_series: Any, right_series: ScalarSet) -> Any:
|
|
158
172
|
if right_series.data_type == Null:
|
|
159
173
|
return pd.Series(None, index=left_series.index)
|
|
160
174
|
|
|
161
|
-
return left_series.map(lambda x: x in right_series, na_action=
|
|
175
|
+
return left_series.map(lambda x: x in right_series, na_action="ignore")
|
|
162
176
|
|
|
163
177
|
@classmethod
|
|
164
|
-
def py_op(cls, x, y):
|
|
178
|
+
def py_op(cls, x: Any, y: Any) -> Any:
|
|
165
179
|
if y.data_type == Null:
|
|
166
180
|
return None
|
|
167
181
|
return operator.contains(y, x)
|
|
@@ -171,14 +185,12 @@ class NotIn(Binary):
|
|
|
171
185
|
op = NOT_IN
|
|
172
186
|
|
|
173
187
|
@classmethod
|
|
174
|
-
def apply_operation_two_series(cls,
|
|
175
|
-
left_series: Any,
|
|
176
|
-
right_series: list) -> Any:
|
|
188
|
+
def apply_operation_two_series(cls, left_series: Any, right_series: Any) -> Any:
|
|
177
189
|
series_result = In.apply_operation_two_series(left_series, right_series)
|
|
178
|
-
return series_result.map(lambda x: not x, na_action=
|
|
190
|
+
return series_result.map(lambda x: not x, na_action="ignore")
|
|
179
191
|
|
|
180
192
|
@classmethod
|
|
181
|
-
def py_op(cls, x, y):
|
|
193
|
+
def py_op(cls, x: Any, y: Any) -> Any:
|
|
182
194
|
return not operator.contains(y, x)
|
|
183
195
|
|
|
184
196
|
|
|
@@ -187,7 +199,7 @@ class Match(Binary):
|
|
|
187
199
|
type_to_check = String
|
|
188
200
|
|
|
189
201
|
@classmethod
|
|
190
|
-
def op_func(cls, x, y):
|
|
202
|
+
def op_func(cls, x: Optional[str], y: Optional[str]) -> Optional[bool]:
|
|
191
203
|
if pd.isnull(x) or pd.isnull(y):
|
|
192
204
|
return None
|
|
193
205
|
if isinstance(x, pd.Series):
|
|
@@ -199,41 +211,44 @@ class Between(Operator.Operator):
|
|
|
199
211
|
return_type = Boolean
|
|
200
212
|
"""
|
|
201
213
|
This comparison operator has the following class methods.
|
|
202
|
-
|
|
214
|
+
|
|
203
215
|
Class methods:
|
|
204
216
|
op_function: Sets the data to be manipulated.
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
schema of op_function.
|
|
208
|
-
|
|
209
|
-
apply_return_type_dataset: Because the result must be a boolean,
|
|
210
|
-
is actually a boolean one.
|
|
211
|
-
|
|
212
|
-
|
|
217
|
+
apply_operation_component: Returns a pandas dataframe with the operation,
|
|
218
|
+
|
|
219
|
+
considering each component with the schema of op_function.
|
|
220
|
+
|
|
221
|
+
apply_return_type_dataset: Because the result must be a boolean,
|
|
222
|
+
this function evaluates if the measure is actually a boolean one.
|
|
213
223
|
"""
|
|
214
224
|
|
|
215
225
|
@classmethod
|
|
216
|
-
def op_func(
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
226
|
+
def op_func(
|
|
227
|
+
cls,
|
|
228
|
+
x: Optional[Union[int, float, bool, str]],
|
|
229
|
+
y: Optional[Union[int, float, bool, str]],
|
|
230
|
+
z: Optional[Union[int, float, bool, str]],
|
|
231
|
+
) -> Optional[bool]:
|
|
232
|
+
return (
|
|
233
|
+
None
|
|
234
|
+
if (pd.isnull(x) or pd.isnull(y) or pd.isnull(z))
|
|
235
|
+
else y <= x <= z # type: ignore[operator]
|
|
236
|
+
)
|
|
221
237
|
|
|
222
238
|
@classmethod
|
|
223
|
-
def apply_operation_component(cls, series:
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
control_any_series_from_to = isinstance(from_data, pd.Series) or isinstance(to_data,
|
|
228
|
-
pd.Series)
|
|
239
|
+
def apply_operation_component(cls, series: Any, from_data: Any, to_data: Any) -> Any:
|
|
240
|
+
control_any_series_from_to = isinstance(from_data, pd.Series) or isinstance(
|
|
241
|
+
to_data, pd.Series
|
|
242
|
+
)
|
|
229
243
|
if control_any_series_from_to:
|
|
230
244
|
if not isinstance(from_data, pd.Series):
|
|
231
245
|
from_data = pd.Series(from_data, index=series.index, dtype=object)
|
|
232
246
|
if not isinstance(to_data, pd.Series):
|
|
233
247
|
to_data = pd.Series(to_data, index=series.index)
|
|
234
|
-
df = pd.DataFrame({
|
|
235
|
-
return df.apply(
|
|
236
|
-
|
|
248
|
+
df = pd.DataFrame({"operand": series, "from_data": from_data, "to_data": to_data})
|
|
249
|
+
return df.apply(
|
|
250
|
+
lambda x: cls.op_func(x["operand"], x["from_data"], x["to_data"]), axis=1
|
|
251
|
+
)
|
|
237
252
|
|
|
238
253
|
return series.map(lambda x: cls.op_func(x, from_data, to_data))
|
|
239
254
|
|
|
@@ -242,14 +257,13 @@ class Between(Operator.Operator):
|
|
|
242
257
|
is_mono_measure = len(operand.get_measures()) == 1
|
|
243
258
|
for measure in result_dataset.get_measures():
|
|
244
259
|
operand_type = operand.get_component(measure.name).data_type
|
|
245
|
-
|
|
246
260
|
result_data_type = cls.type_validation(operand_type)
|
|
247
261
|
if is_mono_measure and operand_type.promotion_changed_type(result_data_type):
|
|
248
262
|
component = Component(
|
|
249
263
|
name=COMP_NAME_MAPPING[result_data_type],
|
|
250
264
|
data_type=result_data_type,
|
|
251
265
|
role=Role.MEASURE,
|
|
252
|
-
nullable=measure.nullable
|
|
266
|
+
nullable=measure.nullable,
|
|
253
267
|
)
|
|
254
268
|
result_dataset.delete_component(measure.name)
|
|
255
269
|
result_dataset.add_component(component)
|
|
@@ -261,30 +275,39 @@ class Between(Operator.Operator):
|
|
|
261
275
|
measure.data_type = result_data_type
|
|
262
276
|
|
|
263
277
|
@classmethod
|
|
264
|
-
def validate(
|
|
265
|
-
|
|
266
|
-
|
|
278
|
+
def validate(
|
|
279
|
+
cls,
|
|
280
|
+
operand: Union[Dataset, DataComponent, Scalar],
|
|
281
|
+
from_: Union[DataComponent, Scalar],
|
|
282
|
+
to: Union[DataComponent, Scalar],
|
|
283
|
+
) -> Any:
|
|
284
|
+
result: Union[Dataset, DataComponent, Scalar]
|
|
267
285
|
if isinstance(operand, Dataset):
|
|
268
286
|
if len(operand.get_measures()) == 0:
|
|
269
287
|
raise SemanticError("1-1-1-8", op=cls.op, name=operand.name)
|
|
270
|
-
result_components = {
|
|
271
|
-
|
|
272
|
-
|
|
288
|
+
result_components = {
|
|
289
|
+
comp_name: copy(comp)
|
|
290
|
+
for comp_name, comp in operand.components.items()
|
|
291
|
+
if comp.role == Role.IDENTIFIER or comp.role == Role.MEASURE
|
|
292
|
+
}
|
|
273
293
|
result = Dataset(name=operand.name, components=result_components, data=None)
|
|
274
294
|
elif isinstance(operand, DataComponent):
|
|
275
|
-
result = DataComponent(
|
|
276
|
-
|
|
277
|
-
|
|
295
|
+
result = DataComponent(
|
|
296
|
+
name=operand.name, data=None, data_type=cls.return_type, role=operand.role
|
|
297
|
+
)
|
|
298
|
+
elif isinstance(from_, Scalar) and isinstance(to, Scalar):
|
|
278
299
|
result = Scalar(name=operand.name, value=None, data_type=cls.return_type)
|
|
279
300
|
else: # From or To is a DataComponent, or both
|
|
280
|
-
result = DataComponent(
|
|
281
|
-
|
|
301
|
+
result = DataComponent(
|
|
302
|
+
name=operand.name, data=None, data_type=cls.return_type, role=Role.MEASURE
|
|
303
|
+
)
|
|
282
304
|
|
|
283
305
|
if isinstance(operand, Dataset):
|
|
284
306
|
for measure in operand.get_measures():
|
|
285
307
|
cls.validate_type_compatibility(measure.data_type, from_.data_type)
|
|
286
308
|
cls.validate_type_compatibility(measure.data_type, to.data_type)
|
|
287
|
-
|
|
309
|
+
if isinstance(result, Dataset):
|
|
310
|
+
cls.apply_return_type_dataset(result, operand)
|
|
288
311
|
else:
|
|
289
312
|
cls.validate_type_compatibility(operand.data_type, from_.data_type)
|
|
290
313
|
cls.validate_type_compatibility(operand.data_type, to.data_type)
|
|
@@ -292,18 +315,20 @@ class Between(Operator.Operator):
|
|
|
292
315
|
return result
|
|
293
316
|
|
|
294
317
|
@classmethod
|
|
295
|
-
def evaluate(
|
|
296
|
-
|
|
297
|
-
|
|
318
|
+
def evaluate(
|
|
319
|
+
cls,
|
|
320
|
+
operand: Union[DataComponent, Scalar],
|
|
321
|
+
from_: Union[DataComponent, Scalar],
|
|
322
|
+
to: Union[DataComponent, Scalar],
|
|
323
|
+
) -> Any:
|
|
298
324
|
result = cls.validate(operand, from_, to)
|
|
299
|
-
|
|
300
325
|
from_data = from_.data if isinstance(from_, DataComponent) else from_.value
|
|
301
326
|
to_data = to.data if isinstance(to, DataComponent) else to.value
|
|
302
327
|
|
|
303
328
|
if (
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
329
|
+
isinstance(from_data, pd.Series)
|
|
330
|
+
and isinstance(to_data, pd.Series)
|
|
331
|
+
and len(from_data) != len(to_data)
|
|
307
332
|
):
|
|
308
333
|
raise ValueError("From and To must have the same length")
|
|
309
334
|
|
|
@@ -311,38 +336,31 @@ class Between(Operator.Operator):
|
|
|
311
336
|
result.data = operand.data.copy()
|
|
312
337
|
for measure_name in operand.get_measures_names():
|
|
313
338
|
result.data[measure_name] = cls.apply_operation_component(
|
|
314
|
-
operand.data[measure_name],
|
|
315
|
-
from_data, to_data
|
|
339
|
+
operand.data[measure_name], from_data, to_data
|
|
316
340
|
)
|
|
317
341
|
if len(result.get_measures()) == 1:
|
|
318
342
|
result.data[COMP_NAME_MAPPING[cls.return_type]] = result.data[measure_name]
|
|
319
343
|
result.data = result.data.drop(columns=[measure_name])
|
|
320
344
|
result.data = result.data[result.get_components_names()]
|
|
321
345
|
if isinstance(operand, DataComponent):
|
|
322
|
-
result.data = cls.apply_operation_component(
|
|
323
|
-
operand.data,
|
|
324
|
-
from_data, to_data
|
|
325
|
-
)
|
|
346
|
+
result.data = cls.apply_operation_component(operand.data, from_data, to_data)
|
|
326
347
|
if isinstance(operand, Scalar) and isinstance(from_, Scalar) and isinstance(to, Scalar):
|
|
327
348
|
if operand.value is None or from_data is None or to_data is None:
|
|
328
349
|
result.value = None
|
|
329
350
|
else:
|
|
330
351
|
result.value = from_data <= operand.value <= to_data
|
|
331
|
-
elif (
|
|
332
|
-
|
|
333
|
-
(
|
|
334
|
-
isinstance(from_data, pd.Series) or
|
|
335
|
-
isinstance(to_data, pd.Series)
|
|
336
|
-
)
|
|
352
|
+
elif isinstance(operand, Scalar) and (
|
|
353
|
+
isinstance(from_data, pd.Series) or isinstance(to_data, pd.Series)
|
|
337
354
|
): # From or To is a DataComponent, or both
|
|
355
|
+
|
|
338
356
|
if isinstance(from_data, pd.Series):
|
|
339
357
|
series = pd.Series(operand.value, index=from_data.index, dtype=object)
|
|
340
|
-
|
|
358
|
+
elif isinstance(to_data, pd.Series):
|
|
341
359
|
series = pd.Series(operand.value, index=to_data.index, dtype=object)
|
|
342
360
|
result_series = cls.apply_operation_component(series, from_data, to_data)
|
|
343
|
-
result = DataComponent(
|
|
344
|
-
|
|
345
|
-
|
|
361
|
+
result = DataComponent(
|
|
362
|
+
name=operand.name, data=result_series, data_type=cls.return_type, role=Role.MEASURE
|
|
363
|
+
)
|
|
346
364
|
return result
|
|
347
365
|
|
|
348
366
|
|
|
@@ -352,12 +370,14 @@ class ExistIn(Operator.Operator):
|
|
|
352
370
|
validate: Sets the identifiers and check if the left one exists in the right one.
|
|
353
371
|
evaluate: Evaluates if the result data type is actually a boolean.
|
|
354
372
|
"""
|
|
373
|
+
|
|
355
374
|
op = IN
|
|
356
375
|
|
|
357
376
|
# noinspection PyTypeChecker
|
|
358
377
|
@classmethod
|
|
359
|
-
def validate(
|
|
360
|
-
|
|
378
|
+
def validate(
|
|
379
|
+
cls, dataset_1: Dataset, dataset_2: Dataset, retain_element: Optional[Boolean]
|
|
380
|
+
) -> Any:
|
|
361
381
|
left_identifiers = dataset_1.get_identifiers_names()
|
|
362
382
|
right_identifiers = dataset_2.get_identifiers_names()
|
|
363
383
|
|
|
@@ -368,17 +388,15 @@ class ExistIn(Operator.Operator):
|
|
|
368
388
|
|
|
369
389
|
result_components = {comp.name: copy(comp) for comp in dataset_1.get_identifiers()}
|
|
370
390
|
result_dataset = Dataset(name="result", components=result_components, data=None)
|
|
371
|
-
result_dataset.add_component(
|
|
372
|
-
name=
|
|
373
|
-
|
|
374
|
-
role=Role.MEASURE,
|
|
375
|
-
nullable=False
|
|
376
|
-
))
|
|
391
|
+
result_dataset.add_component(
|
|
392
|
+
Component(name="bool_var", data_type=Boolean, role=Role.MEASURE, nullable=False)
|
|
393
|
+
)
|
|
377
394
|
return result_dataset
|
|
378
395
|
|
|
379
396
|
@classmethod
|
|
380
|
-
def evaluate(
|
|
381
|
-
|
|
397
|
+
def evaluate(
|
|
398
|
+
cls, dataset_1: Dataset, dataset_2: Dataset, retain_element: Optional[Boolean]
|
|
399
|
+
) -> Any:
|
|
382
400
|
result_dataset = cls.validate(dataset_1, dataset_2, retain_element)
|
|
383
401
|
|
|
384
402
|
# Checking the subset
|
|
@@ -396,24 +414,36 @@ class ExistIn(Operator.Operator):
|
|
|
396
414
|
common_columns = right_id_names
|
|
397
415
|
|
|
398
416
|
# Check if the common identifiers are equal between the two datasets
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
417
|
+
if dataset_1.data is not None and dataset_2.data is not None:
|
|
418
|
+
true_results = pd.merge(
|
|
419
|
+
dataset_1.data,
|
|
420
|
+
dataset_2.data,
|
|
421
|
+
how="inner",
|
|
422
|
+
left_on=common_columns,
|
|
423
|
+
right_on=common_columns,
|
|
424
|
+
)
|
|
425
|
+
true_results = true_results[reference_identifiers_names]
|
|
426
|
+
else:
|
|
427
|
+
true_results = pd.DataFrame(columns=reference_identifiers_names)
|
|
403
428
|
|
|
404
429
|
# Check for empty values
|
|
405
430
|
if true_results.empty:
|
|
406
|
-
true_results[
|
|
431
|
+
true_results["bool_var"] = None
|
|
407
432
|
else:
|
|
408
|
-
true_results[
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
433
|
+
true_results["bool_var"] = True
|
|
434
|
+
if dataset_1.data is None:
|
|
435
|
+
dataset_1.data = pd.DataFrame(columns=reference_identifiers_names)
|
|
436
|
+
final_result = pd.merge(
|
|
437
|
+
dataset_1.data,
|
|
438
|
+
true_results,
|
|
439
|
+
how="left",
|
|
440
|
+
left_on=reference_identifiers_names,
|
|
441
|
+
right_on=reference_identifiers_names,
|
|
442
|
+
)
|
|
443
|
+
final_result = final_result[reference_identifiers_names + ["bool_var"]]
|
|
414
444
|
|
|
415
445
|
# No null values are returned, only True or False
|
|
416
|
-
final_result[
|
|
446
|
+
final_result["bool_var"] = final_result["bool_var"].fillna(False)
|
|
417
447
|
|
|
418
448
|
# Adding to the result dataset
|
|
419
449
|
result_dataset.data = final_result
|
|
@@ -421,11 +451,12 @@ class ExistIn(Operator.Operator):
|
|
|
421
451
|
# Retain only the elements that are specified (True or False)
|
|
422
452
|
if retain_element is not None:
|
|
423
453
|
result_dataset.data = result_dataset.data[
|
|
424
|
-
result_dataset.data[
|
|
454
|
+
result_dataset.data["bool_var"] == retain_element
|
|
455
|
+
]
|
|
425
456
|
result_dataset.data = result_dataset.data.reset_index(drop=True)
|
|
426
457
|
|
|
427
458
|
return result_dataset
|
|
428
459
|
|
|
429
460
|
@staticmethod
|
|
430
|
-
def _check_all_columns(row):
|
|
461
|
+
def _check_all_columns(row: Any) -> bool:
|
|
431
462
|
return all(col_value == True for col_value in row)
|