vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,422 @@
1
+ import _random
2
+ import math
3
+ import operator
4
+ import warnings
5
+ from decimal import Decimal, getcontext
6
+ from typing import Any, Optional, Union
7
+
8
+ import pandas as pd
9
+
10
+ import vtlengine.Operators as Operator
11
+ from vtlengine.AST.Grammar.tokens import (
12
+ ABS,
13
+ CEIL,
14
+ DIV,
15
+ EXP,
16
+ FLOOR,
17
+ LN,
18
+ LOG,
19
+ MINUS,
20
+ MOD,
21
+ MULT,
22
+ PLUS,
23
+ POWER,
24
+ RANDOM,
25
+ ROUND,
26
+ SQRT,
27
+ TRUNC,
28
+ )
29
+ from vtlengine.DataTypes import Integer, Number, binary_implicit_promotion
30
+ from vtlengine.Exceptions import SemanticError
31
+ from vtlengine.Model import DataComponent, Dataset, Scalar
32
+ from vtlengine.Operators import ALL_MODEL_DATA_TYPES
33
+
34
+
35
+ class Unary(Operator.Unary):
36
+ """
37
+ Checks that the unary operation is performed with a number.
38
+ """
39
+
40
+ type_to_check = Number
41
+
42
+
43
+ class Binary(Operator.Binary):
44
+ """
45
+ Checks that the binary operation is performed with numbers.
46
+ """
47
+
48
+ type_to_check = Number
49
+
50
+ @classmethod
51
+ def op_func(cls, x: Any, y: Any) -> Any:
52
+ if pd.isnull(x) or pd.isnull(y):
53
+ return None
54
+ if isinstance(x, int) and isinstance(y, int):
55
+ if cls.op == DIV and y == 0:
56
+ raise SemanticError("2-1-15-6", op=cls.op, value=y)
57
+ if cls.op == RANDOM:
58
+ return cls.py_op(x, y)
59
+ x = float(x)
60
+ y = float(y)
61
+ # Handles precision to avoid floating point errors
62
+ if cls.op == DIV and y == 0:
63
+ raise SemanticError("2-1-15-6", op=cls.op, value=y)
64
+
65
+ decimal_value = cls.py_op(Decimal(x), Decimal(y))
66
+ getcontext().prec = 10
67
+ result = float(decimal_value)
68
+ if result.is_integer():
69
+ return int(result)
70
+ return result
71
+
72
+
73
+ class UnPlus(Unary):
74
+ """
75
+ `Plus <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=94&zoom=100,72,142> `_ unary operator
76
+ """ # noqa E501
77
+
78
+ op = PLUS
79
+ py_op = operator.pos
80
+
81
+ @classmethod
82
+ def apply_operation_component(cls, series: Any) -> Any:
83
+ return series
84
+
85
+
86
+ class UnMinus(Unary):
87
+ """
88
+ `Minus <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=95&zoom=100,72,414> `_unary operator
89
+ """ # noqa E501
90
+
91
+ op = MINUS
92
+ py_op = operator.neg
93
+
94
+
95
+ class AbsoluteValue(Unary):
96
+ """
97
+ `Absolute <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=112&zoom=100,72,801> `_ unary operator
98
+ """ # noqa E501
99
+
100
+ op = ABS
101
+ py_op = operator.abs
102
+
103
+
104
+ class Exponential(Unary):
105
+ """
106
+ `Exponential <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=114&zoom=100,72,94>`_ unary operator
107
+ """ # noqa E501
108
+
109
+ op = EXP
110
+ py_op = math.exp
111
+ return_type = Number
112
+
113
+
114
+ class NaturalLogarithm(Unary):
115
+ """
116
+ `Natural logarithm <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=115&zoom=100,72,394> `_
117
+ unary operator
118
+ """ # noqa E501
119
+
120
+ op = LN
121
+ py_op = math.log
122
+ return_type = Number
123
+
124
+
125
+ class SquareRoot(Unary):
126
+ """
127
+ `Square Root <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=119&zoom=100,72,556> '_
128
+ unary operator
129
+ """ # noqa E501
130
+
131
+ op = SQRT
132
+ py_op = math.sqrt
133
+ return_type = Number
134
+
135
+
136
+ class Ceil(Unary):
137
+ """
138
+ `Ceilling <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=110&zoom=100,72,94> `_ unary operator
139
+ """ # noqa E501
140
+
141
+ op = CEIL
142
+ py_op = math.ceil
143
+ return_type = Integer
144
+
145
+
146
+ class Floor(Unary):
147
+ """
148
+ `Floor <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=111&zoom=100,72,442> `_ unary operator
149
+ """ # noqa E501
150
+
151
+ op = FLOOR
152
+ py_op = math.floor
153
+ return_type = Integer
154
+
155
+
156
+ class BinPlus(Binary):
157
+ """
158
+ `Addition <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=96&zoom=100,72,692> `_ binary operator
159
+ """ # noqa E501
160
+
161
+ op = PLUS
162
+ py_op = operator.add
163
+ type_to_check = Number
164
+
165
+
166
+ class BinMinus(Binary):
167
+ """
168
+ `Subtraction <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=98&zoom=100,72,448> `_ binary operator
169
+ """ # noqa E501
170
+
171
+ op = MINUS
172
+ py_op = operator.sub
173
+ type_to_check = Number
174
+
175
+
176
+ class Mult(Binary):
177
+ """
178
+ `Multiplication <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=100&zoom=100,72,254>`_
179
+ binary operator
180
+ """ # noqa E501
181
+
182
+ op = MULT
183
+ py_op = operator.mul
184
+
185
+
186
+ class Div(Binary):
187
+ """
188
+ `Division <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=102&zoom=100,72,94>`_
189
+ binary operator
190
+ """ # noqa E501
191
+
192
+ op = DIV
193
+ py_op = operator.truediv
194
+ return_type = Number
195
+
196
+
197
+ class Logarithm(Binary):
198
+ """
199
+ `Logarithm <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=118&zoom=100,72,228>`_ operator
200
+ """ # noqa E501
201
+
202
+ op = LOG
203
+ return_type = Number
204
+
205
+ @classmethod
206
+ def py_op(cls, x: Any, param: Any) -> Any:
207
+ if pd.isnull(param):
208
+ return None
209
+ if param <= 0:
210
+ raise SemanticError("2-1-15-3", op=cls.op, value=param)
211
+
212
+ return math.log(x, param)
213
+
214
+
215
+ class Modulo(Binary):
216
+ """
217
+ `Module <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=104&zoom=100,72,94>`_ operator
218
+ """ # noqa E501
219
+
220
+ op = MOD
221
+ py_op = operator.mod
222
+
223
+
224
+ class Power(Binary):
225
+ """
226
+ `Power <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=116&zoom=100,72,693>`_ operator
227
+ """ # noqa E501
228
+
229
+ op = POWER
230
+ return_type = Number
231
+
232
+ @classmethod
233
+ def py_op(cls, x: Any, param: Any) -> Any:
234
+ if pd.isnull(param):
235
+ return None
236
+ return x**param
237
+
238
+
239
+ class Parameterized(Unary):
240
+ """Parametrized class
241
+ Inherits from Unary class, to validate the data type and evaluate if it is the correct one to
242
+ perform the operation. Similar to Unary, but in the end, the param validation is added.
243
+ """
244
+
245
+ @classmethod
246
+ def validate(
247
+ cls,
248
+ operand: Operator.ALL_MODEL_DATA_TYPES,
249
+ param: Optional[Union[DataComponent, Scalar]] = None,
250
+ ) -> Any:
251
+ if param is not None:
252
+ if isinstance(param, Dataset):
253
+ raise SemanticError("1-1-15-8", op=cls.op, comp_type="Dataset")
254
+ if isinstance(param, DataComponent):
255
+ if isinstance(operand, Scalar):
256
+ raise SemanticError(
257
+ "1-1-15-8",
258
+ op=cls.op,
259
+ comp_type="DataComponent and an Scalar operand",
260
+ )
261
+ cls.validate_type_compatibility(param.data_type)
262
+ else:
263
+ cls.validate_scalar_type(param)
264
+ if param is None:
265
+ cls.return_type = Integer
266
+ else:
267
+ cls.return_type = Number
268
+
269
+ return super().validate(operand)
270
+
271
+ @classmethod
272
+ def op_func(cls, x: Any, param: Optional[Any]) -> Any:
273
+ return None if pd.isnull(x) else cls.py_op(x, param)
274
+
275
+ @classmethod
276
+ def apply_operation_two_series(cls, left_series: Any, right_series: Any) -> Any:
277
+ return left_series.combine(right_series, cls.op_func)
278
+
279
+ @classmethod
280
+ def apply_operation_series_scalar(cls, series: Any, param: Any) -> Any:
281
+ return series.map(lambda x: cls.op_func(x, param))
282
+
283
+ @classmethod
284
+ def dataset_evaluation(
285
+ cls, operand: Dataset, param: Optional[Union[DataComponent, Scalar]] = None
286
+ ) -> Dataset:
287
+ result = cls.validate(operand, param)
288
+ result.data = operand.data.copy() if operand.data is not None else pd.DataFrame()
289
+ for measure_name in result.get_measures_names():
290
+ try:
291
+ if isinstance(param, DataComponent):
292
+ result.data[measure_name] = cls.apply_operation_two_series(
293
+ result.data[measure_name], param.data
294
+ )
295
+ else:
296
+ param_value = param.value if param is not None else None
297
+ result.data[measure_name] = cls.apply_operation_series_scalar(
298
+ result.data[measure_name], param_value
299
+ )
300
+ except ValueError:
301
+ raise SemanticError(
302
+ "2-1-15-1",
303
+ op=cls.op,
304
+ comp_name=measure_name,
305
+ dataset_name=operand.name,
306
+ ) from None
307
+ result.data = result.data[result.get_components_names()]
308
+ return result
309
+
310
+ @classmethod
311
+ def component_evaluation(
312
+ cls,
313
+ operand: DataComponent,
314
+ param: Optional[Union[DataComponent, Scalar]] = None,
315
+ ) -> DataComponent:
316
+ result = cls.validate(operand, param)
317
+ if operand.data is None:
318
+ operand.data = pd.Series()
319
+ result.data = operand.data.copy()
320
+ if isinstance(param, DataComponent):
321
+ result.data = cls.apply_operation_two_series(operand.data, param.data)
322
+ else:
323
+ param_value = param.value if param is not None else None
324
+ result.data = cls.apply_operation_series_scalar(operand.data, param_value)
325
+ return result
326
+
327
+ @classmethod
328
+ def scalar_evaluation(cls, operand: Scalar, param: Optional[Any] = None) -> Scalar:
329
+ result = cls.validate(operand, param)
330
+ param_value = param.value if param is not None else None
331
+ result.value = cls.op_func(operand.value, param_value)
332
+ return result
333
+
334
+ @classmethod
335
+ def evaluate(
336
+ cls,
337
+ operand: ALL_MODEL_DATA_TYPES,
338
+ param: Optional[Union[DataComponent, Scalar]] = None,
339
+ ) -> Union[DataComponent, Dataset, Scalar]:
340
+ if isinstance(operand, Dataset):
341
+ return cls.dataset_evaluation(operand, param)
342
+ elif isinstance(operand, DataComponent):
343
+ return cls.component_evaluation(operand, param)
344
+ else:
345
+ return cls.scalar_evaluation(operand, param)
346
+
347
+
348
+ class Round(Parameterized):
349
+ """
350
+ `Round <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=106&zoom=100,72,94>`_ operator
351
+ """ # noqa E501
352
+
353
+ op = ROUND
354
+ return_type = Integer
355
+
356
+ @classmethod
357
+ def py_op(cls, x: Any, param: Any) -> Any:
358
+ multiplier = 1.0
359
+ if not pd.isnull(param):
360
+ multiplier = 10**param
361
+
362
+ if x >= 0.0:
363
+ rounded_value = math.floor(x * multiplier + 0.5) / multiplier
364
+ else:
365
+ rounded_value = math.ceil(x * multiplier - 0.5) / multiplier
366
+
367
+ if param is not None:
368
+ return rounded_value
369
+
370
+ return int(rounded_value)
371
+
372
+
373
+ class Trunc(Parameterized):
374
+ """
375
+ `Trunc <https://sdmx.org/wp-content/uploads/VTL-2.1-Reference-Manual.pdf#page=108&zoom=100,72,94>`_ operator.
376
+ """ # noqa E501
377
+
378
+ op = TRUNC
379
+
380
+ @classmethod
381
+ def py_op(cls, x: float, param: Optional[float]) -> Any:
382
+ multiplier = 1.0
383
+ if not pd.isnull(param) and param is not None:
384
+ multiplier = 10**param
385
+
386
+ truncated_value = int(x * multiplier) / multiplier
387
+
388
+ if not pd.isnull(param):
389
+ return truncated_value
390
+
391
+ return int(truncated_value)
392
+
393
+
394
+ class PseudoRandom(_random.Random):
395
+ def __init__(self, seed: Union[int, float]) -> None:
396
+ super().__init__()
397
+ self.seed(seed)
398
+
399
+
400
+ class Random(Parameterized):
401
+ op = RANDOM
402
+ return_type = Number
403
+
404
+ @classmethod
405
+ def validate(cls, seed: Any, index: Any = None) -> Any:
406
+ if index.data_type != Integer:
407
+ index.data_type = binary_implicit_promotion(index.data_type, Integer)
408
+ if index.value < 0:
409
+ raise SemanticError("2-1-15-2", op=cls.op, value=index)
410
+ if index.value > 10000:
411
+ warnings.warn(
412
+ "Random: The value of 'index' is very big. This can affect performance.",
413
+ UserWarning,
414
+ )
415
+ return super().validate(seed, index)
416
+
417
+ @classmethod
418
+ def py_op(cls, seed: Union[int, float], index: int) -> float:
419
+ instance: PseudoRandom = PseudoRandom(seed)
420
+ for _ in range(index):
421
+ instance.random()
422
+ return instance.random().__round__(6)
@@ -0,0 +1,77 @@
1
+ from copy import copy
2
+ from typing import Any, Union
3
+
4
+ # if os.environ.get("SPARK", False):
5
+ # import pyspark.pandas as pd
6
+ # else:
7
+ # import pandas as pd
8
+ import pandas as pd
9
+
10
+ from vtlengine.Exceptions import SemanticError
11
+ from vtlengine.Model import DataComponent, Role, Scalar
12
+ from vtlengine.Operators import Unary
13
+
14
+ ALLOWED_MODEL_TYPES = Union[DataComponent, Scalar]
15
+
16
+
17
+ class RoleSetter(Unary):
18
+ role: Role
19
+
20
+ @classmethod
21
+ def validate(cls, operand: ALLOWED_MODEL_TYPES, data_size: int = 0) -> DataComponent:
22
+ if isinstance(operand, Scalar):
23
+ nullable = True
24
+ if cls.role == Role.IDENTIFIER or operand.value is not None:
25
+ nullable = False
26
+ return DataComponent(
27
+ name=operand.name,
28
+ data_type=operand.data_type,
29
+ role=cls.role,
30
+ nullable=nullable,
31
+ data=None,
32
+ )
33
+ operand.role = cls.role
34
+ return copy(operand)
35
+
36
+ @classmethod
37
+ def evaluate(cls, operand: Any, data_size: int = 0) -> DataComponent:
38
+ if (
39
+ isinstance(operand, DataComponent)
40
+ and operand.data is not None
41
+ and not operand.nullable
42
+ and any(operand.data.isnull())
43
+ ):
44
+ raise SemanticError("1-1-1-16")
45
+ result = cls.validate(operand, data_size)
46
+ if isinstance(operand, Scalar):
47
+ result.data = pd.Series([operand.value] * data_size, dtype=object)
48
+ else:
49
+ result.data = operand.data
50
+ return result
51
+
52
+
53
+ class Identifier(RoleSetter):
54
+ role = Role.IDENTIFIER
55
+
56
+ @classmethod
57
+ def validate(cls, operand: ALLOWED_MODEL_TYPES, data_size: int = 0) -> DataComponent:
58
+ result = super().validate(operand)
59
+ if result.nullable:
60
+ raise SemanticError("1-1-1-16")
61
+ return result
62
+
63
+ @classmethod
64
+ def evaluate( # type: ignore[override]
65
+ cls, operand: ALLOWED_MODEL_TYPES, data_size: int = 0
66
+ ) -> DataComponent:
67
+ if isinstance(operand, Scalar) and operand.value is None:
68
+ raise SemanticError("1-1-1-16")
69
+ return super().evaluate(operand, data_size)
70
+
71
+
72
+ class Attribute(RoleSetter):
73
+ role = Role.ATTRIBUTE
74
+
75
+
76
+ class Measure(RoleSetter):
77
+ role = Role.MEASURE
@@ -0,0 +1,176 @@
1
+ from typing import Any, Dict, List
2
+
3
+ # if os.environ.get("SPARK"):
4
+ # import pyspark.pandas as pd
5
+ # else:
6
+ # import pandas as pd
7
+ import pandas as pd
8
+
9
+ from vtlengine.DataTypes import binary_implicit_promotion
10
+ from vtlengine.Exceptions import SemanticError
11
+ from vtlengine.Model import Dataset
12
+ from vtlengine.Operators import Operator
13
+
14
+
15
+ class Set(Operator):
16
+ @classmethod
17
+ def check_same_structure(cls, dataset_1: Dataset, dataset_2: Dataset) -> None:
18
+ if len(dataset_1.components) != len(dataset_2.components):
19
+ raise SemanticError(
20
+ "1-1-17-1",
21
+ op=cls.op,
22
+ dataset_1=dataset_1.name,
23
+ dataset_2=dataset_2.name,
24
+ )
25
+
26
+ for comp in dataset_1.components.values():
27
+ if comp.name not in dataset_2.components:
28
+ raise Exception(f"Component {comp.name} not found in dataset {dataset_2.name}")
29
+ second_comp = dataset_2.components[comp.name]
30
+ binary_implicit_promotion(
31
+ comp.data_type,
32
+ second_comp.data_type,
33
+ cls.type_to_check,
34
+ cls.return_type,
35
+ )
36
+ if comp.role != second_comp.role:
37
+ raise Exception(
38
+ f"Component {comp.name} has different roles "
39
+ f"in datasets {dataset_1.name} and {dataset_2.name}"
40
+ )
41
+
42
+ @classmethod
43
+ def validate(cls, operands: List[Dataset]) -> Dataset:
44
+ base_operand = operands[0]
45
+ for operand in operands[1:]:
46
+ cls.check_same_structure(base_operand, operand)
47
+
48
+ result_components: Dict[str, Any] = {}
49
+ for operand in operands:
50
+ if len(result_components) == 0:
51
+ result_components = operand.components
52
+ else:
53
+ for comp_name, comp in operand.components.items():
54
+ current_comp = result_components[comp_name]
55
+ result_components[comp_name].data_type = binary_implicit_promotion(
56
+ current_comp.data_type, comp.data_type
57
+ )
58
+ result_components[comp_name].nullable = current_comp.nullable or comp.nullable
59
+
60
+ result = Dataset(name="result", components=result_components, data=None)
61
+ return result
62
+
63
+
64
+ class Union(Set):
65
+ @classmethod
66
+ def evaluate(cls, operands: List[Dataset]) -> Dataset:
67
+ result = cls.validate(operands)
68
+ all_datapoints = [ds.data for ds in operands]
69
+ result.data = pd.concat(all_datapoints, sort=True, ignore_index=True)
70
+ identifiers_names = result.get_identifiers_names()
71
+ result.data = result.data.drop_duplicates(subset=identifiers_names, keep="first")
72
+ result.data.reset_index(drop=True, inplace=True)
73
+ return result
74
+
75
+
76
+ class Intersection(Set):
77
+ @classmethod
78
+ def evaluate(cls, operands: List[Dataset]) -> Dataset:
79
+ result = cls.validate(operands)
80
+ all_datapoints = [ds.data for ds in operands]
81
+ for data in all_datapoints:
82
+ if result.data is None:
83
+ result.data = data
84
+ else:
85
+ if data is None:
86
+ result.data = pd.DataFrame(columns=result.get_identifiers_names())
87
+ break
88
+ result.data = result.data.merge(
89
+ data, how="inner", on=result.get_identifiers_names()
90
+ )
91
+
92
+ not_identifiers = result.get_measures_names() + result.get_attributes_names()
93
+
94
+ for col in not_identifiers:
95
+ result.data[col] = result.data[col + "_x"]
96
+ result.data = result.data[result.get_identifiers_names() + not_identifiers]
97
+ if result.data is not None:
98
+ result.data.reset_index(drop=True, inplace=True)
99
+ return result
100
+
101
+
102
+ class Symdiff(Set):
103
+ @classmethod
104
+ def evaluate(cls, operands: List[Dataset]) -> Dataset:
105
+ result = cls.validate(operands)
106
+ all_datapoints = [ds.data for ds in operands]
107
+ for data in all_datapoints:
108
+ if data is None:
109
+ data = pd.DataFrame(columns=result.get_identifiers_names())
110
+ if result.data is None:
111
+ result.data = data
112
+ else:
113
+ # Realiza la operación equivalente en pyspark.pandas
114
+ result.data = result.data.merge(
115
+ data,
116
+ how="outer",
117
+ on=result.get_identifiers_names(),
118
+ suffixes=("_x", "_y"),
119
+ )
120
+
121
+ for measure in result.get_measures_names():
122
+ result.data["_merge"] = result.data.apply(
123
+ lambda row: (
124
+ "left_only"
125
+ if pd.isnull(row[f"{measure}_y"])
126
+ else ("right_only" if pd.isnull(row[f"{measure}_x"]) else "both")
127
+ ),
128
+ axis=1,
129
+ )
130
+
131
+ not_identifiers = result.get_measures_names() + result.get_attributes_names()
132
+ for col in not_identifiers:
133
+ result.data[col] = result.data.apply(
134
+ lambda x, c=col: (
135
+ x[c + "_x"]
136
+ if x["_merge"] == "left_only"
137
+ else (x[c + "_y"] if x["_merge"] == "right_only" else None)
138
+ ),
139
+ axis=1,
140
+ )
141
+ result.data = result.data[result.get_identifiers_names() + not_identifiers].dropna()
142
+ if result.data is not None:
143
+ result.data = result.data.reset_index(drop=True)
144
+ return result
145
+
146
+
147
+ class Setdiff(Set):
148
+ @staticmethod
149
+ def has_null(row: Any) -> bool:
150
+ return row.isnull().any()
151
+
152
+ @classmethod
153
+ def evaluate(cls, operands: List[Dataset]) -> Dataset:
154
+ result = cls.validate(operands)
155
+ all_datapoints = [ds.data for ds in operands]
156
+ for data in all_datapoints:
157
+ if result.data is None:
158
+ result.data = data
159
+ else:
160
+ if data is None:
161
+ data = pd.DataFrame(columns=result.get_identifiers_names())
162
+ result.data = result.data.merge(data, how="left", on=result.get_identifiers_names())
163
+ if len(result.data) > 0:
164
+ result.data = result.data[result.data.apply(cls.has_null, axis=1)]
165
+
166
+ not_identifiers = result.get_measures_names() + result.get_attributes_names()
167
+ for col in not_identifiers:
168
+ if col + "_x" in result.data:
169
+ result.data[col] = result.data[col + "_x"]
170
+ del result.data[col + "_x"]
171
+ if col + "_y" in result.data:
172
+ del result.data[col + "_y"]
173
+ result.data = result.data[result.get_identifiers_names() + not_identifiers]
174
+ if result.data is not None:
175
+ result.data.reset_index(drop=True, inplace=True)
176
+ return result