vtlengine 1.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. vtlengine/API/_InternalApi.py +791 -0
  2. vtlengine/API/__init__.py +612 -0
  3. vtlengine/API/data/schema/external_routines_schema.json +34 -0
  4. vtlengine/API/data/schema/json_schema_2.1.json +116 -0
  5. vtlengine/API/data/schema/value_domain_schema.json +97 -0
  6. vtlengine/AST/ASTComment.py +57 -0
  7. vtlengine/AST/ASTConstructor.py +598 -0
  8. vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
  9. vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
  10. vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
  11. vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
  12. vtlengine/AST/ASTDataExchange.py +10 -0
  13. vtlengine/AST/ASTEncoders.py +32 -0
  14. vtlengine/AST/ASTString.py +675 -0
  15. vtlengine/AST/ASTTemplate.py +558 -0
  16. vtlengine/AST/ASTVisitor.py +25 -0
  17. vtlengine/AST/DAG/__init__.py +479 -0
  18. vtlengine/AST/DAG/_words.py +10 -0
  19. vtlengine/AST/Grammar/Vtl.g4 +705 -0
  20. vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
  21. vtlengine/AST/Grammar/__init__.py +0 -0
  22. vtlengine/AST/Grammar/lexer.py +2139 -0
  23. vtlengine/AST/Grammar/parser.py +16597 -0
  24. vtlengine/AST/Grammar/tokens.py +169 -0
  25. vtlengine/AST/VtlVisitor.py +824 -0
  26. vtlengine/AST/__init__.py +674 -0
  27. vtlengine/DataTypes/TimeHandling.py +562 -0
  28. vtlengine/DataTypes/__init__.py +863 -0
  29. vtlengine/DataTypes/_time_checking.py +135 -0
  30. vtlengine/Exceptions/__exception_file_generator.py +96 -0
  31. vtlengine/Exceptions/__init__.py +159 -0
  32. vtlengine/Exceptions/messages.py +1004 -0
  33. vtlengine/Interpreter/__init__.py +2048 -0
  34. vtlengine/Model/__init__.py +501 -0
  35. vtlengine/Operators/Aggregation.py +357 -0
  36. vtlengine/Operators/Analytic.py +455 -0
  37. vtlengine/Operators/Assignment.py +23 -0
  38. vtlengine/Operators/Boolean.py +106 -0
  39. vtlengine/Operators/CastOperator.py +451 -0
  40. vtlengine/Operators/Clause.py +366 -0
  41. vtlengine/Operators/Comparison.py +488 -0
  42. vtlengine/Operators/Conditional.py +495 -0
  43. vtlengine/Operators/General.py +191 -0
  44. vtlengine/Operators/HROperators.py +254 -0
  45. vtlengine/Operators/Join.py +447 -0
  46. vtlengine/Operators/Numeric.py +422 -0
  47. vtlengine/Operators/RoleSetter.py +77 -0
  48. vtlengine/Operators/Set.py +176 -0
  49. vtlengine/Operators/String.py +578 -0
  50. vtlengine/Operators/Time.py +1144 -0
  51. vtlengine/Operators/Validation.py +275 -0
  52. vtlengine/Operators/__init__.py +900 -0
  53. vtlengine/Utils/__Virtual_Assets.py +34 -0
  54. vtlengine/Utils/__init__.py +479 -0
  55. vtlengine/__extras_check.py +17 -0
  56. vtlengine/__init__.py +27 -0
  57. vtlengine/files/__init__.py +0 -0
  58. vtlengine/files/output/__init__.py +35 -0
  59. vtlengine/files/output/_time_period_representation.py +55 -0
  60. vtlengine/files/parser/__init__.py +240 -0
  61. vtlengine/files/parser/_rfc_dialect.py +22 -0
  62. vtlengine/py.typed +0 -0
  63. vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
  64. vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
  65. vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
  66. vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
@@ -0,0 +1,2048 @@
1
+ import csv
2
+ from copy import copy, deepcopy
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional, Type, Union
6
+
7
+ import pandas as pd
8
+
9
+ import vtlengine.AST as AST
10
+ import vtlengine.Exceptions
11
+ import vtlengine.Operators as Operators
12
+ from vtlengine.AST import VarID
13
+ from vtlengine.AST.ASTTemplate import ASTTemplate
14
+ from vtlengine.AST.DAG import HRDAGAnalyzer
15
+ from vtlengine.AST.DAG._words import DELETE, GLOBAL, INSERT, PERSISTENT
16
+ from vtlengine.AST.Grammar.tokens import (
17
+ AGGREGATE,
18
+ ALL,
19
+ APPLY,
20
+ AS,
21
+ BETWEEN,
22
+ CALC,
23
+ CAST,
24
+ CHECK_DATAPOINT,
25
+ CHECK_HIERARCHY,
26
+ COUNT,
27
+ CURRENT_DATE,
28
+ DATE_ADD,
29
+ DROP,
30
+ EQ,
31
+ EXISTS_IN,
32
+ EXTERNAL,
33
+ FILL_TIME_SERIES,
34
+ FILTER,
35
+ HAVING,
36
+ HIERARCHY,
37
+ INSTR,
38
+ KEEP,
39
+ MEMBERSHIP,
40
+ REPLACE,
41
+ ROUND,
42
+ SUBSTR,
43
+ TRUNC,
44
+ WHEN,
45
+ )
46
+ from vtlengine.DataTypes import (
47
+ BASIC_TYPES,
48
+ SCALAR_TYPES_CLASS_REVERSE,
49
+ Boolean,
50
+ ScalarType,
51
+ check_unary_implicit_promotion,
52
+ )
53
+ from vtlengine.Exceptions import SemanticError
54
+ from vtlengine.files.output import save_datapoints
55
+ from vtlengine.files.output._time_period_representation import TimePeriodRepresentation
56
+ from vtlengine.files.parser import _fill_dataset_empty_data, load_datapoints
57
+ from vtlengine.Model import (
58
+ Component,
59
+ DataComponent,
60
+ Dataset,
61
+ ExternalRoutine,
62
+ Role,
63
+ Scalar,
64
+ ScalarSet,
65
+ ValueDomain,
66
+ )
67
+ from vtlengine.Operators.Aggregation import extract_grouping_identifiers
68
+ from vtlengine.Operators.Assignment import Assignment
69
+ from vtlengine.Operators.CastOperator import Cast
70
+ from vtlengine.Operators.Comparison import Between, ExistIn
71
+ from vtlengine.Operators.Conditional import Case, If
72
+ from vtlengine.Operators.General import Eval
73
+ from vtlengine.Operators.HROperators import (
74
+ HAAssignment,
75
+ Hierarchy,
76
+ get_measure_from_dataset,
77
+ )
78
+ from vtlengine.Operators.Numeric import Round, Trunc
79
+ from vtlengine.Operators.String import Instr, Replace, Substr
80
+ from vtlengine.Operators.Time import (
81
+ Current_Date,
82
+ Date_Add,
83
+ Fill_time_series,
84
+ Time_Aggregation,
85
+ )
86
+ from vtlengine.Operators.Validation import Check, Check_Datapoint, Check_Hierarchy
87
+ from vtlengine.Utils import (
88
+ AGGREGATION_MAPPING,
89
+ ANALYTIC_MAPPING,
90
+ BINARY_MAPPING,
91
+ HR_COMP_MAPPING,
92
+ HR_NUM_BINARY_MAPPING,
93
+ HR_UNARY_MAPPING,
94
+ JOIN_MAPPING,
95
+ REGULAR_AGGREGATION_MAPPING,
96
+ ROLE_SETTER_MAPPING,
97
+ SET_MAPPING,
98
+ THEN_ELSE,
99
+ UNARY_MAPPING,
100
+ )
101
+ from vtlengine.Utils.__Virtual_Assets import VirtualCounter
102
+
103
+
104
+ # noinspection PyTypeChecker
105
+ @dataclass
106
+ class InterpreterAnalyzer(ASTTemplate):
107
+ # Model elements
108
+ datasets: Dict[str, Dataset]
109
+ scalars: Optional[Dict[str, Scalar]] = None
110
+ value_domains: Optional[Dict[str, ValueDomain]] = None
111
+ external_routines: Optional[Dict[str, ExternalRoutine]] = None
112
+ # Analysis mode
113
+ only_semantic: bool = False
114
+ # Memory efficient
115
+ ds_analysis: Optional[Dict[str, Any]] = None
116
+ datapoints_paths: Optional[Dict[str, Path]] = None
117
+ output_path: Optional[Union[str, Path]] = None
118
+ # Time Period Representation
119
+ time_period_representation: Optional[TimePeriodRepresentation] = None
120
+ # Return only persistent
121
+ return_only_persistent: bool = True
122
+ # Flags to change behavior
123
+ nested_condition: Union[str, bool] = False
124
+ is_from_assignment: bool = False
125
+ is_from_component_assignment: bool = False
126
+ is_from_regular_aggregation: bool = False
127
+ is_from_grouping: bool = False
128
+ is_from_having: bool = False
129
+ is_from_if: bool = False
130
+ is_from_rule: bool = False
131
+ is_from_join: bool = False
132
+ is_from_condition: bool = False
133
+ is_from_hr_val: bool = False
134
+ is_from_hr_agg: bool = False
135
+ condition_stack: Optional[List[str]] = None
136
+ # Handlers for simplicity
137
+ regular_aggregation_dataset: Optional[Dataset] = None
138
+ aggregation_grouping: Optional[List[str]] = None
139
+ aggregation_dataset: Optional[Dataset] = None
140
+ then_condition_dataset: Optional[List[Any]] = None
141
+ else_condition_dataset: Optional[List[Any]] = None
142
+ ruleset_dataset: Optional[Dataset] = None
143
+ rule_data: Optional[pd.DataFrame] = None
144
+ ruleset_signature: Optional[Dict[str, str]] = None
145
+ udo_params: Optional[List[Dict[str, Any]]] = None
146
+ hr_agg_rules_computed: Optional[Dict[str, pd.DataFrame]] = None
147
+ ruleset_mode: Optional[str] = None
148
+ hr_input: Optional[str] = None
149
+ hr_partial_is_valid: Optional[List[bool]] = None
150
+ hr_condition: Optional[Dict[str, str]] = None
151
+ # DL
152
+ dprs: Optional[Dict[str, Optional[Dict[str, Any]]]] = None
153
+ udos: Optional[Dict[str, Optional[Dict[str, Any]]]] = None
154
+ hrs: Optional[Dict[str, Optional[Dict[str, Any]]]] = None
155
+ is_from_case_then: bool = False
156
+ signature_values: Optional[Dict[str, Any]] = None
157
+
158
+ def __post_init__(self) -> None:
159
+ self.datasets_inputs = set(self.datasets.keys())
160
+ self.scalars_inputs = set(self.scalars.keys()) if self.scalars else set()
161
+
162
+ # **********************************
163
+ # * *
164
+ # * Memory efficient *
165
+ # * *
166
+ # **********************************
167
+ def _load_datapoints_efficient(self, statement_num: int) -> None:
168
+ if self.datapoints_paths is None:
169
+ return
170
+ if self.ds_analysis is None:
171
+ return
172
+ if statement_num not in self.ds_analysis[INSERT]:
173
+ return
174
+ for ds_name in self.ds_analysis[INSERT][statement_num]:
175
+ if ds_name in self.datapoints_paths:
176
+ self.datasets[ds_name].data = load_datapoints(
177
+ self.datasets[ds_name].components,
178
+ ds_name,
179
+ self.datapoints_paths[ds_name],
180
+ )
181
+ elif ds_name in self.datasets and self.datasets[ds_name].data is None:
182
+ _fill_dataset_empty_data(self.datasets[ds_name])
183
+
184
+ def _save_datapoints_efficient(self, statement_num: int) -> None:
185
+ if self.output_path is None:
186
+ # Keeping the data in memory if no output path is provided
187
+ return
188
+ if self.ds_analysis is None:
189
+ return
190
+ if statement_num not in self.ds_analysis[DELETE]:
191
+ return
192
+ for ds_name in self.ds_analysis[DELETE][statement_num]:
193
+ if (
194
+ ds_name not in self.datasets
195
+ or not isinstance(self.datasets[ds_name], Dataset)
196
+ or self.datasets[ds_name].data is None
197
+ ):
198
+ continue
199
+ if ds_name in self.ds_analysis[GLOBAL]:
200
+ # We do not save global input datasets, only results of transformations
201
+ self.datasets[ds_name].data = None
202
+ continue
203
+ if self.return_only_persistent and ds_name not in self.ds_analysis[PERSISTENT]:
204
+ self.datasets[ds_name].data = None
205
+ continue
206
+ # Saving only datasets, no scalars
207
+ save_datapoints(
208
+ self.time_period_representation,
209
+ self.datasets[ds_name],
210
+ self.output_path,
211
+ )
212
+ self.datasets[ds_name].data = None
213
+
214
+ def _save_scalars_efficient(self, scalars: Dict[str, Scalar]) -> None:
215
+ output_path = Path(self.output_path) # type: ignore[arg-type]
216
+ output_path.mkdir(parents=True, exist_ok=True)
217
+ result_scalars = dict(scalars)
218
+ if result_scalars:
219
+ sorted(result_scalars.keys())
220
+ file_path = output_path / "_scalars.csv"
221
+ with open(file_path, "w", newline="", encoding="utf-8") as csv_file:
222
+ writer = csv.writer(csv_file)
223
+ writer.writerow(["Name", "Value"])
224
+ for name, scalar in sorted(result_scalars.items(), key=lambda item: item[0]):
225
+ value_to_write = "" if scalar.value is None else scalar.value
226
+ writer.writerow([name, str(value_to_write)])
227
+
228
+ # **********************************
229
+ # * *
230
+ # * AST Visitors *
231
+ # * *
232
+ # **********************************
233
+
234
+ def visit_Start(self, node: AST.Start) -> Any:
235
+ statement_num = 1
236
+ if self.only_semantic:
237
+ Operators.only_semantic = True
238
+ else:
239
+ Operators.only_semantic = False
240
+ results = {}
241
+ scalars_to_save = set()
242
+ invalid_dataset_outputs = []
243
+ invalid_scalar_outputs = []
244
+ for child in node.children:
245
+ if isinstance(child, (AST.Assignment, AST.PersistentAssignment)):
246
+ vtlengine.Exceptions.dataset_output = child.left.value # type: ignore[attr-defined]
247
+ self._load_datapoints_efficient(statement_num)
248
+ if not isinstance(
249
+ child, (AST.HRuleset, AST.DPRuleset, AST.Operator)
250
+ ) and not isinstance(child, (AST.Assignment, AST.PersistentAssignment)):
251
+ raise SemanticError("1-2-5")
252
+ result = self.visit(child)
253
+ if isinstance(result, Dataset) and result.name in self.datasets_inputs:
254
+ invalid_dataset_outputs.append(result.name)
255
+ if isinstance(result, Scalar) and result.name in self.scalars_inputs:
256
+ invalid_scalar_outputs.append(result.name)
257
+
258
+ # Reset some handlers (joins and if)
259
+ self.is_from_join = False
260
+ self.condition_stack = None
261
+ self.then_condition_dataset = None
262
+ self.else_condition_dataset = None
263
+ self.nested_condition = False
264
+
265
+ # Reset VirtualCounter
266
+ VirtualCounter.reset()
267
+
268
+ if result is None:
269
+ continue
270
+
271
+ # Removing output dataset
272
+ vtlengine.Exceptions.dataset_output = None
273
+ # Save results
274
+ self.datasets[result.name] = copy(result)
275
+ results[result.name] = result
276
+ if isinstance(result, Scalar):
277
+ scalars_to_save.add(result.name)
278
+ if self.scalars is None:
279
+ self.scalars = {}
280
+ self.scalars[result.name] = copy(result)
281
+ self._save_datapoints_efficient(statement_num)
282
+ statement_num += 1
283
+ if invalid_dataset_outputs:
284
+ raise SemanticError("0-1-2-8", names=", ".join(invalid_dataset_outputs))
285
+ if invalid_scalar_outputs:
286
+ raise SemanticError("0-1-2-8", names=", ".join(invalid_scalar_outputs))
287
+
288
+ if self.output_path is not None and scalars_to_save:
289
+ scalars_filtered = {
290
+ name: self.scalars[name] # type: ignore[index]
291
+ for name in scalars_to_save
292
+ if (not self.return_only_persistent or name in self.ds_analysis.get(PERSISTENT, [])) # type: ignore[union-attr]
293
+ }
294
+ self._save_scalars_efficient(scalars_filtered)
295
+
296
+ return results
297
+
298
+ # Definition Language
299
+
300
+ def visit_Operator(self, node: AST.Operator) -> None:
301
+ if self.udos is None:
302
+ self.udos = {}
303
+ elif node.op in self.udos:
304
+ raise ValueError(f"User Defined Operator {node.op} already exists")
305
+
306
+ param_info: List[Dict[str, Union[str, Type[ScalarType], AST.AST]]] = []
307
+ for param in node.parameters:
308
+ if param.name in [x["name"] for x in param_info]:
309
+ raise ValueError(f"Duplicated Parameter {param.name} in UDO {node.op}")
310
+ # We use a string for model types, but the data type class for basic types
311
+ # (Integer, Number, String, Boolean, ...)
312
+ if isinstance(param.type_, (Dataset, Component, Scalar)):
313
+ type_ = param.type_.__class__.__name__
314
+ else:
315
+ type_ = param.type_
316
+ param_info.append({"name": param.name, "type": type_})
317
+ if param.default is not None:
318
+ param_info[-1]["default"] = param.default
319
+ if len(param_info) > 1:
320
+ previous_default = param_info[0]
321
+ for i in [1, len(param_info) - 1]:
322
+ if previous_default and not param_info[i]:
323
+ raise SemanticError("1-3-12")
324
+ previous_default = param_info[i]
325
+
326
+ self.udos[node.op] = {
327
+ "params": param_info,
328
+ "expression": node.expression,
329
+ "output": node.output_type,
330
+ }
331
+
332
+ def visit_DPRuleset(self, node: AST.DPRuleset) -> None:
333
+ # Rule names are optional, if not provided, they are generated.
334
+ # If provided, all must be provided
335
+ rule_names = [rule.name for rule in node.rules if rule.name is not None]
336
+ if len(rule_names) != 0 and len(node.rules) != len(rule_names):
337
+ raise SemanticError("1-3-1-7", type="Datapoint Ruleset", name=node.name)
338
+ if len(rule_names) == 0:
339
+ for i, rule in enumerate(node.rules):
340
+ rule.name = (i + 1).__str__()
341
+
342
+ if len(rule_names) != len(set(rule_names)):
343
+ not_unique = [name for name in rule_names if rule_names.count(name) > 1]
344
+ raise SemanticError(
345
+ "1-3-1-5",
346
+ type="Datapoint Ruleset",
347
+ names=", ".join(not_unique),
348
+ ruleset_name=node.name,
349
+ )
350
+
351
+ # Signature has the actual parameters names or aliases if provided
352
+ signature_actual_names = {}
353
+ if not isinstance(node.params, AST.DefIdentifier):
354
+ for param in node.params:
355
+ if param.alias is not None:
356
+ signature_actual_names[param.alias] = param.value
357
+ else:
358
+ signature_actual_names[param.value] = param.value
359
+
360
+ ruleset_data = {
361
+ "rules": node.rules,
362
+ "signature": signature_actual_names,
363
+ "params": (
364
+ [x.value for x in node.params]
365
+ if not isinstance(node.params, AST.DefIdentifier)
366
+ else []
367
+ ),
368
+ "signature_type": node.signature_type,
369
+ }
370
+
371
+ # Adding the ruleset to the dprs dictionary
372
+ if self.dprs is None:
373
+ self.dprs = {}
374
+ elif node.name in self.dprs:
375
+ raise ValueError(f"Datapoint Ruleset {node.name} already exists")
376
+
377
+ self.dprs[node.name] = ruleset_data
378
+
379
+ def visit_HRuleset(self, node: AST.HRuleset) -> None:
380
+ if self.hrs is None:
381
+ self.hrs = {}
382
+
383
+ if node.name in self.hrs:
384
+ raise ValueError(f"Hierarchical Ruleset {node.name} already exists")
385
+
386
+ rule_names = [rule.name for rule in node.rules if rule.name is not None]
387
+ if len(rule_names) != 0 and len(node.rules) != len(rule_names):
388
+ raise ValueError("All rules must have a name, or none of them")
389
+ if len(rule_names) == 0:
390
+ for i, rule in enumerate(node.rules):
391
+ rule.name = (i + 1).__str__()
392
+
393
+ cond_comp: List[Any] = []
394
+ if isinstance(node.element, list):
395
+ cond_comp = [x.value for x in node.element[:-1]]
396
+ node.element = node.element[-1]
397
+
398
+ signature_actual_name = node.element.value
399
+
400
+ ruleset_data = {
401
+ "rules": node.rules,
402
+ "signature": signature_actual_name,
403
+ "condition": cond_comp,
404
+ "node": node,
405
+ }
406
+
407
+ self.hrs[node.name] = ruleset_data
408
+
409
+ # Execution Language
410
+ def visit_Assignment(self, node: AST.Assignment) -> Any:
411
+ if (
412
+ self.is_from_join
413
+ and isinstance(node.left, AST.Identifier)
414
+ and node.left.kind == "ComponentID"
415
+ ):
416
+ self.is_from_component_assignment = True
417
+ self.is_from_assignment = True
418
+ left_operand: str = self.visit(node.left)
419
+ self.is_from_assignment = False
420
+ right_operand: Union[Dataset, DataComponent] = self.visit(node.right)
421
+ self.is_from_component_assignment = False
422
+ result = Assignment.analyze(left_operand, right_operand)
423
+ if isinstance(result, (Dataset, Scalar)):
424
+ result.persistent = isinstance(node, AST.PersistentAssignment)
425
+ return result
426
+
427
+ def visit_PersistentAssignment(self, node: AST.PersistentAssignment) -> Any:
428
+ return self.visit_Assignment(node)
429
+
430
+ def visit_ParFunction(self, node: AST.ParFunction) -> Any:
431
+ return self.visit(node.operand)
432
+
433
+ def visit_BinOp(self, node: AST.BinOp) -> Any:
434
+ is_from_if = False
435
+ if (
436
+ not self.is_from_condition
437
+ and node.op != MEMBERSHIP
438
+ and self.condition_stack is not None
439
+ and len(self.condition_stack) > 0
440
+ ):
441
+ is_from_if = self.is_from_if
442
+ self.is_from_if = False
443
+
444
+ if (
445
+ self.is_from_join
446
+ and node.op in [MEMBERSHIP, AGGREGATE]
447
+ and hasattr(node.left, "value")
448
+ and hasattr(node.right, "value")
449
+ ):
450
+ if self.udo_params is not None and node.right.value in self.udo_params[-1]:
451
+ comp_name = f"{node.left.value}#{self.udo_params[-1][node.right.value]}"
452
+ else:
453
+ comp_name = f"{node.left.value}#{node.right.value}"
454
+ ast_var_id = AST.VarID(
455
+ value=comp_name,
456
+ line_start=node.right.line_start,
457
+ line_stop=node.right.line_stop,
458
+ column_start=node.right.column_start,
459
+ column_stop=node.right.column_stop,
460
+ )
461
+ return self.visit(ast_var_id)
462
+ left_operand = self.visit(node.left)
463
+ right_operand = self.visit(node.right)
464
+ if is_from_if:
465
+ left_operand, right_operand = self.merge_then_else_datasets(left_operand, right_operand)
466
+ if node.op == MEMBERSHIP:
467
+ if right_operand not in left_operand.components and "#" in right_operand:
468
+ right_operand = right_operand.split("#")[1]
469
+ if self.is_from_component_assignment:
470
+ return BINARY_MAPPING[node.op].analyze(
471
+ left_operand, right_operand, self.is_from_component_assignment
472
+ )
473
+ elif self.is_from_regular_aggregation:
474
+ raise SemanticError("1-1-6-6", dataset_name=left_operand, comp_name=right_operand)
475
+ elif len(left_operand.get_identifiers()) == 0:
476
+ raise SemanticError("1-2-10", op=node.op)
477
+ return BINARY_MAPPING[node.op].analyze(left_operand, right_operand)
478
+
479
+ def visit_UnaryOp(self, node: AST.UnaryOp) -> None:
480
+ operand = self.visit(node.operand)
481
+ if node.op not in UNARY_MAPPING and node.op not in ROLE_SETTER_MAPPING:
482
+ raise NotImplementedError
483
+ if (
484
+ self.is_from_regular_aggregation
485
+ and self.regular_aggregation_dataset is not None
486
+ and node.op in ROLE_SETTER_MAPPING
487
+ ):
488
+ if self.regular_aggregation_dataset.data is None:
489
+ data_size = 0
490
+ else:
491
+ data_size = len(self.regular_aggregation_dataset.data)
492
+ return ROLE_SETTER_MAPPING[node.op].analyze(operand, data_size)
493
+ return UNARY_MAPPING[node.op].analyze(operand)
494
+
495
+ def visit_Aggregation(self, node: AST.Aggregation) -> None:
496
+ # Having takes precedence as it is lower in the AST
497
+ if self.is_from_having:
498
+ if node.operand is not None:
499
+ self.visit(node.operand)
500
+ operand = self.aggregation_dataset
501
+ elif self.is_from_regular_aggregation and self.regular_aggregation_dataset is not None:
502
+ operand = self.regular_aggregation_dataset
503
+ if node.operand is not None and operand is not None:
504
+ op_comp: DataComponent = self.visit(node.operand)
505
+ comps_to_keep = {}
506
+ for (
507
+ comp_name,
508
+ comp,
509
+ ) in self.regular_aggregation_dataset.components.items():
510
+ if comp.role == Role.IDENTIFIER:
511
+ comps_to_keep[comp_name] = copy(comp)
512
+ comps_to_keep[op_comp.name] = Component(
513
+ name=op_comp.name,
514
+ data_type=op_comp.data_type,
515
+ role=op_comp.role,
516
+ nullable=op_comp.nullable,
517
+ )
518
+ if operand.data is not None:
519
+ data_to_keep = operand.data[operand.get_identifiers_names()]
520
+ data_to_keep[op_comp.name] = op_comp.data
521
+ else:
522
+ data_to_keep = None
523
+ operand = Dataset(name=operand.name, components=comps_to_keep, data=data_to_keep)
524
+ else:
525
+ operand = self.visit(node.operand)
526
+
527
+ if not isinstance(operand, Dataset):
528
+ raise SemanticError("2-3-4", op=node.op, comp="dataset")
529
+
530
+ for comp in operand.components.values():
531
+ if isinstance(comp.data_type, ScalarType):
532
+ raise SemanticError("2-1-12-1", op=node.op)
533
+
534
+ if node.having_clause is not None and node.grouping is None:
535
+ raise SemanticError("1-2-13")
536
+
537
+ groupings: Any = []
538
+ having = None
539
+ grouping_op = node.grouping_op
540
+ if node.grouping is not None:
541
+ if grouping_op == "group all":
542
+ data = None if self.only_semantic else copy(operand.data)
543
+ self.aggregation_dataset = Dataset(
544
+ name=operand.name, components=operand.components, data=data
545
+ )
546
+ # For Component handling in operators like time_agg
547
+ self.is_from_grouping = True
548
+ for x in node.grouping:
549
+ groupings.append(self.visit(x))
550
+ self.is_from_grouping = False
551
+ if grouping_op == "group all":
552
+ comp_grouped = groupings[0]
553
+ if (
554
+ operand.data is not None
555
+ and comp_grouped.data is not None
556
+ and len(comp_grouped.data) > 0
557
+ ):
558
+ operand.data[comp_grouped.name] = comp_grouped.data
559
+ groupings = [comp_grouped.name]
560
+ self.aggregation_dataset = None
561
+ if node.having_clause is not None:
562
+ self.aggregation_dataset = Dataset(
563
+ name=operand.name,
564
+ components=deepcopy(operand.components),
565
+ data=pd.DataFrame(columns=operand.get_components_names()),
566
+ )
567
+ self.aggregation_grouping = extract_grouping_identifiers(
568
+ operand.get_identifiers_names(), node.grouping_op, groupings
569
+ )
570
+ self.is_from_having = True
571
+ # Empty data analysis on having - we do not care about the result
572
+ self.visit(node.having_clause)
573
+ # Reset to default values
574
+ self.is_from_having = False
575
+ self.aggregation_grouping = None
576
+ self.aggregation_dataset = None
577
+ having = getattr(node.having_clause, "expr", "")
578
+ having = self._format_having_expression_udo(having)
579
+
580
+ elif self.is_from_having:
581
+ groupings = self.aggregation_grouping
582
+ # Setting here group by as we have already selected the identifiers we need
583
+ grouping_op = "group by"
584
+
585
+ result = AGGREGATION_MAPPING[node.op].analyze(operand, grouping_op, groupings, having)
586
+ if not self.is_from_regular_aggregation:
587
+ result.name = VirtualCounter._new_ds_name()
588
+ return result
589
+
590
+ def _format_having_expression_udo(self, having: str) -> str:
591
+ if self.udo_params is None:
592
+ return having
593
+ for k, v in self.udo_params[-1].items():
594
+ old_param = None
595
+ if f"{k} " in having:
596
+ old_param = f"{k} "
597
+ elif f" {k}" in having:
598
+ old_param = f" {k}"
599
+ if old_param is not None:
600
+ if isinstance(v, str):
601
+ new_param = f" {v}"
602
+ elif isinstance(v, (Dataset, Scalar)):
603
+ new_param = f" {v.name}"
604
+ else:
605
+ new_param = f" {v.value}"
606
+ having = having.replace(old_param, new_param)
607
+ return having
608
+
609
+ def visit_Analytic(self, node: AST.Analytic) -> Any: # noqa: C901
610
+ component_name = None
611
+ if self.is_from_regular_aggregation:
612
+ if self.regular_aggregation_dataset is None:
613
+ raise SemanticError("1-1-6-10")
614
+ if node.operand is None:
615
+ operand = self.regular_aggregation_dataset
616
+ else:
617
+ operand_comp = self.visit(node.operand)
618
+ component_name = operand_comp.name
619
+ id_names = self.regular_aggregation_dataset.get_identifiers_names()
620
+ measure_names = self.regular_aggregation_dataset.get_measures_names()
621
+ attribute_names = self.regular_aggregation_dataset.get_attributes_names()
622
+ dataset_components = self.regular_aggregation_dataset.components.copy()
623
+ for name in measure_names + attribute_names:
624
+ dataset_components.pop(name)
625
+
626
+ dataset_components[operand_comp.name] = Component(
627
+ name=operand_comp.name,
628
+ data_type=operand_comp.data_type,
629
+ role=operand_comp.role,
630
+ nullable=operand_comp.nullable,
631
+ )
632
+
633
+ if self.only_semantic or self.regular_aggregation_dataset.data is None:
634
+ data = None
635
+ else:
636
+ data = self.regular_aggregation_dataset.data[id_names].copy()
637
+ data[operand_comp.name] = operand_comp.data
638
+
639
+ operand = Dataset(
640
+ name=self.regular_aggregation_dataset.name,
641
+ components=dataset_components,
642
+ data=data,
643
+ )
644
+
645
+ else:
646
+ operand = self.visit(node.operand)
647
+ partitioning: Any = []
648
+ ordering = []
649
+ if self.udo_params is not None:
650
+ if node.partition_by is not None:
651
+ for comp_name in node.partition_by:
652
+ if comp_name in self.udo_params[-1]:
653
+ partitioning.append(self.udo_params[-1][comp_name])
654
+ elif comp_name in operand.get_identifiers_names():
655
+ partitioning.append(comp_name)
656
+ else:
657
+ raise SemanticError(
658
+ "2-3-9",
659
+ comp_type="Component",
660
+ comp_name=comp_name,
661
+ param="UDO parameters",
662
+ )
663
+ if node.order_by is not None:
664
+ for o in node.order_by:
665
+ if o.component in self.udo_params[-1]:
666
+ o.component = self.udo_params[-1][o.component]
667
+ elif o.component not in operand.get_identifiers_names():
668
+ raise SemanticError(
669
+ "2-3-9",
670
+ comp_type="Component",
671
+ comp_name=o.component,
672
+ param="UDO parameters",
673
+ )
674
+ ordering = node.order_by
675
+
676
+ else:
677
+ partitioning = node.partition_by
678
+ ordering = node.order_by if node.order_by is not None else []
679
+ if not isinstance(operand, Dataset):
680
+ raise SemanticError("2-3-4", op=node.op, comp="dataset")
681
+ if node.partition_by is None:
682
+ order_components = (
683
+ [x.component for x in node.order_by] if node.order_by is not None else []
684
+ )
685
+ partitioning = [x for x in operand.get_identifiers_names() if x not in order_components]
686
+
687
+ params = []
688
+ if node.params is not None:
689
+ for param in node.params:
690
+ if isinstance(param, AST.Constant):
691
+ params.append(param.value)
692
+ else:
693
+ params.append(param)
694
+
695
+ result = ANALYTIC_MAPPING[node.op].analyze(
696
+ operand=operand,
697
+ partitioning=partitioning,
698
+ ordering=ordering,
699
+ window=node.window,
700
+ params=params,
701
+ component_name=component_name,
702
+ )
703
+ if not self.is_from_regular_aggregation:
704
+ return result
705
+
706
+ # Extracting the components we need (only identifiers)
707
+ id_columns = (
708
+ self.regular_aggregation_dataset.get_identifiers_names()
709
+ if (self.regular_aggregation_dataset is not None)
710
+ else None
711
+ )
712
+
713
+ # # Extracting the component we need (only measure)
714
+ if component_name is None or node.op == COUNT:
715
+ measure_name = result.get_measures_names()[0]
716
+ else:
717
+ measure_name = component_name
718
+ # Joining the result with the original dataset
719
+ if self.only_semantic:
720
+ data = None
721
+ else:
722
+ if (
723
+ self.regular_aggregation_dataset is not None
724
+ and self.regular_aggregation_dataset.data is not None
725
+ ):
726
+ joined_result = pd.merge(
727
+ self.regular_aggregation_dataset.data[id_columns],
728
+ result.data,
729
+ on=id_columns,
730
+ how="inner",
731
+ )
732
+ data = joined_result[measure_name]
733
+ else:
734
+ data = None
735
+
736
+ return DataComponent(
737
+ name=measure_name,
738
+ data=data,
739
+ data_type=result.components[measure_name].data_type,
740
+ role=result.components[measure_name].role,
741
+ nullable=result.components[measure_name].nullable,
742
+ )
743
+
744
+ def visit_MulOp(self, node: AST.MulOp) -> None:
745
+ """
746
+ MulOp: (op, children)
747
+
748
+ op: BETWEEN : 'between'.
749
+
750
+ Basic usage:
751
+
752
+ for child in node.children:
753
+ self.visit(child)
754
+ """
755
+ # Comparison Operators
756
+ if node.op == BETWEEN:
757
+ operand_element = self.visit(node.children[0])
758
+ from_element = self.visit(node.children[1])
759
+ to_element = self.visit(node.children[2])
760
+
761
+ return Between.analyze(operand_element, from_element, to_element)
762
+
763
+ # Comparison Operators
764
+ elif node.op == EXISTS_IN:
765
+ dataset_1 = self.visit(node.children[0])
766
+ if not isinstance(dataset_1, Dataset):
767
+ raise SemanticError("2-3-11", pos="First")
768
+ dataset_2 = self.visit(node.children[1])
769
+ if not isinstance(dataset_2, Dataset):
770
+ raise SemanticError("2-3-11", pos="Second")
771
+
772
+ retain_element = None
773
+ if len(node.children) == 3:
774
+ retain_element = self.visit(node.children[2])
775
+ if isinstance(retain_element, Scalar):
776
+ retain_element = retain_element.value
777
+ if retain_element == ALL:
778
+ retain_element = None
779
+
780
+ return ExistIn.analyze(dataset_1, dataset_2, retain_element)
781
+
782
+ # Set Operators.
783
+ elif node.op in SET_MAPPING:
784
+ datasets = []
785
+ for child in node.children:
786
+ datasets.append(self.visit(child))
787
+
788
+ for ds in datasets:
789
+ if not isinstance(ds, Dataset):
790
+ raise ValueError(f"Expected dataset, got {type(ds).__name__}")
791
+
792
+ return SET_MAPPING[node.op].analyze(datasets)
793
+
794
+ elif node.op == CURRENT_DATE:
795
+ return Current_Date.analyze()
796
+
797
+ else:
798
+ raise SemanticError("1-3-5", op_type="MulOp", node_op=node.op)
799
+
800
+ def visit_VarID(self, node: AST.VarID) -> Any: # noqa: C901
801
+ if self.is_from_assignment:
802
+ return node.value
803
+ # Having takes precedence as it is lower in the AST
804
+ if self.udo_params is not None and node.value in self.udo_params[-1]:
805
+ udo_element = copy(self.udo_params[-1][node.value])
806
+ if isinstance(udo_element, (Scalar, Dataset, DataComponent)):
807
+ return udo_element
808
+ # If it is only the component or dataset name, we rename the node.value
809
+ node.value = udo_element
810
+ if self.aggregation_dataset is not None and (self.is_from_having or self.is_from_grouping):
811
+ if node.value not in self.aggregation_dataset.components:
812
+ raise SemanticError(
813
+ "1-1-1-10",
814
+ op=None,
815
+ comp_name=node.value,
816
+ dataset_name=self.aggregation_dataset.name,
817
+ )
818
+ if self.aggregation_dataset.data is None:
819
+ data = None
820
+ else:
821
+ data = copy(self.aggregation_dataset.data[node.value])
822
+ return DataComponent(
823
+ name=node.value,
824
+ data=data,
825
+ data_type=self.aggregation_dataset.components[node.value].data_type,
826
+ role=self.aggregation_dataset.components[node.value].role,
827
+ nullable=self.aggregation_dataset.components[node.value].nullable,
828
+ )
829
+ if self.is_from_regular_aggregation:
830
+ if self.is_from_join and node.value in self.datasets:
831
+ return copy(self.datasets[node.value])
832
+ if self.regular_aggregation_dataset is not None:
833
+ if self.scalars is not None and node.value in self.scalars:
834
+ if node.value in self.regular_aggregation_dataset.components:
835
+ raise SemanticError("1-1-6-11", comp_name=node.value)
836
+ return copy(self.scalars[node.value])
837
+ if self.regular_aggregation_dataset.data is not None:
838
+ if (
839
+ self.is_from_join
840
+ and node.value
841
+ not in self.regular_aggregation_dataset.get_components_names()
842
+ ):
843
+ is_partial_present = 0
844
+ found_comp = None
845
+ for comp_name in self.regular_aggregation_dataset.get_components_names():
846
+ if (
847
+ "#" in comp_name
848
+ and comp_name.split("#")[1] == node.value
849
+ or "#" in node.value
850
+ and node.value.split("#")[1] == comp_name
851
+ ):
852
+ is_partial_present += 1
853
+ found_comp = comp_name
854
+ if is_partial_present == 0:
855
+ raise SemanticError(
856
+ "1-1-1-10",
857
+ comp_name=node.value,
858
+ dataset_name=self.regular_aggregation_dataset.name,
859
+ )
860
+ elif is_partial_present == 2:
861
+ raise SemanticError("1-1-13-9", comp_name=node.value)
862
+ node.value = found_comp # type:ignore[assignment]
863
+ if node.value not in self.regular_aggregation_dataset.components:
864
+ raise SemanticError(
865
+ "1-1-1-10",
866
+ comp_name=node.value,
867
+ dataset_name=self.regular_aggregation_dataset.name,
868
+ )
869
+ data = copy(self.regular_aggregation_dataset.data[node.value])
870
+ else:
871
+ data = None
872
+ return DataComponent(
873
+ name=node.value,
874
+ data=data,
875
+ data_type=self.regular_aggregation_dataset.components[node.value].data_type,
876
+ role=self.regular_aggregation_dataset.components[node.value].role,
877
+ nullable=self.regular_aggregation_dataset.components[node.value].nullable,
878
+ )
879
+ if (
880
+ self.is_from_rule
881
+ and self.ruleset_dataset is not None
882
+ and self.ruleset_signature is not None
883
+ ):
884
+ if node.value not in self.ruleset_signature:
885
+ raise SemanticError("1-1-10-7", comp_name=node.value)
886
+ comp_name = self.ruleset_signature[node.value]
887
+ if comp_name not in self.ruleset_dataset.components:
888
+ raise SemanticError(
889
+ "1-1-1-10",
890
+ comp_name=node.value,
891
+ dataset_name=self.ruleset_dataset.name,
892
+ )
893
+ data = None if self.rule_data is None else self.rule_data[comp_name]
894
+ return DataComponent(
895
+ name=comp_name,
896
+ data=data,
897
+ data_type=self.ruleset_dataset.components[comp_name].data_type,
898
+ role=self.ruleset_dataset.components[comp_name].role,
899
+ nullable=self.ruleset_dataset.components[comp_name].nullable,
900
+ )
901
+ if self.scalars and node.value in self.scalars:
902
+ return copy(self.scalars[node.value])
903
+ if node.value not in self.datasets:
904
+ raise SemanticError("2-3-6", dataset_name=node.value)
905
+
906
+ return copy(self.datasets[node.value])
907
+
908
+ def visit_Collection(self, node: AST.Collection) -> Any:
909
+ if node.kind == "Set":
910
+ elements = []
911
+ duplicates = []
912
+ for child in node.children:
913
+ ref_element = child.children[1] if isinstance(child, AST.ParamOp) else child
914
+ if ref_element in elements:
915
+ duplicates.append(ref_element)
916
+ elements.append(self.visit(child).value)
917
+ if len(duplicates) > 0:
918
+ raise SemanticError("1-2-5", duplicates=duplicates)
919
+ for element in elements:
920
+ if type(element) is not type(elements[0]):
921
+ raise Exception("All elements in a set must be of the same type")
922
+ if len(elements) == 0:
923
+ raise Exception("A set must contain at least one element")
924
+ if len(elements) != len(set(elements)):
925
+ raise Exception("A set must not contain duplicates")
926
+ return ScalarSet(data_type=BASIC_TYPES[type(elements[0])], values=elements)
927
+ elif node.kind == "ValueDomain":
928
+ if self.value_domains is None:
929
+ raise SemanticError("2-3-10", comp_type="Value Domains")
930
+ if node.name not in self.value_domains:
931
+ raise SemanticError("1-2-8", name=node.name)
932
+ vd = self.value_domains[node.name]
933
+ return ScalarSet(data_type=vd.type, values=vd.setlist)
934
+ else:
935
+ raise SemanticError("1-2-9", name=node.name)
936
+
937
+ def visit_RegularAggregation(self, node: AST.RegularAggregation) -> None: # noqa: C901
938
+ operands = []
939
+ dataset = self.visit(node.dataset)
940
+ if isinstance(dataset, Scalar):
941
+ raise SemanticError("1-1-1-20", op=node.op)
942
+ self.regular_aggregation_dataset = dataset
943
+ if node.op == APPLY:
944
+ op_map = BINARY_MAPPING
945
+ return REGULAR_AGGREGATION_MAPPING[node.op].analyze(dataset, node.children, op_map)
946
+ for child in node.children:
947
+ self.is_from_regular_aggregation = True
948
+ operands.append(self.visit(child))
949
+ self.is_from_regular_aggregation = False
950
+ if node.op == CALC and any(isinstance(operand, Dataset) for operand in operands):
951
+ raise SemanticError("1-2-14", op=node.op)
952
+ if node.op == AGGREGATE:
953
+ # Extracting the role encoded inside the children assignments
954
+ role_info = {
955
+ child.left.value: child.left.role
956
+ for child in node.children
957
+ if hasattr(child, "left")
958
+ }
959
+ dataset = copy(operands[0])
960
+ if self.regular_aggregation_dataset is not None:
961
+ dataset.name = self.regular_aggregation_dataset.name
962
+ dataset.components = {
963
+ comp_name: comp
964
+ for comp_name, comp in dataset.components.items()
965
+ if comp.role != Role.MEASURE
966
+ }
967
+ if dataset.data is not None:
968
+ dataset.data = dataset.data[dataset.get_identifiers_names()]
969
+ aux_operands = []
970
+ for operand in operands:
971
+ measure = operand.get_component(operand.get_measures_names()[0])
972
+ data = operand.data[measure.name] if operand.data is not None else None
973
+ # Getting role from encoded information
974
+ # (handling also UDO params as it is present in the value of the mapping)
975
+ if self.udo_params is not None and operand.name in self.udo_params[-1].values():
976
+ role = None
977
+ for k, v in self.udo_params[-1].items():
978
+ if isinstance(v, str) and v == operand.name:
979
+ role_key = k
980
+ role = role_info[role_key]
981
+ else:
982
+ role = role_info[operand.name]
983
+ aux_operands.append(
984
+ DataComponent(
985
+ name=operand.name,
986
+ data=data,
987
+ data_type=measure.data_type,
988
+ role=role if role is not None else measure.role,
989
+ nullable=measure.nullable,
990
+ )
991
+ )
992
+ operands = aux_operands
993
+ self.regular_aggregation_dataset = None
994
+ if node.op == FILTER:
995
+ if not isinstance(operands[0], DataComponent) and hasattr(child, "left"):
996
+ measure = child.left.value
997
+ operands[0] = DataComponent(
998
+ name=measure,
999
+ data=operands[0].data[measure],
1000
+ data_type=operands[0].components[measure].data_type,
1001
+ role=operands[0].components[measure].role,
1002
+ nullable=operands[0].components[measure].nullable,
1003
+ )
1004
+ return REGULAR_AGGREGATION_MAPPING[node.op].analyze(operands[0], dataset)
1005
+ if self.is_from_join:
1006
+ if node.op in [DROP, KEEP]:
1007
+ operands = [
1008
+ (
1009
+ operand.get_measures_names()
1010
+ if isinstance(operand, Dataset)
1011
+ else (
1012
+ operand.name
1013
+ if isinstance(operand, DataComponent)
1014
+ and operand.role is not Role.IDENTIFIER
1015
+ else operand
1016
+ )
1017
+ )
1018
+ for operand in operands
1019
+ ]
1020
+ operands = list(
1021
+ set(
1022
+ [
1023
+ item
1024
+ for sublist in operands
1025
+ for item in (sublist if isinstance(sublist, list) else [sublist])
1026
+ ]
1027
+ )
1028
+ )
1029
+ result = REGULAR_AGGREGATION_MAPPING[node.op].analyze(operands, dataset)
1030
+ if node.isLast:
1031
+ if result.data is not None:
1032
+ result.data.rename(
1033
+ columns={col: col[col.find("#") + 1 :] for col in result.data.columns},
1034
+ inplace=True,
1035
+ )
1036
+ result.components = {
1037
+ comp_name[comp_name.find("#") + 1 :]: comp
1038
+ for comp_name, comp in result.components.items()
1039
+ }
1040
+ for comp in result.components.values():
1041
+ comp.name = comp.name[comp.name.find("#") + 1 :]
1042
+ if result.data is not None:
1043
+ result.data.reset_index(drop=True, inplace=True)
1044
+ self.is_from_join = False
1045
+ return result
1046
+ return REGULAR_AGGREGATION_MAPPING[node.op].analyze(operands, dataset)
1047
+
1048
+ def visit_If(self, node: AST.If) -> Dataset:
1049
+ self.is_from_condition = True
1050
+ condition = self.visit(node.condition)
1051
+ self.is_from_condition = False
1052
+
1053
+ if isinstance(condition, Scalar):
1054
+ thenValue = self.visit(node.thenOp)
1055
+ elseValue = self.visit(node.elseOp)
1056
+ if not isinstance(thenValue, Scalar) or not isinstance(elseValue, Scalar):
1057
+ raise SemanticError(
1058
+ "1-1-9-3",
1059
+ op="If_op",
1060
+ then_name=thenValue.name,
1061
+ else_name=elseValue.name,
1062
+ )
1063
+ if condition.value:
1064
+ return self.visit(node.thenOp)
1065
+ else:
1066
+ return self.visit(node.elseOp)
1067
+
1068
+ # Analysis for data component and dataset
1069
+ else:
1070
+ if self.condition_stack is None:
1071
+ self.condition_stack = []
1072
+ if self.then_condition_dataset is None:
1073
+ self.then_condition_dataset = []
1074
+ if self.else_condition_dataset is None:
1075
+ self.else_condition_dataset = []
1076
+ self.generate_then_else_datasets(copy(condition))
1077
+
1078
+ self.condition_stack.append(THEN_ELSE["then"])
1079
+ self.is_from_if = True
1080
+ self.nested_condition = "T" if isinstance(node.thenOp, AST.If) else False
1081
+ thenOp = self.visit(node.thenOp)
1082
+ if isinstance(thenOp, Scalar) or not isinstance(node.thenOp, AST.BinOp):
1083
+ self.then_condition_dataset.pop()
1084
+ self.condition_stack.pop()
1085
+
1086
+ self.condition_stack.append(THEN_ELSE["else"])
1087
+ self.is_from_if = True
1088
+ self.nested_condition = "E" if isinstance(node.elseOp, AST.If) else False
1089
+ elseOp = self.visit(node.elseOp)
1090
+ if isinstance(elseOp, Scalar) or (
1091
+ not isinstance(node.elseOp, AST.BinOp) and not isinstance(node.elseOp, AST.If)
1092
+ ):
1093
+ if len(self.else_condition_dataset) > 0:
1094
+ self.else_condition_dataset.pop()
1095
+ if len(self.condition_stack) > 0:
1096
+ self.condition_stack.pop()
1097
+
1098
+ return If.analyze(condition, thenOp, elseOp)
1099
+
1100
+ def visit_Case(self, node: AST.Case) -> Any:
1101
+ conditions: List[Any] = []
1102
+ thenOps: List[Any] = []
1103
+
1104
+ if self.condition_stack is None:
1105
+ self.condition_stack = []
1106
+ if self.then_condition_dataset is None:
1107
+ self.then_condition_dataset = []
1108
+ if self.else_condition_dataset is None:
1109
+ self.else_condition_dataset = []
1110
+
1111
+ for case in node.cases:
1112
+ self.is_from_condition = True
1113
+ cond = self.visit(case.condition)
1114
+ self.is_from_condition = False
1115
+
1116
+ conditions.append(cond)
1117
+ if isinstance(cond, Scalar):
1118
+ then_result = self.visit(case.thenOp)
1119
+ thenOps.append(then_result)
1120
+ continue
1121
+
1122
+ self.generate_then_else_datasets(copy(cond))
1123
+
1124
+ self.condition_stack.append(THEN_ELSE["then"])
1125
+ self.is_from_if = True
1126
+ self.is_from_case_then = True
1127
+
1128
+ then_result = self.visit(case.thenOp)
1129
+ thenOps.append(then_result)
1130
+
1131
+ self.is_from_case_then = False
1132
+ self.is_from_if = False
1133
+ if len(self.condition_stack) > 0:
1134
+ self.condition_stack.pop()
1135
+ if len(self.then_condition_dataset) > 0:
1136
+ self.then_condition_dataset.pop()
1137
+ if len(self.else_condition_dataset) > 0:
1138
+ self.else_condition_dataset.pop()
1139
+
1140
+ elseOp = self.visit(node.elseOp)
1141
+
1142
+ return Case.analyze(conditions, thenOps, elseOp)
1143
+
1144
+ def visit_RenameNode(self, node: AST.RenameNode) -> Any:
1145
+ if self.udo_params is not None:
1146
+ if "#" in node.old_name:
1147
+ if node.old_name.split("#")[1] in self.udo_params[-1]:
1148
+ comp_name = self.udo_params[-1][node.old_name.split("#")[1]]
1149
+ node.old_name = f"{node.old_name.split('#')[0]}#{comp_name}"
1150
+ else:
1151
+ if node.old_name in self.udo_params[-1]:
1152
+ node.old_name = self.udo_params[-1][node.old_name]
1153
+
1154
+ if (
1155
+ self.is_from_join
1156
+ and self.regular_aggregation_dataset is not None
1157
+ and node.old_name not in self.regular_aggregation_dataset.components
1158
+ ):
1159
+ node.old_name = node.old_name.split("#")[1]
1160
+
1161
+ return node
1162
+
1163
+ def visit_Constant(self, node: AST.Constant) -> Any:
1164
+ return Scalar(
1165
+ name=str(node.value),
1166
+ value=node.value,
1167
+ data_type=BASIC_TYPES[type(node.value)],
1168
+ )
1169
+
1170
+ def visit_JoinOp(self, node: AST.JoinOp) -> None:
1171
+ clause_elements = []
1172
+ for clause in node.clauses:
1173
+ clause_elements.append(self.visit(clause))
1174
+ if hasattr(clause, "op") and clause.op == AS:
1175
+ # TODO: We need to delete somewhere the join datasets with alias that are added here
1176
+ self.datasets[clause_elements[-1].name] = clause_elements[-1]
1177
+
1178
+ # No need to check using, regular aggregation is executed afterwards
1179
+ self.is_from_join = True
1180
+ return JOIN_MAPPING[node.op].analyze(clause_elements, node.using)
1181
+
1182
+ def visit_ParamConstant(self, node: AST.ParamConstant) -> str:
1183
+ return node.value
1184
+
1185
+ def visit_ParamOp(self, node: AST.ParamOp) -> None: # noqa: C901
1186
+ if node.op == ROUND:
1187
+ op_element = self.visit(node.children[0])
1188
+ param_element = self.visit(node.params[0]) if len(node.params) != 0 else None
1189
+ return Round.analyze(op_element, param_element)
1190
+
1191
+ # Numeric Operator
1192
+ elif node.op == TRUNC:
1193
+ op_element = self.visit(node.children[0])
1194
+ param_element = None
1195
+ if len(node.params) != 0:
1196
+ param_element = self.visit(node.params[0])
1197
+
1198
+ return Trunc.analyze(op_element, param_element)
1199
+
1200
+ elif node.op == SUBSTR or node.op == REPLACE or node.op == INSTR:
1201
+ params = [None, None, None]
1202
+ op_element = self.visit(node.children[0])
1203
+ for i, node_param in enumerate(node.params):
1204
+ params[i] = self.visit(node_param)
1205
+ param1, param2, param3 = tuple(params)
1206
+ if node.op == SUBSTR:
1207
+ return Substr.analyze(op_element, param1, param2)
1208
+ elif node.op == REPLACE:
1209
+ return Replace.analyze(op_element, param1, param2)
1210
+ elif node.op == INSTR:
1211
+ return Instr.analyze(op_element, param1, param2, param3)
1212
+ else:
1213
+ raise NotImplementedError
1214
+ elif node.op == HAVING:
1215
+ if self.aggregation_dataset is not None and self.aggregation_grouping is not None:
1216
+ for id_name in self.aggregation_grouping:
1217
+ if id_name not in self.aggregation_dataset.components:
1218
+ raise SemanticError("1-1-2-4", op=node.op, id_name=id_name)
1219
+ if len(self.aggregation_dataset.get_measures()) != 1:
1220
+ raise ValueError("Only one measure is allowed")
1221
+ # Deepcopy is necessary for components to avoid changing the original dataset
1222
+ self.aggregation_dataset.components = {
1223
+ comp_name: deepcopy(comp)
1224
+ for comp_name, comp in self.aggregation_dataset.components.items()
1225
+ if comp_name in self.aggregation_grouping or comp.role == Role.MEASURE
1226
+ }
1227
+
1228
+ self.aggregation_dataset.data = (
1229
+ self.aggregation_dataset.data[
1230
+ self.aggregation_dataset.get_identifiers_names()
1231
+ + self.aggregation_dataset.get_measures_names()
1232
+ ]
1233
+ if (self.aggregation_dataset.data is not None)
1234
+ else None
1235
+ )
1236
+ result = self.visit(node.params)
1237
+ measure = result.get_measures()[0]
1238
+ if measure.data_type != Boolean:
1239
+ raise SemanticError("1-1-2-3", type=SCALAR_TYPES_CLASS_REVERSE[Boolean])
1240
+ return None
1241
+ elif node.op == FILL_TIME_SERIES:
1242
+ mode = self.visit(node.params[0]) if len(node.params) == 1 else "all"
1243
+ return Fill_time_series.analyze(self.visit(node.children[0]), mode)
1244
+ elif node.op == DATE_ADD:
1245
+ params = [self.visit(node.params[0]), self.visit(node.params[1])]
1246
+ return Date_Add.analyze(self.visit(node.children[0]), params)
1247
+ elif node.op == CAST:
1248
+ operand = self.visit(node.children[0])
1249
+ scalar_type = node.children[1]
1250
+ mask = None
1251
+ if len(node.params) > 0:
1252
+ mask = self.visit(node.params[0])
1253
+ return Cast.analyze(operand, scalar_type, mask)
1254
+
1255
+ elif node.op == CHECK_DATAPOINT:
1256
+ if self.dprs is None:
1257
+ raise SemanticError("1-2-6", node_type="Datapoint Rulesets", node_value="")
1258
+ # Checking if ruleset exists
1259
+ dpr_name: Any = node.children[1]
1260
+ if dpr_name not in self.dprs:
1261
+ raise SemanticError("1-2-6", node_type="Datapoint Ruleset", node_value=dpr_name)
1262
+ dpr_info = self.dprs[dpr_name]
1263
+
1264
+ # Extracting dataset
1265
+ dataset_element = self.visit(node.children[0])
1266
+ if not isinstance(dataset_element, Dataset):
1267
+ raise SemanticError("1-1-1-20", op=node.op)
1268
+ # Checking if list of components supplied is valid
1269
+ if len(node.children) > 2:
1270
+ for comp_name in node.children[2:]:
1271
+ if comp_name.__str__() not in dataset_element.components:
1272
+ raise SemanticError(
1273
+ "1-1-1-10",
1274
+ comp_name=comp_name,
1275
+ dataset_name=dataset_element.name,
1276
+ )
1277
+ if dpr_info is not None and dpr_info["signature_type"] == "variable":
1278
+ for i, comp_name in enumerate(node.children[2:]):
1279
+ if comp_name != dpr_info["params"][i]:
1280
+ raise SemanticError(
1281
+ "1-1-10-3",
1282
+ op=node.op,
1283
+ expected=dpr_info["params"][i],
1284
+ found=comp_name,
1285
+ )
1286
+
1287
+ output: Any = node.params[0] # invalid, all_measures, all
1288
+ if dpr_info is None:
1289
+ dpr_info = {}
1290
+
1291
+ rule_output_values = {}
1292
+ self.ruleset_dataset = dataset_element
1293
+ self.ruleset_signature = dpr_info["signature"]
1294
+ self.ruleset_mode = output
1295
+ # Gather rule data, adding the ruleset dataset to the interpreter
1296
+ if dpr_info is not None:
1297
+ for rule in dpr_info["rules"]:
1298
+ rule_output_values[rule.name] = {
1299
+ "errorcode": rule.erCode,
1300
+ "errorlevel": rule.erLevel,
1301
+ "output": self.visit(rule),
1302
+ }
1303
+ self.ruleset_mode = None
1304
+ self.ruleset_signature = None
1305
+ self.ruleset_dataset = None
1306
+
1307
+ # Datapoint Ruleset final evaluation
1308
+ return Check_Datapoint.analyze(
1309
+ dataset_element=dataset_element,
1310
+ rule_info=rule_output_values,
1311
+ output=output,
1312
+ )
1313
+ elif node.op in (CHECK_HIERARCHY, HIERARCHY):
1314
+ component: Optional[str] = None
1315
+ if len(node.children) == 2:
1316
+ dataset, hr_name = (self.visit(x) for x in node.children)
1317
+ cond_components: List[str] = []
1318
+ elif len(node.children) == 3:
1319
+ dataset, component, hr_name = (self.visit(x) for x in node.children)
1320
+ cond_components = []
1321
+ else:
1322
+ children = [self.visit(x) for x in node.children]
1323
+ dataset = children[0]
1324
+ component = children[1]
1325
+ hr_name = children[2]
1326
+ cond_components = children[3:]
1327
+
1328
+ # Input is always dataset
1329
+ mode, input_, output = (self.visit(param) for param in node.params)
1330
+
1331
+ # Sanitise the hierarchical ruleset and the call
1332
+
1333
+ if self.hrs is None:
1334
+ raise SemanticError("1-2-6", node_type="Hierarchical Rulesets", node_value="")
1335
+ else:
1336
+ if hr_name not in self.hrs:
1337
+ raise SemanticError(
1338
+ "1-2-6", node_type="Hierarchical Ruleset", node_value=hr_name
1339
+ )
1340
+
1341
+ if not isinstance(dataset, Dataset):
1342
+ raise SemanticError("1-1-1-20", op=node.op)
1343
+
1344
+ hr_info = self.hrs[hr_name]
1345
+ if hr_info is not None:
1346
+ if len(cond_components) != len(hr_info["condition"]):
1347
+ raise SemanticError("1-1-10-2", op=node.op)
1348
+
1349
+ if (
1350
+ hr_info["node"].signature_type == "variable"
1351
+ and hr_info["signature"] != component
1352
+ ):
1353
+ raise SemanticError(
1354
+ "1-1-10-3",
1355
+ op=node.op,
1356
+ found=component,
1357
+ expected=hr_info["signature"],
1358
+ )
1359
+ elif hr_info["node"].signature_type == "valuedomain" and component is None:
1360
+ raise SemanticError("1-1-10-4", op=node.op)
1361
+ elif component is None:
1362
+ # TODO: Leaving this until refactor in Ruleset handling is done
1363
+ raise NotImplementedError(
1364
+ "Hierarchical Ruleset handling without component "
1365
+ "and signature type variable is not implemented yet."
1366
+ )
1367
+
1368
+ cond_info = {}
1369
+ for i, cond_comp in enumerate(hr_info["condition"]):
1370
+ if (
1371
+ hr_info["node"].signature_type == "variable"
1372
+ and cond_components[i] != cond_comp
1373
+ ):
1374
+ raise SemanticError(
1375
+ "1-1-10-6",
1376
+ op=node.op,
1377
+ expected=cond_comp,
1378
+ found=cond_components[i],
1379
+ )
1380
+ cond_info[cond_comp] = cond_components[i]
1381
+
1382
+ if node.op == HIERARCHY:
1383
+ aux = []
1384
+ for rule in hr_info["rules"]:
1385
+ if rule.rule.op == EQ or rule.rule.op == WHEN and rule.rule.right.op == EQ:
1386
+ aux.append(rule)
1387
+ # Filter only the rules with HRBinOP as =,
1388
+ # as they are the ones that will be computed
1389
+ if len(aux) == 0:
1390
+ raise SemanticError("1-1-10-5")
1391
+ hr_info["rules"] = aux
1392
+
1393
+ hierarchy_ast = AST.HRuleset(
1394
+ name=hr_name,
1395
+ signature_type=hr_info["node"].signature_type,
1396
+ element=hr_info["node"].element,
1397
+ rules=aux,
1398
+ line_start=node.line_start,
1399
+ line_stop=node.line_stop,
1400
+ column_start=node.column_start,
1401
+ column_stop=node.column_stop,
1402
+ )
1403
+ HRDAGAnalyzer().visit(hierarchy_ast)
1404
+
1405
+ Check_Hierarchy.validate_hr_dataset(dataset, component)
1406
+
1407
+ # Gather rule data, adding the necessary elements to the interpreter
1408
+ # for simplicity
1409
+ self.ruleset_dataset = dataset
1410
+ self.ruleset_signature = {**{"RULE_COMPONENT": component}, **cond_info}
1411
+ self.ruleset_mode = mode
1412
+ self.hr_input = input_
1413
+ rule_output_values = {}
1414
+ if node.op == HIERARCHY:
1415
+ self.is_from_hr_agg = True
1416
+ self.hr_agg_rules_computed = {}
1417
+ for rule in hr_info["rules"]:
1418
+ self.visit(rule)
1419
+ self.is_from_hr_agg = False
1420
+ else:
1421
+ self.is_from_hr_val = True
1422
+ for rule in hr_info["rules"]:
1423
+ rule_output_values[rule.name] = {
1424
+ "errorcode": rule.erCode,
1425
+ "errorlevel": rule.erLevel,
1426
+ "output": self.visit(rule),
1427
+ }
1428
+ self.is_from_hr_val = False
1429
+ self.ruleset_signature = None
1430
+ self.ruleset_dataset = None
1431
+ self.ruleset_mode = None
1432
+ self.hr_input = None
1433
+
1434
+ # Final evaluation
1435
+ if node.op == CHECK_HIERARCHY:
1436
+ result = Check_Hierarchy.analyze(
1437
+ dataset_element=dataset,
1438
+ rule_info=rule_output_values,
1439
+ output=output,
1440
+ )
1441
+ del rule_output_values
1442
+ else:
1443
+ result = Hierarchy.analyze(dataset, self.hr_agg_rules_computed, output)
1444
+ self.hr_agg_rules_computed = None
1445
+ return result
1446
+
1447
+ raise SemanticError("1-3-5", op_type="ParamOp", node_op=node.op)
1448
+
1449
+ def visit_DPRule(self, node: AST.DPRule) -> None:
1450
+ self.is_from_rule = True
1451
+ if self.ruleset_dataset is not None:
1452
+ if self.ruleset_dataset.data is None:
1453
+ self.rule_data = None
1454
+ else:
1455
+ self.rule_data = self.ruleset_dataset.data.copy()
1456
+ validation_data = self.visit(node.rule)
1457
+ if isinstance(validation_data, DataComponent):
1458
+ if self.rule_data is not None and self.ruleset_dataset is not None:
1459
+ aux = self.rule_data.loc[:, self.ruleset_dataset.get_components_names()]
1460
+ aux["bool_var"] = validation_data.data
1461
+ validation_data = aux
1462
+ else:
1463
+ validation_data = None
1464
+ if self.ruleset_mode == "invalid" and validation_data is not None:
1465
+ validation_data = validation_data[validation_data["bool_var"] == False]
1466
+ self.rule_data = None
1467
+ self.is_from_rule = False
1468
+ return validation_data
1469
+
1470
+ def visit_HRule(self, node: AST.HRule) -> None:
1471
+ self.is_from_rule = True
1472
+ if self.ruleset_dataset is not None:
1473
+ self.rule_data = (
1474
+ None if self.ruleset_dataset.data is None else self.ruleset_dataset.data.copy()
1475
+ )
1476
+ rule_result = self.visit(node.rule)
1477
+ if rule_result is None:
1478
+ self.is_from_rule = False
1479
+ return None
1480
+ if self.is_from_hr_agg:
1481
+ measure_name = rule_result.get_measures_names()[0]
1482
+ if (
1483
+ self.hr_agg_rules_computed is not None
1484
+ and rule_result.data is not None
1485
+ and len(rule_result.data[measure_name]) > 0
1486
+ ):
1487
+ self.hr_agg_rules_computed[rule_result.name] = rule_result.data
1488
+ else:
1489
+ rule_result = rule_result.data
1490
+ self.rule_data = None
1491
+ self.is_from_rule = False
1492
+ return rule_result
1493
+
1494
+ def visit_HRBinOp(self, node: AST.HRBinOp) -> Any:
1495
+ if node.op == WHEN:
1496
+ filter_comp = self.visit(node.left)
1497
+ if self.rule_data is None:
1498
+ return None
1499
+ filtering_indexes = list(filter_comp.data[filter_comp.data == True].index)
1500
+ nan_indexes = list(filter_comp.data[filter_comp.data.isnull()].index)
1501
+ # If no filtering indexes, then all datapoints are valid on DPR and HR
1502
+ if len(filtering_indexes) == 0 and not (self.is_from_hr_agg or self.is_from_hr_val):
1503
+ self.rule_data["bool_var"] = True
1504
+ self.rule_data.loc[nan_indexes, "bool_var"] = None
1505
+ return self.rule_data
1506
+ non_filtering_indexes = list(set(filter_comp.data.index) - set(filtering_indexes))
1507
+
1508
+ original_data = self.rule_data.copy()
1509
+ self.rule_data = self.rule_data.iloc[filtering_indexes].reset_index(drop=True)
1510
+ result_validation = self.visit(node.right)
1511
+ if self.is_from_hr_agg or self.is_from_hr_val:
1512
+ # We only need to filter rule_data on DPR
1513
+ return result_validation
1514
+ self.rule_data["bool_var"] = result_validation.data
1515
+ original_data = original_data.merge(
1516
+ self.rule_data, how="left", on=original_data.columns.tolist()
1517
+ )
1518
+ original_data.loc[non_filtering_indexes, "bool_var"] = True
1519
+ original_data.loc[nan_indexes, "bool_var"] = None
1520
+ return original_data
1521
+ elif node.op in HR_COMP_MAPPING:
1522
+ self.is_from_assignment = True
1523
+ if self.ruleset_mode in ("partial_null", "partial_zero"):
1524
+ self.hr_partial_is_valid = []
1525
+ left_operand = self.visit(node.left)
1526
+ self.is_from_assignment = False
1527
+ right_operand = self.visit(node.right)
1528
+ if isinstance(right_operand, Dataset):
1529
+ right_operand = get_measure_from_dataset(right_operand, node.right.value)
1530
+
1531
+ if self.ruleset_mode in ("partial_null", "partial_zero"):
1532
+ # Check all values were present in the dataset
1533
+ if self.hr_partial_is_valid and not any(self.hr_partial_is_valid):
1534
+ right_operand.data = right_operand.data.map(lambda x: "REMOVE_VALUE")
1535
+ self.hr_partial_is_valid = []
1536
+
1537
+ if self.is_from_hr_agg:
1538
+ return HAAssignment.analyze(left_operand, right_operand, self.ruleset_mode)
1539
+ else:
1540
+ result = HR_COMP_MAPPING[node.op].analyze(
1541
+ left_operand, right_operand, self.ruleset_mode
1542
+ )
1543
+ left_measure = left_operand.get_measures()[0]
1544
+ if left_operand.data is None:
1545
+ result.data = None
1546
+ else:
1547
+ left_original_measure_data = left_operand.data[left_measure.name]
1548
+ result.data[left_measure.name] = left_original_measure_data
1549
+ result.components[left_measure.name] = left_measure
1550
+ return result
1551
+ else:
1552
+ left_operand = self.visit(node.left)
1553
+ right_operand = self.visit(node.right)
1554
+ if (
1555
+ isinstance(left_operand, Dataset)
1556
+ and isinstance(right_operand, Dataset)
1557
+ and self.ruleset_mode in ("partial_null", "partial_zero")
1558
+ and not self.only_semantic
1559
+ ):
1560
+ measure_name = left_operand.get_measures_names()[0]
1561
+ if left_operand.data is None:
1562
+ left_operand.data = pd.DataFrame({measure_name: []})
1563
+ if right_operand.data is None:
1564
+ right_operand.data = pd.DataFrame({measure_name: []})
1565
+ left_null_indexes = set(
1566
+ left_operand.data[left_operand.data[measure_name].isnull()].index
1567
+ )
1568
+ right_null_indexes = set(
1569
+ right_operand.data[right_operand.data[measure_name].isnull()].index
1570
+ )
1571
+ # If no indexes are in common, then one datapoint is not null
1572
+ invalid_indexes = list(left_null_indexes.intersection(right_null_indexes))
1573
+ if len(invalid_indexes) > 0:
1574
+ left_operand.data.loc[invalid_indexes, measure_name] = "REMOVE_VALUE"
1575
+ if isinstance(left_operand, Dataset):
1576
+ left_operand = get_measure_from_dataset(left_operand, node.left.value)
1577
+ if isinstance(right_operand, Dataset):
1578
+ right_operand = get_measure_from_dataset(right_operand, node.right.value)
1579
+ return HR_NUM_BINARY_MAPPING[node.op].analyze(left_operand, right_operand)
1580
+
1581
+ def visit_HRUnOp(self, node: AST.HRUnOp) -> None:
1582
+ operand = self.visit(node.operand)
1583
+ return HR_UNARY_MAPPING[node.op].analyze(operand)
1584
+
1585
+ def visit_Validation(self, node: AST.Validation) -> Dataset:
1586
+ validation_element = self.visit(node.validation)
1587
+ if not isinstance(validation_element, Dataset):
1588
+ raise ValueError(f"Expected dataset, got {type(validation_element).__name__}")
1589
+
1590
+ imbalance_element = None
1591
+ if node.imbalance is not None:
1592
+ imbalance_element = self.visit(node.imbalance)
1593
+ if not isinstance(imbalance_element, Dataset):
1594
+ raise ValueError(f"Expected dataset, got {type(validation_element).__name__}")
1595
+
1596
+ return Check.analyze(
1597
+ validation_element=validation_element,
1598
+ imbalance_element=imbalance_element,
1599
+ error_code=node.error_code,
1600
+ error_level=node.error_level,
1601
+ invalid=node.invalid,
1602
+ )
1603
+
1604
+ def visit_EvalOp(self, node: AST.EvalOp) -> Dataset:
1605
+ """
1606
+ EvalOp: (name, children, output, language)
1607
+
1608
+ Basic usage:
1609
+
1610
+ for child in node.children:
1611
+ self.visit(child)
1612
+ if node.output != None:
1613
+ self.visit(node.output)
1614
+
1615
+ """
1616
+ if node.language not in EXTERNAL:
1617
+ raise Exception(f"Language {node.language} not supported on Eval")
1618
+
1619
+ if self.external_routines is None:
1620
+ raise SemanticError("2-3-10", comp_type="External Routines")
1621
+
1622
+ if node.name not in self.external_routines:
1623
+ raise SemanticError("1-3-5", op_type="External Routine", node_op=node.name)
1624
+ external_routine = self.external_routines[node.name]
1625
+ operands = {}
1626
+ for operand in node.operands:
1627
+ element = self.visit(operand)
1628
+ if not isinstance(element, Dataset):
1629
+ raise ValueError(f"Expected dataset, got {type(element).__name__} as Eval Operand")
1630
+ operands[element.name.split(".")[1] if "." in element.name else element.name] = element
1631
+ output_to_check = node.output
1632
+ return Eval.analyze(operands, external_routine, output_to_check)
1633
+
1634
+ def generate_then_else_datasets(self, condition: Union[Dataset, DataComponent]) -> None:
1635
+ components = {}
1636
+ if self.then_condition_dataset is None:
1637
+ self.then_condition_dataset = []
1638
+ if self.else_condition_dataset is None:
1639
+ self.else_condition_dataset = []
1640
+ if isinstance(condition, Dataset):
1641
+ if len(condition.get_measures()) != 1:
1642
+ raise SemanticError("1-1-1-4", op="condition")
1643
+ if condition.get_measures()[0].data_type != BASIC_TYPES[bool]:
1644
+ raise SemanticError("2-1-9-5", op="condition", name=condition.name)
1645
+ name = condition.get_measures_names()[0]
1646
+ if condition.data is None or condition.data.empty:
1647
+ data = None
1648
+ else:
1649
+ data = condition.data[name]
1650
+ components = {comp.name: comp for comp in condition.get_identifiers()}
1651
+
1652
+ else:
1653
+ if condition.data_type != BASIC_TYPES[bool]:
1654
+ raise SemanticError("2-1-9-4", op="condition", name=condition.name)
1655
+ name = condition.name
1656
+ data = None if condition.data is None else condition.data
1657
+
1658
+ if data is not None:
1659
+ if self.nested_condition and self.condition_stack is not None:
1660
+ merge_df = (
1661
+ self.then_condition_dataset[-1]
1662
+ if self.condition_stack[-1] == THEN_ELSE["then"]
1663
+ else self.else_condition_dataset[-1]
1664
+ )
1665
+ indexes = merge_df.data[merge_df.data.columns[-1]]
1666
+ else:
1667
+ indexes = data[data.notnull()].index
1668
+
1669
+ if isinstance(condition, Dataset):
1670
+ filtered_data = data.iloc[indexes]
1671
+ then_data: Any = (
1672
+ condition.data[condition.data[name] == True]
1673
+ if (condition.data is not None)
1674
+ else []
1675
+ )
1676
+ then_indexes: Any = list(filtered_data[filtered_data == True].index)
1677
+ if len(then_data) > len(then_indexes):
1678
+ then_data = then_data.iloc[then_indexes]
1679
+ then_data[name] = then_indexes
1680
+ else_data: Any = (
1681
+ condition.data[condition.data[name] != True]
1682
+ if (condition.data is not None)
1683
+ else []
1684
+ )
1685
+ else_indexes: Any = list(set(indexes) - set(then_indexes))
1686
+ if len(else_data) > len(else_indexes):
1687
+ else_data = else_data.iloc[else_indexes]
1688
+ else_data[name] = else_indexes
1689
+ else:
1690
+ filtered_data = data.iloc[indexes]
1691
+ then_indexes = list(filtered_data[filtered_data == True].index)
1692
+ else_indexes = list(set(indexes) - set(then_indexes))
1693
+ then_data = pd.DataFrame({name: then_indexes})
1694
+ else_data = pd.DataFrame({name: else_indexes})
1695
+ else:
1696
+ then_data = pd.DataFrame({name: []})
1697
+ else_data = pd.DataFrame({name: []})
1698
+ components.update(
1699
+ {
1700
+ name: Component(
1701
+ name=name,
1702
+ data_type=BASIC_TYPES[int],
1703
+ role=Role.MEASURE,
1704
+ nullable=True,
1705
+ )
1706
+ }
1707
+ )
1708
+
1709
+ if self.condition_stack and len(self.condition_stack) > 0:
1710
+ last_condition_dataset = (
1711
+ self.then_condition_dataset[-1]
1712
+ if self.condition_stack[-1] == THEN_ELSE["then"]
1713
+ else (self.else_condition_dataset[-1])
1714
+ )
1715
+ measure_name = last_condition_dataset.get_measures_names()[0]
1716
+ then_data = then_data[then_data[name].isin(last_condition_dataset.data[measure_name])]
1717
+ else_data = else_data[else_data[name].isin(last_condition_dataset.data[measure_name])]
1718
+ then_dataset = Dataset(name=name, components=components, data=then_data)
1719
+ else_dataset = Dataset(name=name, components=components, data=else_data)
1720
+
1721
+ self.then_condition_dataset.append(then_dataset)
1722
+ self.else_condition_dataset.append(else_dataset)
1723
+
1724
+ def merge_then_else_datasets(self, left_operand: Any, right_operand: Any) -> Any:
1725
+ if (
1726
+ self.then_condition_dataset is None
1727
+ or self.else_condition_dataset is None
1728
+ or self.condition_stack is None
1729
+ ):
1730
+ return left_operand, right_operand
1731
+
1732
+ if self.is_from_case_then:
1733
+ merge_dataset = (
1734
+ self.then_condition_dataset[-1]
1735
+ if self.condition_stack[-1] == THEN_ELSE["then"]
1736
+ else self.else_condition_dataset[-1]
1737
+ )
1738
+ else:
1739
+ merge_dataset = (
1740
+ self.then_condition_dataset.pop()
1741
+ if self.condition_stack.pop() == THEN_ELSE["then"]
1742
+ else (self.else_condition_dataset.pop())
1743
+ )
1744
+
1745
+ merge_index = merge_dataset.data[merge_dataset.get_measures_names()[0]].to_list()
1746
+ ids = merge_dataset.get_identifiers_names()
1747
+ if isinstance(left_operand, (Dataset, DataComponent)):
1748
+ if left_operand.data is None:
1749
+ return left_operand, right_operand
1750
+ if isinstance(left_operand, Dataset):
1751
+ dataset_index = left_operand.data.index[
1752
+ left_operand.data[ids]
1753
+ .apply(tuple, 1)
1754
+ .isin(merge_dataset.data[ids].apply(tuple, 1))
1755
+ ]
1756
+ left = left_operand.data[left_operand.get_measures_names()[0]]
1757
+ left_operand.data[left_operand.get_measures_names()[0]] = left.reindex(
1758
+ dataset_index, fill_value=None
1759
+ )
1760
+ else:
1761
+ left = left_operand.data
1762
+ left_operand.data = left.reindex(merge_index, fill_value=None)
1763
+ if isinstance(right_operand, (Dataset, DataComponent)):
1764
+ if right_operand.data is None:
1765
+ return left_operand, right_operand
1766
+ if isinstance(right_operand, Dataset):
1767
+ dataset_index = right_operand.data.index[
1768
+ right_operand.data[ids]
1769
+ .apply(tuple, 1)
1770
+ .isin(merge_dataset.data[ids].apply(tuple, 1))
1771
+ ]
1772
+ right = right_operand.data[right_operand.get_measures_names()[0]]
1773
+ right_operand.data[right_operand.get_measures_names()[0]] = right.reindex(
1774
+ dataset_index, fill_value=None
1775
+ )
1776
+ else:
1777
+ right = right_operand.data
1778
+ right_operand.data = right.reindex(merge_index, fill_value=None)
1779
+ return left_operand, right_operand
1780
+
1781
+ def visit_Identifier(self, node: AST.Identifier) -> Union[AST.AST, Dataset, str]:
1782
+ """
1783
+ Identifier: (value)
1784
+
1785
+ Basic usage:
1786
+
1787
+ return node.value
1788
+ """
1789
+
1790
+ if self.udo_params is not None and node.value in self.udo_params[-1]:
1791
+ return self.udo_params[-1][node.value]
1792
+
1793
+ if node.value in self.datasets:
1794
+ if self.is_from_assignment:
1795
+ return copy(self.datasets[node.value].name)
1796
+ return copy(self.datasets[node.value])
1797
+ return node.value
1798
+
1799
+ def visit_DefIdentifier(self, node: AST.DefIdentifier) -> Any:
1800
+ """
1801
+ DefIdentifier: (value, kind)
1802
+
1803
+ Basic usage:
1804
+
1805
+ return node.value
1806
+ """
1807
+ partial_is_valid = True
1808
+ # Only for Hierarchical Rulesets
1809
+ if not (self.is_from_rule and node.kind == "CodeItemID"):
1810
+ return node.value
1811
+
1812
+ # Getting Dataset elements
1813
+ result_components = {
1814
+ comp_name: copy(comp)
1815
+ for comp_name, comp in self.ruleset_dataset.components.items() # type: ignore[union-attr]
1816
+ }
1817
+ if self.ruleset_signature is not None:
1818
+ hr_component = self.ruleset_signature["RULE_COMPONENT"]
1819
+ name = node.value
1820
+
1821
+ if self.rule_data is None:
1822
+ return Dataset(name=name, components=result_components, data=None)
1823
+
1824
+ condition = None
1825
+ if hasattr(node, "_right_condition"):
1826
+ condition: DataComponent = self.visit(node._right_condition) # type: ignore[no-redef]
1827
+ if condition is not None:
1828
+ condition = condition.data[condition.data == True].index
1829
+
1830
+ if (
1831
+ self.hr_agg_rules_computed is not None
1832
+ and self.hr_input == "rule"
1833
+ and node.value in self.hr_agg_rules_computed
1834
+ ):
1835
+ df = self.hr_agg_rules_computed[node.value].copy()
1836
+ return Dataset(name=name, components=result_components, data=df)
1837
+
1838
+ df = self.rule_data.copy()
1839
+ if condition is not None:
1840
+ df = df.loc[condition].reset_index(drop=True)
1841
+
1842
+ measure_name = self.ruleset_dataset.get_measures_names()[0] # type: ignore[union-attr]
1843
+ if node.value in df[hr_component].values:
1844
+ rest_identifiers = [
1845
+ comp.name
1846
+ for comp in result_components.values()
1847
+ if comp.role == Role.IDENTIFIER and comp.name != hr_component
1848
+ ]
1849
+ code_data = df[df[hr_component] == node.value].reset_index(drop=True)
1850
+ code_data = code_data.merge(df[rest_identifiers], how="right", on=rest_identifiers)
1851
+ code_data = code_data.drop_duplicates().reset_index(drop=True)
1852
+
1853
+ # If the value is in the dataset, we create a new row
1854
+ # based on the hierarchy mode
1855
+ # (Missing data points are considered,
1856
+ # lines 6483-6510 of the reference manual)
1857
+ if self.ruleset_mode in ("partial_null", "partial_zero"):
1858
+ # We do not care about the presence of the leftCodeItem in Hierarchy Roll-up
1859
+ if self.is_from_hr_agg and self.is_from_assignment:
1860
+ pass
1861
+ elif code_data[hr_component].isnull().any():
1862
+ partial_is_valid = False
1863
+
1864
+ if self.ruleset_mode in ("non_zero", "partial_zero", "always_zero"):
1865
+ fill_indexes = code_data[code_data[hr_component].isnull()].index
1866
+ code_data.loc[fill_indexes, measure_name] = 0
1867
+ code_data[hr_component] = node.value
1868
+ df = code_data
1869
+ else:
1870
+ # If the value is not in the dataset, we create a new row
1871
+ # based on the hierarchy mode
1872
+ # (Missing data points are considered,
1873
+ # lines 6483-6510 of the reference manual)
1874
+ if self.ruleset_mode in ("partial_null", "partial_zero"):
1875
+ # We do not care about the presence of the leftCodeItem in Hierarchy Roll-up
1876
+ if self.is_from_hr_agg and self.is_from_assignment:
1877
+ pass
1878
+ elif self.ruleset_mode == "partial_null":
1879
+ partial_is_valid = False
1880
+ df = df.head(1)
1881
+ df[hr_component] = node.value
1882
+ if self.ruleset_mode in ("non_zero", "partial_zero", "always_zero"):
1883
+ df[measure_name] = 0
1884
+ else: # For non_null, partial_null and always_null
1885
+ df[measure_name] = None
1886
+ if self.hr_partial_is_valid is not None and self.ruleset_mode in (
1887
+ "partial_null",
1888
+ "partial_zero",
1889
+ ):
1890
+ self.hr_partial_is_valid.append(partial_is_valid)
1891
+ return Dataset(name=name, components=result_components, data=df)
1892
+
1893
+ def visit_UDOCall(self, node: AST.UDOCall) -> None: # noqa: C901
1894
+ if self.udos is None:
1895
+ raise SemanticError("2-3-10", comp_type="User Defined Operators")
1896
+ elif node.op not in self.udos:
1897
+ raise SemanticError("1-2-3", node_op=node.op, op_type="User Defined Operator")
1898
+ if self.signature_values is None:
1899
+ self.signature_values = {}
1900
+
1901
+ operator = self.udos[node.op]
1902
+ signature_values = {}
1903
+
1904
+ if operator is None:
1905
+ raise SemanticError("1-2-3", node_op=node.op, op_type="User Defined Operator")
1906
+ if operator["output"] == "Component" and not (
1907
+ self.is_from_regular_aggregation or self.is_from_rule
1908
+ ):
1909
+ raise SemanticError("1-2-12", op=node.op)
1910
+
1911
+ for i, param in enumerate(operator["params"]):
1912
+ if i >= len(node.params):
1913
+ if "default" in param:
1914
+ value = self.visit(param["default"]).value
1915
+ signature_values[param["name"]] = Scalar(
1916
+ name=str(value), value=value, data_type=BASIC_TYPES[type(value)]
1917
+ )
1918
+ else:
1919
+ raise SemanticError(
1920
+ "1-2-11",
1921
+ op=node.op,
1922
+ received=len(node.params),
1923
+ expected=len(operator["params"]),
1924
+ )
1925
+ else:
1926
+ if isinstance(param["type"], str): # Scalar, Dataset, Component
1927
+ if param["type"] == "Scalar":
1928
+ signature_values[param["name"]] = self.visit(node.params[i])
1929
+ elif param["type"] in ["Dataset", "Component"]:
1930
+ if isinstance(node.params[i], AST.VarID):
1931
+ signature_values[param["name"]] = node.params[i].value # type: ignore[attr-defined]
1932
+ else:
1933
+ param_element = self.visit(node.params[i])
1934
+ if isinstance(param_element, Dataset):
1935
+ if param["type"] == "Component":
1936
+ raise SemanticError(
1937
+ "1-3-1-1",
1938
+ op=node.op,
1939
+ option=param["name"],
1940
+ type_1=param["type"],
1941
+ type_2="Dataset",
1942
+ )
1943
+ elif isinstance(param_element, Scalar) and param["type"] in [
1944
+ "Dataset",
1945
+ "Component",
1946
+ ]:
1947
+ raise SemanticError(
1948
+ "1-3-1-1",
1949
+ op=node.op,
1950
+ option=param["name"],
1951
+ type_1=param["type"],
1952
+ type_2="Scalar",
1953
+ )
1954
+ signature_values[param["name"]] = param_element
1955
+
1956
+ else:
1957
+ raise NotImplementedError
1958
+ elif issubclass(param["type"], ScalarType): # Basic types
1959
+ # For basic Scalar types (Integer, Float, String, Boolean)
1960
+ # We validate the type is correct and cast the value
1961
+ param_element = self.visit(node.params[i])
1962
+ if isinstance(param_element, (Dataset, DataComponent)):
1963
+ type_2 = "Dataset" if isinstance(param_element, Dataset) else "Component"
1964
+ raise SemanticError(
1965
+ "1-3-1-1",
1966
+ op=node.op,
1967
+ option=param["name"],
1968
+ type_1=param["type"],
1969
+ type_2=type_2,
1970
+ )
1971
+ scalar_type = param["type"]
1972
+ if not check_unary_implicit_promotion(param_element.data_type, scalar_type):
1973
+ raise SemanticError(
1974
+ "2-3-5",
1975
+ param_type=scalar_type,
1976
+ type_name=param_element.data_type,
1977
+ op=node.op,
1978
+ param_name=param["name"],
1979
+ )
1980
+ signature_values[param["name"]] = Scalar(
1981
+ name=param_element.name,
1982
+ value=scalar_type.cast(param_element.value),
1983
+ data_type=scalar_type,
1984
+ )
1985
+ else:
1986
+ raise NotImplementedError
1987
+
1988
+ # We set it here to a list to start the stack of UDO params
1989
+ if self.udo_params is None:
1990
+ self.udo_params = []
1991
+
1992
+ # Adding parameters to the stack
1993
+ for k, v in signature_values.items():
1994
+ if hasattr(v, "name"):
1995
+ v = v.name # type: ignore[assignment]
1996
+ if v in self.signature_values:
1997
+ signature_values[k] = self.signature_values[v] # type: ignore[index]
1998
+ self.signature_values.update(signature_values)
1999
+ self.udo_params.append(signature_values)
2000
+
2001
+ # Calling the UDO AST, we use deepcopy to avoid changing the original UDO AST
2002
+ if operator is not None:
2003
+ result = self.visit(deepcopy(operator["expression"]))
2004
+
2005
+ if self.is_from_regular_aggregation or self.is_from_rule:
2006
+ result_type = "Component" if isinstance(result, DataComponent) else "Scalar"
2007
+ else:
2008
+ result_type = "Scalar" if isinstance(result, Scalar) else "Dataset"
2009
+
2010
+ if result_type != operator["output"]:
2011
+ raise SemanticError(
2012
+ "1-3-1-1",
2013
+ op=node.op,
2014
+ option="output",
2015
+ type_1=operator["output"],
2016
+ type_2=result_type,
2017
+ )
2018
+
2019
+ # We pop the last element of the stack (current UDO params)
2020
+ # to avoid using them in the next UDO call
2021
+ self.udo_params.pop()
2022
+
2023
+ # We set to None if empty to ensure we do not use these params anymore
2024
+ if len(self.udo_params) == 0:
2025
+ self.udo_params = None
2026
+ return result
2027
+
2028
+ def visit_TimeAggregation(self, node: AST.TimeAggregation) -> None:
2029
+ if node.operand is not None:
2030
+ operand = self.visit(node.operand)
2031
+ else:
2032
+ if self.aggregation_dataset is None:
2033
+ raise SemanticError("1-1-19-11")
2034
+ component_name = Time_Aggregation._get_time_id(self.aggregation_dataset)
2035
+ ast_operand = VarID(
2036
+ value=component_name,
2037
+ line_start=node.line_start,
2038
+ line_stop=node.line_stop,
2039
+ column_start=node.column_start,
2040
+ column_stop=node.column_stop,
2041
+ )
2042
+ operand = self.visit(ast_operand)
2043
+ return Time_Aggregation.analyze(
2044
+ operand=operand,
2045
+ period_from=node.period_from,
2046
+ period_to=node.period_to,
2047
+ conf=node.conf,
2048
+ )