vtlengine 1.1.1__py3-none-any.whl → 1.2.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vtlengine might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Any, Dict, List, Literal, Optional, Union
4
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
5
5
 
6
6
  import jsonschema
7
7
  import pandas as pd
@@ -22,7 +22,11 @@ from vtlengine.__extras_check import __check_s3_extra
22
22
  from vtlengine.AST import Assignment, DPRuleset, HRuleset, Operator, PersistentAssignment, Start
23
23
  from vtlengine.AST.ASTString import ASTString
24
24
  from vtlengine.DataTypes import SCALAR_TYPES
25
- from vtlengine.Exceptions import InputValidationException, check_key
25
+ from vtlengine.Exceptions import (
26
+ InputValidationException,
27
+ SemanticError,
28
+ check_key,
29
+ )
26
30
  from vtlengine.files.parser import _fill_dataset_empty_data, _validate_pandas
27
31
  from vtlengine.Model import (
28
32
  Component as VTL_Component,
@@ -44,11 +48,14 @@ with open(schema_path / "json_schema_2.1.json", "r") as file:
44
48
  schema = json.load(file)
45
49
 
46
50
 
47
- def _load_dataset_from_structure(structures: Dict[str, Any]) -> Dict[str, Any]:
51
+ def _load_dataset_from_structure(
52
+ structures: Dict[str, Any],
53
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
48
54
  """
49
55
  Loads a dataset with the structure given.
50
56
  """
51
57
  datasets = {}
58
+ scalars = {}
52
59
 
53
60
  if "datasets" in structures:
54
61
  for dataset_json in structures["datasets"]:
@@ -110,8 +117,8 @@ def _load_dataset_from_structure(structures: Dict[str, Any]) -> Dict[str, Any]:
110
117
  data_type=SCALAR_TYPES[scalar_json["type"]],
111
118
  value=None,
112
119
  )
113
- datasets[scalar_name] = scalar # type: ignore[assignment]
114
- return datasets
120
+ scalars[scalar_name] = scalar
121
+ return datasets, scalars
115
122
 
116
123
 
117
124
  def _load_single_datapoint(datapoint: Union[str, Path]) -> Dict[str, Any]:
@@ -159,7 +166,9 @@ def _load_datapoints_path(
159
166
  return _load_single_datapoint(datapoints)
160
167
 
161
168
 
162
- def _load_datastructure_single(data_structure: Union[Dict[str, Any], Path]) -> Dict[str, Dataset]:
169
+ def _load_datastructure_single(
170
+ data_structure: Union[Dict[str, Any], Path],
171
+ ) -> Tuple[Dict[str, Dataset], Dict[str, Scalar]]:
163
172
  """
164
173
  Loads a single data structure.
165
174
  """
@@ -170,13 +179,15 @@ def _load_datastructure_single(data_structure: Union[Dict[str, Any], Path]) -> D
170
179
  if not data_structure.exists():
171
180
  raise Exception("Invalid datastructure. Input does not exist")
172
181
  if data_structure.is_dir():
173
- datasets: Dict[str, Any] = {}
182
+ datasets: Dict[str, Dataset] = {}
183
+ scalars: Dict[str, Scalar] = {}
174
184
  for f in data_structure.iterdir():
175
185
  if f.suffix != ".json":
176
186
  continue
177
- dataset = _load_datastructure_single(f)
178
- datasets = {**datasets, **dataset}
179
- return datasets
187
+ ds, sc = _load_datastructure_single(f)
188
+ datasets = {**datasets, **ds}
189
+ scalars = {**scalars, **sc}
190
+ return datasets, scalars
180
191
  else:
181
192
  if data_structure.suffix != ".json":
182
193
  raise Exception("Invalid datastructure. Must have .json extension")
@@ -187,7 +198,7 @@ def _load_datastructure_single(data_structure: Union[Dict[str, Any], Path]) -> D
187
198
 
188
199
  def load_datasets(
189
200
  data_structure: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
190
- ) -> Dict[str, Dataset]:
201
+ ) -> Tuple[Dict[str, Dataset], Dict[str, Scalar]]:
191
202
  """
192
203
  Loads multiple datasets.
193
204
 
@@ -205,21 +216,42 @@ def load_datasets(
205
216
  if isinstance(data_structure, dict):
206
217
  return _load_datastructure_single(data_structure)
207
218
  if isinstance(data_structure, list):
208
- ds_structures: Dict[str, Any] = {}
219
+ ds_structures: Dict[str, Dataset] = {}
220
+ scalar_structures: Dict[str, Scalar] = {}
209
221
  for x in data_structure:
210
- result = _load_datastructure_single(x)
211
- ds_structures = {**ds_structures, **result} # Overwrite ds_structures dict.
212
- return ds_structures
222
+ ds, sc = _load_datastructure_single(x)
223
+ ds_structures = {**ds_structures, **ds} # Overwrite ds_structures dict.
224
+ scalar_structures = {**scalar_structures, **sc} # Overwrite scalar_structures dict.
225
+ return ds_structures, scalar_structures
213
226
  return _load_datastructure_single(data_structure)
214
227
 
215
228
 
216
- def load_datasets_with_data(data_structures: Any, datapoints: Optional[Any] = None) -> Any:
229
+ def _handle_scalars_values(
230
+ scalars: Dict[str, Scalar],
231
+ scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
232
+ ) -> None:
233
+ if scalar_values is None:
234
+ return
235
+ # Handling scalar values with the scalar dict
236
+ for name, value in scalar_values.items():
237
+ if name not in scalars:
238
+ raise Exception(f"Not found scalar {name} in datastructures")
239
+ # Casting value to scalar data type
240
+ scalars[name].value = scalars[name].data_type.cast(value)
241
+
242
+
243
+ def load_datasets_with_data(
244
+ data_structures: Any,
245
+ datapoints: Optional[Any] = None,
246
+ scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
247
+ ) -> Any:
217
248
  """
218
249
  Loads the dataset structures and fills them with the data contained in the datapoints.
219
250
 
220
251
  Args:
221
252
  data_structures: Dict, Path or a List of dicts or Paths.
222
253
  datapoints: Dict, Path or a List of Paths.
254
+ scalar_values: Dict with the scalar values.
223
255
 
224
256
  Returns:
225
257
  A dict with the structure and a pandas dataframe with the data.
@@ -227,17 +259,18 @@ def load_datasets_with_data(data_structures: Any, datapoints: Optional[Any] = No
227
259
  Raises:
228
260
  Exception: If the Path is wrong or the file is invalid.
229
261
  """
230
- datasets = load_datasets(data_structures)
262
+ datasets, scalars = load_datasets(data_structures)
231
263
  if datapoints is None:
232
264
  for dataset in datasets.values():
233
265
  if isinstance(dataset, Dataset):
234
266
  _fill_dataset_empty_data(dataset)
235
- return datasets, None
267
+ _handle_scalars_values(scalars, scalar_values)
268
+ return datasets, scalars, None
236
269
  if isinstance(datapoints, dict):
237
270
  # Handling dictionary of Pandas Dataframes
238
271
  for dataset_name, data in datapoints.items():
239
272
  if dataset_name not in datasets:
240
- raise Exception(f"Not found dataset {dataset_name}")
273
+ raise Exception(f"Not found dataset {dataset_name} in datastructures.")
241
274
  datasets[dataset_name].data = _validate_pandas(
242
275
  datasets[dataset_name].components, data, dataset_name
243
276
  )
@@ -246,14 +279,17 @@ def load_datasets_with_data(data_structures: Any, datapoints: Optional[Any] = No
246
279
  datasets[dataset_name].data = pd.DataFrame(
247
280
  columns=list(datasets[dataset_name].components.keys())
248
281
  )
249
- return datasets, None
282
+ _handle_scalars_values(scalars, scalar_values)
283
+ return datasets, scalars, None
250
284
  # Handling dictionary of paths
251
285
  dict_datapoints = _load_datapoints_path(datapoints)
252
286
  for dataset_name, _ in dict_datapoints.items():
253
287
  if dataset_name not in datasets:
254
- raise Exception(f"Not found dataset {dataset_name}")
288
+ raise Exception(f"Not found dataset {dataset_name} in datastructures.")
289
+
290
+ _handle_scalars_values(scalars, scalar_values)
255
291
 
256
- return datasets, dict_datapoints
292
+ return datasets, scalars, dict_datapoints
257
293
 
258
294
 
259
295
  def load_vtl(input: Union[str, Path]) -> str:
@@ -362,8 +398,8 @@ def load_external_routines(input: Union[Dict[str, Any], Path, str]) -> Any:
362
398
 
363
399
 
364
400
  def _return_only_persistent_datasets(
365
- datasets: Dict[str, Dataset], ast: Start
366
- ) -> Dict[str, Dataset]:
401
+ datasets: Dict[str, Union[Dataset, Scalar]], ast: Start
402
+ ) -> Dict[str, Union[Dataset, Scalar]]:
367
403
  """
368
404
  Returns only the datasets with a persistent assignment.
369
405
  """
@@ -606,11 +642,9 @@ def _check_script(script: Union[str, TransformationScheme, Path]) -> str:
606
642
  Check if the TransformationScheme object is valid to generate a vtl script.
607
643
  """
608
644
  if not isinstance(script, (str, TransformationScheme, Path)):
609
- raise Exception(
610
- "Invalid script format. Input must be a string, TransformationScheme or Path object"
611
- )
645
+ raise SemanticError("0-1-1-1", format_=type(script).__name__)
612
646
  if isinstance(script, TransformationScheme):
613
- from pysdmx.toolkit.vtl.generate_vtl_script import (
647
+ from pysdmx.toolkit.vtl import (
614
648
  generate_vtl_script,
615
649
  )
616
650
 
vtlengine/API/__init__.py CHANGED
@@ -35,7 +35,7 @@ from vtlengine.files.output._time_period_representation import (
35
35
  format_time_period_external_representation,
36
36
  )
37
37
  from vtlengine.Interpreter import InterpreterAnalyzer
38
- from vtlengine.Model import Dataset
38
+ from vtlengine.Model import Dataset, Scalar
39
39
 
40
40
  pd.options.mode.chained_assignment = None
41
41
 
@@ -180,7 +180,7 @@ def semantic_analysis(
180
180
  ast = create_ast(vtl)
181
181
 
182
182
  # Loading datasets
183
- structures = load_datasets(data_structures)
183
+ datasets, scalars = load_datasets(data_structures)
184
184
 
185
185
  # Handling of library items
186
186
  vd = None
@@ -192,9 +192,10 @@ def semantic_analysis(
192
192
 
193
193
  # Running the interpreter
194
194
  interpreter = InterpreterAnalyzer(
195
- datasets=structures,
195
+ datasets=datasets,
196
196
  value_domains=vd,
197
197
  external_routines=ext_routines,
198
+ scalars=scalars,
198
199
  only_semantic=True,
199
200
  )
200
201
  result = interpreter.visit(ast)
@@ -210,7 +211,8 @@ def run(
210
211
  time_period_output_format: str = "vtl",
211
212
  return_only_persistent: bool = True,
212
213
  output_folder: Optional[Union[str, Path]] = None,
213
- ) -> Dict[str, Dataset]:
214
+ scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
215
+ ) -> Dict[str, Union[Dataset, Scalar]]:
214
216
  """
215
217
  Run is the main function of the ``API``, which mission is to execute
216
218
  the vtl operation over the data.
@@ -276,6 +278,8 @@ def run(
276
278
 
277
279
  output_folder: Path or S3 URI to the output folder. (default: None)
278
280
 
281
+ scalar_values: Dict with the scalar values to be used in the VTL script. \
282
+
279
283
 
280
284
  Returns:
281
285
  The datasets are produced without data if the output folder is defined.
@@ -292,7 +296,9 @@ def run(
292
296
  ast = create_ast(vtl)
293
297
 
294
298
  # Loading datasets and datapoints
295
- datasets, path_dict = load_datasets_with_data(data_structures, datapoints)
299
+ datasets, scalars, path_dict = load_datasets_with_data(
300
+ data_structures, datapoints, scalar_values
301
+ )
296
302
 
297
303
  # Handling of library items
298
304
  vd = None
@@ -322,13 +328,15 @@ def run(
322
328
  output_path=output_folder,
323
329
  time_period_representation=time_period_representation,
324
330
  return_only_persistent=return_only_persistent,
331
+ scalars=scalars,
325
332
  )
326
333
  result = interpreter.visit(ast)
327
334
 
328
335
  # Applying time period output format
329
336
  if output_folder is None:
330
- for dataset in result.values():
331
- format_time_period_external_representation(dataset, time_period_representation)
337
+ for obj in result.values():
338
+ if isinstance(obj, (Dataset, Scalar)):
339
+ format_time_period_external_representation(obj, time_period_representation)
332
340
 
333
341
  # Returning only persistent datasets
334
342
  if return_only_persistent:
@@ -345,7 +353,7 @@ def run_sdmx( # noqa: C901
345
353
  time_period_output_format: str = "vtl",
346
354
  return_only_persistent: bool = True,
347
355
  output_folder: Optional[Union[str, Path]] = None,
348
- ) -> Dict[str, Dataset]:
356
+ ) -> Dict[str, Union[Dataset, Scalar]]:
349
357
  """
350
358
  Executes a VTL script using a list of pysdmx `PandasDataset` objects.
351
359
 
@@ -403,8 +411,16 @@ def run_sdmx( # noqa: C901
403
411
  mapping_dict = {}
404
412
  input_names = _extract_input_datasets(script)
405
413
 
406
- # Mapping handling
414
+ if not isinstance(datasets, (list, set)) or any(
415
+ not isinstance(ds, PandasDataset) for ds in datasets
416
+ ):
417
+ type_ = type(datasets).__name__
418
+ if isinstance(datasets, (list, set)):
419
+ object_typing = {type(o).__name__ for o in datasets}
420
+ type_ = f"{type_}[{', '.join(object_typing)}]"
421
+ raise SemanticError("0-1-3-7", type_=type_)
407
422
 
423
+ # Mapping handling
408
424
  if mappings is None:
409
425
  if len(datasets) != 1:
410
426
  raise SemanticError("0-1-3-3")
@@ -840,8 +840,8 @@ class Expr(VtlVisitor):
840
840
  Parser.DayOfYearAtomContext,
841
841
  Parser.DayToYearAtomContext,
842
842
  Parser.DayToMonthAtomContext,
843
- Parser.YearTodayAtomContext,
844
- Parser.MonthTodayAtomContext,
843
+ Parser.YearToDayAtomContext,
844
+ Parser.MonthToDayAtomContext,
845
845
  ),
846
846
  ):
847
847
  return self.visitTimeUnaryAtom(ctx)
@@ -1901,7 +1901,10 @@ class Expr(VtlVisitor):
1901
1901
 
1902
1902
  left_node = Terminals().visitVarID(ctx_list[0])
1903
1903
  op_node = ctx_list[1].getSymbol().text
1904
- right_node = Terminals().visitScalarItem(ctx_list[2])
1904
+ if isinstance(ctx_list[2], Parser.ScalarItemContext):
1905
+ right_node = Terminals().visitScalarItem(ctx_list[2])
1906
+ else:
1907
+ right_node = Terminals().visitVarID(ctx_list[2])
1905
1908
  return BinOp(left=left_node, op=op_node, right=right_node, **extract_token_info(ctx))
1906
1909
 
1907
1910
  def visitOptionalExpr(self, ctx: Parser.OptionalExprContext):
@@ -32,8 +32,8 @@ from vtlengine.AST import (
32
32
  VarID,
33
33
  )
34
34
  from vtlengine.AST.ASTTemplate import ASTTemplate
35
- from vtlengine.AST.DAG._words import DELETE, GLOBAL, INPUTS, INSERT, OUTPUTS, PERSISTENT
36
- from vtlengine.AST.Grammar.tokens import AS, MEMBERSHIP, TO
35
+ from vtlengine.AST.DAG._words import DELETE, GLOBAL, INPUTS, INSERT, OUTPUTS, PERSISTENT, UNKNOWN
36
+ from vtlengine.AST.Grammar.tokens import AS, DROP, KEEP, MEMBERSHIP, RENAME, TO
37
37
  from vtlengine.Exceptions import SemanticError
38
38
 
39
39
 
@@ -61,6 +61,8 @@ class DAGAnalyzer(ASTTemplate):
61
61
  inputs: Optional[list] = None
62
62
  outputs: Optional[list] = None
63
63
  persistent: Optional[list] = None
64
+ unknown_variables: Optional[list] = None
65
+ unknown_variables_statement: Optional[list] = None
64
66
 
65
67
  def __post_init__(self):
66
68
  self.dependencies = {}
@@ -72,6 +74,8 @@ class DAGAnalyzer(ASTTemplate):
72
74
  self.outputs = []
73
75
  self.persistent = []
74
76
  self.alias = []
77
+ self.unknown_variables = []
78
+ self.unknown_variables_statement = []
75
79
 
76
80
  @classmethod
77
81
  def ds_structure(cls, ast: AST):
@@ -176,7 +180,7 @@ class DAGAnalyzer(ASTTemplate):
176
180
  """ """
177
181
  # For each vertex
178
182
  for key, statement in self.dependencies.items():
179
- output = statement[OUTPUTS] + statement[PERSISTENT]
183
+ output = statement[OUTPUTS] + statement[PERSISTENT] + statement[UNKNOWN]
180
184
  # If the statement has no := or -> symbol there is no vertex to add.
181
185
  if len(output) != 0:
182
186
  self.vertex[key] = output[0]
@@ -245,12 +249,15 @@ class DAGAnalyzer(ASTTemplate):
245
249
  inputs = list(set(self.inputs))
246
250
  outputs = list(set(self.outputs))
247
251
  persistent = list(set(self.persistent))
252
+ unknown = list(set(self.unknown_variables_statement))
248
253
 
249
254
  # Remove inputs that are outputs of some statement.
250
255
  inputsF = [inputf for inputf in inputs if inputf not in outputs]
251
256
 
252
- dict_ = {INPUTS: inputsF, OUTPUTS: outputs, PERSISTENT: persistent}
253
-
257
+ dict_ = {INPUTS: inputsF, OUTPUTS: outputs, PERSISTENT: persistent, UNKNOWN: unknown}
258
+ for variable in self.unknown_variables_statement:
259
+ if variable not in self.unknown_variables:
260
+ self.unknown_variables.append(variable)
254
261
  return dict_
255
262
 
256
263
  """______________________________________________________________________________________
@@ -293,6 +300,19 @@ class DAGAnalyzer(ASTTemplate):
293
300
  self.inputs = []
294
301
  self.outputs = []
295
302
  self.persistent = []
303
+ self.unknown_variables_statement = []
304
+ aux = copy.copy(self.unknown_variables)
305
+ for variable in aux:
306
+ for _number_of_statement, dependency in self.dependencies.items():
307
+ if variable in dependency[OUTPUTS]:
308
+ if variable in self.unknown_variables:
309
+ self.unknown_variables.remove(variable)
310
+ for _number_of_statement, dependency in self.dependencies.items():
311
+ if variable in dependency[UNKNOWN]:
312
+ dependency[UNKNOWN].remove(variable)
313
+ dependency[INPUTS].append(variable)
314
+ if variable not in self.inputs:
315
+ self.inputs.append(variable)
296
316
 
297
317
  def visit_Assignment(self, node: Assignment) -> None:
298
318
  if self.isFirstAssignment:
@@ -310,6 +330,8 @@ class DAGAnalyzer(ASTTemplate):
310
330
 
311
331
  def visit_RegularAggregation(self, node: RegularAggregation) -> None:
312
332
  self.visit(node.dataset)
333
+ if node.op in [KEEP, DROP, RENAME]:
334
+ return
313
335
  for child in node.children:
314
336
  self.isFromRegularAggregation = True
315
337
  self.visit(child)
@@ -331,6 +353,13 @@ class DAGAnalyzer(ASTTemplate):
331
353
  def visit_VarID(self, node: VarID) -> None:
332
354
  if (not self.isFromRegularAggregation or self.isDataset) and node.value not in self.alias:
333
355
  self.inputs.append(node.value)
356
+ elif (
357
+ self.isFromRegularAggregation
358
+ and node.value not in self.alias
359
+ and not self.isDataset
360
+ and node.value not in self.unknown_variables_statement
361
+ ):
362
+ self.unknown_variables_statement.append(node.value)
334
363
 
335
364
  def visit_Identifier(self, node: Identifier) -> None:
336
365
  if node.kind == "DatasetID" and node.value not in self.alias:
@@ -7,3 +7,4 @@ INPUTS = "inputs"
7
7
  OUTPUTS = "outputs"
8
8
  PERSISTENT = "persistent"
9
9
  STATEMENT_ = "statement"
10
+ UNKNOWN = "unknown_variables"
@@ -219,11 +219,11 @@ timeOperators:
219
219
  | YEAR_OP LPAREN expr RPAREN # yearAtom
220
220
  | MONTH_OP LPAREN expr RPAREN # monthAtom
221
221
  | DAYOFMONTH LPAREN expr RPAREN # dayOfMonthAtom
222
- | DAYOFYEAR LPAREN expr RPAREN # datOfYearAtom
222
+ | DAYOFYEAR LPAREN expr RPAREN # dayOfYearAtom
223
223
  | DAYTOYEAR LPAREN expr RPAREN # dayToYearAtom
224
224
  | DAYTOMONTH LPAREN expr RPAREN # dayToMonthAtom
225
- | YEARTODAY LPAREN expr RPAREN # yearTodayAtom
226
- | MONTHTODAY LPAREN expr RPAREN # monthTodayAtom
225
+ | YEARTODAY LPAREN expr RPAREN # yearToDayAtom
226
+ | MONTHTODAY LPAREN expr RPAREN # monthToDayAtom
227
227
  ;
228
228
 
229
229
  timeOperatorsComponent:
@@ -238,11 +238,11 @@ timeOperatorsComponent:
238
238
  | YEAR_OP LPAREN exprComponent RPAREN # yearAtomComponent
239
239
  | MONTH_OP LPAREN exprComponent RPAREN # monthAtomComponent
240
240
  | DAYOFMONTH LPAREN exprComponent RPAREN # dayOfMonthAtomComponent
241
- | DAYOFYEAR LPAREN exprComponent RPAREN # datOfYearAtomComponent
241
+ | DAYOFYEAR LPAREN exprComponent RPAREN # dayOfYearAtomComponent
242
242
  | DAYTOYEAR LPAREN exprComponent RPAREN # dayToYearAtomComponent
243
243
  | DAYTOMONTH LPAREN exprComponent RPAREN # dayToMonthAtomComponent
244
- | YEARTODAY LPAREN exprComponent RPAREN # yearTodayAtomComponent
245
- | MONTHTODAY LPAREN exprComponent RPAREN # monthTodayAtomComponent
244
+ | YEARTODAY LPAREN exprComponent RPAREN # yearToDayAtomComponent
245
+ | MONTHTODAY LPAREN exprComponent RPAREN # monthToDayAtomComponent
246
246
  ;
247
247
 
248
248
  setOperators:
@@ -363,7 +363,7 @@ calcClauseItem:
363
363
 
364
364
  /*SUBSPACE CLAUSE*/
365
365
  subspaceClauseItem:
366
- componentID EQ scalarItem
366
+ componentID EQ (scalarItem | varID)
367
367
  ;
368
368
 
369
369
  scalarItem: