vtlengine 1.1rc2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vtlengine might be problematic. Click here for more details.

vtlengine/API/__init__.py CHANGED
@@ -1,29 +1,41 @@
1
+ import warnings
1
2
  from pathlib import Path
2
- from typing import Any, Dict, List, Optional, Union
3
+ from typing import Any, Dict, List, Optional, Sequence, Union
3
4
 
4
5
  import pandas as pd
5
6
  from antlr4 import CommonTokenStream, InputStream # type: ignore[import-untyped]
6
7
  from antlr4.error.ErrorListener import ErrorListener # type: ignore[import-untyped]
8
+ from pysdmx.io.pd import PandasDataset
9
+ from pysdmx.model import DataflowRef, Reference, TransformationScheme
10
+ from pysdmx.model.dataflow import Dataflow, Schema
11
+ from pysdmx.model.vtl import VtlDataflowMapping
12
+ from pysdmx.util import parse_urn
7
13
 
8
14
  from vtlengine.API._InternalApi import (
9
15
  _check_output_folder,
16
+ _check_script,
10
17
  _return_only_persistent_datasets,
18
+ ast_to_sdmx,
11
19
  load_datasets,
12
20
  load_datasets_with_data,
13
21
  load_external_routines,
14
22
  load_value_domains,
15
23
  load_vtl,
24
+ to_vtl_json,
16
25
  )
17
26
  from vtlengine.AST import Start
18
27
  from vtlengine.AST.ASTConstructor import ASTVisitor
28
+ from vtlengine.AST.ASTString import ASTString
19
29
  from vtlengine.AST.DAG import DAGAnalyzer
20
30
  from vtlengine.AST.Grammar.lexer import Lexer
21
31
  from vtlengine.AST.Grammar.parser import Parser
32
+ from vtlengine.Exceptions import SemanticError
22
33
  from vtlengine.files.output._time_period_representation import (
23
34
  TimePeriodRepresentation,
24
35
  format_time_period_external_representation,
25
36
  )
26
37
  from vtlengine.Interpreter import InterpreterAnalyzer
38
+ from vtlengine.Model import Dataset
27
39
 
28
40
  pd.options.mode.chained_assignment = None
29
41
 
@@ -68,6 +80,38 @@ def _parser(stream: CommonTokenStream) -> Any:
68
80
  return vtl_parser.start()
69
81
 
70
82
 
83
+ def _extract_input_datasets(script: Union[str, TransformationScheme, Path]) -> str:
84
+ if isinstance(script, TransformationScheme):
85
+ vtl_script = _check_script(script)
86
+ elif isinstance(script, (str, Path)):
87
+ vtl_script = load_vtl(script)
88
+ else:
89
+ raise TypeError("Unsupported script type.")
90
+
91
+ ast = create_ast(vtl_script)
92
+ dag_inputs = DAGAnalyzer.ds_structure(ast)["global_inputs"]
93
+
94
+ return dag_inputs
95
+
96
+
97
+ def prettify(script: Union[str, TransformationScheme, Path]) -> str:
98
+ """
99
+ Function that prettifies the VTL script given.
100
+
101
+ Args:
102
+ script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
103
+
104
+ Returns:
105
+ A str with the prettified VTL script.
106
+ """
107
+ from vtlengine.AST.ASTComment import create_ast_with_comments
108
+
109
+ checking = _check_script(script)
110
+ vtl = load_vtl(checking)
111
+ ast = create_ast_with_comments(vtl)
112
+ return ASTString(pretty=True).render(ast)
113
+
114
+
71
115
  def create_ast(text: str) -> Start:
72
116
  """
73
117
  Function that creates the AST object.
@@ -90,30 +134,19 @@ def create_ast(text: str) -> Start:
90
134
 
91
135
 
92
136
  def semantic_analysis(
93
- script: Union[str, Path],
94
- data_structures: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
137
+ script: Union[str, TransformationScheme, Path],
138
+ data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
95
139
  value_domains: Optional[Union[Dict[str, Any], Path]] = None,
96
140
  external_routines: Optional[Union[Dict[str, Any], Path]] = None,
97
- ) -> Any:
141
+ ) -> Dict[str, Dataset]:
98
142
  """
99
- Checks if the vtl operation can be done.To do that, it generates the AST with the vtl script
100
- given and also reviews if the data structure given can fit with it.
143
+ Checks if the vtl scripts and its related datastructures are valid. As part of the compatibility
144
+ with pysdmx library, the vtl script can be a Transformation Scheme object, which availability as
145
+ input is going to be serialized as a string VTL script.
101
146
 
102
- This vtl script can be a string with the actual expression or a filepath to the folder
103
- that contains the vtl file.
104
-
105
- Moreover, the data structure can be a dictionary or a filepath to the folder that contains it.
106
-
107
- If there are any value domains or external routines, this data is taken into account.
108
- Both can be loaded the same way as data structures or vtl scripts are.
109
-
110
- Finally, the :obj:`Interpreter <vtl-engine-spark.Interpreter.InterpreterAnalyzer>`
111
- class takes all of this information and checks it with the ast generated to
112
- return the semantic analysis result.
113
-
114
- Concepts you may know:
147
+ Concepts you may need to know:
115
148
 
116
- - Vtl script: The expression that shows the operation to be done.
149
+ - Vtl script: The script that shows the set of operations to be executed.
117
150
 
118
151
  - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
119
152
  (and/or scalar) about the datatype (String, integer or number), \
@@ -126,7 +159,8 @@ def semantic_analysis(
126
159
  This function has the following params:
127
160
 
128
161
  Args:
129
- script: String or Path of the vtl expression.
162
+ script: Vtl script as a string, Transformation Scheme object or Path to the folder \
163
+ that holds the vtl script.
130
164
  data_structures: Dict or Path (file or folder), \
131
165
  or List of Dicts or Paths with the data structures JSON files.
132
166
  value_domains: Dict or Path of the value domains JSON files. (default: None)
@@ -139,8 +173,10 @@ def semantic_analysis(
139
173
  Exception: If the files have the wrong format, or they do not exist, \
140
174
  or their Paths are invalid.
141
175
  """
176
+
142
177
  # AST generation
143
- vtl = load_vtl(script)
178
+ checking = _check_script(script)
179
+ vtl = load_vtl(checking)
144
180
  ast = create_ast(vtl)
145
181
 
146
182
  # Loading datasets
@@ -166,30 +202,34 @@ def semantic_analysis(
166
202
 
167
203
 
168
204
  def run(
169
- script: Union[str, Path],
170
- data_structures: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
171
- datapoints: Union[Dict[str, Any], str, Path, List[Union[str, Path]]],
205
+ script: Union[str, TransformationScheme, Path],
206
+ data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
207
+ datapoints: Union[Dict[str, pd.DataFrame], str, Path, List[Dict[str, Any]], List[Path]],
172
208
  value_domains: Optional[Union[Dict[str, Any], Path]] = None,
173
209
  external_routines: Optional[Union[str, Path]] = None,
174
210
  time_period_output_format: str = "vtl",
175
- return_only_persistent: bool = False,
211
+ return_only_persistent: bool = True,
176
212
  output_folder: Optional[Union[str, Path]] = None,
177
- ) -> Any:
213
+ ) -> Dict[str, Dataset]:
178
214
  """
179
- Run is the main function of the ``API``, which mission is to ensure the vtl operation is ready
180
- to be performed.
181
- When the vtl expression is given, an AST object is created.
182
- This vtl script can be given as a string or a path with the folder or file that contains it.
183
- At the same time, data structures are loaded with its datapoints.
215
+ Run is the main function of the ``API``, which mission is to execute
216
+ the vtl operation over the data.
217
+
218
+ Concepts you may need to know:
184
219
 
185
- The data structure information is contained in the JSON file given,
186
- and establish the datatype (string, integer or number),
187
- and the role that each component is going to have (Identifier, Attribute or Measure).
188
- It can be a dictionary or a path to the JSON file or folder that contains it.
220
+ - Vtl script: The script that shows the set of operations to be executed.
189
221
 
190
- Moreover, a csv file with the data to operate with is going to be loaded.
191
- It can be given with a dictionary (dataset name : pandas Dataframe),
192
- a path or S3 URI to the folder, path or S3 to the csv file that contains the data.
222
+ - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
223
+ (and/or scalar) about the datatype (String, integer or number), \
224
+ the role (Identifier, Attribute or Measure) and the nullability each component has.
225
+
226
+ - Data point: `Pandas Dataframe \
227
+ <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ \
228
+ that holds the data related to the Dataset.
229
+
230
+ - Value domains: Collection of unique values on the same datatype.
231
+
232
+ - External routines: SQL query used to transform a dataset.
193
233
 
194
234
  .. important::
195
235
  The data structure and the data points must have the same dataset
@@ -212,35 +252,12 @@ def run(
212
252
  For more details, see
213
253
  `s3fs documentation <https://s3fs.readthedocs.io/en/latest/index.html#credentials>`_.
214
254
 
215
- Before the execution, the DAG analysis reviews if the VTL script is a direct acyclic graphs.
216
-
217
-
218
- If value domain data or external routines are required, the function loads this information
219
- and integrates them into the
220
- :obj:`Interpreter <vtl-engine-spark.Interpreter.InterpreterAnalyzer>` class.
221
-
222
- Moreover, if any component has a Time Period component, the external representation
223
- is passed to the Interpreter class.
224
-
225
- Concepts you may need to know:
226
-
227
- - Vtl script: The expression that shows the operation to be done.
228
-
229
- - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
230
- (and/or scalar) about the datatype (String, integer or number), \
231
- the role (Identifier, Attribute or Measure) and the nullability each component has.
232
-
233
- - Data point: Pointer to the data. It will be loaded as a `Pandas Dataframe \
234
- <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_.
235
-
236
- - Value domains: Collection of unique values that have the same datatype.
237
-
238
- - External routines: SQL query used to transform a dataset.
255
+ Before the execution, the DAG analysis reviews if the VTL script is a direct acyclic graph.
239
256
 
240
257
  This function has the following params:
241
258
 
242
259
  Args:
243
- script: String or Path with the vtl expression.
260
+ script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
244
261
 
245
262
  data_structures: Dict, Path or a List of Dicts or Paths with the data structures.
246
263
 
@@ -255,7 +272,7 @@ def run(
255
272
  Time Period components.
256
273
 
257
274
  return_only_persistent: If True, run function will only return the results of \
258
- Persistent Assignments. (default: False)
275
+ Persistent Assignments. (default: True)
259
276
 
260
277
  output_folder: Path or S3 URI to the output folder. (default: None)
261
278
 
@@ -268,7 +285,9 @@ def run(
268
285
  or their Paths are invalid.
269
286
 
270
287
  """
288
+
271
289
  # AST generation
290
+ script = _check_script(script)
272
291
  vtl = load_vtl(script)
273
292
  ast = create_ast(vtl)
274
293
 
@@ -302,6 +321,7 @@ def run(
302
321
  datapoints_paths=path_dict,
303
322
  output_path=output_folder,
304
323
  time_period_representation=time_period_representation,
324
+ return_only_persistent=return_only_persistent,
305
325
  )
306
326
  result = interpreter.visit(ast)
307
327
 
@@ -314,3 +334,174 @@ def run(
314
334
  if return_only_persistent:
315
335
  return _return_only_persistent_datasets(result, ast)
316
336
  return result
337
+
338
+
339
+ def run_sdmx( # noqa: C901
340
+ script: Union[str, TransformationScheme, Path],
341
+ datasets: Sequence[PandasDataset],
342
+ mappings: Optional[Union[VtlDataflowMapping, Dict[str, str]]] = None,
343
+ value_domains: Optional[Union[Dict[str, Any], Path]] = None,
344
+ external_routines: Optional[Union[str, Path]] = None,
345
+ time_period_output_format: str = "vtl",
346
+ return_only_persistent: bool = True,
347
+ output_folder: Optional[Union[str, Path]] = None,
348
+ ) -> Dict[str, Dataset]:
349
+ """
350
+ Executes a VTL script using a list of pysdmx `PandasDataset` objects.
351
+
352
+ This function prepares the required VTL data structures and datapoints from
353
+ the given list of pysdmx `PandasDataset` objects. It validates each
354
+ `PandasDataset` uses a valid `Schema` instance as its structure. Each `Schema` is converted
355
+ to the appropriate VTL JSON data structure, and the Pandas Dataframe is extracted.
356
+
357
+ .. important::
358
+ We recommend to use this function in combination with the
359
+ `get_datasets <https://py.sdmx.io/howto/data_rw.html#pysdmx.io.get_datasets>`_
360
+ pysdmx method.
361
+
362
+ .. important::
363
+ The mapping between pysdmx `PandasDataset
364
+ <https://py.sdmx.io/howto/data_rw.html#pysdmx.io.pd.PandasDataset>`_ \
365
+ and VTL datasets is done using the `Schema` instance of the `PandasDataset`.
366
+ The Schema ID is used as the dataset name.
367
+
368
+ DataStructure=MD:TEST_DS(1.0) -> TEST_DS
369
+
370
+ The function then calls the :obj:`run <vtlengine.API>` function with the provided VTL
371
+ script and prepared inputs.
372
+
373
+ Before the execution, the DAG analysis reviews if the generated VTL script is a direct acyclic
374
+ graph.
375
+
376
+ Args:
377
+ script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
378
+
379
+ datasets: A list of PandasDataset.
380
+
381
+ mappings: A dictionary or VtlDataflowMapping object that maps the dataset names.
382
+
383
+ value_domains: Dict or Path of the value domains JSON files. (default:None)
384
+
385
+ external_routines: String or Path of the external routines SQL files. (default: None)
386
+
387
+ time_period_output_format: String with the possible values \
388
+ ("sdmx_gregorian", "sdmx_reporting", "vtl") for the representation of the \
389
+ Time Period components.
390
+
391
+ return_only_persistent: If True, run function will only return the results of \
392
+ Persistent Assignments. (default: True)
393
+
394
+ output_folder: Path or S3 URI to the output folder. (default: None)
395
+
396
+ Returns:
397
+ The datasets are produced without data if the output folder is defined.
398
+
399
+ Raises:
400
+ SemanticError: If any dataset does not contain a valid `Schema` instance as its structure.
401
+
402
+ """
403
+ mapping_dict = {}
404
+ input_names = _extract_input_datasets(script)
405
+
406
+ # Mapping handling
407
+
408
+ if mappings is None:
409
+ if len(datasets) != 1:
410
+ raise SemanticError("0-1-3-3")
411
+ if len(datasets) == 1:
412
+ if len(input_names) != 1:
413
+ raise SemanticError("0-1-3-1", number_datasets=len(input_names))
414
+ schema = datasets[0].structure
415
+ if not isinstance(schema, Schema):
416
+ raise SemanticError("0-1-3-2", schema=schema)
417
+ mapping_dict = {schema.short_urn: input_names[0]}
418
+ elif isinstance(mappings, Dict):
419
+ mapping_dict = mappings
420
+ elif isinstance(mappings, VtlDataflowMapping):
421
+ if mappings.to_vtl_mapping_method is not None:
422
+ warnings.warn(
423
+ "To_vtl_mapping_method is not implemented yet, we will use the Basic "
424
+ "method with old data."
425
+ )
426
+ if mappings.from_vtl_mapping_method is not None:
427
+ warnings.warn(
428
+ "From_vtl_mapping_method is not implemented yet, we will use the Basic "
429
+ "method with old data."
430
+ )
431
+ if isinstance(mappings.dataflow, str):
432
+ short_urn = str(parse_urn(mappings.dataflow))
433
+ elif isinstance(mappings.dataflow, (Reference, DataflowRef)):
434
+ short_urn = str(mappings.dataflow)
435
+ elif isinstance(mappings.dataflow, Dataflow):
436
+ short_urn = mappings.dataflow.short_urn
437
+ else:
438
+ raise TypeError(
439
+ "Expected str, Reference, DataflowRef or Dataflow type for dataflow in "
440
+ "VtlDataflowMapping."
441
+ )
442
+
443
+ mapping_dict = {short_urn: mappings.dataflow_alias}
444
+ else:
445
+ raise TypeError("Expected dict or VtlDataflowMapping type for mappings.")
446
+
447
+ for vtl_name in mapping_dict.values():
448
+ if vtl_name not in input_names:
449
+ raise SemanticError("0-1-3-5", dataset_name=vtl_name)
450
+
451
+ datapoints = {}
452
+ data_structures = []
453
+ for dataset in datasets:
454
+ schema = dataset.structure
455
+ if not isinstance(schema, Schema):
456
+ raise SemanticError("0-1-3-2", schema=schema)
457
+ if schema.short_urn not in mapping_dict:
458
+ raise SemanticError("0-1-3-4", short_urn=schema.short_urn)
459
+ # Generating VTL Datastructure and Datapoints.
460
+ dataset_name = mapping_dict[schema.short_urn]
461
+ vtl_structure = to_vtl_json(schema, dataset_name)
462
+ data_structures.append(vtl_structure)
463
+ datapoints[dataset_name] = dataset.data
464
+
465
+ missing = []
466
+ for input_name in input_names:
467
+ if input_name not in mapping_dict.values():
468
+ missing.append(input_name)
469
+ if missing:
470
+ raise SemanticError("0-1-3-6", missing=missing)
471
+
472
+ result = run(
473
+ script=script,
474
+ data_structures=data_structures,
475
+ datapoints=datapoints,
476
+ value_domains=value_domains,
477
+ external_routines=external_routines,
478
+ time_period_output_format=time_period_output_format,
479
+ return_only_persistent=return_only_persistent,
480
+ output_folder=output_folder,
481
+ )
482
+ return result
483
+
484
+
485
+ def generate_sdmx(
486
+ script: Union[str, Path], agency_id: str, id: str, version: str = "1.0"
487
+ ) -> TransformationScheme:
488
+ """
489
+ Function that generates a TransformationScheme object from a VTL script.
490
+
491
+ The TransformationScheme object is the SDMX representation of the VTL script. \
492
+ For more details please check the `SDMX IM VTL objects \
493
+ <https://sdmx.org/wp-content/uploads/SDMX_3-0-0_SECTION_2_FINAL-1_0.pdf#page=146>`_, line 2266.
494
+
495
+ Args:
496
+ script: A string with the VTL script.
497
+ agency_id: The Agency ID used in the generated `TransformationScheme` object.
498
+ id: The given id of the generated `TransformationScheme` object.
499
+ version: The Version used in the generated `TransformationScheme` object. (default: "1.0")
500
+
501
+ Returns:
502
+ The generated Transformation Scheme object.
503
+ """
504
+ vtl = load_vtl(script)
505
+ ast = create_ast(vtl)
506
+ result = ast_to_sdmx(ast, agency_id, id, version)
507
+ return result
@@ -0,0 +1,56 @@
1
+ from antlr4 import CommonTokenStream, InputStream
2
+ from antlr4.Token import CommonToken
3
+
4
+ from vtlengine.API import create_ast
5
+ from vtlengine.AST import Comment, Start
6
+ from vtlengine.AST.ASTConstructorModules import extract_token_info
7
+ from vtlengine.AST.Grammar.lexer import Lexer
8
+
9
+
10
+ def generate_ast_comment(token: CommonToken) -> Comment:
11
+ """
12
+ Parses a token belonging to a comment and returns a Comment AST object.
13
+
14
+ Args:
15
+ token (str): The comment string to parse.
16
+
17
+ Returns:
18
+ Comment: A Comment AST object.
19
+ """
20
+ token_info = extract_token_info(token)
21
+ text = token.text
22
+ if token.type == Lexer.SL_COMMENT:
23
+ text = token.text[:-1] # Remove the trailing newline character
24
+ return Comment(value=text, **token_info)
25
+
26
+
27
+ def create_ast_with_comments(text: str) -> Start:
28
+ """
29
+ Parses a VTL script and returns an AST with comments.
30
+
31
+ Args:
32
+ text (str): The VTL script to parse.
33
+
34
+ Returns:
35
+ AST: The generated AST with comments.
36
+ """
37
+ # Call the create_ast function to generate the AST from channel 0
38
+ ast = create_ast(text)
39
+
40
+ # Reading the script on channel 2 to get the comments
41
+ lexer_ = Lexer(InputStream(text))
42
+ stream = CommonTokenStream(lexer_, channel=2)
43
+
44
+ # Fill the stream with tokens on the buffer
45
+ stream.fill()
46
+
47
+ # Extract comments from the stream
48
+ comments = [generate_ast_comment(token) for token in stream.tokens if token.channel == 2]
49
+
50
+ # Add comments to the AST
51
+ ast.children.extend(comments)
52
+
53
+ # Sort the ast children based on their start line and column
54
+ ast.children.sort(key=lambda x: (x.line_start, x.column_start))
55
+
56
+ return ast