PyPI - vtlengine - Versions diffs - 1.1rc2__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

vtlengine 1.1rc2py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vtlengine might be problematic. Click here for more details.

Files changed (44) hide show

vtlengine/API/_InternalApi.py +288 -29
vtlengine/API/__init__.py +277 -70
vtlengine/AST/ASTComment.py +56 -0
vtlengine/AST/ASTConstructor.py +71 -18
vtlengine/AST/ASTConstructorModules/Expr.py +197 -75
vtlengine/AST/ASTConstructorModules/ExprComponents.py +81 -38
vtlengine/AST/ASTConstructorModules/Terminals.py +76 -31
vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
vtlengine/AST/ASTEncoders.py +4 -0
vtlengine/AST/ASTString.py +622 -0
vtlengine/AST/ASTTemplate.py +28 -2
vtlengine/AST/DAG/__init__.py +44 -6
vtlengine/AST/DAG/_words.py +1 -0
vtlengine/AST/Grammar/Vtl.g4 +7 -7
vtlengine/AST/Grammar/lexer.py +19759 -1112
vtlengine/AST/Grammar/parser.py +17996 -3199
vtlengine/AST/__init__.py +127 -14
vtlengine/Exceptions/messages.py +14 -2
vtlengine/Interpreter/__init__.py +90 -11
vtlengine/Model/__init__.py +9 -4
vtlengine/Operators/Aggregation.py +13 -6
vtlengine/Operators/Analytic.py +19 -13
vtlengine/Operators/CastOperator.py +5 -2
vtlengine/Operators/Clause.py +26 -18
vtlengine/Operators/Comparison.py +3 -1
vtlengine/Operators/Conditional.py +40 -18
vtlengine/Operators/General.py +3 -1
vtlengine/Operators/HROperators.py +3 -1
vtlengine/Operators/Join.py +4 -2
vtlengine/Operators/Time.py +22 -15
vtlengine/Operators/Validation.py +5 -2
vtlengine/Operators/__init__.py +15 -8
vtlengine/Utils/__Virtual_Assets.py +34 -0
vtlengine/Utils/__init__.py +49 -0
vtlengine/__init__.py +4 -2
vtlengine/files/parser/__init__.py +16 -26
vtlengine/files/parser/_rfc_dialect.py +1 -1
vtlengine/py.typed +0 -0
vtlengine-1.2.0.dist-info/METADATA +92 -0
vtlengine-1.2.0.dist-info/RECORD +63 -0
{vtlengine-1.1rc2.dist-info → vtlengine-1.2.0.dist-info}/WHEEL +1 -1
vtlengine-1.1rc2.dist-info/METADATA +0 -248
vtlengine-1.1rc2.dist-info/RECORD +0 -59
{vtlengine-1.1rc2.dist-info → vtlengine-1.2.0.dist-info}/LICENSE.md +0 -0

vtlengine/API/__init__.py CHANGED Viewed

@@ -1,29 +1,41 @@
+import warnings
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Sequence, Union
 import pandas as pd
 from antlr4 import CommonTokenStream, InputStream  # type: ignore[import-untyped]
 from antlr4.error.ErrorListener import ErrorListener  # type: ignore[import-untyped]
+from pysdmx.io.pd import PandasDataset
+from pysdmx.model import DataflowRef, Reference, TransformationScheme
+from pysdmx.model.dataflow import Dataflow, Schema
+from pysdmx.model.vtl import VtlDataflowMapping
+from pysdmx.util import parse_urn
 from vtlengine.API._InternalApi import (
     _check_output_folder,
+    _check_script,
     _return_only_persistent_datasets,
+    ast_to_sdmx,
     load_datasets,
     load_datasets_with_data,
     load_external_routines,
     load_value_domains,
     load_vtl,
+    to_vtl_json,
 )
 from vtlengine.AST import Start
 from vtlengine.AST.ASTConstructor import ASTVisitor
+from vtlengine.AST.ASTString import ASTString
 from vtlengine.AST.DAG import DAGAnalyzer
 from vtlengine.AST.Grammar.lexer import Lexer
 from vtlengine.AST.Grammar.parser import Parser
+from vtlengine.Exceptions import SemanticError
 from vtlengine.files.output._time_period_representation import (
     TimePeriodRepresentation,
     format_time_period_external_representation,
 )
 from vtlengine.Interpreter import InterpreterAnalyzer
+from vtlengine.Model import Dataset, Scalar
 pd.options.mode.chained_assignment = None
@@ -68,6 +80,38 @@ def _parser(stream: CommonTokenStream) -> Any:
     return vtl_parser.start()
+def _extract_input_datasets(script: Union[str, TransformationScheme, Path]) -> str:
+    if isinstance(script, TransformationScheme):
+        vtl_script = _check_script(script)
+    elif isinstance(script, (str, Path)):
+        vtl_script = load_vtl(script)
+    else:
+        raise TypeError("Unsupported script type.")
+    ast = create_ast(vtl_script)
+    dag_inputs = DAGAnalyzer.ds_structure(ast)["global_inputs"]
+    return dag_inputs
+def prettify(script: Union[str, TransformationScheme, Path]) -> str:
+    """
+    Function that prettifies the VTL script given.
+    Args:
+        script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
+    Returns:
+        A str with the prettified VTL script.
+    """
+    from vtlengine.AST.ASTComment import create_ast_with_comments
+    checking = _check_script(script)
+    vtl = load_vtl(checking)
+    ast = create_ast_with_comments(vtl)
+    return ASTString(pretty=True).render(ast)
 def create_ast(text: str) -> Start:
     """
     Function that creates the AST object.
@@ -90,30 +134,19 @@ def create_ast(text: str) -> Start:
 def semantic_analysis(
-    script: Union[str, Path],
-    data_structures: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
+    script: Union[str, TransformationScheme, Path],
+    data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
     value_domains: Optional[Union[Dict[str, Any], Path]] = None,
     external_routines: Optional[Union[Dict[str, Any], Path]] = None,
-) -> Any:
+) -> Dict[str, Dataset]:
     """
-    Checks if the vtl operation can be done.To do that, it generates the AST with the vtl script
-    given and also reviews if the data structure given can fit with it.
+    Checks if the vtl scripts and its related datastructures are valid. As part of the compatibility
+    with pysdmx library, the vtl script can be a Transformation Scheme object, which availability as
+    input is going to be serialized as a string VTL script.
-    This vtl script can be a string with the actual expression or a filepath to the folder
-    that contains the vtl file.
-    Moreover, the data structure can be a dictionary or a filepath to the folder that contains it.
-    If there are any value domains or external routines, this data is taken into account.
-    Both can be loaded the same way as data structures or vtl scripts are.
-    Finally, the :obj:`Interpreter <vtl-engine-spark.Interpreter.InterpreterAnalyzer>`
-    class takes all of this information and checks it with the ast generated to
-    return the semantic analysis result.
-    Concepts you may know:
+    Concepts you may need to know:
-    - Vtl script: The expression that shows the operation to be done.
+    - Vtl script: The script that shows the set of operations to be executed.
     - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
     (and/or scalar) about the datatype (String, integer or number), \
@@ -126,7 +159,8 @@ def semantic_analysis(
     This function has the following params:
     Args:
-        script: String or Path of the vtl expression.
+        script: Vtl script as a string, Transformation Scheme object or Path to the folder \
+        that holds the vtl script.
         data_structures: Dict or Path (file or folder), \
         or List of Dicts or Paths with the data structures JSON files.
         value_domains: Dict or Path of the value domains JSON files. (default: None)
@@ -139,12 +173,14 @@ def semantic_analysis(
         Exception: If the files have the wrong format, or they do not exist, \
         or their Paths are invalid.
     """
     # AST generation
-    vtl = load_vtl(script)
+    checking = _check_script(script)
+    vtl = load_vtl(checking)
     ast = create_ast(vtl)
     # Loading datasets
-    structures = load_datasets(data_structures)
+    datasets, scalars = load_datasets(data_structures)
     # Handling of library items
     vd = None
@@ -156,9 +192,10 @@ def semantic_analysis(
     # Running the interpreter
     interpreter = InterpreterAnalyzer(
-        datasets=structures,
+        datasets=datasets,
         value_domains=vd,
         external_routines=ext_routines,
+        scalars=scalars,
         only_semantic=True,
     )
     result = interpreter.visit(ast)
@@ -166,30 +203,35 @@ def semantic_analysis(
 def run(
-    script: Union[str, Path],
-    data_structures: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
-    datapoints: Union[Dict[str, Any], str, Path, List[Union[str, Path]]],
+    script: Union[str, TransformationScheme, Path],
+    data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
+    datapoints: Union[Dict[str, pd.DataFrame], str, Path, List[Dict[str, Any]], List[Path]],
     value_domains: Optional[Union[Dict[str, Any], Path]] = None,
     external_routines: Optional[Union[str, Path]] = None,
     time_period_output_format: str = "vtl",
-    return_only_persistent: bool = False,
+    return_only_persistent: bool = True,
     output_folder: Optional[Union[str, Path]] = None,
-) -> Any:
+    scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
+) -> Dict[str, Union[Dataset, Scalar]]:
     """
-    Run is the main function of the ``API``, which mission is to ensure the vtl operation is ready
-    to be performed.
-    When the vtl expression is given, an AST object is created.
-    This vtl script can be given as a string or a path with the folder or file that contains it.
-    At the same time, data structures are loaded with its datapoints.
+    Run is the main function of the ``API``, which mission is to execute
+    the vtl operation over the data.
+    Concepts you may need to know:
+    - Vtl script: The script that shows the set of operations to be executed.
+    - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
+    (and/or scalar) about the datatype (String, integer or number), \
+    the role (Identifier, Attribute or Measure) and the nullability each component has.
-    The data structure information is contained in the JSON file given,
-    and establish the datatype (string, integer or number),
-    and the role that each component is going to have (Identifier, Attribute or Measure).
-    It can be a dictionary or a path to the JSON file or folder that contains it.
+    - Data point: `Pandas Dataframe \
+    <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ \
+    that holds the data related to the Dataset.
-    Moreover, a csv file with the data to operate with is going to be loaded.
-    It can be given with a dictionary (dataset name : pandas Dataframe),
-    a path or S3 URI to the folder, path or S3 to the csv file that contains the data.
+    - Value domains: Collection of unique values on the same datatype.
+    - External routines: SQL query used to transform a dataset.
     .. important::
         The data structure and the data points must have the same dataset
@@ -212,35 +254,12 @@ def run(
         For more details, see
         `s3fs documentation <https://s3fs.readthedocs.io/en/latest/index.html#credentials>`_.
-    Before the execution, the DAG analysis reviews if the VTL script is a direct acyclic graphs.
-    If value domain data or external routines are required, the function loads this information
-    and integrates them into the
-    :obj:`Interpreter <vtl-engine-spark.Interpreter.InterpreterAnalyzer>` class.
-    Moreover, if any component has a Time Period component, the external representation
-    is passed to the Interpreter class.
-    Concepts you may need to know:
-    - Vtl script: The expression that shows the operation to be done.
-    - Data Structure: JSON file that contains the structure and the name for the dataset(s) \
-    (and/or scalar) about the datatype (String, integer or number), \
-    the role (Identifier, Attribute or Measure) and the nullability each component has.
-    - Data point: Pointer to the data. It will be loaded as a `Pandas Dataframe \
-    <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_.
-    - Value domains: Collection of unique values that have the same datatype.
-    - External routines: SQL query used to transform a dataset.
+    Before the execution, the DAG analysis reviews if the VTL script is a direct acyclic graph.
     This function has the following params:
     Args:
-        script: String or Path with the vtl expression.
+        script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
         data_structures: Dict, Path or a List of Dicts or Paths with the data structures.
@@ -255,10 +274,12 @@ def run(
         Time Period components.
         return_only_persistent: If True, run function will only return the results of \
-        Persistent Assignments. (default: False)
+        Persistent Assignments. (default: True)
         output_folder: Path or S3 URI to the output folder. (default: None)
+        scalar_values: Dict with the scalar values to be used in the VTL script. \
     Returns:
        The datasets are produced without data if the output folder is defined.
@@ -268,12 +289,16 @@ def run(
         or their Paths are invalid.
     """
     # AST generation
+    script = _check_script(script)
     vtl = load_vtl(script)
     ast = create_ast(vtl)
     # Loading datasets and datapoints
-    datasets, path_dict = load_datasets_with_data(data_structures, datapoints)
+    datasets, scalars, path_dict = load_datasets_with_data(
+        data_structures, datapoints, scalar_values
+    )
     # Handling of library items
     vd = None
@@ -302,15 +327,197 @@ def run(
         datapoints_paths=path_dict,
         output_path=output_folder,
         time_period_representation=time_period_representation,
+        return_only_persistent=return_only_persistent,
+        scalars=scalars,
     )
     result = interpreter.visit(ast)
     # Applying time period output format
     if output_folder is None:
-        for dataset in result.values():
-            format_time_period_external_representation(dataset, time_period_representation)
+        for obj in result.values():
+            if isinstance(obj, (Dataset, Scalar)):
+                format_time_period_external_representation(obj, time_period_representation)
     # Returning only persistent datasets
     if return_only_persistent:
         return _return_only_persistent_datasets(result, ast)
     return result
+def run_sdmx(  # noqa: C901
+    script: Union[str, TransformationScheme, Path],
+    datasets: Sequence[PandasDataset],
+    mappings: Optional[Union[VtlDataflowMapping, Dict[str, str]]] = None,
+    value_domains: Optional[Union[Dict[str, Any], Path]] = None,
+    external_routines: Optional[Union[str, Path]] = None,
+    time_period_output_format: str = "vtl",
+    return_only_persistent: bool = True,
+    output_folder: Optional[Union[str, Path]] = None,
+) -> Dict[str, Union[Dataset, Scalar]]:
+    """
+    Executes a VTL script using a list of pysdmx `PandasDataset` objects.
+    This function prepares the required VTL data structures and datapoints from
+    the given list of pysdmx `PandasDataset` objects. It validates each
+    `PandasDataset` uses a valid `Schema` instance as its structure. Each `Schema` is converted
+    to the appropriate VTL JSON data structure, and the Pandas Dataframe is extracted.
+    .. important::
+        We recommend to use this function in combination with the
+        `get_datasets <https://py.sdmx.io/howto/data_rw.html#pysdmx.io.get_datasets>`_
+        pysdmx method.
+    .. important::
+        The mapping between pysdmx `PandasDataset
+        <https://py.sdmx.io/howto/data_rw.html#pysdmx.io.pd.PandasDataset>`_ \
+        and VTL datasets is done using the `Schema` instance of the `PandasDataset`.
+        The Schema ID is used as the dataset name.
+        DataStructure=MD:TEST_DS(1.0) -> TEST_DS
+    The function then calls the :obj:`run <vtlengine.API>` function with the provided VTL
+    script and prepared inputs.
+    Before the execution, the DAG analysis reviews if the generated VTL script is a direct acyclic
+    graph.
+    Args:
+        script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
+        datasets: A list of PandasDataset.
+        mappings: A dictionary or VtlDataflowMapping object that maps the dataset names.
+        value_domains: Dict or Path of the value domains JSON files. (default:None)
+        external_routines: String or Path of the external routines SQL files. (default: None)
+        time_period_output_format: String with the possible values \
+        ("sdmx_gregorian", "sdmx_reporting", "vtl") for the representation of the \
+        Time Period components.
+        return_only_persistent: If True, run function will only return the results of \
+        Persistent Assignments. (default: True)
+        output_folder: Path or S3 URI to the output folder. (default: None)
+    Returns:
+       The datasets are produced without data if the output folder is defined.
+    Raises:
+        SemanticError: If any dataset does not contain a valid `Schema` instance as its structure.
+    """
+    mapping_dict = {}
+    input_names = _extract_input_datasets(script)
+    if not isinstance(datasets, (list, set)) or any(
+        not isinstance(ds, PandasDataset) for ds in datasets
+    ):
+        type_ = type(datasets).__name__
+        if isinstance(datasets, (list, set)):
+            object_typing = {type(o).__name__ for o in datasets}
+            type_ = f"{type_}[{', '.join(object_typing)}]"
+        raise SemanticError("0-1-3-7", type_=type_)
+    # Mapping handling
+    if mappings is None:
+        if len(datasets) != 1:
+            raise SemanticError("0-1-3-3")
+        if len(datasets) == 1:
+            if len(input_names) != 1:
+                raise SemanticError("0-1-3-1", number_datasets=len(input_names))
+            schema = datasets[0].structure
+            if not isinstance(schema, Schema):
+                raise SemanticError("0-1-3-2", schema=schema)
+            mapping_dict = {schema.short_urn: input_names[0]}
+    elif isinstance(mappings, Dict):
+        mapping_dict = mappings
+    elif isinstance(mappings, VtlDataflowMapping):
+        if mappings.to_vtl_mapping_method is not None:
+            warnings.warn(
+                "To_vtl_mapping_method is not implemented yet, we will use the Basic "
+                "method with old data."
+            )
+        if mappings.from_vtl_mapping_method is not None:
+            warnings.warn(
+                "From_vtl_mapping_method is not implemented yet, we will use the Basic "
+                "method with old data."
+            )
+        if isinstance(mappings.dataflow, str):
+            short_urn = str(parse_urn(mappings.dataflow))
+        elif isinstance(mappings.dataflow, (Reference, DataflowRef)):
+            short_urn = str(mappings.dataflow)
+        elif isinstance(mappings.dataflow, Dataflow):
+            short_urn = mappings.dataflow.short_urn
+        else:
+            raise TypeError(
+                "Expected str, Reference, DataflowRef or Dataflow type for dataflow in "
+                "VtlDataflowMapping."
+            )
+        mapping_dict = {short_urn: mappings.dataflow_alias}
+    else:
+        raise TypeError("Expected dict or VtlDataflowMapping type for mappings.")
+    for vtl_name in mapping_dict.values():
+        if vtl_name not in input_names:
+            raise SemanticError("0-1-3-5", dataset_name=vtl_name)
+    datapoints = {}
+    data_structures = []
+    for dataset in datasets:
+        schema = dataset.structure
+        if not isinstance(schema, Schema):
+            raise SemanticError("0-1-3-2", schema=schema)
+        if schema.short_urn not in mapping_dict:
+            raise SemanticError("0-1-3-4", short_urn=schema.short_urn)
+        # Generating VTL Datastructure and Datapoints.
+        dataset_name = mapping_dict[schema.short_urn]
+        vtl_structure = to_vtl_json(schema, dataset_name)
+        data_structures.append(vtl_structure)
+        datapoints[dataset_name] = dataset.data
+    missing = []
+    for input_name in input_names:
+        if input_name not in mapping_dict.values():
+            missing.append(input_name)
+    if missing:
+        raise SemanticError("0-1-3-6", missing=missing)
+    result = run(
+        script=script,
+        data_structures=data_structures,
+        datapoints=datapoints,
+        value_domains=value_domains,
+        external_routines=external_routines,
+        time_period_output_format=time_period_output_format,
+        return_only_persistent=return_only_persistent,
+        output_folder=output_folder,
+    )
+    return result
+def generate_sdmx(
+    script: Union[str, Path], agency_id: str, id: str, version: str = "1.0"
+) -> TransformationScheme:
+    """
+    Function that generates a TransformationScheme object from a VTL script.
+    The TransformationScheme object is the SDMX representation of the VTL script. \
+    For more details please check the `SDMX IM VTL objects \
+    <https://sdmx.org/wp-content/uploads/SDMX_3-0-0_SECTION_2_FINAL-1_0.pdf#page=146>`_, line 2266.
+    Args:
+        script: A string with the VTL script.
+        agency_id: The Agency ID used in the generated `TransformationScheme` object.
+        id: The given id of the generated `TransformationScheme` object.
+        version: The Version used in the generated `TransformationScheme` object. (default: "1.0")
+    Returns:
+        The generated Transformation Scheme object.
+    """
+    vtl = load_vtl(script)
+    ast = create_ast(vtl)
+    result = ast_to_sdmx(ast, agency_id, id, version)
+    return result

vtlengine/AST/ASTComment.py ADDED Viewed

@@ -0,0 +1,56 @@
+from antlr4 import CommonTokenStream, InputStream
+from antlr4.Token import CommonToken
+from vtlengine.API import create_ast
+from vtlengine.AST import Comment, Start
+from vtlengine.AST.ASTConstructorModules import extract_token_info
+from vtlengine.AST.Grammar.lexer import Lexer
+def generate_ast_comment(token: CommonToken) -> Comment:
+    """
+    Parses a token belonging to a comment and returns a Comment AST object.
+    Args:
+        token (str): The comment string to parse.
+    Returns:
+        Comment: A Comment AST object.
+    """
+    token_info = extract_token_info(token)
+    text = token.text
+    if token.type == Lexer.SL_COMMENT:
+        text = token.text[:-1]  # Remove the trailing newline character
+    return Comment(value=text, **token_info)
+def create_ast_with_comments(text: str) -> Start:
+    """
+    Parses a VTL script and returns an AST with comments.
+    Args:
+        text (str): The VTL script to parse.
+    Returns:
+        AST: The generated AST with comments.
+    """
+    # Call the create_ast function to generate the AST from channel 0
+    ast = create_ast(text)
+    # Reading the script on channel 2 to get the comments
+    lexer_ = Lexer(InputStream(text))
+    stream = CommonTokenStream(lexer_, channel=2)
+    # Fill the stream with tokens on the buffer
+    stream.fill()
+    # Extract comments from the stream
+    comments = [generate_ast_comment(token) for token in stream.tokens if token.channel == 2]
+    # Add comments to the AST
+    ast.children.extend(comments)
+    # Sort the ast children based on their start line and column
+    ast.children.sort(key=lambda x: (x.line_start, x.column_start))
+    return ast

vtlengine 1.1rc2__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

vtlengine 1.1rc2py3-none-any.whl → 1.2.0py3-none-any.whl