vtlengine 1.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vtlengine/API/_InternalApi.py +791 -0
- vtlengine/API/__init__.py +612 -0
- vtlengine/API/data/schema/external_routines_schema.json +34 -0
- vtlengine/API/data/schema/json_schema_2.1.json +116 -0
- vtlengine/API/data/schema/value_domain_schema.json +97 -0
- vtlengine/AST/ASTComment.py +57 -0
- vtlengine/AST/ASTConstructor.py +598 -0
- vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
- vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
- vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
- vtlengine/AST/ASTDataExchange.py +10 -0
- vtlengine/AST/ASTEncoders.py +32 -0
- vtlengine/AST/ASTString.py +675 -0
- vtlengine/AST/ASTTemplate.py +558 -0
- vtlengine/AST/ASTVisitor.py +25 -0
- vtlengine/AST/DAG/__init__.py +479 -0
- vtlengine/AST/DAG/_words.py +10 -0
- vtlengine/AST/Grammar/Vtl.g4 +705 -0
- vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
- vtlengine/AST/Grammar/__init__.py +0 -0
- vtlengine/AST/Grammar/lexer.py +2139 -0
- vtlengine/AST/Grammar/parser.py +16597 -0
- vtlengine/AST/Grammar/tokens.py +169 -0
- vtlengine/AST/VtlVisitor.py +824 -0
- vtlengine/AST/__init__.py +674 -0
- vtlengine/DataTypes/TimeHandling.py +562 -0
- vtlengine/DataTypes/__init__.py +863 -0
- vtlengine/DataTypes/_time_checking.py +135 -0
- vtlengine/Exceptions/__exception_file_generator.py +96 -0
- vtlengine/Exceptions/__init__.py +159 -0
- vtlengine/Exceptions/messages.py +1004 -0
- vtlengine/Interpreter/__init__.py +2048 -0
- vtlengine/Model/__init__.py +501 -0
- vtlengine/Operators/Aggregation.py +357 -0
- vtlengine/Operators/Analytic.py +455 -0
- vtlengine/Operators/Assignment.py +23 -0
- vtlengine/Operators/Boolean.py +106 -0
- vtlengine/Operators/CastOperator.py +451 -0
- vtlengine/Operators/Clause.py +366 -0
- vtlengine/Operators/Comparison.py +488 -0
- vtlengine/Operators/Conditional.py +495 -0
- vtlengine/Operators/General.py +191 -0
- vtlengine/Operators/HROperators.py +254 -0
- vtlengine/Operators/Join.py +447 -0
- vtlengine/Operators/Numeric.py +422 -0
- vtlengine/Operators/RoleSetter.py +77 -0
- vtlengine/Operators/Set.py +176 -0
- vtlengine/Operators/String.py +578 -0
- vtlengine/Operators/Time.py +1144 -0
- vtlengine/Operators/Validation.py +275 -0
- vtlengine/Operators/__init__.py +900 -0
- vtlengine/Utils/__Virtual_Assets.py +34 -0
- vtlengine/Utils/__init__.py +479 -0
- vtlengine/__extras_check.py +17 -0
- vtlengine/__init__.py +27 -0
- vtlengine/files/__init__.py +0 -0
- vtlengine/files/output/__init__.py +35 -0
- vtlengine/files/output/_time_period_representation.py +55 -0
- vtlengine/files/parser/__init__.py +240 -0
- vtlengine/files/parser/_rfc_dialect.py +22 -0
- vtlengine/py.typed +0 -0
- vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
- vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
- vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
- vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional, Sequence, Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from antlr4 import CommonTokenStream, InputStream # type: ignore[import-untyped]
|
|
7
|
+
from antlr4.error.ErrorListener import ErrorListener # type: ignore[import-untyped]
|
|
8
|
+
from pysdmx.io.pd import PandasDataset
|
|
9
|
+
from pysdmx.model import DataflowRef, Reference, TransformationScheme
|
|
10
|
+
from pysdmx.model.dataflow import Dataflow, Schema
|
|
11
|
+
from pysdmx.model.vtl import VtlDataflowMapping
|
|
12
|
+
from pysdmx.util import parse_urn
|
|
13
|
+
|
|
14
|
+
from vtlengine.API._InternalApi import (
|
|
15
|
+
_check_output_folder,
|
|
16
|
+
_check_script,
|
|
17
|
+
_return_only_persistent_datasets,
|
|
18
|
+
ast_to_sdmx,
|
|
19
|
+
load_datasets,
|
|
20
|
+
load_datasets_with_data,
|
|
21
|
+
load_external_routines,
|
|
22
|
+
load_value_domains,
|
|
23
|
+
load_vtl,
|
|
24
|
+
to_vtl_json,
|
|
25
|
+
)
|
|
26
|
+
from vtlengine.AST import Start
|
|
27
|
+
from vtlengine.AST.ASTConstructor import ASTVisitor
|
|
28
|
+
from vtlengine.AST.ASTString import ASTString
|
|
29
|
+
from vtlengine.AST.DAG import DAGAnalyzer
|
|
30
|
+
from vtlengine.AST.Grammar.lexer import Lexer
|
|
31
|
+
from vtlengine.AST.Grammar.parser import Parser
|
|
32
|
+
from vtlengine.Exceptions import InputValidationException
|
|
33
|
+
from vtlengine.files.output._time_period_representation import (
|
|
34
|
+
TimePeriodRepresentation,
|
|
35
|
+
format_time_period_external_representation,
|
|
36
|
+
)
|
|
37
|
+
from vtlengine.Interpreter import InterpreterAnalyzer
|
|
38
|
+
from vtlengine.Model import Dataset, Scalar
|
|
39
|
+
|
|
40
|
+
pd.options.mode.chained_assignment = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class __VTLSingleErrorListener(ErrorListener): # type: ignore[misc]
|
|
44
|
+
""" """
|
|
45
|
+
|
|
46
|
+
def syntaxError(
|
|
47
|
+
self,
|
|
48
|
+
recognizer: Any,
|
|
49
|
+
offendingSymbol: str,
|
|
50
|
+
line: str,
|
|
51
|
+
column: str,
|
|
52
|
+
msg: str,
|
|
53
|
+
e: Any,
|
|
54
|
+
) -> None:
|
|
55
|
+
raise Exception(
|
|
56
|
+
f"Not valid VTL Syntax \n "
|
|
57
|
+
f"offendingSymbol: {offendingSymbol} \n "
|
|
58
|
+
f"msg: {msg} \n "
|
|
59
|
+
f"line: {line}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _lexer(text: str) -> CommonTokenStream:
|
|
64
|
+
"""
|
|
65
|
+
Lexing
|
|
66
|
+
"""
|
|
67
|
+
lexer_ = Lexer(InputStream(text))
|
|
68
|
+
lexer_._listeners = [__VTLSingleErrorListener()]
|
|
69
|
+
stream = CommonTokenStream(lexer_)
|
|
70
|
+
|
|
71
|
+
return stream
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parser(stream: CommonTokenStream) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
Parse the expression
|
|
77
|
+
"""
|
|
78
|
+
vtl_parser = Parser(stream)
|
|
79
|
+
vtl_parser._listeners = [__VTLSingleErrorListener()]
|
|
80
|
+
return vtl_parser.start()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _extract_input_datasets(script: Union[str, TransformationScheme, Path]) -> str:
|
|
84
|
+
if isinstance(script, TransformationScheme):
|
|
85
|
+
vtl_script = _check_script(script)
|
|
86
|
+
elif isinstance(script, (str, Path)):
|
|
87
|
+
vtl_script = load_vtl(script)
|
|
88
|
+
else:
|
|
89
|
+
raise TypeError("Unsupported script type.")
|
|
90
|
+
|
|
91
|
+
ast = create_ast(vtl_script)
|
|
92
|
+
dag_inputs = DAGAnalyzer.ds_structure(ast)["global_inputs"]
|
|
93
|
+
|
|
94
|
+
return dag_inputs
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def prettify(script: Union[str, TransformationScheme, Path]) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Function that prettifies the VTL script given.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
A str with the prettified VTL script.
|
|
106
|
+
"""
|
|
107
|
+
from vtlengine.AST.ASTComment import create_ast_with_comments
|
|
108
|
+
|
|
109
|
+
checking = _check_script(script)
|
|
110
|
+
vtl = load_vtl(checking)
|
|
111
|
+
ast = create_ast_with_comments(vtl)
|
|
112
|
+
return ASTString(pretty=True).render(ast)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def create_ast(text: str) -> Start:
|
|
116
|
+
"""
|
|
117
|
+
Function that creates the AST object.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
text: Vtl string expression that will be used to create the AST object.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The ast object.
|
|
124
|
+
|
|
125
|
+
Raises:
|
|
126
|
+
Exception: When the vtl syntax expression is wrong.
|
|
127
|
+
"""
|
|
128
|
+
text = text + "\n"
|
|
129
|
+
stream = _lexer(text)
|
|
130
|
+
cst = _parser(stream)
|
|
131
|
+
visitor = ASTVisitor()
|
|
132
|
+
ast = visitor.visitStart(cst)
|
|
133
|
+
DAGAnalyzer.createDAG(ast)
|
|
134
|
+
return ast
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def validate_dataset(
|
|
138
|
+
data_structures: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
|
|
139
|
+
datapoints: Optional[
|
|
140
|
+
Union[Dict[str, Union[pd.DataFrame, Path, str]], List[Union[str, Path]], Path, str]
|
|
141
|
+
] = None,
|
|
142
|
+
scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Validate that datasets can be loaded from the given data_structures and optional datapoints.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
data_structures: Dict, Path, or List of Dict/Path objects representing data structures.
|
|
149
|
+
datapoints: Optional Dict, Path, or List of Dict/Path objects representing datapoints.
|
|
150
|
+
scalar_values: Optional Dict with scalar values to be used in the datasets.
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
Exception: If the data structures or datapoints are invalid or cannot be loaded.
|
|
154
|
+
"""
|
|
155
|
+
load_datasets_with_data(data_structures, datapoints, scalar_values)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def validate_value_domain(
|
|
159
|
+
input: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Validate ValueDomain(s) using JSON Schema.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
input: Dict, Path, or List of Dict/Path objects representing value domain definitions.
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
Exception: If the input file is invalid, does not exist,
|
|
169
|
+
or the JSON content does not follow the schema.
|
|
170
|
+
"""
|
|
171
|
+
load_value_domains(input)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def validate_external_routine(
|
|
175
|
+
input: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
|
|
176
|
+
) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Validate External Routine(s) using JSON Schema and SQLGlot.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
input: Dict, Path, or List of Dict/Path objects representing external routines.
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
Exception: If JSON schema validation fails,
|
|
185
|
+
SQL syntax is invalid, or file type is wrong.
|
|
186
|
+
"""
|
|
187
|
+
load_external_routines(input)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def semantic_analysis(
|
|
191
|
+
script: Union[str, TransformationScheme, Path],
|
|
192
|
+
data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
|
|
193
|
+
value_domains: Optional[Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]] = None,
|
|
194
|
+
external_routines: Optional[
|
|
195
|
+
Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]
|
|
196
|
+
] = None,
|
|
197
|
+
) -> Dict[str, Dataset]:
|
|
198
|
+
"""
|
|
199
|
+
Checks if the vtl scripts and its related datastructures are valid. As part of the compatibility
|
|
200
|
+
with pysdmx library, the vtl script can be a Transformation Scheme object, which availability as
|
|
201
|
+
input is going to be serialized as a string VTL script.
|
|
202
|
+
|
|
203
|
+
Concepts you may need to know:
|
|
204
|
+
|
|
205
|
+
- Vtl script: The script that shows the set of operations to be executed.
|
|
206
|
+
|
|
207
|
+
- Data Structure: JSON file that contains the structure and the name for the dataset(s) \
|
|
208
|
+
(and/or scalar) about the datatype (String, integer or number), \
|
|
209
|
+
the role (Identifier, Attribute or Measure) and the nullability each component has.
|
|
210
|
+
|
|
211
|
+
- Value domains: Collection of unique values on the same datatype.
|
|
212
|
+
|
|
213
|
+
- External routines: SQL query used to transform a dataset.
|
|
214
|
+
|
|
215
|
+
This function has the following params:
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
script: Vtl script as a string, Transformation Scheme object or Path to the folder \
|
|
219
|
+
that holds the vtl script.
|
|
220
|
+
data_structures: Dict or Path (file or folder), \
|
|
221
|
+
or List of Dicts or Paths with the data structures JSON files.
|
|
222
|
+
value_domains: Dict or Path, or List of Dicts or Paths of the \
|
|
223
|
+
value domains JSON files. (default:None) It is passed as an object, that can be read from \
|
|
224
|
+
a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
225
|
+
Check the following example: \
|
|
226
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
227
|
+
|
|
228
|
+
external_routines: String or Path, or List of Strings or Paths of the \
|
|
229
|
+
external routines SQL files. (default: None) It is passed as an object, that can be read \
|
|
230
|
+
from a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
231
|
+
Check the following example: \
|
|
232
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
The computed datasets.
|
|
236
|
+
|
|
237
|
+
Raises:
|
|
238
|
+
Exception: If the files have the wrong format, or they do not exist, \
|
|
239
|
+
or their Paths are invalid.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# AST generation
|
|
243
|
+
checking = _check_script(script)
|
|
244
|
+
vtl = load_vtl(checking)
|
|
245
|
+
ast = create_ast(vtl)
|
|
246
|
+
|
|
247
|
+
# Loading datasets
|
|
248
|
+
datasets, scalars = load_datasets(data_structures)
|
|
249
|
+
|
|
250
|
+
# Handling of library items
|
|
251
|
+
vd = None
|
|
252
|
+
if value_domains is not None:
|
|
253
|
+
vd = load_value_domains(value_domains)
|
|
254
|
+
ext_routines = None
|
|
255
|
+
if external_routines is not None:
|
|
256
|
+
ext_routines = load_external_routines(external_routines)
|
|
257
|
+
|
|
258
|
+
# Running the interpreter
|
|
259
|
+
interpreter = InterpreterAnalyzer(
|
|
260
|
+
datasets=datasets,
|
|
261
|
+
value_domains=vd,
|
|
262
|
+
external_routines=ext_routines,
|
|
263
|
+
scalars=scalars,
|
|
264
|
+
only_semantic=True,
|
|
265
|
+
)
|
|
266
|
+
result = interpreter.visit(ast)
|
|
267
|
+
return result
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def run(
|
|
271
|
+
script: Union[str, TransformationScheme, Path],
|
|
272
|
+
data_structures: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
|
|
273
|
+
datapoints: Union[Dict[str, Union[pd.DataFrame, str, Path]], List[Union[str, Path]], str, Path],
|
|
274
|
+
value_domains: Optional[Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]] = None,
|
|
275
|
+
external_routines: Optional[
|
|
276
|
+
Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]
|
|
277
|
+
] = None,
|
|
278
|
+
time_period_output_format: str = "vtl",
|
|
279
|
+
return_only_persistent: bool = True,
|
|
280
|
+
output_folder: Optional[Union[str, Path]] = None,
|
|
281
|
+
scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
|
|
282
|
+
) -> Dict[str, Union[Dataset, Scalar]]:
|
|
283
|
+
"""
|
|
284
|
+
Run is the main function of the ``API``, which mission is to execute
|
|
285
|
+
the vtl operation over the data.
|
|
286
|
+
|
|
287
|
+
Concepts you may need to know:
|
|
288
|
+
|
|
289
|
+
- Vtl script: The script that shows the set of operations to be executed.
|
|
290
|
+
|
|
291
|
+
- Data Structure: JSON file that contains the structure and the name for the dataset(s) \
|
|
292
|
+
(and/or scalar) about the datatype (String, integer or number), \
|
|
293
|
+
the role (Identifier, Attribute or Measure) and the nullability each component has.
|
|
294
|
+
|
|
295
|
+
- Data point: `Pandas Dataframe \
|
|
296
|
+
<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ \
|
|
297
|
+
that holds the data related to the Dataset.
|
|
298
|
+
|
|
299
|
+
- Value domains: Collection of unique values on the same datatype.
|
|
300
|
+
|
|
301
|
+
- External routines: SQL query used to transform a dataset.
|
|
302
|
+
|
|
303
|
+
.. important::
|
|
304
|
+
The data structure and the data points must have the same dataset
|
|
305
|
+
name to be loaded correctly.
|
|
306
|
+
|
|
307
|
+
.. important::
|
|
308
|
+
If pointing to a Path or an S3 URI, dataset_name will be taken from the file name.
|
|
309
|
+
Example: If the path is 'path/to/data.csv', the dataset name will be 'data'.
|
|
310
|
+
|
|
311
|
+
.. important::
|
|
312
|
+
If using an S3 URI, the path must be in the format:
|
|
313
|
+
|
|
314
|
+
s3://bucket-name/path/to/data.csv
|
|
315
|
+
|
|
316
|
+
The following environment variables must be set (from the AWS account):
|
|
317
|
+
|
|
318
|
+
- AWS_ACCESS_KEY_ID
|
|
319
|
+
- AWS_SECRET_ACCESS_KEY
|
|
320
|
+
|
|
321
|
+
For more details, see
|
|
322
|
+
`s3fs documentation <https://s3fs.readthedocs.io/en/latest/index.html#credentials>`_.
|
|
323
|
+
|
|
324
|
+
Before the execution, the DAG analysis reviews if the VTL script is a direct acyclic graph.
|
|
325
|
+
|
|
326
|
+
This function has the following params:
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
|
|
330
|
+
|
|
331
|
+
data_structures: Dict, Path or a List of Dicts or Paths with the data structures.
|
|
332
|
+
|
|
333
|
+
datapoints: Dict, Path, S3 URI or List of S3 URIs or Paths with data. \
|
|
334
|
+
You can also use a custom name for the dataset by passing a dictionary with \
|
|
335
|
+
the dataset name as key and the Path, S3 URI or DataFrame as value. \
|
|
336
|
+
Check the following example: \
|
|
337
|
+
:ref:`Example 6 <example_6_run_using_paths>`.
|
|
338
|
+
|
|
339
|
+
value_domains: Dict or Path, or List of Dicts or Paths of the \
|
|
340
|
+
value domains JSON files. (default:None) It is passed as an object, that can be read from \
|
|
341
|
+
a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
342
|
+
Check the following example: \
|
|
343
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
344
|
+
|
|
345
|
+
external_routines: String or Path, or List of Strings or Paths of the \
|
|
346
|
+
external routines JSON files. (default: None) It is passed as an object, that can be read \
|
|
347
|
+
from a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
348
|
+
Check the following example: \
|
|
349
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
350
|
+
|
|
351
|
+
time_period_output_format: String with the possible values \
|
|
352
|
+
("sdmx_gregorian", "sdmx_reporting", "vtl") for the representation of the \
|
|
353
|
+
Time Period components.
|
|
354
|
+
|
|
355
|
+
return_only_persistent: If True, run function will only return the results of \
|
|
356
|
+
Persistent Assignments. (default: True)
|
|
357
|
+
|
|
358
|
+
output_folder: Path or S3 URI to the output folder. (default: None)
|
|
359
|
+
|
|
360
|
+
scalar_values: Dict with the scalar values to be used in the VTL script. \
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
The datasets are produced without data if the output folder is defined.
|
|
365
|
+
|
|
366
|
+
Raises:
|
|
367
|
+
Exception: If the files have the wrong format, or they do not exist, \
|
|
368
|
+
or their Paths are invalid.
|
|
369
|
+
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
# AST generation
|
|
373
|
+
script = _check_script(script)
|
|
374
|
+
vtl = load_vtl(script)
|
|
375
|
+
ast = create_ast(vtl)
|
|
376
|
+
|
|
377
|
+
# Loading datasets and datapoints
|
|
378
|
+
datasets, scalars, path_dict = load_datasets_with_data(
|
|
379
|
+
data_structures, datapoints, scalar_values
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Handling of library items
|
|
383
|
+
vd = None
|
|
384
|
+
if value_domains is not None:
|
|
385
|
+
vd = load_value_domains(value_domains)
|
|
386
|
+
ext_routines = None
|
|
387
|
+
if external_routines is not None:
|
|
388
|
+
ext_routines = load_external_routines(external_routines)
|
|
389
|
+
|
|
390
|
+
# Checking time period output format value
|
|
391
|
+
time_period_representation = TimePeriodRepresentation.check_value(time_period_output_format)
|
|
392
|
+
|
|
393
|
+
# VTL Efficient analysis
|
|
394
|
+
ds_analysis = DAGAnalyzer.ds_structure(ast)
|
|
395
|
+
|
|
396
|
+
# Checking the output path to be a Path object to a directory
|
|
397
|
+
if output_folder is not None:
|
|
398
|
+
_check_output_folder(output_folder)
|
|
399
|
+
|
|
400
|
+
# Running the interpreter
|
|
401
|
+
interpreter = InterpreterAnalyzer(
|
|
402
|
+
datasets=datasets,
|
|
403
|
+
value_domains=vd,
|
|
404
|
+
external_routines=ext_routines,
|
|
405
|
+
ds_analysis=ds_analysis,
|
|
406
|
+
datapoints_paths=path_dict,
|
|
407
|
+
output_path=output_folder,
|
|
408
|
+
time_period_representation=time_period_representation,
|
|
409
|
+
return_only_persistent=return_only_persistent,
|
|
410
|
+
scalars=scalars,
|
|
411
|
+
)
|
|
412
|
+
result = interpreter.visit(ast)
|
|
413
|
+
|
|
414
|
+
# Applying time period output format
|
|
415
|
+
if output_folder is None:
|
|
416
|
+
for obj in result.values():
|
|
417
|
+
if isinstance(obj, (Dataset, Scalar)):
|
|
418
|
+
format_time_period_external_representation(obj, time_period_representation)
|
|
419
|
+
|
|
420
|
+
# Returning only persistent datasets
|
|
421
|
+
if return_only_persistent:
|
|
422
|
+
return _return_only_persistent_datasets(result, ast)
|
|
423
|
+
return result
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def run_sdmx( # noqa: C901
|
|
427
|
+
script: Union[str, TransformationScheme, Path],
|
|
428
|
+
datasets: Sequence[PandasDataset],
|
|
429
|
+
mappings: Optional[Union[VtlDataflowMapping, Dict[str, str]]] = None,
|
|
430
|
+
value_domains: Optional[Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]] = None,
|
|
431
|
+
external_routines: Optional[
|
|
432
|
+
Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]]
|
|
433
|
+
] = None,
|
|
434
|
+
time_period_output_format: str = "vtl",
|
|
435
|
+
return_only_persistent: bool = True,
|
|
436
|
+
output_folder: Optional[Union[str, Path]] = None,
|
|
437
|
+
) -> Dict[str, Union[Dataset, Scalar]]:
|
|
438
|
+
"""
|
|
439
|
+
Executes a VTL script using a list of pysdmx `PandasDataset` objects.
|
|
440
|
+
|
|
441
|
+
This function prepares the required VTL data structures and datapoints from
|
|
442
|
+
the given list of pysdmx `PandasDataset` objects. It validates each
|
|
443
|
+
`PandasDataset` uses a valid `Schema` instance as its structure. Each `Schema` is converted
|
|
444
|
+
to the appropriate VTL JSON data structure, and the Pandas Dataframe is extracted.
|
|
445
|
+
|
|
446
|
+
.. important::
|
|
447
|
+
We recommend to use this function in combination with the
|
|
448
|
+
`get_datasets <https://py.sdmx.io/howto/data_rw.html#pysdmx.io.get_datasets>`_
|
|
449
|
+
pysdmx method.
|
|
450
|
+
|
|
451
|
+
.. important::
|
|
452
|
+
The mapping between pysdmx `PandasDataset
|
|
453
|
+
<https://py.sdmx.io/howto/data_rw.html#pysdmx.io.pd.PandasDataset>`_ \
|
|
454
|
+
and VTL datasets is done using the `Schema` instance of the `PandasDataset`.
|
|
455
|
+
The Schema ID is used as the dataset name.
|
|
456
|
+
|
|
457
|
+
DataStructure=MD:TEST_DS(1.0) -> TEST_DS
|
|
458
|
+
|
|
459
|
+
The function then calls the :obj:`run <vtlengine.API>` function with the provided VTL
|
|
460
|
+
script and prepared inputs.
|
|
461
|
+
|
|
462
|
+
Before the execution, the DAG analysis reviews if the generated VTL script is a direct acyclic
|
|
463
|
+
graph.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
script: VTL script as a string, a Transformation Scheme object or Path with the VTL script.
|
|
467
|
+
|
|
468
|
+
datasets: A list of PandasDataset.
|
|
469
|
+
|
|
470
|
+
mappings: A dictionary or VtlDataflowMapping object that maps the dataset names.
|
|
471
|
+
|
|
472
|
+
value_domains: Dict or Path, or List of Dicts or Paths of the \
|
|
473
|
+
value domains JSON files. (default:None) It is passed as an object, that can be read from \
|
|
474
|
+
a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
475
|
+
Check the following example: \
|
|
476
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
477
|
+
|
|
478
|
+
external_routines: String or Path, or List of Strings or Paths of the \
|
|
479
|
+
external routines JSON files. (default: None) It is passed as an object, that can be read \
|
|
480
|
+
from a Path or from a dictionary. Furthermore, a list of those objects can be passed. \
|
|
481
|
+
Check the following example: \
|
|
482
|
+
:ref:`Example 5 <example_5_run_with_multiple_value_domains_and_external_routines>`.
|
|
483
|
+
|
|
484
|
+
time_period_output_format: String with the possible values \
|
|
485
|
+
("sdmx_gregorian", "sdmx_reporting", "vtl") for the representation of the \
|
|
486
|
+
Time Period components.
|
|
487
|
+
|
|
488
|
+
return_only_persistent: If True, run function will only return the results of \
|
|
489
|
+
Persistent Assignments. (default: True)
|
|
490
|
+
|
|
491
|
+
output_folder: Path or S3 URI to the output folder. (default: None)
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
The datasets are produced without data if the output folder is defined.
|
|
495
|
+
|
|
496
|
+
Raises:
|
|
497
|
+
SemanticError: If any dataset does not contain a valid `Schema` instance as its structure.
|
|
498
|
+
|
|
499
|
+
"""
|
|
500
|
+
mapping_dict = {}
|
|
501
|
+
input_names = _extract_input_datasets(script)
|
|
502
|
+
|
|
503
|
+
if not isinstance(datasets, (list, set)) or any(
|
|
504
|
+
not isinstance(ds, PandasDataset) for ds in datasets
|
|
505
|
+
):
|
|
506
|
+
type_ = type(datasets).__name__
|
|
507
|
+
if isinstance(datasets, (list, set)):
|
|
508
|
+
object_typing = {type(o).__name__ for o in datasets}
|
|
509
|
+
type_ = f"{type_}[{', '.join(object_typing)}]"
|
|
510
|
+
raise InputValidationException("0-1-3-7", type_=type_)
|
|
511
|
+
|
|
512
|
+
# Mapping handling
|
|
513
|
+
if mappings is None:
|
|
514
|
+
if len(datasets) != 1:
|
|
515
|
+
raise InputValidationException("0-1-3-3")
|
|
516
|
+
if len(datasets) == 1:
|
|
517
|
+
if len(input_names) != 1:
|
|
518
|
+
raise InputValidationException("0-1-3-1", number_datasets=len(input_names))
|
|
519
|
+
schema = datasets[0].structure
|
|
520
|
+
if not isinstance(schema, Schema):
|
|
521
|
+
raise InputValidationException("0-1-3-2", schema=schema)
|
|
522
|
+
mapping_dict = {schema.short_urn: input_names[0]}
|
|
523
|
+
elif isinstance(mappings, Dict):
|
|
524
|
+
mapping_dict = mappings
|
|
525
|
+
elif isinstance(mappings, VtlDataflowMapping):
|
|
526
|
+
if mappings.to_vtl_mapping_method is not None:
|
|
527
|
+
warnings.warn(
|
|
528
|
+
"To_vtl_mapping_method is not implemented yet, we will use the Basic "
|
|
529
|
+
"method with old data."
|
|
530
|
+
)
|
|
531
|
+
if mappings.from_vtl_mapping_method is not None:
|
|
532
|
+
warnings.warn(
|
|
533
|
+
"From_vtl_mapping_method is not implemented yet, we will use the Basic "
|
|
534
|
+
"method with old data."
|
|
535
|
+
)
|
|
536
|
+
if isinstance(mappings.dataflow, str):
|
|
537
|
+
short_urn = str(parse_urn(mappings.dataflow))
|
|
538
|
+
elif isinstance(mappings.dataflow, (Reference, DataflowRef)):
|
|
539
|
+
short_urn = str(mappings.dataflow)
|
|
540
|
+
elif isinstance(mappings.dataflow, Dataflow):
|
|
541
|
+
short_urn = mappings.dataflow.short_urn
|
|
542
|
+
else:
|
|
543
|
+
raise InputValidationException(
|
|
544
|
+
"Expected str, Reference, DataflowRef or Dataflow type for dataflow in "
|
|
545
|
+
"VtlDataflowMapping."
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
mapping_dict = {short_urn: mappings.dataflow_alias}
|
|
549
|
+
else:
|
|
550
|
+
raise InputValidationException("Expected dict or VtlDataflowMapping type for mappings.")
|
|
551
|
+
|
|
552
|
+
for vtl_name in mapping_dict.values():
|
|
553
|
+
if vtl_name not in input_names:
|
|
554
|
+
raise InputValidationException("0-1-3-5", dataset_name=vtl_name)
|
|
555
|
+
|
|
556
|
+
datapoints = {}
|
|
557
|
+
data_structures = []
|
|
558
|
+
for dataset in datasets:
|
|
559
|
+
schema = dataset.structure
|
|
560
|
+
if not isinstance(schema, Schema):
|
|
561
|
+
raise InputValidationException("0-1-3-2", schema=schema)
|
|
562
|
+
if schema.short_urn not in mapping_dict:
|
|
563
|
+
raise InputValidationException("0-1-3-4", short_urn=schema.short_urn)
|
|
564
|
+
# Generating VTL Datastructure and Datapoints.
|
|
565
|
+
dataset_name = mapping_dict[schema.short_urn]
|
|
566
|
+
vtl_structure = to_vtl_json(schema, dataset_name)
|
|
567
|
+
data_structures.append(vtl_structure)
|
|
568
|
+
datapoints[dataset_name] = dataset.data
|
|
569
|
+
|
|
570
|
+
missing = []
|
|
571
|
+
for input_name in input_names:
|
|
572
|
+
if input_name not in mapping_dict.values():
|
|
573
|
+
missing.append(input_name)
|
|
574
|
+
if missing:
|
|
575
|
+
raise InputValidationException("0-1-3-6", missing=missing)
|
|
576
|
+
|
|
577
|
+
result = run(
|
|
578
|
+
script=script,
|
|
579
|
+
data_structures=data_structures,
|
|
580
|
+
datapoints=datapoints,
|
|
581
|
+
value_domains=value_domains,
|
|
582
|
+
external_routines=external_routines,
|
|
583
|
+
time_period_output_format=time_period_output_format,
|
|
584
|
+
return_only_persistent=return_only_persistent,
|
|
585
|
+
output_folder=output_folder,
|
|
586
|
+
)
|
|
587
|
+
return result
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def generate_sdmx(
|
|
591
|
+
script: Union[str, Path], agency_id: str, id: str, version: str = "1.0"
|
|
592
|
+
) -> TransformationScheme:
|
|
593
|
+
"""
|
|
594
|
+
Function that generates a TransformationScheme object from a VTL script.
|
|
595
|
+
|
|
596
|
+
The TransformationScheme object is the SDMX representation of the VTL script. \
|
|
597
|
+
For more details please check the `SDMX IM VTL objects \
|
|
598
|
+
<https://sdmx.org/wp-content/uploads/SDMX_3-0-0_SECTION_2_FINAL-1_0.pdf#page=146>`_, line 2266.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
script: A string with the VTL script.
|
|
602
|
+
agency_id: The Agency ID used in the generated `TransformationScheme` object.
|
|
603
|
+
id: The given id of the generated `TransformationScheme` object.
|
|
604
|
+
version: The Version used in the generated `TransformationScheme` object. (default: "1.0")
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
The generated Transformation Scheme object.
|
|
608
|
+
"""
|
|
609
|
+
vtl = load_vtl(script)
|
|
610
|
+
ast = create_ast(vtl)
|
|
611
|
+
result = ast_to_sdmx(ast, agency_id, id, version)
|
|
612
|
+
return result
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"definitions": {
|
|
4
|
+
"sqlQuery": {
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"name": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"minLength": 1,
|
|
10
|
+
"description": "Identifier for the SQL query"
|
|
11
|
+
},
|
|
12
|
+
"query": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"minLength": 1,
|
|
15
|
+
"description": "SQL query statement"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"required": ["name", "query"],
|
|
19
|
+
"additionalProperties": false
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"oneOf": [
|
|
23
|
+
{
|
|
24
|
+
"$ref": "#/definitions/sqlQuery"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"type": "array",
|
|
28
|
+
"items": {
|
|
29
|
+
"$ref": "#/definitions/sqlQuery"
|
|
30
|
+
},
|
|
31
|
+
"minItems": 1
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|