vtlengine 1.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vtlengine/API/_InternalApi.py +791 -0
- vtlengine/API/__init__.py +612 -0
- vtlengine/API/data/schema/external_routines_schema.json +34 -0
- vtlengine/API/data/schema/json_schema_2.1.json +116 -0
- vtlengine/API/data/schema/value_domain_schema.json +97 -0
- vtlengine/AST/ASTComment.py +57 -0
- vtlengine/AST/ASTConstructor.py +598 -0
- vtlengine/AST/ASTConstructorModules/Expr.py +1928 -0
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +995 -0
- vtlengine/AST/ASTConstructorModules/Terminals.py +790 -0
- vtlengine/AST/ASTConstructorModules/__init__.py +50 -0
- vtlengine/AST/ASTDataExchange.py +10 -0
- vtlengine/AST/ASTEncoders.py +32 -0
- vtlengine/AST/ASTString.py +675 -0
- vtlengine/AST/ASTTemplate.py +558 -0
- vtlengine/AST/ASTVisitor.py +25 -0
- vtlengine/AST/DAG/__init__.py +479 -0
- vtlengine/AST/DAG/_words.py +10 -0
- vtlengine/AST/Grammar/Vtl.g4 +705 -0
- vtlengine/AST/Grammar/VtlTokens.g4 +409 -0
- vtlengine/AST/Grammar/__init__.py +0 -0
- vtlengine/AST/Grammar/lexer.py +2139 -0
- vtlengine/AST/Grammar/parser.py +16597 -0
- vtlengine/AST/Grammar/tokens.py +169 -0
- vtlengine/AST/VtlVisitor.py +824 -0
- vtlengine/AST/__init__.py +674 -0
- vtlengine/DataTypes/TimeHandling.py +562 -0
- vtlengine/DataTypes/__init__.py +863 -0
- vtlengine/DataTypes/_time_checking.py +135 -0
- vtlengine/Exceptions/__exception_file_generator.py +96 -0
- vtlengine/Exceptions/__init__.py +159 -0
- vtlengine/Exceptions/messages.py +1004 -0
- vtlengine/Interpreter/__init__.py +2048 -0
- vtlengine/Model/__init__.py +501 -0
- vtlengine/Operators/Aggregation.py +357 -0
- vtlengine/Operators/Analytic.py +455 -0
- vtlengine/Operators/Assignment.py +23 -0
- vtlengine/Operators/Boolean.py +106 -0
- vtlengine/Operators/CastOperator.py +451 -0
- vtlengine/Operators/Clause.py +366 -0
- vtlengine/Operators/Comparison.py +488 -0
- vtlengine/Operators/Conditional.py +495 -0
- vtlengine/Operators/General.py +191 -0
- vtlengine/Operators/HROperators.py +254 -0
- vtlengine/Operators/Join.py +447 -0
- vtlengine/Operators/Numeric.py +422 -0
- vtlengine/Operators/RoleSetter.py +77 -0
- vtlengine/Operators/Set.py +176 -0
- vtlengine/Operators/String.py +578 -0
- vtlengine/Operators/Time.py +1144 -0
- vtlengine/Operators/Validation.py +275 -0
- vtlengine/Operators/__init__.py +900 -0
- vtlengine/Utils/__Virtual_Assets.py +34 -0
- vtlengine/Utils/__init__.py +479 -0
- vtlengine/__extras_check.py +17 -0
- vtlengine/__init__.py +27 -0
- vtlengine/files/__init__.py +0 -0
- vtlengine/files/output/__init__.py +35 -0
- vtlengine/files/output/_time_period_representation.py +55 -0
- vtlengine/files/parser/__init__.py +240 -0
- vtlengine/files/parser/_rfc_dialect.py +22 -0
- vtlengine/py.typed +0 -0
- vtlengine-1.4.0rc2.dist-info/METADATA +89 -0
- vtlengine-1.4.0rc2.dist-info/RECORD +66 -0
- vtlengine-1.4.0rc2.dist-info/WHEEL +4 -0
- vtlengine-1.4.0rc2.dist-info/licenses/LICENSE.md +661 -0
|
@@ -0,0 +1,791 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
import jsonschema
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pysdmx.model.dataflow import Component as SDMXComponent
|
|
10
|
+
from pysdmx.model.dataflow import DataStructureDefinition, Schema
|
|
11
|
+
from pysdmx.model.dataflow import Role as SDMX_Role
|
|
12
|
+
from pysdmx.model.vtl import (
|
|
13
|
+
Ruleset,
|
|
14
|
+
RulesetScheme,
|
|
15
|
+
Transformation,
|
|
16
|
+
TransformationScheme,
|
|
17
|
+
UserDefinedOperator,
|
|
18
|
+
UserDefinedOperatorScheme,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from vtlengine import AST as AST
|
|
22
|
+
from vtlengine.__extras_check import __check_s3_extra
|
|
23
|
+
from vtlengine.AST import Assignment, DPRuleset, HRuleset, Operator, PersistentAssignment, Start
|
|
24
|
+
from vtlengine.AST.ASTString import ASTString
|
|
25
|
+
from vtlengine.DataTypes import SCALAR_TYPES
|
|
26
|
+
from vtlengine.Exceptions import (
|
|
27
|
+
DataLoadError,
|
|
28
|
+
InputValidationException,
|
|
29
|
+
check_key,
|
|
30
|
+
)
|
|
31
|
+
from vtlengine.files.parser import (
|
|
32
|
+
_fill_dataset_empty_data,
|
|
33
|
+
_validate_pandas,
|
|
34
|
+
load_datapoints,
|
|
35
|
+
)
|
|
36
|
+
from vtlengine.Model import (
|
|
37
|
+
Component as VTL_Component,
|
|
38
|
+
)
|
|
39
|
+
from vtlengine.Model import (
|
|
40
|
+
Dataset,
|
|
41
|
+
ExternalRoutine,
|
|
42
|
+
Role,
|
|
43
|
+
Role_keys,
|
|
44
|
+
Scalar,
|
|
45
|
+
ValueDomain,
|
|
46
|
+
)
|
|
47
|
+
from vtlengine.Utils import VTL_DTYPES_MAPPING, VTL_ROLE_MAPPING
|
|
48
|
+
|
|
49
|
+
base_path = Path(__file__).parent
|
|
50
|
+
schema_path = base_path / "data" / "schema"
|
|
51
|
+
sdmx_csv_path = base_path / "data" / "sdmx_csv"
|
|
52
|
+
with open(schema_path / "json_schema_2.1.json", "r") as file:
|
|
53
|
+
schema = json.load(file)
|
|
54
|
+
with open(schema_path / "value_domain_schema.json", "r") as file:
|
|
55
|
+
vd_schema = json.load(file)
|
|
56
|
+
with open(schema_path / "external_routines_schema.json", "r") as file:
|
|
57
|
+
external_routine_schema = json.load(file)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _load_dataset_from_structure(
|
|
61
|
+
structures: Dict[str, Any],
|
|
62
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Loads a dataset with the structure given.
|
|
65
|
+
"""
|
|
66
|
+
datasets = {}
|
|
67
|
+
scalars = {}
|
|
68
|
+
|
|
69
|
+
if "datasets" in structures:
|
|
70
|
+
for dataset_json in structures["datasets"]:
|
|
71
|
+
dataset_name = dataset_json["name"]
|
|
72
|
+
components = {}
|
|
73
|
+
|
|
74
|
+
if "structure" in dataset_json:
|
|
75
|
+
structure_name = dataset_json["structure"]
|
|
76
|
+
structure_json = None
|
|
77
|
+
for s in structures["structures"]:
|
|
78
|
+
if s["name"] == structure_name:
|
|
79
|
+
structure_json = s
|
|
80
|
+
if structure_json is None:
|
|
81
|
+
raise InputValidationException(code="0-2-1-2", message="Structure not found.")
|
|
82
|
+
try:
|
|
83
|
+
jsonschema.validate(instance=structure_json, schema=schema)
|
|
84
|
+
except jsonschema.exceptions.ValidationError as e:
|
|
85
|
+
raise InputValidationException(code="0-2-1-2", message=e.message)
|
|
86
|
+
|
|
87
|
+
for component in structure_json["components"]:
|
|
88
|
+
check_key("data_type", SCALAR_TYPES.keys(), component["data_type"])
|
|
89
|
+
if component["role"] == "ViralAttribute":
|
|
90
|
+
component["role"] = "Attribute"
|
|
91
|
+
|
|
92
|
+
check_key("role", Role_keys, component["role"])
|
|
93
|
+
|
|
94
|
+
if "nullable" not in component:
|
|
95
|
+
if Role(component["role"]) == Role.IDENTIFIER:
|
|
96
|
+
component["nullable"] = False
|
|
97
|
+
elif Role(component["role"]) in (Role.MEASURE, Role.ATTRIBUTE):
|
|
98
|
+
component["nullable"] = True
|
|
99
|
+
else:
|
|
100
|
+
component["nullable"] = False
|
|
101
|
+
|
|
102
|
+
components[component["name"]] = VTL_Component(
|
|
103
|
+
name=component["name"],
|
|
104
|
+
data_type=SCALAR_TYPES[component["data_type"]],
|
|
105
|
+
role=Role(component["role"]),
|
|
106
|
+
nullable=component["nullable"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if "DataStructure" in dataset_json:
|
|
110
|
+
for component in dataset_json["DataStructure"]:
|
|
111
|
+
check_key("data_type", SCALAR_TYPES.keys(), component["type"])
|
|
112
|
+
check_key("role", Role_keys, component["role"])
|
|
113
|
+
components[component["name"]] = VTL_Component(
|
|
114
|
+
name=component["name"],
|
|
115
|
+
data_type=SCALAR_TYPES[component["type"]],
|
|
116
|
+
role=Role(component["role"]),
|
|
117
|
+
nullable=component["nullable"],
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
datasets[dataset_name] = Dataset(name=dataset_name, components=components, data=None)
|
|
121
|
+
if "scalars" in structures:
|
|
122
|
+
for scalar_json in structures["scalars"]:
|
|
123
|
+
scalar_name = scalar_json["name"]
|
|
124
|
+
check_key("type", SCALAR_TYPES.keys(), scalar_json["type"])
|
|
125
|
+
scalar = Scalar(
|
|
126
|
+
name=scalar_name,
|
|
127
|
+
data_type=SCALAR_TYPES[scalar_json["type"]],
|
|
128
|
+
value=None,
|
|
129
|
+
)
|
|
130
|
+
scalars[scalar_name] = scalar
|
|
131
|
+
return datasets, scalars
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _generate_single_path_dict(
|
|
135
|
+
datapoint: Path,
|
|
136
|
+
) -> Dict[str, Path]:
|
|
137
|
+
"""
|
|
138
|
+
Generates a dict with one dataset name and its path. The dataset name is extracted
|
|
139
|
+
from the filename without the .csv extension.
|
|
140
|
+
"""
|
|
141
|
+
dataset_name = datapoint.name.removesuffix(".csv")
|
|
142
|
+
dict_paths = {dataset_name: datapoint}
|
|
143
|
+
return dict_paths
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _load_single_datapoint(datapoint: Union[str, Path]) -> Dict[str, Union[str, Path]]:
|
|
147
|
+
"""
|
|
148
|
+
Returns a dict with the data given from one dataset.
|
|
149
|
+
"""
|
|
150
|
+
if not isinstance(datapoint, (str, Path)):
|
|
151
|
+
raise InputValidationException(
|
|
152
|
+
code="0-1-1-2", input=datapoint, message="Input must be a Path or an S3 URI"
|
|
153
|
+
)
|
|
154
|
+
# Handling of str values
|
|
155
|
+
if isinstance(datapoint, str):
|
|
156
|
+
if "s3://" in datapoint:
|
|
157
|
+
__check_s3_extra()
|
|
158
|
+
dataset_name = datapoint.split("/")[-1].removesuffix(".csv")
|
|
159
|
+
return {dataset_name: datapoint}
|
|
160
|
+
# Converting to Path object if it is not an S3 URI
|
|
161
|
+
try:
|
|
162
|
+
datapoint = Path(datapoint)
|
|
163
|
+
except Exception:
|
|
164
|
+
raise InputValidationException(
|
|
165
|
+
code="0-1-1-2", input=datapoint, message="Input must refer to a Path or an S3 URI"
|
|
166
|
+
)
|
|
167
|
+
# Validation of Path object
|
|
168
|
+
if not datapoint.exists():
|
|
169
|
+
raise DataLoadError(code="0-3-1-1", file=datapoint)
|
|
170
|
+
|
|
171
|
+
# Generation of datapoints dictionary with Path objects
|
|
172
|
+
dict_paths: Dict[str, Path] = {}
|
|
173
|
+
if datapoint.is_dir():
|
|
174
|
+
for f in datapoint.iterdir():
|
|
175
|
+
if f.suffix != ".csv":
|
|
176
|
+
continue
|
|
177
|
+
dict_paths.update(_generate_single_path_dict(f))
|
|
178
|
+
else:
|
|
179
|
+
dict_paths = _generate_single_path_dict(datapoint)
|
|
180
|
+
return dict_paths # type: ignore[return-value]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _check_unique_datapoints(
|
|
184
|
+
datapoints_to_add: List[str],
|
|
185
|
+
datapoints_present: List[str],
|
|
186
|
+
) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Checks we don´t add duplicate dataset names in the datapoints.
|
|
189
|
+
"""
|
|
190
|
+
for x in datapoints_to_add:
|
|
191
|
+
if x in datapoints_present:
|
|
192
|
+
raise InputValidationException(
|
|
193
|
+
f"Duplicate dataset name found in datapoints: {x}. "
|
|
194
|
+
f"Please check file names and dictionary keys in datapoints."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _load_datapoints_path(
|
|
199
|
+
datapoints: Union[Dict[str, Union[str, Path]], List[Union[str, Path]], str, Path],
|
|
200
|
+
) -> Dict[str, Union[str, Path]]:
|
|
201
|
+
"""
|
|
202
|
+
Returns a dict with the data given from a Path.
|
|
203
|
+
"""
|
|
204
|
+
dict_datapoints: Dict[str, Union[str, Path]] = {}
|
|
205
|
+
if isinstance(datapoints, dict):
|
|
206
|
+
for dataset_name, datapoint in datapoints.items():
|
|
207
|
+
if not isinstance(dataset_name, str):
|
|
208
|
+
raise InputValidationException(
|
|
209
|
+
code="0-1-1-2",
|
|
210
|
+
input=dataset_name,
|
|
211
|
+
message="Datapoints dictionary keys must be strings.",
|
|
212
|
+
)
|
|
213
|
+
if not isinstance(datapoint, (str, Path)):
|
|
214
|
+
raise InputValidationException(
|
|
215
|
+
code="0-1-1-2",
|
|
216
|
+
input=datapoint,
|
|
217
|
+
message="Datapoints dictionary values must be Paths or S3 URIs.",
|
|
218
|
+
)
|
|
219
|
+
single_datapoint = _load_single_datapoint(datapoint)
|
|
220
|
+
first_datapoint = list(single_datapoint.values())[0]
|
|
221
|
+
_check_unique_datapoints([dataset_name], list(dict_datapoints.keys()))
|
|
222
|
+
dict_datapoints[dataset_name] = first_datapoint
|
|
223
|
+
return dict_datapoints
|
|
224
|
+
if isinstance(datapoints, list):
|
|
225
|
+
for x in datapoints:
|
|
226
|
+
single_datapoint = _load_single_datapoint(x)
|
|
227
|
+
_check_unique_datapoints(list(single_datapoint.keys()), list(dict_datapoints.keys()))
|
|
228
|
+
dict_datapoints.update(single_datapoint)
|
|
229
|
+
return dict_datapoints
|
|
230
|
+
return _load_single_datapoint(datapoints)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _load_datastructure_single(
|
|
234
|
+
data_structure: Union[Dict[str, Any], Path],
|
|
235
|
+
) -> Tuple[Dict[str, Dataset], Dict[str, Scalar]]:
|
|
236
|
+
"""
|
|
237
|
+
Loads a single data structure.
|
|
238
|
+
"""
|
|
239
|
+
if isinstance(data_structure, dict):
|
|
240
|
+
return _load_dataset_from_structure(data_structure)
|
|
241
|
+
if not isinstance(data_structure, Path):
|
|
242
|
+
raise InputValidationException(
|
|
243
|
+
code="0-1-1-2", input=data_structure, message="Input must be a dict or Path object"
|
|
244
|
+
)
|
|
245
|
+
if not data_structure.exists():
|
|
246
|
+
raise DataLoadError(code="0-3-1-1", file=data_structure)
|
|
247
|
+
if data_structure.is_dir():
|
|
248
|
+
datasets: Dict[str, Dataset] = {}
|
|
249
|
+
scalars: Dict[str, Scalar] = {}
|
|
250
|
+
for f in data_structure.iterdir():
|
|
251
|
+
if f.suffix != ".json":
|
|
252
|
+
continue
|
|
253
|
+
ds, sc = _load_datastructure_single(f)
|
|
254
|
+
datasets = {**datasets, **ds}
|
|
255
|
+
scalars = {**scalars, **sc}
|
|
256
|
+
return datasets, scalars
|
|
257
|
+
else:
|
|
258
|
+
if data_structure.suffix != ".json":
|
|
259
|
+
raise InputValidationException(
|
|
260
|
+
code="0-1-1-3", expected_ext=".json", ext=data_structure.suffix
|
|
261
|
+
)
|
|
262
|
+
with open(data_structure, "r") as file:
|
|
263
|
+
structures = json.load(file)
|
|
264
|
+
return _load_dataset_from_structure(structures)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def load_datasets(
|
|
268
|
+
data_structure: Union[Dict[str, Any], Path, List[Dict[str, Any]], List[Path]],
|
|
269
|
+
) -> Tuple[Dict[str, Dataset], Dict[str, Scalar]]:
|
|
270
|
+
"""
|
|
271
|
+
Loads multiple datasets.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
data_structure: Dict, Path or a List of dicts or Paths.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
The datastructure as a dict or a list of datastructures as dicts. \
|
|
278
|
+
These dicts will have as keys the name, role, \
|
|
279
|
+
type and nullable of the data contained in the dataset.
|
|
280
|
+
|
|
281
|
+
Raises:
|
|
282
|
+
Exception: If the Path is invalid or datastructure has a wrong format.
|
|
283
|
+
"""
|
|
284
|
+
if isinstance(data_structure, dict):
|
|
285
|
+
return _load_datastructure_single(data_structure)
|
|
286
|
+
if isinstance(data_structure, list):
|
|
287
|
+
ds_structures: Dict[str, Dataset] = {}
|
|
288
|
+
scalar_structures: Dict[str, Scalar] = {}
|
|
289
|
+
for x in data_structure:
|
|
290
|
+
ds, sc = _load_datastructure_single(x)
|
|
291
|
+
ds_structures = {**ds_structures, **ds} # Overwrite ds_structures dict.
|
|
292
|
+
scalar_structures = {**scalar_structures, **sc} # Overwrite scalar_structures dict.
|
|
293
|
+
return ds_structures, scalar_structures
|
|
294
|
+
return _load_datastructure_single(data_structure)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _handle_scalars_values(
|
|
298
|
+
scalars: Dict[str, Scalar],
|
|
299
|
+
scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
|
|
300
|
+
) -> None:
|
|
301
|
+
if scalar_values is None:
|
|
302
|
+
return
|
|
303
|
+
# Handling scalar values with the scalar dict
|
|
304
|
+
for name, value in scalar_values.items():
|
|
305
|
+
if name not in scalars:
|
|
306
|
+
raise InputValidationException(code="0-1-2-6", name=name)
|
|
307
|
+
# Casting value to scalar data type
|
|
308
|
+
if not scalars[name].data_type.check(value):
|
|
309
|
+
raise InputValidationException(
|
|
310
|
+
code="0-1-2-7",
|
|
311
|
+
value=value,
|
|
312
|
+
type_=scalars[name].data_type.__name__,
|
|
313
|
+
op_type=type(scalars[name]).__name__,
|
|
314
|
+
name=name,
|
|
315
|
+
)
|
|
316
|
+
scalars[name].value = scalars[name].data_type.cast(value)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _handle_empty_datasets(datasets: Dict[str, Dataset]) -> None:
|
|
320
|
+
for dataset in datasets.values():
|
|
321
|
+
if dataset.data is None:
|
|
322
|
+
_fill_dataset_empty_data(dataset)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def load_datasets_with_data(
|
|
326
|
+
data_structures: Any,
|
|
327
|
+
datapoints: Optional[
|
|
328
|
+
Union[Dict[str, Union[pd.DataFrame, Path, str]], List[Union[str, Path]], Path, str]
|
|
329
|
+
] = None,
|
|
330
|
+
scalar_values: Optional[Dict[str, Optional[Union[int, str, bool, float]]]] = None,
|
|
331
|
+
) -> Any:
|
|
332
|
+
"""
|
|
333
|
+
Loads the dataset structures and fills them with the data contained in the datapoints.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
data_structures: Dict, Path or a List of dicts or Paths.
|
|
337
|
+
datapoints: Dict, Path or a List of Paths.
|
|
338
|
+
scalar_values: Dict with the scalar values.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
A dict with the structure and a pandas dataframe with the data.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
Exception: If the Path is wrong or the file is invalid.
|
|
345
|
+
"""
|
|
346
|
+
# Load the datasets without data
|
|
347
|
+
datasets, scalars = load_datasets(data_structures)
|
|
348
|
+
# Handle empty datasets and scalar values if no datapoints are given
|
|
349
|
+
if datapoints is None:
|
|
350
|
+
_handle_empty_datasets(datasets)
|
|
351
|
+
_handle_scalars_values(scalars, scalar_values)
|
|
352
|
+
return datasets, scalars, None
|
|
353
|
+
|
|
354
|
+
# Handling dictionary of Pandas Dataframes
|
|
355
|
+
if isinstance(datapoints, dict) and all(
|
|
356
|
+
isinstance(v, pd.DataFrame) for v in datapoints.values()
|
|
357
|
+
):
|
|
358
|
+
for dataset_name, data in datapoints.items():
|
|
359
|
+
if dataset_name not in datasets:
|
|
360
|
+
raise InputValidationException(
|
|
361
|
+
f"Not found dataset {dataset_name} in datastructures."
|
|
362
|
+
)
|
|
363
|
+
# This exception is not needed due to the all() check above, but it is left for safety
|
|
364
|
+
if not isinstance(data, pd.DataFrame):
|
|
365
|
+
raise InputValidationException(
|
|
366
|
+
f"Invalid datapoint for dataset {dataset_name}. Must be a Pandas Dataframe."
|
|
367
|
+
)
|
|
368
|
+
datasets[dataset_name].data = _validate_pandas(
|
|
369
|
+
datasets[dataset_name].components, data, dataset_name
|
|
370
|
+
)
|
|
371
|
+
# Handle empty datasets and scalar values for remaining datasets
|
|
372
|
+
_handle_empty_datasets(datasets)
|
|
373
|
+
_handle_scalars_values(scalars, scalar_values)
|
|
374
|
+
return datasets, scalars, None
|
|
375
|
+
|
|
376
|
+
# Checking mixed types in the dictionary
|
|
377
|
+
if isinstance(datapoints, dict) and any(
|
|
378
|
+
not isinstance(v, (str, Path)) for v in datapoints.values()
|
|
379
|
+
):
|
|
380
|
+
raise InputValidationException(
|
|
381
|
+
"Invalid datapoints. All values in the dictionary must be Paths or S3 URIs, "
|
|
382
|
+
"or all values must be Pandas Dataframes."
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Handling Individual, List or Dict of Paths or S3 URIs
|
|
386
|
+
# NOTE: Adding type: ignore[arg-type] due to mypy issue with Union types
|
|
387
|
+
datapoints_path = _load_datapoints_path(datapoints) # type: ignore[arg-type]
|
|
388
|
+
for dataset_name, csv_pointer in datapoints_path.items():
|
|
389
|
+
# Check if dataset exists in datastructures
|
|
390
|
+
if dataset_name not in datasets:
|
|
391
|
+
raise InputValidationException(f"Not found dataset {dataset_name} in datastructures.")
|
|
392
|
+
# Validate csv path for this dataset
|
|
393
|
+
components = datasets[dataset_name].components
|
|
394
|
+
_ = load_datapoints(components=components, dataset_name=dataset_name, csv_path=csv_pointer)
|
|
395
|
+
gc.collect() # Garbage collector to free memory after we loaded everything and discarded them
|
|
396
|
+
|
|
397
|
+
_handle_empty_datasets(datasets)
|
|
398
|
+
_handle_scalars_values(scalars, scalar_values)
|
|
399
|
+
|
|
400
|
+
return datasets, scalars, datapoints_path
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def load_vtl(input: Union[str, Path]) -> str:
|
|
404
|
+
"""
|
|
405
|
+
Reads the vtl expression.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
input: String or Path of the vtl expression.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
If it is a string, it will return the input as a string. \
|
|
412
|
+
If it is a Path, it will return the expression contained in the file as a string.
|
|
413
|
+
|
|
414
|
+
Raises:
|
|
415
|
+
Exception: If the vtl does not exist, if the Path is wrong, or if it is not a vtl file.
|
|
416
|
+
"""
|
|
417
|
+
if isinstance(input, str):
|
|
418
|
+
if os.path.exists(input):
|
|
419
|
+
input = Path(input)
|
|
420
|
+
else:
|
|
421
|
+
return input
|
|
422
|
+
if not isinstance(input, Path):
|
|
423
|
+
raise InputValidationException(
|
|
424
|
+
code="0-1-1-2", input=input, message="Input is not a Path object"
|
|
425
|
+
)
|
|
426
|
+
if not input.exists():
|
|
427
|
+
raise DataLoadError(code="0-3-1-1", file=input)
|
|
428
|
+
if input.suffix != ".vtl":
|
|
429
|
+
raise InputValidationException(code="0-1-1-3", expected_ext=".vtl", ext=input.suffix)
|
|
430
|
+
with open(input, "r") as f:
|
|
431
|
+
return f.read()
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _validate_json(data: Dict[str, Any], schema: Dict[str, Any]) -> None:
|
|
435
|
+
try:
|
|
436
|
+
jsonschema.validate(instance=data, schema=schema)
|
|
437
|
+
except jsonschema.ValidationError as e:
|
|
438
|
+
raise InputValidationException(code="0-2-1-1", message=f"{e}")
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _load_single_value_domain(input: Path) -> Dict[str, ValueDomain]:
|
|
442
|
+
if input.suffix != ".json":
|
|
443
|
+
raise InputValidationException(code="0-1-1-3", expected_ext=".json", ext=input.suffix)
|
|
444
|
+
with open(input, "r") as f:
|
|
445
|
+
data = json.load(f)
|
|
446
|
+
_validate_json(data, vd_schema)
|
|
447
|
+
vd = ValueDomain.from_dict(data)
|
|
448
|
+
return {vd.name: vd}
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def load_value_domains(
|
|
452
|
+
input: Union[Dict[str, Any], Path, List[Union[Dict[str, Any], Path]]],
|
|
453
|
+
) -> Dict[str, ValueDomain]:
|
|
454
|
+
"""
|
|
455
|
+
Loads the value domains.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
input: Dict or Path, or a list of them \
|
|
459
|
+
of the json file that contains the value domains data.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
A dictionary with the value domains data, or a list of dictionaries with them.
|
|
463
|
+
|
|
464
|
+
Raises:
|
|
465
|
+
Exception: If the value domains file is wrong, the Path is invalid, \
|
|
466
|
+
or the value domains file does not exist.
|
|
467
|
+
"""
|
|
468
|
+
if isinstance(input, dict):
|
|
469
|
+
_validate_json(input, vd_schema)
|
|
470
|
+
vd = ValueDomain.from_dict(input)
|
|
471
|
+
return {vd.name: vd}
|
|
472
|
+
if isinstance(input, list):
|
|
473
|
+
value_domains: Dict[str, Any] = {}
|
|
474
|
+
for item in input:
|
|
475
|
+
value_domains.update(load_value_domains(item))
|
|
476
|
+
return value_domains
|
|
477
|
+
if not isinstance(input, Path):
|
|
478
|
+
raise InputValidationException(
|
|
479
|
+
code="0-1-1-2", input=input, message="Input is not a Path object"
|
|
480
|
+
)
|
|
481
|
+
if not input.exists():
|
|
482
|
+
raise DataLoadError(code="0-3-1-1", file=input)
|
|
483
|
+
if input.is_dir():
|
|
484
|
+
value_domains = {}
|
|
485
|
+
for f in input.iterdir():
|
|
486
|
+
vd = _load_single_value_domain(f)
|
|
487
|
+
value_domains = {**value_domains, **vd}
|
|
488
|
+
return value_domains
|
|
489
|
+
if input.suffix != ".json":
|
|
490
|
+
raise InputValidationException(code="0-1-1-3", expected_ext=".json", ext=input.suffix)
|
|
491
|
+
return _load_single_value_domain(input)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def load_external_routines(
|
|
495
|
+
input: Union[Dict[str, Any], Path, str, List[Union[Dict[str, Any], Path]]],
|
|
496
|
+
) -> Any:
|
|
497
|
+
"""
|
|
498
|
+
Load the external routines.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
input: Dict or Path, or a list of them, \
|
|
502
|
+
of the JSON file that contains the external routine data.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
A dictionary with the external routine data, or a list with \
|
|
506
|
+
the dictionaries from the Path given.
|
|
507
|
+
|
|
508
|
+
Raises:
|
|
509
|
+
Exception: If the JSON file does not exist, the Path is wrong, or the file is not a \
|
|
510
|
+
JSON one.
|
|
511
|
+
"""
|
|
512
|
+
external_routines = {}
|
|
513
|
+
if isinstance(input, dict):
|
|
514
|
+
_validate_json(input, external_routine_schema)
|
|
515
|
+
ext_routine = ExternalRoutine.from_sql_query(input["name"], input["query"])
|
|
516
|
+
external_routines[ext_routine.name] = ext_routine
|
|
517
|
+
return external_routines
|
|
518
|
+
if isinstance(input, list):
|
|
519
|
+
ext_routines = {}
|
|
520
|
+
for item in input:
|
|
521
|
+
ext_routines.update(load_external_routines(item))
|
|
522
|
+
return ext_routines
|
|
523
|
+
if not isinstance(input, Path):
|
|
524
|
+
raise InputValidationException(
|
|
525
|
+
code="0-1-1-2", input=input, message="Input must be a json file."
|
|
526
|
+
)
|
|
527
|
+
if not input.exists():
|
|
528
|
+
raise DataLoadError(code="0-3-1-1", file=input)
|
|
529
|
+
if input.is_dir():
|
|
530
|
+
for f in input.iterdir():
|
|
531
|
+
if f.suffix != ".sql":
|
|
532
|
+
continue
|
|
533
|
+
ext_rout = _load_single_external_routine_from_file(f)
|
|
534
|
+
external_routines[ext_rout.name] = ext_rout
|
|
535
|
+
return external_routines
|
|
536
|
+
ext_rout = _load_single_external_routine_from_file(input)
|
|
537
|
+
external_routines[ext_rout.name] = ext_rout
|
|
538
|
+
return external_routines
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def _return_only_persistent_datasets(
|
|
542
|
+
datasets: Dict[str, Union[Dataset, Scalar]], ast: Start
|
|
543
|
+
) -> Dict[str, Union[Dataset, Scalar]]:
|
|
544
|
+
"""
|
|
545
|
+
Returns only the datasets with a persistent assignment.
|
|
546
|
+
"""
|
|
547
|
+
return {dataset.name: dataset for dataset in datasets.values() if dataset.persistent}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _load_single_external_routine_from_file(input: Path) -> Any:
|
|
551
|
+
if not isinstance(input, Path):
|
|
552
|
+
raise InputValidationException(code="0-1-1-2", input=input)
|
|
553
|
+
if not input.exists():
|
|
554
|
+
raise DataLoadError(code="0-3-1-1", file=input)
|
|
555
|
+
if input.suffix != ".json":
|
|
556
|
+
raise InputValidationException(code="0-1-1-3", expected_ext=".json", ext=input.suffix)
|
|
557
|
+
routine_name = input.stem
|
|
558
|
+
with open(input, "r") as f:
|
|
559
|
+
data = json.load(f)
|
|
560
|
+
_validate_json(data, external_routine_schema)
|
|
561
|
+
ext_rout = ExternalRoutine.from_sql_query(routine_name, data["query"])
|
|
562
|
+
return ext_rout
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _check_output_folder(output_folder: Union[str, Path]) -> None:
|
|
566
|
+
"""
|
|
567
|
+
Check if the output folder exists. If not, it will create it.
|
|
568
|
+
"""
|
|
569
|
+
if isinstance(output_folder, str):
|
|
570
|
+
if "s3://" in output_folder:
|
|
571
|
+
__check_s3_extra()
|
|
572
|
+
if not output_folder.endswith("/"):
|
|
573
|
+
raise DataLoadError("0-3-1-2", folder=str(output_folder))
|
|
574
|
+
return
|
|
575
|
+
try:
|
|
576
|
+
output_folder = Path(output_folder)
|
|
577
|
+
except Exception:
|
|
578
|
+
raise DataLoadError("0-3-1-2", folder=str(output_folder))
|
|
579
|
+
|
|
580
|
+
if not isinstance(output_folder, Path):
|
|
581
|
+
raise DataLoadError("0-3-1-2", folder=str(output_folder))
|
|
582
|
+
if not output_folder.exists():
|
|
583
|
+
if output_folder.suffix != "":
|
|
584
|
+
raise DataLoadError("0-3-1-2", folder=str(output_folder))
|
|
585
|
+
os.mkdir(output_folder)
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def to_vtl_json(dsd: Union[DataStructureDefinition, Schema], dataset_name: str) -> Dict[str, Any]:
|
|
589
|
+
"""
|
|
590
|
+
Converts a pysdmx `DataStructureDefinition` or `Schema` into a VTL-compatible JSON
|
|
591
|
+
representation.
|
|
592
|
+
|
|
593
|
+
This function extracts and transforms the components (dimensions, measures, and attributes)
|
|
594
|
+
from the given SDMX data structure and maps them into a dictionary format that conforms
|
|
595
|
+
to the expected VTL data structure json schema.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
dsd: An instance of `DataStructureDefinition` or `Schema` from the `pysdmx` model.
|
|
599
|
+
dataset_name: The name of the resulting VTL dataset.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
A dictionary representing the dataset in VTL format, with keys for dataset name and its
|
|
603
|
+
components, including their name, role, data type, and nullability.
|
|
604
|
+
"""
|
|
605
|
+
components = []
|
|
606
|
+
NAME = "name"
|
|
607
|
+
ROLE = "role"
|
|
608
|
+
TYPE = "type"
|
|
609
|
+
NULLABLE = "nullable"
|
|
610
|
+
|
|
611
|
+
_components: List[SDMXComponent] = []
|
|
612
|
+
_components.extend(dsd.components.dimensions)
|
|
613
|
+
_components.extend(dsd.components.measures)
|
|
614
|
+
_components.extend(dsd.components.attributes)
|
|
615
|
+
|
|
616
|
+
for c in _components:
|
|
617
|
+
_type = VTL_DTYPES_MAPPING[c.dtype]
|
|
618
|
+
_nullability = c.role != SDMX_Role.DIMENSION
|
|
619
|
+
_role = VTL_ROLE_MAPPING[c.role]
|
|
620
|
+
|
|
621
|
+
component = {
|
|
622
|
+
NAME: c.id,
|
|
623
|
+
ROLE: _role,
|
|
624
|
+
TYPE: _type,
|
|
625
|
+
NULLABLE: _nullability,
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
components.append(component)
|
|
629
|
+
|
|
630
|
+
result = {"datasets": [{"name": dataset_name, "DataStructure": components}]}
|
|
631
|
+
|
|
632
|
+
return result
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def __generate_transformation(
|
|
636
|
+
child: Union[Assignment, PersistentAssignment], is_persistent: bool, count: int
|
|
637
|
+
) -> Transformation:
|
|
638
|
+
expression = ASTString().render(ast=child.right)
|
|
639
|
+
result = child.left.value # type: ignore[attr-defined]
|
|
640
|
+
return Transformation(
|
|
641
|
+
id=f"T{count}",
|
|
642
|
+
expression=expression,
|
|
643
|
+
is_persistent=is_persistent,
|
|
644
|
+
result=result,
|
|
645
|
+
name=f"Transformation {result}",
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def __generate_udo(child: Operator, count: int) -> UserDefinedOperator:
|
|
650
|
+
operator_definition = ASTString().render(ast=child)
|
|
651
|
+
return UserDefinedOperator(
|
|
652
|
+
id=f"UDO{count}",
|
|
653
|
+
operator_definition=operator_definition,
|
|
654
|
+
name=f"UserDefinedOperator {child.op}",
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def __generate_ruleset(child: Union[DPRuleset, HRuleset], count: int) -> Ruleset:
|
|
659
|
+
ruleset_definition = ASTString().render(ast=child)
|
|
660
|
+
ruleset_type: Literal["datapoint", "hierarchical"] = (
|
|
661
|
+
"datapoint" if isinstance(child, DPRuleset) else "hierarchical"
|
|
662
|
+
)
|
|
663
|
+
ruleset_scope: Literal["variable", "valuedomain"] = (
|
|
664
|
+
"variable" if child.signature_type == "variable" else "valuedomain"
|
|
665
|
+
)
|
|
666
|
+
return Ruleset(
|
|
667
|
+
id=f"R{count}",
|
|
668
|
+
ruleset_definition=ruleset_definition,
|
|
669
|
+
ruleset_type=ruleset_type,
|
|
670
|
+
ruleset_scope=ruleset_scope,
|
|
671
|
+
name=f"{ruleset_type.capitalize()} ruleset {child.name}",
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def ast_to_sdmx(ast: AST.Start, agency_id: str, id: str, version: str) -> TransformationScheme:
|
|
676
|
+
"""
|
|
677
|
+
Converts a vtl AST into an SDMX compatible `TransformationScheme` object, following
|
|
678
|
+
the pysdmx model.
|
|
679
|
+
|
|
680
|
+
This function iterates over the child nodes of the given AST and categorizes each into one of
|
|
681
|
+
the following types:
|
|
682
|
+
- `PersistentAssignment`: Represents a persistent transformation. These are added to the
|
|
683
|
+
transformation list with a persistence flag.
|
|
684
|
+
- `Assignment`: Represents a temporary (non-persistent) transformation. These are added to the
|
|
685
|
+
transformation list without the persistence flag
|
|
686
|
+
- `DPRuleset` or `HRuleset`: Represent validation rule sets.
|
|
687
|
+
These are collected and wrapped into a `RulesetScheme` object.
|
|
688
|
+
- `Operator`: Defines user-defined operators. These are collected
|
|
689
|
+
into a `UserDefinedOperatorScheme` object.
|
|
690
|
+
|
|
691
|
+
After parsing all AST elements:
|
|
692
|
+
- If any rulesets were found, a `RulesetScheme` is created and added to the references.
|
|
693
|
+
- If any user-defined operators were found, a `UserDefinedOperatorScheme` is created and added
|
|
694
|
+
to the references.
|
|
695
|
+
- A `TransformationScheme` object is constructed with all collected transformations and any
|
|
696
|
+
additional references.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
ast: The root node of the vtl ast representing the set of
|
|
700
|
+
vtl expressions.
|
|
701
|
+
agency_id: The identifier of the agency defining the SDMX structure as a string.
|
|
702
|
+
id: The identifier of the transformation scheme as a string.
|
|
703
|
+
version: The version of the transformation scheme given as a string.
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
TransformationScheme: A fully constructed transformation scheme that includes
|
|
707
|
+
transformations, and optionally rule sets and user-defined operator schemes,
|
|
708
|
+
suitable for SDMX.
|
|
709
|
+
|
|
710
|
+
"""
|
|
711
|
+
list_transformation = []
|
|
712
|
+
list_udos = []
|
|
713
|
+
list_rulesets = []
|
|
714
|
+
count_transformation = 0
|
|
715
|
+
count_udo = 0
|
|
716
|
+
count_ruleset = 0
|
|
717
|
+
|
|
718
|
+
for child in ast.children:
|
|
719
|
+
if isinstance(child, PersistentAssignment):
|
|
720
|
+
count_transformation += 1
|
|
721
|
+
list_transformation.append(
|
|
722
|
+
__generate_transformation(
|
|
723
|
+
child=child, is_persistent=True, count=count_transformation
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
elif isinstance(child, Assignment):
|
|
727
|
+
count_transformation += 1
|
|
728
|
+
list_transformation.append(
|
|
729
|
+
__generate_transformation(
|
|
730
|
+
child=child, is_persistent=False, count=count_transformation
|
|
731
|
+
)
|
|
732
|
+
)
|
|
733
|
+
elif isinstance(child, (DPRuleset, HRuleset)):
|
|
734
|
+
count_ruleset += 1
|
|
735
|
+
list_rulesets.append(__generate_ruleset(child=child, count=count_ruleset))
|
|
736
|
+
elif isinstance(child, Operator):
|
|
737
|
+
count_udo += 1
|
|
738
|
+
list_udos.append(__generate_udo(child=child, count=count_udo))
|
|
739
|
+
|
|
740
|
+
references: Any = {}
|
|
741
|
+
if list_rulesets:
|
|
742
|
+
references["ruleset_schemes"] = [
|
|
743
|
+
RulesetScheme(
|
|
744
|
+
items=list_rulesets,
|
|
745
|
+
agency=agency_id,
|
|
746
|
+
id="RS1",
|
|
747
|
+
vtl_version="2.1",
|
|
748
|
+
version=version,
|
|
749
|
+
name=f"RulesetScheme {id}-RS",
|
|
750
|
+
)
|
|
751
|
+
]
|
|
752
|
+
if list_udos:
|
|
753
|
+
references["user_defined_operator_schemes"] = [
|
|
754
|
+
UserDefinedOperatorScheme(
|
|
755
|
+
items=list_udos,
|
|
756
|
+
agency=agency_id,
|
|
757
|
+
id="UDS1",
|
|
758
|
+
vtl_version="2.1",
|
|
759
|
+
version=version,
|
|
760
|
+
name=f"UserDefinedOperatorScheme {id}-UDS",
|
|
761
|
+
)
|
|
762
|
+
]
|
|
763
|
+
|
|
764
|
+
transformation_scheme = TransformationScheme(
|
|
765
|
+
items=list_transformation,
|
|
766
|
+
agency=agency_id,
|
|
767
|
+
id="TS1",
|
|
768
|
+
vtl_version="2.1",
|
|
769
|
+
version=version,
|
|
770
|
+
name=f"TransformationScheme {id}",
|
|
771
|
+
**references,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
return transformation_scheme
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def _check_script(script: Union[str, TransformationScheme, Path]) -> str:
|
|
778
|
+
"""
|
|
779
|
+
Check if the TransformationScheme object is valid to generate a vtl script.
|
|
780
|
+
"""
|
|
781
|
+
if not isinstance(script, (str, TransformationScheme, Path)):
|
|
782
|
+
raise InputValidationException("0-1-1-1", format_=type(script).__name__)
|
|
783
|
+
if isinstance(script, TransformationScheme):
|
|
784
|
+
from pysdmx.toolkit.vtl import (
|
|
785
|
+
generate_vtl_script,
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
vtl_script = generate_vtl_script(script, model_validation=True)
|
|
789
|
+
return vtl_script
|
|
790
|
+
else:
|
|
791
|
+
return str(script)
|