vtlengine 1.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vtlengine might be problematic. Click here for more details.
- vtlengine/API/_InternalApi.py +153 -100
- vtlengine/API/__init__.py +109 -67
- vtlengine/AST/ASTConstructor.py +188 -98
- vtlengine/AST/ASTConstructorModules/Expr.py +306 -200
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +172 -102
- vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
- vtlengine/AST/ASTEncoders.py +1 -1
- vtlengine/AST/ASTTemplate.py +8 -9
- vtlengine/AST/ASTVisitor.py +8 -12
- vtlengine/AST/DAG/__init__.py +43 -35
- vtlengine/AST/DAG/_words.py +4 -4
- vtlengine/AST/Grammar/lexer.py +732 -142
- vtlengine/AST/Grammar/parser.py +2188 -826
- vtlengine/AST/Grammar/tokens.py +128 -128
- vtlengine/AST/VtlVisitor.py +7 -4
- vtlengine/AST/__init__.py +22 -11
- vtlengine/DataTypes/NumericTypesHandling.py +5 -4
- vtlengine/DataTypes/TimeHandling.py +194 -301
- vtlengine/DataTypes/__init__.py +304 -218
- vtlengine/Exceptions/__init__.py +52 -27
- vtlengine/Exceptions/messages.py +134 -62
- vtlengine/Interpreter/__init__.py +781 -487
- vtlengine/Model/__init__.py +165 -121
- vtlengine/Operators/Aggregation.py +156 -95
- vtlengine/Operators/Analytic.py +115 -59
- vtlengine/Operators/Assignment.py +7 -4
- vtlengine/Operators/Boolean.py +27 -32
- vtlengine/Operators/CastOperator.py +177 -131
- vtlengine/Operators/Clause.py +137 -99
- vtlengine/Operators/Comparison.py +148 -117
- vtlengine/Operators/Conditional.py +149 -98
- vtlengine/Operators/General.py +68 -47
- vtlengine/Operators/HROperators.py +91 -72
- vtlengine/Operators/Join.py +217 -118
- vtlengine/Operators/Numeric.py +89 -44
- vtlengine/Operators/RoleSetter.py +16 -15
- vtlengine/Operators/Set.py +61 -36
- vtlengine/Operators/String.py +213 -139
- vtlengine/Operators/Time.py +334 -216
- vtlengine/Operators/Validation.py +117 -76
- vtlengine/Operators/__init__.py +340 -213
- vtlengine/Utils/__init__.py +195 -40
- vtlengine/__init__.py +1 -1
- vtlengine/files/output/__init__.py +15 -6
- vtlengine/files/output/_time_period_representation.py +10 -9
- vtlengine/files/parser/__init__.py +77 -52
- vtlengine/files/parser/_rfc_dialect.py +6 -5
- vtlengine/files/parser/_time_checking.py +46 -37
- vtlengine-1.0.1.dist-info/METADATA +236 -0
- vtlengine-1.0.1.dist-info/RECORD +58 -0
- {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/WHEEL +1 -1
- vtlengine-1.0.dist-info/METADATA +0 -104
- vtlengine-1.0.dist-info/RECORD +0 -58
- {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/LICENSE.md +0 -0
vtlengine/Utils/__init__.py
CHANGED
|
@@ -1,32 +1,196 @@
|
|
|
1
|
-
from vtlengine.Operators.Aggregation import (
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
from vtlengine.Operators.Aggregation import (
|
|
2
|
+
Avg,
|
|
3
|
+
Count,
|
|
4
|
+
Max,
|
|
5
|
+
Median,
|
|
6
|
+
Min,
|
|
7
|
+
PopulationStandardDeviation,
|
|
8
|
+
PopulationVariance,
|
|
9
|
+
SampleStandardDeviation,
|
|
10
|
+
SampleVariance,
|
|
11
|
+
Sum,
|
|
12
|
+
)
|
|
13
|
+
from vtlengine.Operators.Analytic import (
|
|
14
|
+
Avg as AvgAnalytic,
|
|
15
|
+
Count as CountAnalytic,
|
|
16
|
+
FirstValue,
|
|
17
|
+
Lag,
|
|
18
|
+
LastValue,
|
|
19
|
+
Lead,
|
|
20
|
+
Max as MaxAnalytic,
|
|
21
|
+
Median as MedianAnalytic,
|
|
22
|
+
Min as MinAnalytic,
|
|
23
|
+
PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
|
|
24
|
+
PopulationVariance as PopulationVarianceAnalytic,
|
|
25
|
+
Rank,
|
|
26
|
+
RatioToReport,
|
|
27
|
+
SampleStandardDeviation as SampleStandardDeviationAnalytic,
|
|
28
|
+
SampleVariance as SampleVarianceAnalytic,
|
|
29
|
+
Sum as SumAnalytic,
|
|
30
|
+
)
|
|
12
31
|
from vtlengine.Operators.Boolean import And, Not, Or, Xor
|
|
13
|
-
from vtlengine.Operators.Clause import
|
|
14
|
-
|
|
15
|
-
|
|
32
|
+
from vtlengine.Operators.Clause import (
|
|
33
|
+
Aggregate,
|
|
34
|
+
Calc,
|
|
35
|
+
Drop,
|
|
36
|
+
Filter,
|
|
37
|
+
Keep,
|
|
38
|
+
Pivot,
|
|
39
|
+
Rename,
|
|
40
|
+
Sub,
|
|
41
|
+
Unpivot,
|
|
42
|
+
)
|
|
43
|
+
from vtlengine.Operators.Comparison import (
|
|
44
|
+
Equal,
|
|
45
|
+
Greater,
|
|
46
|
+
GreaterEqual,
|
|
47
|
+
In,
|
|
48
|
+
IsNull,
|
|
49
|
+
Less,
|
|
50
|
+
LessEqual,
|
|
51
|
+
NotEqual,
|
|
52
|
+
NotIn,
|
|
53
|
+
Match,
|
|
54
|
+
)
|
|
16
55
|
from vtlengine.Operators.Conditional import Nvl
|
|
17
56
|
from vtlengine.Operators.General import Alias, Membership
|
|
18
|
-
from vtlengine.Operators.HROperators import
|
|
19
|
-
|
|
57
|
+
from vtlengine.Operators.HROperators import (
|
|
58
|
+
HREqual,
|
|
59
|
+
HRGreater,
|
|
60
|
+
HRGreaterEqual,
|
|
61
|
+
HRLess,
|
|
62
|
+
HRLessEqual,
|
|
63
|
+
HRBinPlus,
|
|
64
|
+
HRBinMinus,
|
|
65
|
+
HRUnPlus,
|
|
66
|
+
HRUnMinus,
|
|
67
|
+
)
|
|
20
68
|
from vtlengine.Operators.Join import Apply, CrossJoin, FullJoin, InnerJoin, LeftJoin
|
|
21
|
-
from vtlengine.Operators.Numeric import
|
|
22
|
-
|
|
69
|
+
from vtlengine.Operators.Numeric import (
|
|
70
|
+
AbsoluteValue,
|
|
71
|
+
BinMinus,
|
|
72
|
+
BinPlus,
|
|
73
|
+
Ceil,
|
|
74
|
+
Div,
|
|
75
|
+
Exponential,
|
|
76
|
+
Floor,
|
|
77
|
+
Logarithm,
|
|
78
|
+
Modulo,
|
|
79
|
+
Mult,
|
|
80
|
+
NaturalLogarithm,
|
|
81
|
+
Power,
|
|
82
|
+
Round,
|
|
83
|
+
SquareRoot,
|
|
84
|
+
Trunc,
|
|
85
|
+
UnMinus,
|
|
86
|
+
UnPlus,
|
|
87
|
+
)
|
|
23
88
|
from vtlengine.Operators.RoleSetter import Attribute, Identifier, Measure
|
|
24
89
|
from vtlengine.Operators.Set import Intersection, Setdiff, Symdiff, Union
|
|
25
|
-
from vtlengine.Operators.String import
|
|
26
|
-
|
|
27
|
-
|
|
90
|
+
from vtlengine.Operators.String import (
|
|
91
|
+
Concatenate,
|
|
92
|
+
Length,
|
|
93
|
+
Lower,
|
|
94
|
+
Ltrim,
|
|
95
|
+
Replace,
|
|
96
|
+
Rtrim,
|
|
97
|
+
Substr,
|
|
98
|
+
Trim,
|
|
99
|
+
Upper,
|
|
100
|
+
)
|
|
101
|
+
from vtlengine.Operators.Time import (
|
|
102
|
+
Flow_to_stock,
|
|
103
|
+
Period_indicator,
|
|
104
|
+
Stock_to_flow,
|
|
105
|
+
Fill_time_series,
|
|
106
|
+
Time_Shift,
|
|
107
|
+
)
|
|
28
108
|
|
|
29
|
-
from vtlengine.AST.Grammar.tokens import
|
|
109
|
+
from vtlengine.AST.Grammar.tokens import (
|
|
110
|
+
MEMBERSHIP,
|
|
111
|
+
AND,
|
|
112
|
+
OR,
|
|
113
|
+
XOR,
|
|
114
|
+
EQ,
|
|
115
|
+
NEQ,
|
|
116
|
+
GT,
|
|
117
|
+
GTE,
|
|
118
|
+
LT,
|
|
119
|
+
LTE,
|
|
120
|
+
IN,
|
|
121
|
+
NOT_IN,
|
|
122
|
+
NVL,
|
|
123
|
+
PLUS,
|
|
124
|
+
MINUS,
|
|
125
|
+
MULT,
|
|
126
|
+
LOG,
|
|
127
|
+
MOD,
|
|
128
|
+
POWER,
|
|
129
|
+
DIV,
|
|
130
|
+
AS,
|
|
131
|
+
CONCAT,
|
|
132
|
+
TIMESHIFT,
|
|
133
|
+
CHARSET_MATCH,
|
|
134
|
+
NOT,
|
|
135
|
+
ABS,
|
|
136
|
+
EXP,
|
|
137
|
+
LN,
|
|
138
|
+
SQRT,
|
|
139
|
+
CEIL,
|
|
140
|
+
FLOOR,
|
|
141
|
+
ISNULL,
|
|
142
|
+
PERIOD_INDICATOR,
|
|
143
|
+
LEN,
|
|
144
|
+
LCASE,
|
|
145
|
+
LTRIM,
|
|
146
|
+
RTRIM,
|
|
147
|
+
TRIM,
|
|
148
|
+
UCASE,
|
|
149
|
+
FLOW_TO_STOCK,
|
|
150
|
+
STOCK_TO_FLOW,
|
|
151
|
+
ROUND,
|
|
152
|
+
TRUNC,
|
|
153
|
+
SUBSTR,
|
|
154
|
+
REPLACE,
|
|
155
|
+
FILL_TIME_SERIES,
|
|
156
|
+
IDENTIFIER,
|
|
157
|
+
ATTRIBUTE,
|
|
158
|
+
MEASURE,
|
|
159
|
+
CALC,
|
|
160
|
+
FILTER,
|
|
161
|
+
KEEP,
|
|
162
|
+
DROP,
|
|
163
|
+
RENAME,
|
|
164
|
+
PIVOT,
|
|
165
|
+
UNPIVOT,
|
|
166
|
+
SUBSPACE,
|
|
167
|
+
AGGREGATE,
|
|
168
|
+
APPLY,
|
|
169
|
+
UNION,
|
|
170
|
+
INTERSECT,
|
|
171
|
+
SYMDIFF,
|
|
172
|
+
SETDIFF,
|
|
173
|
+
MAX,
|
|
174
|
+
MIN,
|
|
175
|
+
SUM,
|
|
176
|
+
COUNT,
|
|
177
|
+
AVG,
|
|
178
|
+
MEDIAN,
|
|
179
|
+
STDDEV_POP,
|
|
180
|
+
STDDEV_SAMP,
|
|
181
|
+
VAR_POP,
|
|
182
|
+
VAR_SAMP,
|
|
183
|
+
LAG,
|
|
184
|
+
LEAD,
|
|
185
|
+
FIRST_VALUE,
|
|
186
|
+
LAST_VALUE,
|
|
187
|
+
RATIO_TO_REPORT,
|
|
188
|
+
RANK,
|
|
189
|
+
INNER_JOIN,
|
|
190
|
+
LEFT_JOIN,
|
|
191
|
+
FULL_JOIN,
|
|
192
|
+
CROSS_JOIN,
|
|
193
|
+
)
|
|
30
194
|
|
|
31
195
|
BINARY_MAPPING = {
|
|
32
196
|
# General
|
|
@@ -60,7 +224,7 @@ BINARY_MAPPING = {
|
|
|
60
224
|
CONCAT: Concatenate,
|
|
61
225
|
# Time
|
|
62
226
|
TIMESHIFT: Time_Shift,
|
|
63
|
-
CHARSET_MATCH: Match
|
|
227
|
+
CHARSET_MATCH: Match,
|
|
64
228
|
}
|
|
65
229
|
|
|
66
230
|
UNARY_MAPPING = {
|
|
@@ -88,7 +252,7 @@ UNARY_MAPPING = {
|
|
|
88
252
|
# Time
|
|
89
253
|
PERIOD_INDICATOR: Period_indicator,
|
|
90
254
|
FLOW_TO_STOCK: Flow_to_stock,
|
|
91
|
-
STOCK_TO_FLOW: Stock_to_flow
|
|
255
|
+
STOCK_TO_FLOW: Stock_to_flow,
|
|
92
256
|
}
|
|
93
257
|
|
|
94
258
|
PARAMETRIC_MAPPING = {
|
|
@@ -118,15 +282,10 @@ REGULAR_AGGREGATION_MAPPING = {
|
|
|
118
282
|
UNPIVOT: Unpivot,
|
|
119
283
|
SUBSPACE: Sub,
|
|
120
284
|
AGGREGATE: Aggregate,
|
|
121
|
-
APPLY: Apply
|
|
285
|
+
APPLY: Apply,
|
|
122
286
|
}
|
|
123
287
|
|
|
124
|
-
SET_MAPPING = {
|
|
125
|
-
UNION: Union,
|
|
126
|
-
INTERSECT: Intersection,
|
|
127
|
-
SYMDIFF: Symdiff,
|
|
128
|
-
SETDIFF: Setdiff
|
|
129
|
-
}
|
|
288
|
+
SET_MAPPING = {UNION: Union, INTERSECT: Intersection, SYMDIFF: Symdiff, SETDIFF: Setdiff}
|
|
130
289
|
|
|
131
290
|
AGGREGATION_MAPPING = {
|
|
132
291
|
MAX: Max,
|
|
@@ -139,7 +298,6 @@ AGGREGATION_MAPPING = {
|
|
|
139
298
|
STDDEV_SAMP: SampleStandardDeviation,
|
|
140
299
|
VAR_POP: PopulationVariance,
|
|
141
300
|
VAR_SAMP: SampleVariance,
|
|
142
|
-
|
|
143
301
|
}
|
|
144
302
|
|
|
145
303
|
ANALYTIC_MAPPING = {
|
|
@@ -158,18 +316,15 @@ ANALYTIC_MAPPING = {
|
|
|
158
316
|
FIRST_VALUE: FirstValue,
|
|
159
317
|
LAST_VALUE: LastValue,
|
|
160
318
|
RATIO_TO_REPORT: RatioToReport,
|
|
161
|
-
RANK: Rank
|
|
319
|
+
RANK: Rank,
|
|
162
320
|
}
|
|
163
321
|
|
|
164
|
-
THEN_ELSE = {
|
|
165
|
-
'then': 'T',
|
|
166
|
-
'else': 'E'
|
|
167
|
-
}
|
|
322
|
+
THEN_ELSE = {"then": "T", "else": "E"}
|
|
168
323
|
JOIN_MAPPING = {
|
|
169
324
|
INNER_JOIN: InnerJoin,
|
|
170
325
|
LEFT_JOIN: LeftJoin,
|
|
171
326
|
FULL_JOIN: FullJoin,
|
|
172
|
-
CROSS_JOIN: CrossJoin
|
|
327
|
+
CROSS_JOIN: CrossJoin,
|
|
173
328
|
}
|
|
174
329
|
|
|
175
330
|
HR_COMP_MAPPING = {
|
|
@@ -190,7 +345,7 @@ HR_NUM_BINARY_MAPPING = {
|
|
|
190
345
|
HR_UNARY_MAPPING = {
|
|
191
346
|
# Numeric
|
|
192
347
|
PLUS: HRUnPlus,
|
|
193
|
-
MINUS: HRUnMinus
|
|
348
|
+
MINUS: HRUnMinus,
|
|
194
349
|
}
|
|
195
350
|
|
|
196
351
|
HA_COMP_MAPPING = {
|
|
@@ -211,5 +366,5 @@ HA_NUM_BINARY_MAPPING = {
|
|
|
211
366
|
HA_UNARY_MAPPING = {
|
|
212
367
|
# Numeric
|
|
213
368
|
PLUS: HRUnPlus,
|
|
214
|
-
MINUS: HRUnMinus
|
|
369
|
+
MINUS: HRUnMinus,
|
|
215
370
|
}
|
vtlengine/__init__.py
CHANGED
|
@@ -1,16 +1,25 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
3
5
|
|
|
4
6
|
from vtlengine.Model import Dataset
|
|
5
|
-
from vtlengine.files.output._time_period_representation import
|
|
6
|
-
format_time_period_external_representation,
|
|
7
|
+
from vtlengine.files.output._time_period_representation import (
|
|
8
|
+
format_time_period_external_representation,
|
|
9
|
+
TimePeriodRepresentation,
|
|
10
|
+
)
|
|
11
|
+
|
|
7
12
|
|
|
13
|
+
def save_datapoints(
|
|
14
|
+
time_period_representation: Optional[TimePeriodRepresentation],
|
|
15
|
+
dataset: Dataset,
|
|
16
|
+
output_path: Union[str, Path],
|
|
17
|
+
) -> None:
|
|
8
18
|
|
|
9
|
-
|
|
10
|
-
|
|
19
|
+
if dataset.data is None:
|
|
20
|
+
dataset.data = pd.DataFrame()
|
|
11
21
|
if time_period_representation is not None:
|
|
12
22
|
format_time_period_external_representation(dataset, time_period_representation)
|
|
13
|
-
|
|
14
23
|
if isinstance(output_path, str):
|
|
15
24
|
if output_path.endswith("/"):
|
|
16
25
|
s3_file_output = output_path + f"{dataset.name}.csv"
|
|
@@ -8,23 +8,24 @@ from vtlengine.Model import Dataset, Scalar
|
|
|
8
8
|
|
|
9
9
|
class TimePeriodRepresentation(Enum):
|
|
10
10
|
# Time Period output format
|
|
11
|
-
SDMX_GREGORIAN =
|
|
12
|
-
SDMX_REPORTING =
|
|
13
|
-
VTL =
|
|
11
|
+
SDMX_GREGORIAN = "sdmx_gregorian"
|
|
12
|
+
SDMX_REPORTING = "sdmx_reporting"
|
|
13
|
+
VTL = "vtl"
|
|
14
14
|
|
|
15
15
|
@classmethod
|
|
16
|
-
def check_value(cls, value: str):
|
|
16
|
+
def check_value(cls, value: str) -> "TimePeriodRepresentation":
|
|
17
17
|
if value not in cls._value2member_map_:
|
|
18
18
|
raise Exception("Invalid Time Period Representation")
|
|
19
19
|
return cls(value)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def _format_vtl_representation(value: str):
|
|
22
|
+
def _format_vtl_representation(value: str) -> str:
|
|
23
23
|
return TimePeriodHandler(value).vtl_representation()
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def format_time_period_external_representation(
|
|
27
|
-
|
|
26
|
+
def format_time_period_external_representation(
|
|
27
|
+
dataset: Dataset | Scalar, mode: TimePeriodRepresentation
|
|
28
|
+
) -> None:
|
|
28
29
|
"""
|
|
29
30
|
From SDMX time period representation to standard VTL representation (no hyphen).
|
|
30
31
|
'A': 'nothing to do',
|
|
@@ -48,7 +49,7 @@ def format_time_period_external_representation(dataset: Dataset | Scalar,
|
|
|
48
49
|
for comp in dataset.components.values():
|
|
49
50
|
if comp.data_type == TimePeriod:
|
|
50
51
|
dataset.data[comp.name] = dataset.data[comp.name].map(
|
|
51
|
-
_format_vtl_representation,
|
|
52
|
-
|
|
52
|
+
_format_vtl_representation, na_action="ignore"
|
|
53
|
+
)
|
|
53
54
|
|
|
54
55
|
return
|
|
@@ -1,12 +1,22 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from csv import DictReader
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from typing import Optional, Dict, Union
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Union, Any, Type, List
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
|
-
from vtlengine.DataTypes import
|
|
9
|
-
|
|
9
|
+
from vtlengine.DataTypes import (
|
|
10
|
+
Date,
|
|
11
|
+
TimePeriod,
|
|
12
|
+
TimeInterval,
|
|
13
|
+
Integer,
|
|
14
|
+
Number,
|
|
15
|
+
Boolean,
|
|
16
|
+
Duration,
|
|
17
|
+
SCALAR_TYPES_CLASS_REVERSE,
|
|
18
|
+
ScalarType,
|
|
19
|
+
)
|
|
10
20
|
from vtlengine.DataTypes.TimeHandling import DURATION_MAPPING
|
|
11
21
|
from vtlengine.files.parser._rfc_dialect import register_rfc
|
|
12
22
|
from vtlengine.files.parser._time_checking import check_date, check_time_period, check_time
|
|
@@ -14,14 +24,14 @@ from vtlengine.files.parser._time_checking import check_date, check_time_period,
|
|
|
14
24
|
from vtlengine.Exceptions import InputValidationException, SemanticError
|
|
15
25
|
from vtlengine.Model import Component, Role, Dataset
|
|
16
26
|
|
|
17
|
-
TIME_CHECKS_MAPPING = {
|
|
27
|
+
TIME_CHECKS_MAPPING: Dict[Type[ScalarType], Any] = {
|
|
18
28
|
Date: check_date,
|
|
19
29
|
TimePeriod: check_time_period,
|
|
20
|
-
TimeInterval: check_time
|
|
30
|
+
TimeInterval: check_time,
|
|
21
31
|
}
|
|
22
32
|
|
|
23
33
|
|
|
24
|
-
def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
34
|
+
def _validate_csv_path(components: Dict[str, Component], csv_path: Path) -> None:
|
|
25
35
|
# GE1 check if the file is empty
|
|
26
36
|
if not csv_path.exists():
|
|
27
37
|
raise Exception(f"Path {csv_path} does not exist.")
|
|
@@ -29,8 +39,8 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
|
29
39
|
raise Exception(f"Path {csv_path} is not a file.")
|
|
30
40
|
register_rfc()
|
|
31
41
|
try:
|
|
32
|
-
with open(csv_path,
|
|
33
|
-
reader = DictReader(f, dialect=
|
|
42
|
+
with open(csv_path, "r") as f:
|
|
43
|
+
reader = DictReader(f, dialect="rfc")
|
|
34
44
|
csv_columns = reader.fieldnames
|
|
35
45
|
|
|
36
46
|
except UnicodeDecodeError as error:
|
|
@@ -45,21 +55,24 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
|
45
55
|
) from None
|
|
46
56
|
|
|
47
57
|
if not csv_columns:
|
|
48
|
-
raise InputValidationException(code=
|
|
58
|
+
raise InputValidationException(code="0-1-1-6", file=csv_path)
|
|
49
59
|
|
|
50
60
|
if len(list(set(csv_columns))) != len(csv_columns):
|
|
51
61
|
duplicates = list(set([item for item in csv_columns if csv_columns.count(item) > 1]))
|
|
52
62
|
raise Exception(f"Duplicated columns {', '.join(duplicates)} found in file.")
|
|
53
63
|
|
|
54
64
|
comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
|
|
55
|
-
comps_missing
|
|
65
|
+
comps_missing: Union[str, List[str]] = (
|
|
66
|
+
[id_m for id_m in comp_names if id_m not in reader.fieldnames] if reader.fieldnames else []
|
|
67
|
+
)
|
|
56
68
|
if comps_missing:
|
|
57
69
|
comps_missing = ", ".join(comps_missing)
|
|
58
|
-
raise InputValidationException(code=
|
|
70
|
+
raise InputValidationException(code="0-1-1-8", ids=comps_missing, file=str(csv_path.name))
|
|
59
71
|
|
|
60
72
|
|
|
61
|
-
def _sanitize_pandas_columns(
|
|
62
|
-
|
|
73
|
+
def _sanitize_pandas_columns(
|
|
74
|
+
components: Dict[str, Component], csv_path: Union[str, Path], data: pd.DataFrame
|
|
75
|
+
) -> pd.DataFrame:
|
|
63
76
|
# Fast loading from SDMX-CSV
|
|
64
77
|
if "DATAFLOW" in data.columns and data.columns[0] == "DATAFLOW":
|
|
65
78
|
if "DATAFLOW" not in components:
|
|
@@ -75,11 +88,11 @@ def _sanitize_pandas_columns(components: Dict[str, Component],
|
|
|
75
88
|
|
|
76
89
|
# Validate identifiers
|
|
77
90
|
comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
|
|
78
|
-
comps_missing = [id_m for id_m in comp_names if id_m not in data.columns]
|
|
91
|
+
comps_missing: Union[str, List[str]] = [id_m for id_m in comp_names if id_m not in data.columns]
|
|
79
92
|
if comps_missing:
|
|
80
93
|
comps_missing = ", ".join(comps_missing)
|
|
81
94
|
file = csv_path if isinstance(csv_path, str) else csv_path.name
|
|
82
|
-
raise InputValidationException(code=
|
|
95
|
+
raise InputValidationException(code="0-1-1-7", ids=comps_missing, file=file)
|
|
83
96
|
|
|
84
97
|
# Fill rest of components with null values
|
|
85
98
|
for comp_name, comp in components.items():
|
|
@@ -94,47 +107,52 @@ def _pandas_load_csv(components: Dict[str, Component], csv_path: Path) -> pd.Dat
|
|
|
94
107
|
obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
|
|
95
108
|
|
|
96
109
|
try:
|
|
97
|
-
data = pd.read_csv(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
except UnicodeDecodeError as error:
|
|
110
|
+
data = pd.read_csv(
|
|
111
|
+
csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
|
|
112
|
+
)
|
|
113
|
+
except UnicodeDecodeError:
|
|
102
114
|
raise InputValidationException(code="0-1-2-5", file=csv_path.name)
|
|
103
115
|
|
|
104
116
|
return _sanitize_pandas_columns(components, csv_path, data)
|
|
105
117
|
|
|
118
|
+
|
|
106
119
|
def _pandas_load_s3_csv(components: Dict[str, Component], csv_path: str) -> pd.DataFrame:
|
|
107
120
|
obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
|
|
108
121
|
|
|
109
122
|
# start = time()
|
|
110
123
|
try:
|
|
111
|
-
data = pd.read_csv(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
na_values=[''])
|
|
124
|
+
data = pd.read_csv(
|
|
125
|
+
csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
|
|
126
|
+
)
|
|
115
127
|
|
|
116
|
-
except UnicodeDecodeError
|
|
128
|
+
except UnicodeDecodeError:
|
|
117
129
|
raise InputValidationException(code="0-1-2-5", file=csv_path)
|
|
118
130
|
except Exception as e:
|
|
119
131
|
raise InputValidationException(f"ERROR: {str(e)}, review file {str(csv_path)}")
|
|
120
|
-
|
|
121
|
-
# print(f"Data loaded from {csv_path}, shape: {data.shape}")
|
|
122
|
-
# end = time()
|
|
123
|
-
# print(f"Time to load data from s3 URI: {end - start}")
|
|
124
|
-
|
|
125
132
|
return _sanitize_pandas_columns(components, csv_path, data)
|
|
126
133
|
|
|
127
|
-
|
|
134
|
+
|
|
135
|
+
def _parse_boolean(value: str) -> bool:
|
|
128
136
|
if value.lower() == "true" or value == "1":
|
|
129
137
|
return True
|
|
130
138
|
return False
|
|
131
139
|
|
|
132
140
|
|
|
133
|
-
def _validate_pandas(
|
|
134
|
-
|
|
141
|
+
def _validate_pandas(
|
|
142
|
+
components: Dict[str, Component], data: pd.DataFrame, dataset_name: str
|
|
143
|
+
) -> pd.DataFrame:
|
|
144
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
135
145
|
# Identifier checking
|
|
146
|
+
|
|
136
147
|
id_names = [comp_name for comp_name, comp in components.items() if comp.role == Role.IDENTIFIER]
|
|
137
148
|
|
|
149
|
+
missing_columns = [name for name in components.keys() if name not in data.columns.tolist()]
|
|
150
|
+
if missing_columns:
|
|
151
|
+
for name in missing_columns:
|
|
152
|
+
if components[name].nullable is False:
|
|
153
|
+
raise SemanticError("0-1-1-10", name=dataset_name, comp_name=name)
|
|
154
|
+
data[name] = None
|
|
155
|
+
|
|
138
156
|
for id_name in id_names:
|
|
139
157
|
if data[id_name].isnull().any():
|
|
140
158
|
raise SemanticError("0-1-1-4", null_identifier=id_name, name=dataset_name)
|
|
@@ -150,35 +168,42 @@ def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
|
|
|
150
168
|
|
|
151
169
|
for comp_name, comp in components.items():
|
|
152
170
|
if comp.data_type in (Date, TimePeriod, TimeInterval):
|
|
153
|
-
data[comp_name] = data[comp_name].map(
|
|
154
|
-
|
|
171
|
+
data[comp_name] = data[comp_name].map(
|
|
172
|
+
TIME_CHECKS_MAPPING[comp.data_type], na_action="ignore"
|
|
173
|
+
)
|
|
155
174
|
elif comp.data_type == Integer:
|
|
156
|
-
data[comp_name] = data[comp_name].map(
|
|
157
|
-
|
|
175
|
+
data[comp_name] = data[comp_name].map(
|
|
176
|
+
lambda x: Integer.cast(float(x)), na_action="ignore"
|
|
177
|
+
)
|
|
158
178
|
elif comp.data_type == Number:
|
|
159
|
-
data[comp_name] = data[comp_name].map(lambda x: float(x), na_action=
|
|
179
|
+
data[comp_name] = data[comp_name].map(lambda x: float(x), na_action="ignore")
|
|
160
180
|
elif comp.data_type == Boolean:
|
|
161
|
-
data[comp_name] = data[comp_name].map(
|
|
162
|
-
|
|
181
|
+
data[comp_name] = data[comp_name].map(
|
|
182
|
+
lambda x: _parse_boolean(x), na_action="ignore"
|
|
183
|
+
)
|
|
163
184
|
elif comp.data_type == Duration:
|
|
164
|
-
values_correct =
|
|
165
|
-
|
|
185
|
+
values_correct = (
|
|
186
|
+
data[comp_name]
|
|
187
|
+
.map(lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action="ignore")
|
|
188
|
+
.all()
|
|
189
|
+
)
|
|
166
190
|
if not values_correct:
|
|
167
191
|
raise ValueError(f"Duration values are not correct in column {comp_name}")
|
|
168
192
|
else:
|
|
169
|
-
data[comp_name] = data[comp_name].map(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
193
|
+
data[comp_name] = data[comp_name].map(
|
|
194
|
+
lambda x: str(x).replace('"', ""), na_action="ignore"
|
|
195
|
+
)
|
|
196
|
+
data[comp_name] = data[comp_name].astype(np.object_, errors="raise")
|
|
197
|
+
except ValueError:
|
|
198
|
+
str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type] if comp else "Null"
|
|
174
199
|
raise SemanticError("0-1-1-12", name=dataset_name, column=comp_name, type=str_comp)
|
|
175
200
|
|
|
176
201
|
return data
|
|
177
202
|
|
|
178
203
|
|
|
179
|
-
def load_datapoints(
|
|
180
|
-
|
|
181
|
-
|
|
204
|
+
def load_datapoints(
|
|
205
|
+
components: Dict[str, Component], dataset_name: str, csv_path: Optional[Union[Path, str]] = None
|
|
206
|
+
) -> pd.DataFrame:
|
|
182
207
|
if csv_path is None or (isinstance(csv_path, Path) and not csv_path.exists()):
|
|
183
208
|
return pd.DataFrame(columns=list(components.keys()))
|
|
184
209
|
elif isinstance(csv_path, str):
|
|
@@ -193,5 +218,5 @@ def load_datapoints(components: Dict[str, Component],
|
|
|
193
218
|
return data
|
|
194
219
|
|
|
195
220
|
|
|
196
|
-
def _fill_dataset_empty_data(dataset: Dataset):
|
|
221
|
+
def _fill_dataset_empty_data(dataset: Dataset) -> None:
|
|
197
222
|
dataset.data = pd.DataFrame(columns=list(dataset.components.keys()))
|
|
@@ -6,9 +6,10 @@ class RFCDialect(csv.Dialect):
|
|
|
6
6
|
https://docs.python.org/3/library/csv.html#csv.Dialect
|
|
7
7
|
https://tools.ietf.org/html/rfc4180
|
|
8
8
|
"""
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
delimiter = ","
|
|
10
11
|
doublequote = True
|
|
11
|
-
lineterminator =
|
|
12
|
+
lineterminator = "\r\n"
|
|
12
13
|
quotechar = '"'
|
|
13
14
|
quoting = csv.QUOTE_MINIMAL
|
|
14
15
|
strict = True
|
|
@@ -16,6 +17,6 @@ class RFCDialect(csv.Dialect):
|
|
|
16
17
|
skipinitialspace = False
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def register_rfc():
|
|
20
|
-
"""
|
|
21
|
-
csv.register_dialect(
|
|
20
|
+
def register_rfc() -> None:
|
|
21
|
+
"""Register the RFC dialect."""
|
|
22
|
+
csv.register_dialect("rfc", RFCDialect)
|