vtlengine 1.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vtlengine might be problematic. Click here for more details.
- vtlengine/API/_InternalApi.py +159 -102
- vtlengine/API/__init__.py +110 -68
- vtlengine/AST/ASTConstructor.py +188 -98
- vtlengine/AST/ASTConstructorModules/Expr.py +402 -205
- vtlengine/AST/ASTConstructorModules/ExprComponents.py +248 -104
- vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
- vtlengine/AST/ASTEncoders.py +1 -1
- vtlengine/AST/ASTTemplate.py +24 -9
- vtlengine/AST/ASTVisitor.py +8 -12
- vtlengine/AST/DAG/__init__.py +43 -35
- vtlengine/AST/DAG/_words.py +4 -4
- vtlengine/AST/Grammar/Vtl.g4 +49 -20
- vtlengine/AST/Grammar/VtlTokens.g4 +13 -1
- vtlengine/AST/Grammar/lexer.py +2012 -1312
- vtlengine/AST/Grammar/parser.py +7524 -4343
- vtlengine/AST/Grammar/tokens.py +140 -128
- vtlengine/AST/VtlVisitor.py +16 -5
- vtlengine/AST/__init__.py +41 -11
- vtlengine/DataTypes/NumericTypesHandling.py +5 -4
- vtlengine/DataTypes/TimeHandling.py +196 -301
- vtlengine/DataTypes/__init__.py +304 -218
- vtlengine/Exceptions/__init__.py +96 -27
- vtlengine/Exceptions/messages.py +149 -69
- vtlengine/Interpreter/__init__.py +817 -497
- vtlengine/Model/__init__.py +172 -121
- vtlengine/Operators/Aggregation.py +156 -95
- vtlengine/Operators/Analytic.py +167 -79
- vtlengine/Operators/Assignment.py +7 -4
- vtlengine/Operators/Boolean.py +27 -32
- vtlengine/Operators/CastOperator.py +177 -131
- vtlengine/Operators/Clause.py +137 -99
- vtlengine/Operators/Comparison.py +148 -117
- vtlengine/Operators/Conditional.py +290 -98
- vtlengine/Operators/General.py +68 -47
- vtlengine/Operators/HROperators.py +91 -72
- vtlengine/Operators/Join.py +217 -118
- vtlengine/Operators/Numeric.py +129 -46
- vtlengine/Operators/RoleSetter.py +16 -15
- vtlengine/Operators/Set.py +61 -36
- vtlengine/Operators/String.py +213 -139
- vtlengine/Operators/Time.py +467 -215
- vtlengine/Operators/Validation.py +117 -76
- vtlengine/Operators/__init__.py +340 -213
- vtlengine/Utils/__init__.py +232 -41
- vtlengine/__init__.py +1 -1
- vtlengine/files/output/__init__.py +15 -6
- vtlengine/files/output/_time_period_representation.py +10 -9
- vtlengine/files/parser/__init__.py +79 -52
- vtlengine/files/parser/_rfc_dialect.py +6 -5
- vtlengine/files/parser/_time_checking.py +48 -37
- vtlengine-1.0.2.dist-info/METADATA +245 -0
- vtlengine-1.0.2.dist-info/RECORD +58 -0
- {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/WHEEL +1 -1
- vtlengine-1.0.dist-info/METADATA +0 -104
- vtlengine-1.0.dist-info/RECORD +0 -58
- {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/LICENSE.md +0 -0
vtlengine/Utils/__init__.py
CHANGED
|
@@ -1,34 +1,222 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from vtlengine.Operators.Aggregation import (
|
|
4
|
+
Avg,
|
|
5
|
+
Count,
|
|
6
|
+
Max,
|
|
7
|
+
Median,
|
|
8
|
+
Min,
|
|
9
|
+
PopulationStandardDeviation,
|
|
10
|
+
PopulationVariance,
|
|
11
|
+
SampleStandardDeviation,
|
|
12
|
+
SampleVariance,
|
|
13
|
+
Sum,
|
|
14
|
+
)
|
|
15
|
+
from vtlengine.Operators.Analytic import (
|
|
16
|
+
Avg as AvgAnalytic,
|
|
17
|
+
Count as CountAnalytic,
|
|
18
|
+
FirstValue,
|
|
19
|
+
Lag,
|
|
20
|
+
LastValue,
|
|
21
|
+
Lead,
|
|
22
|
+
Max as MaxAnalytic,
|
|
23
|
+
Median as MedianAnalytic,
|
|
24
|
+
Min as MinAnalytic,
|
|
25
|
+
PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
|
|
26
|
+
PopulationVariance as PopulationVarianceAnalytic,
|
|
27
|
+
Rank,
|
|
28
|
+
RatioToReport,
|
|
29
|
+
SampleStandardDeviation as SampleStandardDeviationAnalytic,
|
|
30
|
+
SampleVariance as SampleVarianceAnalytic,
|
|
31
|
+
Sum as SumAnalytic,
|
|
32
|
+
)
|
|
12
33
|
from vtlengine.Operators.Boolean import And, Not, Or, Xor
|
|
13
|
-
from vtlengine.Operators.Clause import
|
|
14
|
-
|
|
15
|
-
|
|
34
|
+
from vtlengine.Operators.Clause import (
|
|
35
|
+
Aggregate,
|
|
36
|
+
Calc,
|
|
37
|
+
Drop,
|
|
38
|
+
Filter,
|
|
39
|
+
Keep,
|
|
40
|
+
Pivot,
|
|
41
|
+
Rename,
|
|
42
|
+
Sub,
|
|
43
|
+
Unpivot,
|
|
44
|
+
)
|
|
45
|
+
from vtlengine.Operators.Comparison import (
|
|
46
|
+
Equal,
|
|
47
|
+
Greater,
|
|
48
|
+
GreaterEqual,
|
|
49
|
+
In,
|
|
50
|
+
IsNull,
|
|
51
|
+
Less,
|
|
52
|
+
LessEqual,
|
|
53
|
+
NotEqual,
|
|
54
|
+
NotIn,
|
|
55
|
+
Match,
|
|
56
|
+
)
|
|
16
57
|
from vtlengine.Operators.Conditional import Nvl
|
|
17
58
|
from vtlengine.Operators.General import Alias, Membership
|
|
18
|
-
from vtlengine.Operators.HROperators import
|
|
19
|
-
|
|
59
|
+
from vtlengine.Operators.HROperators import (
|
|
60
|
+
HREqual,
|
|
61
|
+
HRGreater,
|
|
62
|
+
HRGreaterEqual,
|
|
63
|
+
HRLess,
|
|
64
|
+
HRLessEqual,
|
|
65
|
+
HRBinPlus,
|
|
66
|
+
HRBinMinus,
|
|
67
|
+
HRUnPlus,
|
|
68
|
+
HRUnMinus,
|
|
69
|
+
)
|
|
20
70
|
from vtlengine.Operators.Join import Apply, CrossJoin, FullJoin, InnerJoin, LeftJoin
|
|
21
|
-
from vtlengine.Operators.Numeric import
|
|
22
|
-
|
|
71
|
+
from vtlengine.Operators.Numeric import (
|
|
72
|
+
AbsoluteValue,
|
|
73
|
+
BinMinus,
|
|
74
|
+
BinPlus,
|
|
75
|
+
Ceil,
|
|
76
|
+
Div,
|
|
77
|
+
Exponential,
|
|
78
|
+
Floor,
|
|
79
|
+
Logarithm,
|
|
80
|
+
Modulo,
|
|
81
|
+
Mult,
|
|
82
|
+
NaturalLogarithm,
|
|
83
|
+
Power,
|
|
84
|
+
Round,
|
|
85
|
+
SquareRoot,
|
|
86
|
+
Trunc,
|
|
87
|
+
UnMinus,
|
|
88
|
+
UnPlus,
|
|
89
|
+
Random,
|
|
90
|
+
)
|
|
23
91
|
from vtlengine.Operators.RoleSetter import Attribute, Identifier, Measure
|
|
24
92
|
from vtlengine.Operators.Set import Intersection, Setdiff, Symdiff, Union
|
|
25
|
-
from vtlengine.Operators.String import
|
|
26
|
-
|
|
27
|
-
|
|
93
|
+
from vtlengine.Operators.String import (
|
|
94
|
+
Concatenate,
|
|
95
|
+
Length,
|
|
96
|
+
Lower,
|
|
97
|
+
Ltrim,
|
|
98
|
+
Replace,
|
|
99
|
+
Rtrim,
|
|
100
|
+
Substr,
|
|
101
|
+
Trim,
|
|
102
|
+
Upper,
|
|
103
|
+
)
|
|
104
|
+
from vtlengine.Operators.Time import (
|
|
105
|
+
Flow_to_stock,
|
|
106
|
+
Period_indicator,
|
|
107
|
+
Stock_to_flow,
|
|
108
|
+
Fill_time_series,
|
|
109
|
+
Time_Shift,
|
|
110
|
+
Year,
|
|
111
|
+
Month,
|
|
112
|
+
Day_of_Month,
|
|
113
|
+
Day_of_Year,
|
|
114
|
+
Day_to_Year,
|
|
115
|
+
Day_to_Month,
|
|
116
|
+
Year_to_Day,
|
|
117
|
+
Month_to_Day,
|
|
118
|
+
Date_Diff,
|
|
119
|
+
Date_Add,
|
|
120
|
+
)
|
|
28
121
|
|
|
29
|
-
from vtlengine.AST.Grammar.tokens import
|
|
122
|
+
from vtlengine.AST.Grammar.tokens import (
|
|
123
|
+
MEMBERSHIP,
|
|
124
|
+
AND,
|
|
125
|
+
OR,
|
|
126
|
+
XOR,
|
|
127
|
+
EQ,
|
|
128
|
+
NEQ,
|
|
129
|
+
GT,
|
|
130
|
+
GTE,
|
|
131
|
+
LT,
|
|
132
|
+
LTE,
|
|
133
|
+
IN,
|
|
134
|
+
NOT_IN,
|
|
135
|
+
NVL,
|
|
136
|
+
PLUS,
|
|
137
|
+
MINUS,
|
|
138
|
+
MULT,
|
|
139
|
+
LOG,
|
|
140
|
+
MOD,
|
|
141
|
+
POWER,
|
|
142
|
+
DIV,
|
|
143
|
+
AS,
|
|
144
|
+
CONCAT,
|
|
145
|
+
TIMESHIFT,
|
|
146
|
+
CHARSET_MATCH,
|
|
147
|
+
NOT,
|
|
148
|
+
ABS,
|
|
149
|
+
EXP,
|
|
150
|
+
LN,
|
|
151
|
+
SQRT,
|
|
152
|
+
CEIL,
|
|
153
|
+
FLOOR,
|
|
154
|
+
ISNULL,
|
|
155
|
+
PERIOD_INDICATOR,
|
|
156
|
+
LEN,
|
|
157
|
+
LCASE,
|
|
158
|
+
LTRIM,
|
|
159
|
+
RTRIM,
|
|
160
|
+
TRIM,
|
|
161
|
+
UCASE,
|
|
162
|
+
FLOW_TO_STOCK,
|
|
163
|
+
STOCK_TO_FLOW,
|
|
164
|
+
ROUND,
|
|
165
|
+
TRUNC,
|
|
166
|
+
SUBSTR,
|
|
167
|
+
REPLACE,
|
|
168
|
+
FILL_TIME_SERIES,
|
|
169
|
+
IDENTIFIER,
|
|
170
|
+
ATTRIBUTE,
|
|
171
|
+
MEASURE,
|
|
172
|
+
CALC,
|
|
173
|
+
FILTER,
|
|
174
|
+
KEEP,
|
|
175
|
+
DROP,
|
|
176
|
+
RENAME,
|
|
177
|
+
PIVOT,
|
|
178
|
+
UNPIVOT,
|
|
179
|
+
SUBSPACE,
|
|
180
|
+
AGGREGATE,
|
|
181
|
+
APPLY,
|
|
182
|
+
UNION,
|
|
183
|
+
INTERSECT,
|
|
184
|
+
SYMDIFF,
|
|
185
|
+
SETDIFF,
|
|
186
|
+
MAX,
|
|
187
|
+
MIN,
|
|
188
|
+
SUM,
|
|
189
|
+
COUNT,
|
|
190
|
+
AVG,
|
|
191
|
+
MEDIAN,
|
|
192
|
+
STDDEV_POP,
|
|
193
|
+
STDDEV_SAMP,
|
|
194
|
+
VAR_POP,
|
|
195
|
+
VAR_SAMP,
|
|
196
|
+
LAG,
|
|
197
|
+
LEAD,
|
|
198
|
+
FIRST_VALUE,
|
|
199
|
+
LAST_VALUE,
|
|
200
|
+
RATIO_TO_REPORT,
|
|
201
|
+
RANK,
|
|
202
|
+
INNER_JOIN,
|
|
203
|
+
LEFT_JOIN,
|
|
204
|
+
FULL_JOIN,
|
|
205
|
+
CROSS_JOIN,
|
|
206
|
+
RANDOM,
|
|
207
|
+
DAYOFYEAR,
|
|
208
|
+
DAYOFMONTH,
|
|
209
|
+
MONTH,
|
|
210
|
+
YEAR,
|
|
211
|
+
DAYTOYEAR,
|
|
212
|
+
DAYTOMONTH,
|
|
213
|
+
YEARTODAY,
|
|
214
|
+
MONTHTODAY,
|
|
215
|
+
DATE_DIFF,
|
|
216
|
+
DATE_ADD,
|
|
217
|
+
)
|
|
30
218
|
|
|
31
|
-
BINARY_MAPPING = {
|
|
219
|
+
BINARY_MAPPING: Dict[Any, Any] = {
|
|
32
220
|
# General
|
|
33
221
|
MEMBERSHIP: Membership,
|
|
34
222
|
# Boolean
|
|
@@ -54,13 +242,15 @@ BINARY_MAPPING = {
|
|
|
54
242
|
MOD: Modulo,
|
|
55
243
|
POWER: Power,
|
|
56
244
|
DIV: Div,
|
|
245
|
+
RANDOM: Random,
|
|
57
246
|
# General
|
|
58
247
|
AS: Alias,
|
|
59
248
|
# String
|
|
60
249
|
CONCAT: Concatenate,
|
|
61
250
|
# Time
|
|
62
251
|
TIMESHIFT: Time_Shift,
|
|
63
|
-
CHARSET_MATCH: Match
|
|
252
|
+
CHARSET_MATCH: Match,
|
|
253
|
+
DATE_DIFF: Date_Diff,
|
|
64
254
|
}
|
|
65
255
|
|
|
66
256
|
UNARY_MAPPING = {
|
|
@@ -88,7 +278,15 @@ UNARY_MAPPING = {
|
|
|
88
278
|
# Time
|
|
89
279
|
PERIOD_INDICATOR: Period_indicator,
|
|
90
280
|
FLOW_TO_STOCK: Flow_to_stock,
|
|
91
|
-
STOCK_TO_FLOW: Stock_to_flow
|
|
281
|
+
STOCK_TO_FLOW: Stock_to_flow,
|
|
282
|
+
YEAR: Year,
|
|
283
|
+
MONTH: Month,
|
|
284
|
+
DAYOFMONTH: Day_of_Month,
|
|
285
|
+
DAYOFYEAR: Day_of_Year,
|
|
286
|
+
DAYTOYEAR: Day_to_Year,
|
|
287
|
+
DAYTOMONTH: Day_to_Month,
|
|
288
|
+
YEARTODAY: Year_to_Day,
|
|
289
|
+
MONTHTODAY: Month_to_Day,
|
|
92
290
|
}
|
|
93
291
|
|
|
94
292
|
PARAMETRIC_MAPPING = {
|
|
@@ -100,6 +298,7 @@ PARAMETRIC_MAPPING = {
|
|
|
100
298
|
REPLACE: Replace,
|
|
101
299
|
# Time
|
|
102
300
|
FILL_TIME_SERIES: Fill_time_series,
|
|
301
|
+
DATE_ADD: Date_Add,
|
|
103
302
|
}
|
|
104
303
|
|
|
105
304
|
ROLE_SETTER_MAPPING = {
|
|
@@ -118,15 +317,10 @@ REGULAR_AGGREGATION_MAPPING = {
|
|
|
118
317
|
UNPIVOT: Unpivot,
|
|
119
318
|
SUBSPACE: Sub,
|
|
120
319
|
AGGREGATE: Aggregate,
|
|
121
|
-
APPLY: Apply
|
|
320
|
+
APPLY: Apply,
|
|
122
321
|
}
|
|
123
322
|
|
|
124
|
-
SET_MAPPING = {
|
|
125
|
-
UNION: Union,
|
|
126
|
-
INTERSECT: Intersection,
|
|
127
|
-
SYMDIFF: Symdiff,
|
|
128
|
-
SETDIFF: Setdiff
|
|
129
|
-
}
|
|
323
|
+
SET_MAPPING = {UNION: Union, INTERSECT: Intersection, SYMDIFF: Symdiff, SETDIFF: Setdiff}
|
|
130
324
|
|
|
131
325
|
AGGREGATION_MAPPING = {
|
|
132
326
|
MAX: Max,
|
|
@@ -139,7 +333,6 @@ AGGREGATION_MAPPING = {
|
|
|
139
333
|
STDDEV_SAMP: SampleStandardDeviation,
|
|
140
334
|
VAR_POP: PopulationVariance,
|
|
141
335
|
VAR_SAMP: SampleVariance,
|
|
142
|
-
|
|
143
336
|
}
|
|
144
337
|
|
|
145
338
|
ANALYTIC_MAPPING = {
|
|
@@ -158,18 +351,16 @@ ANALYTIC_MAPPING = {
|
|
|
158
351
|
FIRST_VALUE: FirstValue,
|
|
159
352
|
LAST_VALUE: LastValue,
|
|
160
353
|
RATIO_TO_REPORT: RatioToReport,
|
|
161
|
-
RANK: Rank
|
|
354
|
+
RANK: Rank,
|
|
162
355
|
}
|
|
163
356
|
|
|
164
|
-
THEN_ELSE = {
|
|
165
|
-
|
|
166
|
-
'else': 'E'
|
|
167
|
-
}
|
|
357
|
+
THEN_ELSE = {"then": "T", "else": "E"}
|
|
358
|
+
|
|
168
359
|
JOIN_MAPPING = {
|
|
169
360
|
INNER_JOIN: InnerJoin,
|
|
170
361
|
LEFT_JOIN: LeftJoin,
|
|
171
362
|
FULL_JOIN: FullJoin,
|
|
172
|
-
CROSS_JOIN: CrossJoin
|
|
363
|
+
CROSS_JOIN: CrossJoin,
|
|
173
364
|
}
|
|
174
365
|
|
|
175
366
|
HR_COMP_MAPPING = {
|
|
@@ -190,7 +381,7 @@ HR_NUM_BINARY_MAPPING = {
|
|
|
190
381
|
HR_UNARY_MAPPING = {
|
|
191
382
|
# Numeric
|
|
192
383
|
PLUS: HRUnPlus,
|
|
193
|
-
MINUS: HRUnMinus
|
|
384
|
+
MINUS: HRUnMinus,
|
|
194
385
|
}
|
|
195
386
|
|
|
196
387
|
HA_COMP_MAPPING = {
|
|
@@ -211,5 +402,5 @@ HA_NUM_BINARY_MAPPING = {
|
|
|
211
402
|
HA_UNARY_MAPPING = {
|
|
212
403
|
# Numeric
|
|
213
404
|
PLUS: HRUnPlus,
|
|
214
|
-
MINUS: HRUnMinus
|
|
405
|
+
MINUS: HRUnMinus,
|
|
215
406
|
}
|
vtlengine/__init__.py
CHANGED
|
@@ -1,16 +1,25 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
3
5
|
|
|
4
6
|
from vtlengine.Model import Dataset
|
|
5
|
-
from vtlengine.files.output._time_period_representation import
|
|
6
|
-
format_time_period_external_representation,
|
|
7
|
+
from vtlengine.files.output._time_period_representation import (
|
|
8
|
+
format_time_period_external_representation,
|
|
9
|
+
TimePeriodRepresentation,
|
|
10
|
+
)
|
|
11
|
+
|
|
7
12
|
|
|
13
|
+
def save_datapoints(
|
|
14
|
+
time_period_representation: Optional[TimePeriodRepresentation],
|
|
15
|
+
dataset: Dataset,
|
|
16
|
+
output_path: Union[str, Path],
|
|
17
|
+
) -> None:
|
|
8
18
|
|
|
9
|
-
|
|
10
|
-
|
|
19
|
+
if dataset.data is None:
|
|
20
|
+
dataset.data = pd.DataFrame()
|
|
11
21
|
if time_period_representation is not None:
|
|
12
22
|
format_time_period_external_representation(dataset, time_period_representation)
|
|
13
|
-
|
|
14
23
|
if isinstance(output_path, str):
|
|
15
24
|
if output_path.endswith("/"):
|
|
16
25
|
s3_file_output = output_path + f"{dataset.name}.csv"
|
|
@@ -8,23 +8,24 @@ from vtlengine.Model import Dataset, Scalar
|
|
|
8
8
|
|
|
9
9
|
class TimePeriodRepresentation(Enum):
|
|
10
10
|
# Time Period output format
|
|
11
|
-
SDMX_GREGORIAN =
|
|
12
|
-
SDMX_REPORTING =
|
|
13
|
-
VTL =
|
|
11
|
+
SDMX_GREGORIAN = "sdmx_gregorian"
|
|
12
|
+
SDMX_REPORTING = "sdmx_reporting"
|
|
13
|
+
VTL = "vtl"
|
|
14
14
|
|
|
15
15
|
@classmethod
|
|
16
|
-
def check_value(cls, value: str):
|
|
16
|
+
def check_value(cls, value: str) -> "TimePeriodRepresentation":
|
|
17
17
|
if value not in cls._value2member_map_:
|
|
18
18
|
raise Exception("Invalid Time Period Representation")
|
|
19
19
|
return cls(value)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def _format_vtl_representation(value: str):
|
|
22
|
+
def _format_vtl_representation(value: str) -> str:
|
|
23
23
|
return TimePeriodHandler(value).vtl_representation()
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def format_time_period_external_representation(
|
|
27
|
-
|
|
26
|
+
def format_time_period_external_representation(
|
|
27
|
+
dataset: Dataset | Scalar, mode: TimePeriodRepresentation
|
|
28
|
+
) -> None:
|
|
28
29
|
"""
|
|
29
30
|
From SDMX time period representation to standard VTL representation (no hyphen).
|
|
30
31
|
'A': 'nothing to do',
|
|
@@ -48,7 +49,7 @@ def format_time_period_external_representation(dataset: Dataset | Scalar,
|
|
|
48
49
|
for comp in dataset.components.values():
|
|
49
50
|
if comp.data_type == TimePeriod:
|
|
50
51
|
dataset.data[comp.name] = dataset.data[comp.name].map(
|
|
51
|
-
_format_vtl_representation,
|
|
52
|
-
|
|
52
|
+
_format_vtl_representation, na_action="ignore"
|
|
53
|
+
)
|
|
53
54
|
|
|
54
55
|
return
|
|
@@ -1,12 +1,22 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from csv import DictReader
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from typing import Optional, Dict, Union
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Union, Any, Type, List
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
|
-
from vtlengine.DataTypes import
|
|
9
|
-
|
|
9
|
+
from vtlengine.DataTypes import (
|
|
10
|
+
Date,
|
|
11
|
+
TimePeriod,
|
|
12
|
+
TimeInterval,
|
|
13
|
+
Integer,
|
|
14
|
+
Number,
|
|
15
|
+
Boolean,
|
|
16
|
+
Duration,
|
|
17
|
+
SCALAR_TYPES_CLASS_REVERSE,
|
|
18
|
+
ScalarType,
|
|
19
|
+
)
|
|
10
20
|
from vtlengine.DataTypes.TimeHandling import DURATION_MAPPING
|
|
11
21
|
from vtlengine.files.parser._rfc_dialect import register_rfc
|
|
12
22
|
from vtlengine.files.parser._time_checking import check_date, check_time_period, check_time
|
|
@@ -14,14 +24,14 @@ from vtlengine.files.parser._time_checking import check_date, check_time_period,
|
|
|
14
24
|
from vtlengine.Exceptions import InputValidationException, SemanticError
|
|
15
25
|
from vtlengine.Model import Component, Role, Dataset
|
|
16
26
|
|
|
17
|
-
TIME_CHECKS_MAPPING = {
|
|
27
|
+
TIME_CHECKS_MAPPING: Dict[Type[ScalarType], Any] = {
|
|
18
28
|
Date: check_date,
|
|
19
29
|
TimePeriod: check_time_period,
|
|
20
|
-
TimeInterval: check_time
|
|
30
|
+
TimeInterval: check_time,
|
|
21
31
|
}
|
|
22
32
|
|
|
23
33
|
|
|
24
|
-
def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
34
|
+
def _validate_csv_path(components: Dict[str, Component], csv_path: Path) -> None:
|
|
25
35
|
# GE1 check if the file is empty
|
|
26
36
|
if not csv_path.exists():
|
|
27
37
|
raise Exception(f"Path {csv_path} does not exist.")
|
|
@@ -29,8 +39,8 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
|
29
39
|
raise Exception(f"Path {csv_path} is not a file.")
|
|
30
40
|
register_rfc()
|
|
31
41
|
try:
|
|
32
|
-
with open(csv_path,
|
|
33
|
-
reader = DictReader(f, dialect=
|
|
42
|
+
with open(csv_path, "r") as f:
|
|
43
|
+
reader = DictReader(f, dialect="rfc")
|
|
34
44
|
csv_columns = reader.fieldnames
|
|
35
45
|
|
|
36
46
|
except UnicodeDecodeError as error:
|
|
@@ -45,21 +55,24 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
|
|
|
45
55
|
) from None
|
|
46
56
|
|
|
47
57
|
if not csv_columns:
|
|
48
|
-
raise InputValidationException(code=
|
|
58
|
+
raise InputValidationException(code="0-1-1-6", file=csv_path)
|
|
49
59
|
|
|
50
60
|
if len(list(set(csv_columns))) != len(csv_columns):
|
|
51
61
|
duplicates = list(set([item for item in csv_columns if csv_columns.count(item) > 1]))
|
|
52
62
|
raise Exception(f"Duplicated columns {', '.join(duplicates)} found in file.")
|
|
53
63
|
|
|
54
64
|
comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
|
|
55
|
-
comps_missing
|
|
65
|
+
comps_missing: Union[str, List[str]] = (
|
|
66
|
+
[id_m for id_m in comp_names if id_m not in reader.fieldnames] if reader.fieldnames else []
|
|
67
|
+
)
|
|
56
68
|
if comps_missing:
|
|
57
69
|
comps_missing = ", ".join(comps_missing)
|
|
58
|
-
raise InputValidationException(code=
|
|
70
|
+
raise InputValidationException(code="0-1-1-8", ids=comps_missing, file=str(csv_path.name))
|
|
59
71
|
|
|
60
72
|
|
|
61
|
-
def _sanitize_pandas_columns(
|
|
62
|
-
|
|
73
|
+
def _sanitize_pandas_columns(
|
|
74
|
+
components: Dict[str, Component], csv_path: Union[str, Path], data: pd.DataFrame
|
|
75
|
+
) -> pd.DataFrame:
|
|
63
76
|
# Fast loading from SDMX-CSV
|
|
64
77
|
if "DATAFLOW" in data.columns and data.columns[0] == "DATAFLOW":
|
|
65
78
|
if "DATAFLOW" not in components:
|
|
@@ -75,11 +88,11 @@ def _sanitize_pandas_columns(components: Dict[str, Component],
|
|
|
75
88
|
|
|
76
89
|
# Validate identifiers
|
|
77
90
|
comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
|
|
78
|
-
comps_missing = [id_m for id_m in comp_names if id_m not in data.columns]
|
|
91
|
+
comps_missing: Union[str, List[str]] = [id_m for id_m in comp_names if id_m not in data.columns]
|
|
79
92
|
if comps_missing:
|
|
80
93
|
comps_missing = ", ".join(comps_missing)
|
|
81
94
|
file = csv_path if isinstance(csv_path, str) else csv_path.name
|
|
82
|
-
raise InputValidationException(code=
|
|
95
|
+
raise InputValidationException(code="0-1-1-7", ids=comps_missing, file=file)
|
|
83
96
|
|
|
84
97
|
# Fill rest of components with null values
|
|
85
98
|
for comp_name, comp in components.items():
|
|
@@ -94,47 +107,54 @@ def _pandas_load_csv(components: Dict[str, Component], csv_path: Path) -> pd.Dat
|
|
|
94
107
|
obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
|
|
95
108
|
|
|
96
109
|
try:
|
|
97
|
-
data = pd.read_csv(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
except UnicodeDecodeError as error:
|
|
110
|
+
data = pd.read_csv(
|
|
111
|
+
csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
|
|
112
|
+
)
|
|
113
|
+
except UnicodeDecodeError:
|
|
102
114
|
raise InputValidationException(code="0-1-2-5", file=csv_path.name)
|
|
103
115
|
|
|
104
116
|
return _sanitize_pandas_columns(components, csv_path, data)
|
|
105
117
|
|
|
118
|
+
|
|
106
119
|
def _pandas_load_s3_csv(components: Dict[str, Component], csv_path: str) -> pd.DataFrame:
|
|
107
120
|
obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
|
|
108
121
|
|
|
109
122
|
# start = time()
|
|
110
123
|
try:
|
|
111
|
-
data = pd.read_csv(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
na_values=[''])
|
|
124
|
+
data = pd.read_csv(
|
|
125
|
+
csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
|
|
126
|
+
)
|
|
115
127
|
|
|
116
|
-
except UnicodeDecodeError
|
|
128
|
+
except UnicodeDecodeError:
|
|
117
129
|
raise InputValidationException(code="0-1-2-5", file=csv_path)
|
|
118
130
|
except Exception as e:
|
|
119
131
|
raise InputValidationException(f"ERROR: {str(e)}, review file {str(csv_path)}")
|
|
120
|
-
|
|
121
|
-
# print(f"Data loaded from {csv_path}, shape: {data.shape}")
|
|
122
|
-
# end = time()
|
|
123
|
-
# print(f"Time to load data from s3 URI: {end - start}")
|
|
124
|
-
|
|
125
132
|
return _sanitize_pandas_columns(components, csv_path, data)
|
|
126
133
|
|
|
127
|
-
|
|
134
|
+
|
|
135
|
+
def _parse_boolean(value: str) -> bool:
|
|
136
|
+
if isinstance(value, bool):
|
|
137
|
+
return value
|
|
128
138
|
if value.lower() == "true" or value == "1":
|
|
129
139
|
return True
|
|
130
140
|
return False
|
|
131
141
|
|
|
132
142
|
|
|
133
|
-
def _validate_pandas(
|
|
134
|
-
|
|
143
|
+
def _validate_pandas(
|
|
144
|
+
components: Dict[str, Component], data: pd.DataFrame, dataset_name: str
|
|
145
|
+
) -> pd.DataFrame:
|
|
146
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
135
147
|
# Identifier checking
|
|
148
|
+
|
|
136
149
|
id_names = [comp_name for comp_name, comp in components.items() if comp.role == Role.IDENTIFIER]
|
|
137
150
|
|
|
151
|
+
missing_columns = [name for name in components.keys() if name not in data.columns.tolist()]
|
|
152
|
+
if missing_columns:
|
|
153
|
+
for name in missing_columns:
|
|
154
|
+
if components[name].nullable is False:
|
|
155
|
+
raise SemanticError("0-1-1-10", name=dataset_name, comp_name=name)
|
|
156
|
+
data[name] = None
|
|
157
|
+
|
|
138
158
|
for id_name in id_names:
|
|
139
159
|
if data[id_name].isnull().any():
|
|
140
160
|
raise SemanticError("0-1-1-4", null_identifier=id_name, name=dataset_name)
|
|
@@ -150,35 +170,42 @@ def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
|
|
|
150
170
|
|
|
151
171
|
for comp_name, comp in components.items():
|
|
152
172
|
if comp.data_type in (Date, TimePeriod, TimeInterval):
|
|
153
|
-
data[comp_name] = data[comp_name].map(
|
|
154
|
-
|
|
173
|
+
data[comp_name] = data[comp_name].map(
|
|
174
|
+
TIME_CHECKS_MAPPING[comp.data_type], na_action="ignore"
|
|
175
|
+
)
|
|
155
176
|
elif comp.data_type == Integer:
|
|
156
|
-
data[comp_name] = data[comp_name].map(
|
|
157
|
-
|
|
177
|
+
data[comp_name] = data[comp_name].map(
|
|
178
|
+
lambda x: Integer.cast(float(x)), na_action="ignore"
|
|
179
|
+
)
|
|
158
180
|
elif comp.data_type == Number:
|
|
159
|
-
data[comp_name] = data[comp_name].map(lambda x: float(x), na_action=
|
|
181
|
+
data[comp_name] = data[comp_name].map(lambda x: float(x), na_action="ignore")
|
|
160
182
|
elif comp.data_type == Boolean:
|
|
161
|
-
data[comp_name] = data[comp_name].map(
|
|
162
|
-
|
|
183
|
+
data[comp_name] = data[comp_name].map(
|
|
184
|
+
lambda x: _parse_boolean(x), na_action="ignore"
|
|
185
|
+
)
|
|
163
186
|
elif comp.data_type == Duration:
|
|
164
|
-
values_correct =
|
|
165
|
-
|
|
187
|
+
values_correct = (
|
|
188
|
+
data[comp_name]
|
|
189
|
+
.map(lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action="ignore")
|
|
190
|
+
.all()
|
|
191
|
+
)
|
|
166
192
|
if not values_correct:
|
|
167
193
|
raise ValueError(f"Duration values are not correct in column {comp_name}")
|
|
168
194
|
else:
|
|
169
|
-
data[comp_name] = data[comp_name].map(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
195
|
+
data[comp_name] = data[comp_name].map(
|
|
196
|
+
lambda x: str(x).replace('"', ""), na_action="ignore"
|
|
197
|
+
)
|
|
198
|
+
data[comp_name] = data[comp_name].astype(np.object_, errors="raise")
|
|
199
|
+
except ValueError:
|
|
200
|
+
str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type] if comp else "Null"
|
|
174
201
|
raise SemanticError("0-1-1-12", name=dataset_name, column=comp_name, type=str_comp)
|
|
175
202
|
|
|
176
203
|
return data
|
|
177
204
|
|
|
178
205
|
|
|
179
|
-
def load_datapoints(
|
|
180
|
-
|
|
181
|
-
|
|
206
|
+
def load_datapoints(
|
|
207
|
+
components: Dict[str, Component], dataset_name: str, csv_path: Optional[Union[Path, str]] = None
|
|
208
|
+
) -> pd.DataFrame:
|
|
182
209
|
if csv_path is None or (isinstance(csv_path, Path) and not csv_path.exists()):
|
|
183
210
|
return pd.DataFrame(columns=list(components.keys()))
|
|
184
211
|
elif isinstance(csv_path, str):
|
|
@@ -193,5 +220,5 @@ def load_datapoints(components: Dict[str, Component],
|
|
|
193
220
|
return data
|
|
194
221
|
|
|
195
222
|
|
|
196
|
-
def _fill_dataset_empty_data(dataset: Dataset):
|
|
223
|
+
def _fill_dataset_empty_data(dataset: Dataset) -> None:
|
|
197
224
|
dataset.data = pd.DataFrame(columns=list(dataset.components.keys()))
|
|
@@ -6,9 +6,10 @@ class RFCDialect(csv.Dialect):
|
|
|
6
6
|
https://docs.python.org/3/library/csv.html#csv.Dialect
|
|
7
7
|
https://tools.ietf.org/html/rfc4180
|
|
8
8
|
"""
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
delimiter = ","
|
|
10
11
|
doublequote = True
|
|
11
|
-
lineterminator =
|
|
12
|
+
lineterminator = "\r\n"
|
|
12
13
|
quotechar = '"'
|
|
13
14
|
quoting = csv.QUOTE_MINIMAL
|
|
14
15
|
strict = True
|
|
@@ -16,6 +17,6 @@ class RFCDialect(csv.Dialect):
|
|
|
16
17
|
skipinitialspace = False
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
def register_rfc():
|
|
20
|
-
"""
|
|
21
|
-
csv.register_dialect(
|
|
20
|
+
def register_rfc() -> None:
|
|
21
|
+
"""Register the RFC dialect."""
|
|
22
|
+
csv.register_dialect("rfc", RFCDialect)
|