vtlengine 1.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vtlengine might be problematic. Click here for more details.

Files changed (56) hide show
  1. vtlengine/API/_InternalApi.py +159 -102
  2. vtlengine/API/__init__.py +110 -68
  3. vtlengine/AST/ASTConstructor.py +188 -98
  4. vtlengine/AST/ASTConstructorModules/Expr.py +402 -205
  5. vtlengine/AST/ASTConstructorModules/ExprComponents.py +248 -104
  6. vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
  7. vtlengine/AST/ASTEncoders.py +1 -1
  8. vtlengine/AST/ASTTemplate.py +24 -9
  9. vtlengine/AST/ASTVisitor.py +8 -12
  10. vtlengine/AST/DAG/__init__.py +43 -35
  11. vtlengine/AST/DAG/_words.py +4 -4
  12. vtlengine/AST/Grammar/Vtl.g4 +49 -20
  13. vtlengine/AST/Grammar/VtlTokens.g4 +13 -1
  14. vtlengine/AST/Grammar/lexer.py +2012 -1312
  15. vtlengine/AST/Grammar/parser.py +7524 -4343
  16. vtlengine/AST/Grammar/tokens.py +140 -128
  17. vtlengine/AST/VtlVisitor.py +16 -5
  18. vtlengine/AST/__init__.py +41 -11
  19. vtlengine/DataTypes/NumericTypesHandling.py +5 -4
  20. vtlengine/DataTypes/TimeHandling.py +196 -301
  21. vtlengine/DataTypes/__init__.py +304 -218
  22. vtlengine/Exceptions/__init__.py +96 -27
  23. vtlengine/Exceptions/messages.py +149 -69
  24. vtlengine/Interpreter/__init__.py +817 -497
  25. vtlengine/Model/__init__.py +172 -121
  26. vtlengine/Operators/Aggregation.py +156 -95
  27. vtlengine/Operators/Analytic.py +167 -79
  28. vtlengine/Operators/Assignment.py +7 -4
  29. vtlengine/Operators/Boolean.py +27 -32
  30. vtlengine/Operators/CastOperator.py +177 -131
  31. vtlengine/Operators/Clause.py +137 -99
  32. vtlengine/Operators/Comparison.py +148 -117
  33. vtlengine/Operators/Conditional.py +290 -98
  34. vtlengine/Operators/General.py +68 -47
  35. vtlengine/Operators/HROperators.py +91 -72
  36. vtlengine/Operators/Join.py +217 -118
  37. vtlengine/Operators/Numeric.py +129 -46
  38. vtlengine/Operators/RoleSetter.py +16 -15
  39. vtlengine/Operators/Set.py +61 -36
  40. vtlengine/Operators/String.py +213 -139
  41. vtlengine/Operators/Time.py +467 -215
  42. vtlengine/Operators/Validation.py +117 -76
  43. vtlengine/Operators/__init__.py +340 -213
  44. vtlengine/Utils/__init__.py +232 -41
  45. vtlengine/__init__.py +1 -1
  46. vtlengine/files/output/__init__.py +15 -6
  47. vtlengine/files/output/_time_period_representation.py +10 -9
  48. vtlengine/files/parser/__init__.py +79 -52
  49. vtlengine/files/parser/_rfc_dialect.py +6 -5
  50. vtlengine/files/parser/_time_checking.py +48 -37
  51. vtlengine-1.0.2.dist-info/METADATA +245 -0
  52. vtlengine-1.0.2.dist-info/RECORD +58 -0
  53. {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/WHEEL +1 -1
  54. vtlengine-1.0.dist-info/METADATA +0 -104
  55. vtlengine-1.0.dist-info/RECORD +0 -58
  56. {vtlengine-1.0.dist-info → vtlengine-1.0.2.dist-info}/LICENSE.md +0 -0
@@ -1,34 +1,222 @@
1
- from vtlengine.Operators.Aggregation import (Avg, Count, Max, Median, Min,
2
- PopulationStandardDeviation,
3
- PopulationVariance, SampleStandardDeviation, SampleVariance, Sum)
4
- from vtlengine.Operators.Analytic import (Avg as AvgAnalytic, Count as CountAnalytic, FirstValue, Lag,
5
- LastValue, Lead, Max as MaxAnalytic, Median as MedianAnalytic,
6
- Min as MinAnalytic,
7
- PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
8
- PopulationVariance as PopulationVarianceAnalytic, Rank,
9
- RatioToReport,
10
- SampleStandardDeviation as SampleStandardDeviationAnalytic,
11
- SampleVariance as SampleVarianceAnalytic, Sum as SumAnalytic)
1
+ from typing import Any, Dict
2
+
3
+ from vtlengine.Operators.Aggregation import (
4
+ Avg,
5
+ Count,
6
+ Max,
7
+ Median,
8
+ Min,
9
+ PopulationStandardDeviation,
10
+ PopulationVariance,
11
+ SampleStandardDeviation,
12
+ SampleVariance,
13
+ Sum,
14
+ )
15
+ from vtlengine.Operators.Analytic import (
16
+ Avg as AvgAnalytic,
17
+ Count as CountAnalytic,
18
+ FirstValue,
19
+ Lag,
20
+ LastValue,
21
+ Lead,
22
+ Max as MaxAnalytic,
23
+ Median as MedianAnalytic,
24
+ Min as MinAnalytic,
25
+ PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
26
+ PopulationVariance as PopulationVarianceAnalytic,
27
+ Rank,
28
+ RatioToReport,
29
+ SampleStandardDeviation as SampleStandardDeviationAnalytic,
30
+ SampleVariance as SampleVarianceAnalytic,
31
+ Sum as SumAnalytic,
32
+ )
12
33
  from vtlengine.Operators.Boolean import And, Not, Or, Xor
13
- from vtlengine.Operators.Clause import Aggregate, Calc, Drop, Filter, Keep, Pivot, Rename, Sub, Unpivot
14
- from vtlengine.Operators.Comparison import Equal, Greater, GreaterEqual, In, IsNull, Less, LessEqual, \
15
- NotEqual, NotIn, Match
34
+ from vtlengine.Operators.Clause import (
35
+ Aggregate,
36
+ Calc,
37
+ Drop,
38
+ Filter,
39
+ Keep,
40
+ Pivot,
41
+ Rename,
42
+ Sub,
43
+ Unpivot,
44
+ )
45
+ from vtlengine.Operators.Comparison import (
46
+ Equal,
47
+ Greater,
48
+ GreaterEqual,
49
+ In,
50
+ IsNull,
51
+ Less,
52
+ LessEqual,
53
+ NotEqual,
54
+ NotIn,
55
+ Match,
56
+ )
16
57
  from vtlengine.Operators.Conditional import Nvl
17
58
  from vtlengine.Operators.General import Alias, Membership
18
- from vtlengine.Operators.HROperators import HREqual, HRGreater, HRGreaterEqual, HRLess, HRLessEqual, \
19
- HRBinPlus, HRBinMinus, HRUnPlus, HRUnMinus
59
+ from vtlengine.Operators.HROperators import (
60
+ HREqual,
61
+ HRGreater,
62
+ HRGreaterEqual,
63
+ HRLess,
64
+ HRLessEqual,
65
+ HRBinPlus,
66
+ HRBinMinus,
67
+ HRUnPlus,
68
+ HRUnMinus,
69
+ )
20
70
  from vtlengine.Operators.Join import Apply, CrossJoin, FullJoin, InnerJoin, LeftJoin
21
- from vtlengine.Operators.Numeric import AbsoluteValue, BinMinus, BinPlus, Ceil, Div, Exponential, Floor, \
22
- Logarithm, Modulo, Mult, NaturalLogarithm, Power, Round, SquareRoot, Trunc, UnMinus, UnPlus
71
+ from vtlengine.Operators.Numeric import (
72
+ AbsoluteValue,
73
+ BinMinus,
74
+ BinPlus,
75
+ Ceil,
76
+ Div,
77
+ Exponential,
78
+ Floor,
79
+ Logarithm,
80
+ Modulo,
81
+ Mult,
82
+ NaturalLogarithm,
83
+ Power,
84
+ Round,
85
+ SquareRoot,
86
+ Trunc,
87
+ UnMinus,
88
+ UnPlus,
89
+ Random,
90
+ )
23
91
  from vtlengine.Operators.RoleSetter import Attribute, Identifier, Measure
24
92
  from vtlengine.Operators.Set import Intersection, Setdiff, Symdiff, Union
25
- from vtlengine.Operators.String import Concatenate, Length, Lower, Ltrim, Replace, Rtrim, Substr, Trim, Upper
26
- from vtlengine.Operators.Time import Flow_to_stock, Period_indicator, Stock_to_flow, Fill_time_series, \
27
- Time_Shift
93
+ from vtlengine.Operators.String import (
94
+ Concatenate,
95
+ Length,
96
+ Lower,
97
+ Ltrim,
98
+ Replace,
99
+ Rtrim,
100
+ Substr,
101
+ Trim,
102
+ Upper,
103
+ )
104
+ from vtlengine.Operators.Time import (
105
+ Flow_to_stock,
106
+ Period_indicator,
107
+ Stock_to_flow,
108
+ Fill_time_series,
109
+ Time_Shift,
110
+ Year,
111
+ Month,
112
+ Day_of_Month,
113
+ Day_of_Year,
114
+ Day_to_Year,
115
+ Day_to_Month,
116
+ Year_to_Day,
117
+ Month_to_Day,
118
+ Date_Diff,
119
+ Date_Add,
120
+ )
28
121
 
29
- from vtlengine.AST.Grammar.tokens import *
122
+ from vtlengine.AST.Grammar.tokens import (
123
+ MEMBERSHIP,
124
+ AND,
125
+ OR,
126
+ XOR,
127
+ EQ,
128
+ NEQ,
129
+ GT,
130
+ GTE,
131
+ LT,
132
+ LTE,
133
+ IN,
134
+ NOT_IN,
135
+ NVL,
136
+ PLUS,
137
+ MINUS,
138
+ MULT,
139
+ LOG,
140
+ MOD,
141
+ POWER,
142
+ DIV,
143
+ AS,
144
+ CONCAT,
145
+ TIMESHIFT,
146
+ CHARSET_MATCH,
147
+ NOT,
148
+ ABS,
149
+ EXP,
150
+ LN,
151
+ SQRT,
152
+ CEIL,
153
+ FLOOR,
154
+ ISNULL,
155
+ PERIOD_INDICATOR,
156
+ LEN,
157
+ LCASE,
158
+ LTRIM,
159
+ RTRIM,
160
+ TRIM,
161
+ UCASE,
162
+ FLOW_TO_STOCK,
163
+ STOCK_TO_FLOW,
164
+ ROUND,
165
+ TRUNC,
166
+ SUBSTR,
167
+ REPLACE,
168
+ FILL_TIME_SERIES,
169
+ IDENTIFIER,
170
+ ATTRIBUTE,
171
+ MEASURE,
172
+ CALC,
173
+ FILTER,
174
+ KEEP,
175
+ DROP,
176
+ RENAME,
177
+ PIVOT,
178
+ UNPIVOT,
179
+ SUBSPACE,
180
+ AGGREGATE,
181
+ APPLY,
182
+ UNION,
183
+ INTERSECT,
184
+ SYMDIFF,
185
+ SETDIFF,
186
+ MAX,
187
+ MIN,
188
+ SUM,
189
+ COUNT,
190
+ AVG,
191
+ MEDIAN,
192
+ STDDEV_POP,
193
+ STDDEV_SAMP,
194
+ VAR_POP,
195
+ VAR_SAMP,
196
+ LAG,
197
+ LEAD,
198
+ FIRST_VALUE,
199
+ LAST_VALUE,
200
+ RATIO_TO_REPORT,
201
+ RANK,
202
+ INNER_JOIN,
203
+ LEFT_JOIN,
204
+ FULL_JOIN,
205
+ CROSS_JOIN,
206
+ RANDOM,
207
+ DAYOFYEAR,
208
+ DAYOFMONTH,
209
+ MONTH,
210
+ YEAR,
211
+ DAYTOYEAR,
212
+ DAYTOMONTH,
213
+ YEARTODAY,
214
+ MONTHTODAY,
215
+ DATE_DIFF,
216
+ DATE_ADD,
217
+ )
30
218
 
31
- BINARY_MAPPING = {
219
+ BINARY_MAPPING: Dict[Any, Any] = {
32
220
  # General
33
221
  MEMBERSHIP: Membership,
34
222
  # Boolean
@@ -54,13 +242,15 @@ BINARY_MAPPING = {
54
242
  MOD: Modulo,
55
243
  POWER: Power,
56
244
  DIV: Div,
245
+ RANDOM: Random,
57
246
  # General
58
247
  AS: Alias,
59
248
  # String
60
249
  CONCAT: Concatenate,
61
250
  # Time
62
251
  TIMESHIFT: Time_Shift,
63
- CHARSET_MATCH: Match
252
+ CHARSET_MATCH: Match,
253
+ DATE_DIFF: Date_Diff,
64
254
  }
65
255
 
66
256
  UNARY_MAPPING = {
@@ -88,7 +278,15 @@ UNARY_MAPPING = {
88
278
  # Time
89
279
  PERIOD_INDICATOR: Period_indicator,
90
280
  FLOW_TO_STOCK: Flow_to_stock,
91
- STOCK_TO_FLOW: Stock_to_flow
281
+ STOCK_TO_FLOW: Stock_to_flow,
282
+ YEAR: Year,
283
+ MONTH: Month,
284
+ DAYOFMONTH: Day_of_Month,
285
+ DAYOFYEAR: Day_of_Year,
286
+ DAYTOYEAR: Day_to_Year,
287
+ DAYTOMONTH: Day_to_Month,
288
+ YEARTODAY: Year_to_Day,
289
+ MONTHTODAY: Month_to_Day,
92
290
  }
93
291
 
94
292
  PARAMETRIC_MAPPING = {
@@ -100,6 +298,7 @@ PARAMETRIC_MAPPING = {
100
298
  REPLACE: Replace,
101
299
  # Time
102
300
  FILL_TIME_SERIES: Fill_time_series,
301
+ DATE_ADD: Date_Add,
103
302
  }
104
303
 
105
304
  ROLE_SETTER_MAPPING = {
@@ -118,15 +317,10 @@ REGULAR_AGGREGATION_MAPPING = {
118
317
  UNPIVOT: Unpivot,
119
318
  SUBSPACE: Sub,
120
319
  AGGREGATE: Aggregate,
121
- APPLY: Apply
320
+ APPLY: Apply,
122
321
  }
123
322
 
124
- SET_MAPPING = {
125
- UNION: Union,
126
- INTERSECT: Intersection,
127
- SYMDIFF: Symdiff,
128
- SETDIFF: Setdiff
129
- }
323
+ SET_MAPPING = {UNION: Union, INTERSECT: Intersection, SYMDIFF: Symdiff, SETDIFF: Setdiff}
130
324
 
131
325
  AGGREGATION_MAPPING = {
132
326
  MAX: Max,
@@ -139,7 +333,6 @@ AGGREGATION_MAPPING = {
139
333
  STDDEV_SAMP: SampleStandardDeviation,
140
334
  VAR_POP: PopulationVariance,
141
335
  VAR_SAMP: SampleVariance,
142
-
143
336
  }
144
337
 
145
338
  ANALYTIC_MAPPING = {
@@ -158,18 +351,16 @@ ANALYTIC_MAPPING = {
158
351
  FIRST_VALUE: FirstValue,
159
352
  LAST_VALUE: LastValue,
160
353
  RATIO_TO_REPORT: RatioToReport,
161
- RANK: Rank
354
+ RANK: Rank,
162
355
  }
163
356
 
164
- THEN_ELSE = {
165
- 'then': 'T',
166
- 'else': 'E'
167
- }
357
+ THEN_ELSE = {"then": "T", "else": "E"}
358
+
168
359
  JOIN_MAPPING = {
169
360
  INNER_JOIN: InnerJoin,
170
361
  LEFT_JOIN: LeftJoin,
171
362
  FULL_JOIN: FullJoin,
172
- CROSS_JOIN: CrossJoin
363
+ CROSS_JOIN: CrossJoin,
173
364
  }
174
365
 
175
366
  HR_COMP_MAPPING = {
@@ -190,7 +381,7 @@ HR_NUM_BINARY_MAPPING = {
190
381
  HR_UNARY_MAPPING = {
191
382
  # Numeric
192
383
  PLUS: HRUnPlus,
193
- MINUS: HRUnMinus
384
+ MINUS: HRUnMinus,
194
385
  }
195
386
 
196
387
  HA_COMP_MAPPING = {
@@ -211,5 +402,5 @@ HA_NUM_BINARY_MAPPING = {
211
402
  HA_UNARY_MAPPING = {
212
403
  # Numeric
213
404
  PLUS: HRUnPlus,
214
- MINUS: HRUnMinus
405
+ MINUS: HRUnMinus,
215
406
  }
vtlengine/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  from vtlengine.API import semantic_analysis, run
2
2
 
3
- __all__ = ['semantic_analysis', 'run']
3
+ __all__ = ["semantic_analysis", "run"]
@@ -1,16 +1,25 @@
1
1
  from pathlib import Path
2
- # from time import time
2
+ from typing import Optional, Union
3
+
4
+ import pandas as pd
3
5
 
4
6
  from vtlengine.Model import Dataset
5
- from vtlengine.files.output._time_period_representation import \
6
- format_time_period_external_representation, TimePeriodRepresentation
7
+ from vtlengine.files.output._time_period_representation import (
8
+ format_time_period_external_representation,
9
+ TimePeriodRepresentation,
10
+ )
11
+
7
12
 
13
+ def save_datapoints(
14
+ time_period_representation: Optional[TimePeriodRepresentation],
15
+ dataset: Dataset,
16
+ output_path: Union[str, Path],
17
+ ) -> None:
8
18
 
9
- def save_datapoints(time_period_representation: TimePeriodRepresentation,
10
- dataset: Dataset, output_path: str | Path):
19
+ if dataset.data is None:
20
+ dataset.data = pd.DataFrame()
11
21
  if time_period_representation is not None:
12
22
  format_time_period_external_representation(dataset, time_period_representation)
13
-
14
23
  if isinstance(output_path, str):
15
24
  if output_path.endswith("/"):
16
25
  s3_file_output = output_path + f"{dataset.name}.csv"
@@ -8,23 +8,24 @@ from vtlengine.Model import Dataset, Scalar
8
8
 
9
9
  class TimePeriodRepresentation(Enum):
10
10
  # Time Period output format
11
- SDMX_GREGORIAN = 'sdmx_gregorian'
12
- SDMX_REPORTING = 'sdmx_reporting'
13
- VTL = 'vtl'
11
+ SDMX_GREGORIAN = "sdmx_gregorian"
12
+ SDMX_REPORTING = "sdmx_reporting"
13
+ VTL = "vtl"
14
14
 
15
15
  @classmethod
16
- def check_value(cls, value: str):
16
+ def check_value(cls, value: str) -> "TimePeriodRepresentation":
17
17
  if value not in cls._value2member_map_:
18
18
  raise Exception("Invalid Time Period Representation")
19
19
  return cls(value)
20
20
 
21
21
 
22
- def _format_vtl_representation(value: str):
22
+ def _format_vtl_representation(value: str) -> str:
23
23
  return TimePeriodHandler(value).vtl_representation()
24
24
 
25
25
 
26
- def format_time_period_external_representation(dataset: Dataset | Scalar,
27
- mode: TimePeriodRepresentation):
26
+ def format_time_period_external_representation(
27
+ dataset: Dataset | Scalar, mode: TimePeriodRepresentation
28
+ ) -> None:
28
29
  """
29
30
  From SDMX time period representation to standard VTL representation (no hyphen).
30
31
  'A': 'nothing to do',
@@ -48,7 +49,7 @@ def format_time_period_external_representation(dataset: Dataset | Scalar,
48
49
  for comp in dataset.components.values():
49
50
  if comp.data_type == TimePeriod:
50
51
  dataset.data[comp.name] = dataset.data[comp.name].map(
51
- _format_vtl_representation,
52
- na_action='ignore')
52
+ _format_vtl_representation, na_action="ignore"
53
+ )
53
54
 
54
55
  return
@@ -1,12 +1,22 @@
1
+ import warnings
1
2
  from csv import DictReader
2
3
  from pathlib import Path
3
- # from time import time
4
- from typing import Optional, Dict, Union
4
+
5
+ from typing import Optional, Dict, Union, Any, Type, List
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
8
- from vtlengine.DataTypes import Date, TimePeriod, TimeInterval, Integer, Number, Boolean, Duration, \
9
- SCALAR_TYPES_CLASS_REVERSE
9
+ from vtlengine.DataTypes import (
10
+ Date,
11
+ TimePeriod,
12
+ TimeInterval,
13
+ Integer,
14
+ Number,
15
+ Boolean,
16
+ Duration,
17
+ SCALAR_TYPES_CLASS_REVERSE,
18
+ ScalarType,
19
+ )
10
20
  from vtlengine.DataTypes.TimeHandling import DURATION_MAPPING
11
21
  from vtlengine.files.parser._rfc_dialect import register_rfc
12
22
  from vtlengine.files.parser._time_checking import check_date, check_time_period, check_time
@@ -14,14 +24,14 @@ from vtlengine.files.parser._time_checking import check_date, check_time_period,
14
24
  from vtlengine.Exceptions import InputValidationException, SemanticError
15
25
  from vtlengine.Model import Component, Role, Dataset
16
26
 
17
- TIME_CHECKS_MAPPING = {
27
+ TIME_CHECKS_MAPPING: Dict[Type[ScalarType], Any] = {
18
28
  Date: check_date,
19
29
  TimePeriod: check_time_period,
20
- TimeInterval: check_time
30
+ TimeInterval: check_time,
21
31
  }
22
32
 
23
33
 
24
- def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
34
+ def _validate_csv_path(components: Dict[str, Component], csv_path: Path) -> None:
25
35
  # GE1 check if the file is empty
26
36
  if not csv_path.exists():
27
37
  raise Exception(f"Path {csv_path} does not exist.")
@@ -29,8 +39,8 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
29
39
  raise Exception(f"Path {csv_path} is not a file.")
30
40
  register_rfc()
31
41
  try:
32
- with open(csv_path, 'r') as f:
33
- reader = DictReader(f, dialect='rfc')
42
+ with open(csv_path, "r") as f:
43
+ reader = DictReader(f, dialect="rfc")
34
44
  csv_columns = reader.fieldnames
35
45
 
36
46
  except UnicodeDecodeError as error:
@@ -45,21 +55,24 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
45
55
  ) from None
46
56
 
47
57
  if not csv_columns:
48
- raise InputValidationException(code='0-1-1-6', file=csv_path)
58
+ raise InputValidationException(code="0-1-1-6", file=csv_path)
49
59
 
50
60
  if len(list(set(csv_columns))) != len(csv_columns):
51
61
  duplicates = list(set([item for item in csv_columns if csv_columns.count(item) > 1]))
52
62
  raise Exception(f"Duplicated columns {', '.join(duplicates)} found in file.")
53
63
 
54
64
  comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
55
- comps_missing = [id_m for id_m in comp_names if id_m not in reader.fieldnames]
65
+ comps_missing: Union[str, List[str]] = (
66
+ [id_m for id_m in comp_names if id_m not in reader.fieldnames] if reader.fieldnames else []
67
+ )
56
68
  if comps_missing:
57
69
  comps_missing = ", ".join(comps_missing)
58
- raise InputValidationException(code='0-1-1-8', ids=comps_missing, file=str(csv_path.name))
70
+ raise InputValidationException(code="0-1-1-8", ids=comps_missing, file=str(csv_path.name))
59
71
 
60
72
 
61
- def _sanitize_pandas_columns(components: Dict[str, Component],
62
- csv_path: Union[str, Path], data: pd.DataFrame) -> pd.DataFrame:
73
+ def _sanitize_pandas_columns(
74
+ components: Dict[str, Component], csv_path: Union[str, Path], data: pd.DataFrame
75
+ ) -> pd.DataFrame:
63
76
  # Fast loading from SDMX-CSV
64
77
  if "DATAFLOW" in data.columns and data.columns[0] == "DATAFLOW":
65
78
  if "DATAFLOW" not in components:
@@ -75,11 +88,11 @@ def _sanitize_pandas_columns(components: Dict[str, Component],
75
88
 
76
89
  # Validate identifiers
77
90
  comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
78
- comps_missing = [id_m for id_m in comp_names if id_m not in data.columns]
91
+ comps_missing: Union[str, List[str]] = [id_m for id_m in comp_names if id_m not in data.columns]
79
92
  if comps_missing:
80
93
  comps_missing = ", ".join(comps_missing)
81
94
  file = csv_path if isinstance(csv_path, str) else csv_path.name
82
- raise InputValidationException(code='0-1-1-7', ids=comps_missing, file=file)
95
+ raise InputValidationException(code="0-1-1-7", ids=comps_missing, file=file)
83
96
 
84
97
  # Fill rest of components with null values
85
98
  for comp_name, comp in components.items():
@@ -94,47 +107,54 @@ def _pandas_load_csv(components: Dict[str, Component], csv_path: Path) -> pd.Dat
94
107
  obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
95
108
 
96
109
  try:
97
- data = pd.read_csv(csv_path, dtype=obj_dtypes,
98
- engine='c',
99
- keep_default_na=False,
100
- na_values=[''])
101
- except UnicodeDecodeError as error:
110
+ data = pd.read_csv(
111
+ csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
112
+ )
113
+ except UnicodeDecodeError:
102
114
  raise InputValidationException(code="0-1-2-5", file=csv_path.name)
103
115
 
104
116
  return _sanitize_pandas_columns(components, csv_path, data)
105
117
 
118
+
106
119
  def _pandas_load_s3_csv(components: Dict[str, Component], csv_path: str) -> pd.DataFrame:
107
120
  obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
108
121
 
109
122
  # start = time()
110
123
  try:
111
- data = pd.read_csv(csv_path, dtype=obj_dtypes,
112
- engine='c',
113
- keep_default_na=False,
114
- na_values=[''])
124
+ data = pd.read_csv(
125
+ csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
126
+ )
115
127
 
116
- except UnicodeDecodeError as error:
128
+ except UnicodeDecodeError:
117
129
  raise InputValidationException(code="0-1-2-5", file=csv_path)
118
130
  except Exception as e:
119
131
  raise InputValidationException(f"ERROR: {str(e)}, review file {str(csv_path)}")
120
-
121
- # print(f"Data loaded from {csv_path}, shape: {data.shape}")
122
- # end = time()
123
- # print(f"Time to load data from s3 URI: {end - start}")
124
-
125
132
  return _sanitize_pandas_columns(components, csv_path, data)
126
133
 
127
- def _parse_boolean(value: str):
134
+
135
+ def _parse_boolean(value: str) -> bool:
136
+ if isinstance(value, bool):
137
+ return value
128
138
  if value.lower() == "true" or value == "1":
129
139
  return True
130
140
  return False
131
141
 
132
142
 
133
- def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
134
- dataset_name: str) -> pd.DataFrame:
143
+ def _validate_pandas(
144
+ components: Dict[str, Component], data: pd.DataFrame, dataset_name: str
145
+ ) -> pd.DataFrame:
146
+ warnings.filterwarnings("ignore", category=FutureWarning)
135
147
  # Identifier checking
148
+
136
149
  id_names = [comp_name for comp_name, comp in components.items() if comp.role == Role.IDENTIFIER]
137
150
 
151
+ missing_columns = [name for name in components.keys() if name not in data.columns.tolist()]
152
+ if missing_columns:
153
+ for name in missing_columns:
154
+ if components[name].nullable is False:
155
+ raise SemanticError("0-1-1-10", name=dataset_name, comp_name=name)
156
+ data[name] = None
157
+
138
158
  for id_name in id_names:
139
159
  if data[id_name].isnull().any():
140
160
  raise SemanticError("0-1-1-4", null_identifier=id_name, name=dataset_name)
@@ -150,35 +170,42 @@ def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
150
170
 
151
171
  for comp_name, comp in components.items():
152
172
  if comp.data_type in (Date, TimePeriod, TimeInterval):
153
- data[comp_name] = data[comp_name].map(TIME_CHECKS_MAPPING[comp.data_type],
154
- na_action='ignore')
173
+ data[comp_name] = data[comp_name].map(
174
+ TIME_CHECKS_MAPPING[comp.data_type], na_action="ignore"
175
+ )
155
176
  elif comp.data_type == Integer:
156
- data[comp_name] = data[comp_name].map(lambda x: Integer.cast(float(x)),
157
- na_action='ignore')
177
+ data[comp_name] = data[comp_name].map(
178
+ lambda x: Integer.cast(float(x)), na_action="ignore"
179
+ )
158
180
  elif comp.data_type == Number:
159
- data[comp_name] = data[comp_name].map(lambda x: float(x), na_action='ignore')
181
+ data[comp_name] = data[comp_name].map(lambda x: float(x), na_action="ignore")
160
182
  elif comp.data_type == Boolean:
161
- data[comp_name] = data[comp_name].map(lambda x: _parse_boolean(x),
162
- na_action='ignore')
183
+ data[comp_name] = data[comp_name].map(
184
+ lambda x: _parse_boolean(x), na_action="ignore"
185
+ )
163
186
  elif comp.data_type == Duration:
164
- values_correct = data[comp_name].map(
165
- lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action='ignore').all()
187
+ values_correct = (
188
+ data[comp_name]
189
+ .map(lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action="ignore")
190
+ .all()
191
+ )
166
192
  if not values_correct:
167
193
  raise ValueError(f"Duration values are not correct in column {comp_name}")
168
194
  else:
169
- data[comp_name] = data[comp_name].map(lambda x: str(x).replace('"', ''),
170
- na_action='ignore')
171
- data[comp_name] = data[comp_name].astype(np.object_, errors='raise')
172
- except ValueError as e:
173
- str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type]
195
+ data[comp_name] = data[comp_name].map(
196
+ lambda x: str(x).replace('"', ""), na_action="ignore"
197
+ )
198
+ data[comp_name] = data[comp_name].astype(np.object_, errors="raise")
199
+ except ValueError:
200
+ str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type] if comp else "Null"
174
201
  raise SemanticError("0-1-1-12", name=dataset_name, column=comp_name, type=str_comp)
175
202
 
176
203
  return data
177
204
 
178
205
 
179
- def load_datapoints(components: Dict[str, Component],
180
- dataset_name: str,
181
- csv_path: Optional[Union[Path, str]] = None):
206
+ def load_datapoints(
207
+ components: Dict[str, Component], dataset_name: str, csv_path: Optional[Union[Path, str]] = None
208
+ ) -> pd.DataFrame:
182
209
  if csv_path is None or (isinstance(csv_path, Path) and not csv_path.exists()):
183
210
  return pd.DataFrame(columns=list(components.keys()))
184
211
  elif isinstance(csv_path, str):
@@ -193,5 +220,5 @@ def load_datapoints(components: Dict[str, Component],
193
220
  return data
194
221
 
195
222
 
196
- def _fill_dataset_empty_data(dataset: Dataset):
223
+ def _fill_dataset_empty_data(dataset: Dataset) -> None:
197
224
  dataset.data = pd.DataFrame(columns=list(dataset.components.keys()))
@@ -6,9 +6,10 @@ class RFCDialect(csv.Dialect):
6
6
  https://docs.python.org/3/library/csv.html#csv.Dialect
7
7
  https://tools.ietf.org/html/rfc4180
8
8
  """
9
- delimiter = ','
9
+
10
+ delimiter = ","
10
11
  doublequote = True
11
- lineterminator = '\r\n'
12
+ lineterminator = "\r\n"
12
13
  quotechar = '"'
13
14
  quoting = csv.QUOTE_MINIMAL
14
15
  strict = True
@@ -16,6 +17,6 @@ class RFCDialect(csv.Dialect):
16
17
  skipinitialspace = False
17
18
 
18
19
 
19
- def register_rfc():
20
- """ Register the RFC dialect. """
21
- csv.register_dialect('rfc', RFCDialect)
20
+ def register_rfc() -> None:
21
+ """Register the RFC dialect."""
22
+ csv.register_dialect("rfc", RFCDialect)