vtlengine 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vtlengine might be problematic. Click here for more details.

Files changed (54) hide show
  1. vtlengine/API/_InternalApi.py +153 -100
  2. vtlengine/API/__init__.py +109 -67
  3. vtlengine/AST/ASTConstructor.py +188 -98
  4. vtlengine/AST/ASTConstructorModules/Expr.py +306 -200
  5. vtlengine/AST/ASTConstructorModules/ExprComponents.py +172 -102
  6. vtlengine/AST/ASTConstructorModules/Terminals.py +158 -95
  7. vtlengine/AST/ASTEncoders.py +1 -1
  8. vtlengine/AST/ASTTemplate.py +8 -9
  9. vtlengine/AST/ASTVisitor.py +8 -12
  10. vtlengine/AST/DAG/__init__.py +43 -35
  11. vtlengine/AST/DAG/_words.py +4 -4
  12. vtlengine/AST/Grammar/lexer.py +732 -142
  13. vtlengine/AST/Grammar/parser.py +2188 -826
  14. vtlengine/AST/Grammar/tokens.py +128 -128
  15. vtlengine/AST/VtlVisitor.py +7 -4
  16. vtlengine/AST/__init__.py +22 -11
  17. vtlengine/DataTypes/NumericTypesHandling.py +5 -4
  18. vtlengine/DataTypes/TimeHandling.py +194 -301
  19. vtlengine/DataTypes/__init__.py +304 -218
  20. vtlengine/Exceptions/__init__.py +52 -27
  21. vtlengine/Exceptions/messages.py +134 -62
  22. vtlengine/Interpreter/__init__.py +781 -487
  23. vtlengine/Model/__init__.py +165 -121
  24. vtlengine/Operators/Aggregation.py +156 -95
  25. vtlengine/Operators/Analytic.py +115 -59
  26. vtlengine/Operators/Assignment.py +7 -4
  27. vtlengine/Operators/Boolean.py +27 -32
  28. vtlengine/Operators/CastOperator.py +177 -131
  29. vtlengine/Operators/Clause.py +137 -99
  30. vtlengine/Operators/Comparison.py +148 -117
  31. vtlengine/Operators/Conditional.py +149 -98
  32. vtlengine/Operators/General.py +68 -47
  33. vtlengine/Operators/HROperators.py +91 -72
  34. vtlengine/Operators/Join.py +217 -118
  35. vtlengine/Operators/Numeric.py +89 -44
  36. vtlengine/Operators/RoleSetter.py +16 -15
  37. vtlengine/Operators/Set.py +61 -36
  38. vtlengine/Operators/String.py +213 -139
  39. vtlengine/Operators/Time.py +334 -216
  40. vtlengine/Operators/Validation.py +117 -76
  41. vtlengine/Operators/__init__.py +340 -213
  42. vtlengine/Utils/__init__.py +195 -40
  43. vtlengine/__init__.py +1 -1
  44. vtlengine/files/output/__init__.py +15 -6
  45. vtlengine/files/output/_time_period_representation.py +10 -9
  46. vtlengine/files/parser/__init__.py +77 -52
  47. vtlengine/files/parser/_rfc_dialect.py +6 -5
  48. vtlengine/files/parser/_time_checking.py +46 -37
  49. vtlengine-1.0.1.dist-info/METADATA +236 -0
  50. vtlengine-1.0.1.dist-info/RECORD +58 -0
  51. {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/WHEEL +1 -1
  52. vtlengine-1.0.dist-info/METADATA +0 -104
  53. vtlengine-1.0.dist-info/RECORD +0 -58
  54. {vtlengine-1.0.dist-info → vtlengine-1.0.1.dist-info}/LICENSE.md +0 -0
@@ -1,32 +1,196 @@
1
- from vtlengine.Operators.Aggregation import (Avg, Count, Max, Median, Min,
2
- PopulationStandardDeviation,
3
- PopulationVariance, SampleStandardDeviation, SampleVariance, Sum)
4
- from vtlengine.Operators.Analytic import (Avg as AvgAnalytic, Count as CountAnalytic, FirstValue, Lag,
5
- LastValue, Lead, Max as MaxAnalytic, Median as MedianAnalytic,
6
- Min as MinAnalytic,
7
- PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
8
- PopulationVariance as PopulationVarianceAnalytic, Rank,
9
- RatioToReport,
10
- SampleStandardDeviation as SampleStandardDeviationAnalytic,
11
- SampleVariance as SampleVarianceAnalytic, Sum as SumAnalytic)
1
+ from vtlengine.Operators.Aggregation import (
2
+ Avg,
3
+ Count,
4
+ Max,
5
+ Median,
6
+ Min,
7
+ PopulationStandardDeviation,
8
+ PopulationVariance,
9
+ SampleStandardDeviation,
10
+ SampleVariance,
11
+ Sum,
12
+ )
13
+ from vtlengine.Operators.Analytic import (
14
+ Avg as AvgAnalytic,
15
+ Count as CountAnalytic,
16
+ FirstValue,
17
+ Lag,
18
+ LastValue,
19
+ Lead,
20
+ Max as MaxAnalytic,
21
+ Median as MedianAnalytic,
22
+ Min as MinAnalytic,
23
+ PopulationStandardDeviation as PopulationStandardDeviationAnalytic,
24
+ PopulationVariance as PopulationVarianceAnalytic,
25
+ Rank,
26
+ RatioToReport,
27
+ SampleStandardDeviation as SampleStandardDeviationAnalytic,
28
+ SampleVariance as SampleVarianceAnalytic,
29
+ Sum as SumAnalytic,
30
+ )
12
31
  from vtlengine.Operators.Boolean import And, Not, Or, Xor
13
- from vtlengine.Operators.Clause import Aggregate, Calc, Drop, Filter, Keep, Pivot, Rename, Sub, Unpivot
14
- from vtlengine.Operators.Comparison import Equal, Greater, GreaterEqual, In, IsNull, Less, LessEqual, \
15
- NotEqual, NotIn, Match
32
+ from vtlengine.Operators.Clause import (
33
+ Aggregate,
34
+ Calc,
35
+ Drop,
36
+ Filter,
37
+ Keep,
38
+ Pivot,
39
+ Rename,
40
+ Sub,
41
+ Unpivot,
42
+ )
43
+ from vtlengine.Operators.Comparison import (
44
+ Equal,
45
+ Greater,
46
+ GreaterEqual,
47
+ In,
48
+ IsNull,
49
+ Less,
50
+ LessEqual,
51
+ NotEqual,
52
+ NotIn,
53
+ Match,
54
+ )
16
55
  from vtlengine.Operators.Conditional import Nvl
17
56
  from vtlengine.Operators.General import Alias, Membership
18
- from vtlengine.Operators.HROperators import HREqual, HRGreater, HRGreaterEqual, HRLess, HRLessEqual, \
19
- HRBinPlus, HRBinMinus, HRUnPlus, HRUnMinus
57
+ from vtlengine.Operators.HROperators import (
58
+ HREqual,
59
+ HRGreater,
60
+ HRGreaterEqual,
61
+ HRLess,
62
+ HRLessEqual,
63
+ HRBinPlus,
64
+ HRBinMinus,
65
+ HRUnPlus,
66
+ HRUnMinus,
67
+ )
20
68
  from vtlengine.Operators.Join import Apply, CrossJoin, FullJoin, InnerJoin, LeftJoin
21
- from vtlengine.Operators.Numeric import AbsoluteValue, BinMinus, BinPlus, Ceil, Div, Exponential, Floor, \
22
- Logarithm, Modulo, Mult, NaturalLogarithm, Power, Round, SquareRoot, Trunc, UnMinus, UnPlus
69
+ from vtlengine.Operators.Numeric import (
70
+ AbsoluteValue,
71
+ BinMinus,
72
+ BinPlus,
73
+ Ceil,
74
+ Div,
75
+ Exponential,
76
+ Floor,
77
+ Logarithm,
78
+ Modulo,
79
+ Mult,
80
+ NaturalLogarithm,
81
+ Power,
82
+ Round,
83
+ SquareRoot,
84
+ Trunc,
85
+ UnMinus,
86
+ UnPlus,
87
+ )
23
88
  from vtlengine.Operators.RoleSetter import Attribute, Identifier, Measure
24
89
  from vtlengine.Operators.Set import Intersection, Setdiff, Symdiff, Union
25
- from vtlengine.Operators.String import Concatenate, Length, Lower, Ltrim, Replace, Rtrim, Substr, Trim, Upper
26
- from vtlengine.Operators.Time import Flow_to_stock, Period_indicator, Stock_to_flow, Fill_time_series, \
27
- Time_Shift
90
+ from vtlengine.Operators.String import (
91
+ Concatenate,
92
+ Length,
93
+ Lower,
94
+ Ltrim,
95
+ Replace,
96
+ Rtrim,
97
+ Substr,
98
+ Trim,
99
+ Upper,
100
+ )
101
+ from vtlengine.Operators.Time import (
102
+ Flow_to_stock,
103
+ Period_indicator,
104
+ Stock_to_flow,
105
+ Fill_time_series,
106
+ Time_Shift,
107
+ )
28
108
 
29
- from vtlengine.AST.Grammar.tokens import *
109
+ from vtlengine.AST.Grammar.tokens import (
110
+ MEMBERSHIP,
111
+ AND,
112
+ OR,
113
+ XOR,
114
+ EQ,
115
+ NEQ,
116
+ GT,
117
+ GTE,
118
+ LT,
119
+ LTE,
120
+ IN,
121
+ NOT_IN,
122
+ NVL,
123
+ PLUS,
124
+ MINUS,
125
+ MULT,
126
+ LOG,
127
+ MOD,
128
+ POWER,
129
+ DIV,
130
+ AS,
131
+ CONCAT,
132
+ TIMESHIFT,
133
+ CHARSET_MATCH,
134
+ NOT,
135
+ ABS,
136
+ EXP,
137
+ LN,
138
+ SQRT,
139
+ CEIL,
140
+ FLOOR,
141
+ ISNULL,
142
+ PERIOD_INDICATOR,
143
+ LEN,
144
+ LCASE,
145
+ LTRIM,
146
+ RTRIM,
147
+ TRIM,
148
+ UCASE,
149
+ FLOW_TO_STOCK,
150
+ STOCK_TO_FLOW,
151
+ ROUND,
152
+ TRUNC,
153
+ SUBSTR,
154
+ REPLACE,
155
+ FILL_TIME_SERIES,
156
+ IDENTIFIER,
157
+ ATTRIBUTE,
158
+ MEASURE,
159
+ CALC,
160
+ FILTER,
161
+ KEEP,
162
+ DROP,
163
+ RENAME,
164
+ PIVOT,
165
+ UNPIVOT,
166
+ SUBSPACE,
167
+ AGGREGATE,
168
+ APPLY,
169
+ UNION,
170
+ INTERSECT,
171
+ SYMDIFF,
172
+ SETDIFF,
173
+ MAX,
174
+ MIN,
175
+ SUM,
176
+ COUNT,
177
+ AVG,
178
+ MEDIAN,
179
+ STDDEV_POP,
180
+ STDDEV_SAMP,
181
+ VAR_POP,
182
+ VAR_SAMP,
183
+ LAG,
184
+ LEAD,
185
+ FIRST_VALUE,
186
+ LAST_VALUE,
187
+ RATIO_TO_REPORT,
188
+ RANK,
189
+ INNER_JOIN,
190
+ LEFT_JOIN,
191
+ FULL_JOIN,
192
+ CROSS_JOIN,
193
+ )
30
194
 
31
195
  BINARY_MAPPING = {
32
196
  # General
@@ -60,7 +224,7 @@ BINARY_MAPPING = {
60
224
  CONCAT: Concatenate,
61
225
  # Time
62
226
  TIMESHIFT: Time_Shift,
63
- CHARSET_MATCH: Match
227
+ CHARSET_MATCH: Match,
64
228
  }
65
229
 
66
230
  UNARY_MAPPING = {
@@ -88,7 +252,7 @@ UNARY_MAPPING = {
88
252
  # Time
89
253
  PERIOD_INDICATOR: Period_indicator,
90
254
  FLOW_TO_STOCK: Flow_to_stock,
91
- STOCK_TO_FLOW: Stock_to_flow
255
+ STOCK_TO_FLOW: Stock_to_flow,
92
256
  }
93
257
 
94
258
  PARAMETRIC_MAPPING = {
@@ -118,15 +282,10 @@ REGULAR_AGGREGATION_MAPPING = {
118
282
  UNPIVOT: Unpivot,
119
283
  SUBSPACE: Sub,
120
284
  AGGREGATE: Aggregate,
121
- APPLY: Apply
285
+ APPLY: Apply,
122
286
  }
123
287
 
124
- SET_MAPPING = {
125
- UNION: Union,
126
- INTERSECT: Intersection,
127
- SYMDIFF: Symdiff,
128
- SETDIFF: Setdiff
129
- }
288
+ SET_MAPPING = {UNION: Union, INTERSECT: Intersection, SYMDIFF: Symdiff, SETDIFF: Setdiff}
130
289
 
131
290
  AGGREGATION_MAPPING = {
132
291
  MAX: Max,
@@ -139,7 +298,6 @@ AGGREGATION_MAPPING = {
139
298
  STDDEV_SAMP: SampleStandardDeviation,
140
299
  VAR_POP: PopulationVariance,
141
300
  VAR_SAMP: SampleVariance,
142
-
143
301
  }
144
302
 
145
303
  ANALYTIC_MAPPING = {
@@ -158,18 +316,15 @@ ANALYTIC_MAPPING = {
158
316
  FIRST_VALUE: FirstValue,
159
317
  LAST_VALUE: LastValue,
160
318
  RATIO_TO_REPORT: RatioToReport,
161
- RANK: Rank
319
+ RANK: Rank,
162
320
  }
163
321
 
164
- THEN_ELSE = {
165
- 'then': 'T',
166
- 'else': 'E'
167
- }
322
+ THEN_ELSE = {"then": "T", "else": "E"}
168
323
  JOIN_MAPPING = {
169
324
  INNER_JOIN: InnerJoin,
170
325
  LEFT_JOIN: LeftJoin,
171
326
  FULL_JOIN: FullJoin,
172
- CROSS_JOIN: CrossJoin
327
+ CROSS_JOIN: CrossJoin,
173
328
  }
174
329
 
175
330
  HR_COMP_MAPPING = {
@@ -190,7 +345,7 @@ HR_NUM_BINARY_MAPPING = {
190
345
  HR_UNARY_MAPPING = {
191
346
  # Numeric
192
347
  PLUS: HRUnPlus,
193
- MINUS: HRUnMinus
348
+ MINUS: HRUnMinus,
194
349
  }
195
350
 
196
351
  HA_COMP_MAPPING = {
@@ -211,5 +366,5 @@ HA_NUM_BINARY_MAPPING = {
211
366
  HA_UNARY_MAPPING = {
212
367
  # Numeric
213
368
  PLUS: HRUnPlus,
214
- MINUS: HRUnMinus
369
+ MINUS: HRUnMinus,
215
370
  }
vtlengine/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  from vtlengine.API import semantic_analysis, run
2
2
 
3
- __all__ = ['semantic_analysis', 'run']
3
+ __all__ = ["semantic_analysis", "run"]
@@ -1,16 +1,25 @@
1
1
  from pathlib import Path
2
- # from time import time
2
+ from typing import Optional, Union
3
+
4
+ import pandas as pd
3
5
 
4
6
  from vtlengine.Model import Dataset
5
- from vtlengine.files.output._time_period_representation import \
6
- format_time_period_external_representation, TimePeriodRepresentation
7
+ from vtlengine.files.output._time_period_representation import (
8
+ format_time_period_external_representation,
9
+ TimePeriodRepresentation,
10
+ )
11
+
7
12
 
13
+ def save_datapoints(
14
+ time_period_representation: Optional[TimePeriodRepresentation],
15
+ dataset: Dataset,
16
+ output_path: Union[str, Path],
17
+ ) -> None:
8
18
 
9
- def save_datapoints(time_period_representation: TimePeriodRepresentation,
10
- dataset: Dataset, output_path: str | Path):
19
+ if dataset.data is None:
20
+ dataset.data = pd.DataFrame()
11
21
  if time_period_representation is not None:
12
22
  format_time_period_external_representation(dataset, time_period_representation)
13
-
14
23
  if isinstance(output_path, str):
15
24
  if output_path.endswith("/"):
16
25
  s3_file_output = output_path + f"{dataset.name}.csv"
@@ -8,23 +8,24 @@ from vtlengine.Model import Dataset, Scalar
8
8
 
9
9
  class TimePeriodRepresentation(Enum):
10
10
  # Time Period output format
11
- SDMX_GREGORIAN = 'sdmx_gregorian'
12
- SDMX_REPORTING = 'sdmx_reporting'
13
- VTL = 'vtl'
11
+ SDMX_GREGORIAN = "sdmx_gregorian"
12
+ SDMX_REPORTING = "sdmx_reporting"
13
+ VTL = "vtl"
14
14
 
15
15
  @classmethod
16
- def check_value(cls, value: str):
16
+ def check_value(cls, value: str) -> "TimePeriodRepresentation":
17
17
  if value not in cls._value2member_map_:
18
18
  raise Exception("Invalid Time Period Representation")
19
19
  return cls(value)
20
20
 
21
21
 
22
- def _format_vtl_representation(value: str):
22
+ def _format_vtl_representation(value: str) -> str:
23
23
  return TimePeriodHandler(value).vtl_representation()
24
24
 
25
25
 
26
- def format_time_period_external_representation(dataset: Dataset | Scalar,
27
- mode: TimePeriodRepresentation):
26
+ def format_time_period_external_representation(
27
+ dataset: Dataset | Scalar, mode: TimePeriodRepresentation
28
+ ) -> None:
28
29
  """
29
30
  From SDMX time period representation to standard VTL representation (no hyphen).
30
31
  'A': 'nothing to do',
@@ -48,7 +49,7 @@ def format_time_period_external_representation(dataset: Dataset | Scalar,
48
49
  for comp in dataset.components.values():
49
50
  if comp.data_type == TimePeriod:
50
51
  dataset.data[comp.name] = dataset.data[comp.name].map(
51
- _format_vtl_representation,
52
- na_action='ignore')
52
+ _format_vtl_representation, na_action="ignore"
53
+ )
53
54
 
54
55
  return
@@ -1,12 +1,22 @@
1
+ import warnings
1
2
  from csv import DictReader
2
3
  from pathlib import Path
3
- # from time import time
4
- from typing import Optional, Dict, Union
4
+
5
+ from typing import Optional, Dict, Union, Any, Type, List
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
8
- from vtlengine.DataTypes import Date, TimePeriod, TimeInterval, Integer, Number, Boolean, Duration, \
9
- SCALAR_TYPES_CLASS_REVERSE
9
+ from vtlengine.DataTypes import (
10
+ Date,
11
+ TimePeriod,
12
+ TimeInterval,
13
+ Integer,
14
+ Number,
15
+ Boolean,
16
+ Duration,
17
+ SCALAR_TYPES_CLASS_REVERSE,
18
+ ScalarType,
19
+ )
10
20
  from vtlengine.DataTypes.TimeHandling import DURATION_MAPPING
11
21
  from vtlengine.files.parser._rfc_dialect import register_rfc
12
22
  from vtlengine.files.parser._time_checking import check_date, check_time_period, check_time
@@ -14,14 +24,14 @@ from vtlengine.files.parser._time_checking import check_date, check_time_period,
14
24
  from vtlengine.Exceptions import InputValidationException, SemanticError
15
25
  from vtlengine.Model import Component, Role, Dataset
16
26
 
17
- TIME_CHECKS_MAPPING = {
27
+ TIME_CHECKS_MAPPING: Dict[Type[ScalarType], Any] = {
18
28
  Date: check_date,
19
29
  TimePeriod: check_time_period,
20
- TimeInterval: check_time
30
+ TimeInterval: check_time,
21
31
  }
22
32
 
23
33
 
24
- def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
34
+ def _validate_csv_path(components: Dict[str, Component], csv_path: Path) -> None:
25
35
  # GE1 check if the file is empty
26
36
  if not csv_path.exists():
27
37
  raise Exception(f"Path {csv_path} does not exist.")
@@ -29,8 +39,8 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
29
39
  raise Exception(f"Path {csv_path} is not a file.")
30
40
  register_rfc()
31
41
  try:
32
- with open(csv_path, 'r') as f:
33
- reader = DictReader(f, dialect='rfc')
42
+ with open(csv_path, "r") as f:
43
+ reader = DictReader(f, dialect="rfc")
34
44
  csv_columns = reader.fieldnames
35
45
 
36
46
  except UnicodeDecodeError as error:
@@ -45,21 +55,24 @@ def _validate_csv_path(components: Dict[str, Component], csv_path: Path):
45
55
  ) from None
46
56
 
47
57
  if not csv_columns:
48
- raise InputValidationException(code='0-1-1-6', file=csv_path)
58
+ raise InputValidationException(code="0-1-1-6", file=csv_path)
49
59
 
50
60
  if len(list(set(csv_columns))) != len(csv_columns):
51
61
  duplicates = list(set([item for item in csv_columns if csv_columns.count(item) > 1]))
52
62
  raise Exception(f"Duplicated columns {', '.join(duplicates)} found in file.")
53
63
 
54
64
  comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
55
- comps_missing = [id_m for id_m in comp_names if id_m not in reader.fieldnames]
65
+ comps_missing: Union[str, List[str]] = (
66
+ [id_m for id_m in comp_names if id_m not in reader.fieldnames] if reader.fieldnames else []
67
+ )
56
68
  if comps_missing:
57
69
  comps_missing = ", ".join(comps_missing)
58
- raise InputValidationException(code='0-1-1-8', ids=comps_missing, file=str(csv_path.name))
70
+ raise InputValidationException(code="0-1-1-8", ids=comps_missing, file=str(csv_path.name))
59
71
 
60
72
 
61
- def _sanitize_pandas_columns(components: Dict[str, Component],
62
- csv_path: Union[str, Path], data: pd.DataFrame) -> pd.DataFrame:
73
+ def _sanitize_pandas_columns(
74
+ components: Dict[str, Component], csv_path: Union[str, Path], data: pd.DataFrame
75
+ ) -> pd.DataFrame:
63
76
  # Fast loading from SDMX-CSV
64
77
  if "DATAFLOW" in data.columns and data.columns[0] == "DATAFLOW":
65
78
  if "DATAFLOW" not in components:
@@ -75,11 +88,11 @@ def _sanitize_pandas_columns(components: Dict[str, Component],
75
88
 
76
89
  # Validate identifiers
77
90
  comp_names = set([c.name for c in components.values() if c.role == Role.IDENTIFIER])
78
- comps_missing = [id_m for id_m in comp_names if id_m not in data.columns]
91
+ comps_missing: Union[str, List[str]] = [id_m for id_m in comp_names if id_m not in data.columns]
79
92
  if comps_missing:
80
93
  comps_missing = ", ".join(comps_missing)
81
94
  file = csv_path if isinstance(csv_path, str) else csv_path.name
82
- raise InputValidationException(code='0-1-1-7', ids=comps_missing, file=file)
95
+ raise InputValidationException(code="0-1-1-7", ids=comps_missing, file=file)
83
96
 
84
97
  # Fill rest of components with null values
85
98
  for comp_name, comp in components.items():
@@ -94,47 +107,52 @@ def _pandas_load_csv(components: Dict[str, Component], csv_path: Path) -> pd.Dat
94
107
  obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
95
108
 
96
109
  try:
97
- data = pd.read_csv(csv_path, dtype=obj_dtypes,
98
- engine='c',
99
- keep_default_na=False,
100
- na_values=[''])
101
- except UnicodeDecodeError as error:
110
+ data = pd.read_csv(
111
+ csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
112
+ )
113
+ except UnicodeDecodeError:
102
114
  raise InputValidationException(code="0-1-2-5", file=csv_path.name)
103
115
 
104
116
  return _sanitize_pandas_columns(components, csv_path, data)
105
117
 
118
+
106
119
  def _pandas_load_s3_csv(components: Dict[str, Component], csv_path: str) -> pd.DataFrame:
107
120
  obj_dtypes = {comp_name: np.object_ for comp_name, comp in components.items()}
108
121
 
109
122
  # start = time()
110
123
  try:
111
- data = pd.read_csv(csv_path, dtype=obj_dtypes,
112
- engine='c',
113
- keep_default_na=False,
114
- na_values=[''])
124
+ data = pd.read_csv(
125
+ csv_path, dtype=obj_dtypes, engine="c", keep_default_na=False, na_values=[""]
126
+ )
115
127
 
116
- except UnicodeDecodeError as error:
128
+ except UnicodeDecodeError:
117
129
  raise InputValidationException(code="0-1-2-5", file=csv_path)
118
130
  except Exception as e:
119
131
  raise InputValidationException(f"ERROR: {str(e)}, review file {str(csv_path)}")
120
-
121
- # print(f"Data loaded from {csv_path}, shape: {data.shape}")
122
- # end = time()
123
- # print(f"Time to load data from s3 URI: {end - start}")
124
-
125
132
  return _sanitize_pandas_columns(components, csv_path, data)
126
133
 
127
- def _parse_boolean(value: str):
134
+
135
+ def _parse_boolean(value: str) -> bool:
128
136
  if value.lower() == "true" or value == "1":
129
137
  return True
130
138
  return False
131
139
 
132
140
 
133
- def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
134
- dataset_name: str) -> pd.DataFrame:
141
+ def _validate_pandas(
142
+ components: Dict[str, Component], data: pd.DataFrame, dataset_name: str
143
+ ) -> pd.DataFrame:
144
+ warnings.filterwarnings("ignore", category=FutureWarning)
135
145
  # Identifier checking
146
+
136
147
  id_names = [comp_name for comp_name, comp in components.items() if comp.role == Role.IDENTIFIER]
137
148
 
149
+ missing_columns = [name for name in components.keys() if name not in data.columns.tolist()]
150
+ if missing_columns:
151
+ for name in missing_columns:
152
+ if components[name].nullable is False:
153
+ raise SemanticError("0-1-1-10", name=dataset_name, comp_name=name)
154
+ data[name] = None
155
+
138
156
  for id_name in id_names:
139
157
  if data[id_name].isnull().any():
140
158
  raise SemanticError("0-1-1-4", null_identifier=id_name, name=dataset_name)
@@ -150,35 +168,42 @@ def _validate_pandas(components: Dict[str, Component], data: pd.DataFrame,
150
168
 
151
169
  for comp_name, comp in components.items():
152
170
  if comp.data_type in (Date, TimePeriod, TimeInterval):
153
- data[comp_name] = data[comp_name].map(TIME_CHECKS_MAPPING[comp.data_type],
154
- na_action='ignore')
171
+ data[comp_name] = data[comp_name].map(
172
+ TIME_CHECKS_MAPPING[comp.data_type], na_action="ignore"
173
+ )
155
174
  elif comp.data_type == Integer:
156
- data[comp_name] = data[comp_name].map(lambda x: Integer.cast(float(x)),
157
- na_action='ignore')
175
+ data[comp_name] = data[comp_name].map(
176
+ lambda x: Integer.cast(float(x)), na_action="ignore"
177
+ )
158
178
  elif comp.data_type == Number:
159
- data[comp_name] = data[comp_name].map(lambda x: float(x), na_action='ignore')
179
+ data[comp_name] = data[comp_name].map(lambda x: float(x), na_action="ignore")
160
180
  elif comp.data_type == Boolean:
161
- data[comp_name] = data[comp_name].map(lambda x: _parse_boolean(x),
162
- na_action='ignore')
181
+ data[comp_name] = data[comp_name].map(
182
+ lambda x: _parse_boolean(x), na_action="ignore"
183
+ )
163
184
  elif comp.data_type == Duration:
164
- values_correct = data[comp_name].map(
165
- lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action='ignore').all()
185
+ values_correct = (
186
+ data[comp_name]
187
+ .map(lambda x: x.replace(" ", "") in DURATION_MAPPING, na_action="ignore")
188
+ .all()
189
+ )
166
190
  if not values_correct:
167
191
  raise ValueError(f"Duration values are not correct in column {comp_name}")
168
192
  else:
169
- data[comp_name] = data[comp_name].map(lambda x: str(x).replace('"', ''),
170
- na_action='ignore')
171
- data[comp_name] = data[comp_name].astype(np.object_, errors='raise')
172
- except ValueError as e:
173
- str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type]
193
+ data[comp_name] = data[comp_name].map(
194
+ lambda x: str(x).replace('"', ""), na_action="ignore"
195
+ )
196
+ data[comp_name] = data[comp_name].astype(np.object_, errors="raise")
197
+ except ValueError:
198
+ str_comp = SCALAR_TYPES_CLASS_REVERSE[comp.data_type] if comp else "Null"
174
199
  raise SemanticError("0-1-1-12", name=dataset_name, column=comp_name, type=str_comp)
175
200
 
176
201
  return data
177
202
 
178
203
 
179
- def load_datapoints(components: Dict[str, Component],
180
- dataset_name: str,
181
- csv_path: Optional[Union[Path, str]] = None):
204
+ def load_datapoints(
205
+ components: Dict[str, Component], dataset_name: str, csv_path: Optional[Union[Path, str]] = None
206
+ ) -> pd.DataFrame:
182
207
  if csv_path is None or (isinstance(csv_path, Path) and not csv_path.exists()):
183
208
  return pd.DataFrame(columns=list(components.keys()))
184
209
  elif isinstance(csv_path, str):
@@ -193,5 +218,5 @@ def load_datapoints(components: Dict[str, Component],
193
218
  return data
194
219
 
195
220
 
196
- def _fill_dataset_empty_data(dataset: Dataset):
221
+ def _fill_dataset_empty_data(dataset: Dataset) -> None:
197
222
  dataset.data = pd.DataFrame(columns=list(dataset.components.keys()))
@@ -6,9 +6,10 @@ class RFCDialect(csv.Dialect):
6
6
  https://docs.python.org/3/library/csv.html#csv.Dialect
7
7
  https://tools.ietf.org/html/rfc4180
8
8
  """
9
- delimiter = ','
9
+
10
+ delimiter = ","
10
11
  doublequote = True
11
- lineterminator = '\r\n'
12
+ lineterminator = "\r\n"
12
13
  quotechar = '"'
13
14
  quoting = csv.QUOTE_MINIMAL
14
15
  strict = True
@@ -16,6 +17,6 @@ class RFCDialect(csv.Dialect):
16
17
  skipinitialspace = False
17
18
 
18
19
 
19
- def register_rfc():
20
- """ Register the RFC dialect. """
21
- csv.register_dialect('rfc', RFCDialect)
20
+ def register_rfc() -> None:
21
+ """Register the RFC dialect."""
22
+ csv.register_dialect("rfc", RFCDialect)