sibi-dst 0.3.11__tar.gz → 0.3.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/PKG-INFO +4 -1
  2. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/pyproject.toml +4 -1
  3. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/_df_helper.py +5 -2
  4. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
  5. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
  6. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
  7. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/__init__.py +2 -1
  8. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_data_utils.py +91 -62
  9. sibi_dst-0.3.12/sibi_dst/utils/_data_wrapper.py +238 -0
  10. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_date_utils.py +130 -0
  11. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_df_utils.py +91 -0
  12. sibi_dst-0.3.11/sibi_dst/utils/_data_wrapper.py +0 -556
  13. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/README.md +0 -0
  14. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/__init__.py +0 -0
  15. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/__init__.py +0 -0
  16. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  17. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/__init__.py +0 -0
  18. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/_defaults.py +0 -0
  19. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/_params_config.py +0 -0
  20. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/core/_query_config.py +0 -0
  21. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/__init__.py +0 -0
  22. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/__init__.py +0 -0
  23. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_django_db_connection.py +0 -0
  24. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_django_load_from_db.py +0 -0
  25. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py +0 -0
  26. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_io_dask.py +0 -0
  27. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/django/_io_dask_alt.py +0 -0
  28. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/http/__init__.py +0 -0
  29. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/parquet/__init__.py +0 -0
  30. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/__init__.py +0 -0
  31. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
  32. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
  33. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
  34. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
  35. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
  36. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/__init__.py +0 -0
  37. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py +0 -0
  38. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py +0 -0
  39. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_airflow_manager.py +0 -0
  40. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_clickhouse_writer.py +0 -0
  41. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_credentials.py +0 -0
  42. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_file_utils.py +0 -0
  43. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_filepath_generator.py +0 -0
  44. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_log_utils.py +0 -0
  45. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_parquet_saver.py +0 -0
  46. {sibi_dst-0.3.11 → sibi_dst-0.3.12}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.11
3
+ Version: 0.3.12
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
12
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
13
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
@@ -29,6 +31,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
29
31
  Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
30
32
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
31
33
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
34
+ Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
32
35
  Description-Content-Type: text/markdown
33
36
 
34
37
  # sibi-dst
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.11"
3
+ version = "0.3.12"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -29,6 +29,9 @@ pytest = "^8.3.3"
29
29
  clickhouse-connect = "^0.8.7"
30
30
  clickhouse-driver = "^0.2.9"
31
31
  paramiko = "^3.5.0"
32
+ chardet = "^5.2.0"
33
+ charset-normalizer = "^3.4.0"
34
+ uvicorn = "^0.32.1"
32
35
 
33
36
 
34
37
  [build-system]
@@ -42,6 +42,7 @@ class DfHelper:
42
42
  self.dt_field=kwargs.setdefault("dt_field", None)
43
43
  self.as_pandas = kwargs.setdefault("as_pandas", False)
44
44
  kwargs.setdefault("live", True)
45
+ kwargs.setdefault("logger", self.logger)
45
46
  self.post_init(**kwargs)
46
47
 
47
48
 
@@ -211,6 +212,7 @@ class DfHelper:
211
212
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
212
213
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
213
214
  ps.save_to_parquet(parquet_filename)
215
+ self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
214
216
 
215
217
  def save_to_clickhouse(self, database, table, order_by=None, **credentials):
216
218
  click_config ={
@@ -219,13 +221,14 @@ class DfHelper:
219
221
  'order_by': order_by or 'id',
220
222
  }
221
223
  credentials = {**credentials, **click_config}
222
- cs=ClickHouseWriter(**credentials)
224
+ cs=ClickHouseWriter(logger=self.logger, **credentials)
223
225
  cs.save_to_clickhouse(self.df)
226
+ self.logger.info("Save to ClickHouse completed.")
224
227
 
225
228
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
226
229
  self.df = self.plugin_parquet.load_files()
227
230
  if options:
228
- self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
231
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
229
232
  return self.df
230
233
 
231
234
  def load_period(self, **kwargs):
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
13
13
  api_key: Optional[SecretStr] = None
14
14
  model_config = ConfigDict(arbitrary_types_allowed=True)
15
15
 
16
- def __init__(self, **data):
16
+ def __init__(self, logger=None, **data):
17
17
  super().__init__(**data)
18
18
  # Initialize the logger if not provided
19
- if not self.logger:
20
- self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
19
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
21
20
 
22
21
  async def fetch_data(self, **options) -> dd.DataFrame:
23
22
  """Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
@@ -1,7 +1,11 @@
1
1
  import pandas as pd
2
2
  import dask.dataframe as dd
3
+ from sibi_dst.utils import Logger
3
4
 
4
5
  class ParquetFilterHandler(object):
6
+ def __init__(self, logger=None):
7
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
8
+
5
9
  @staticmethod
6
10
  def apply_filters_dask(df, filters):
7
11
  dt_operators = ['date', 'time']
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
25
25
  @model_validator(mode='after')
26
26
  def check_parquet_params(self):
27
27
  # Configure paths based on fsspec
28
+ if self.logger is None:
29
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
30
  self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
29
31
 
30
32
  # Validation for parquet path
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
  from ._credentials import ConfigManager, ConfigLoader
3
3
  from ._log_utils import Logger
4
- from ._date_utils import DateUtils
4
+ from ._date_utils import DateUtils, BusinessDays
5
5
  from ._data_utils import DataUtils
6
6
  from ._file_utils import FileUtils
7
7
  from ._filepath_generator import FilePathGenerator
@@ -17,6 +17,7 @@ __all__=[
17
17
  "ConfigLoader",
18
18
  "Logger",
19
19
  "DateUtils",
20
+ "BusinessDays",
20
21
  "FileUtils",
21
22
  "DataWrapper",
22
23
  "DataUtils",
@@ -7,6 +7,27 @@ class DataUtils:
7
7
  def __init__(self, logger=None):
8
8
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
9
9
 
10
+ def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
+ if not columns:
12
+ self.logger.warning('No columns specified')
13
+
14
+ columns = [column for column in columns if column in df.columns]
15
+ for col in columns:
16
+ if isinstance(df, dd.DataFrame):
17
+ # Replace NaN with 0, then convert to boolean
18
+ df[col] = df[col].map_partitions(
19
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
+ .fillna(fill_value) # Replace NaN with 0
21
+ .astype(dtype),
22
+ meta=(col, dtype)
23
+ )
24
+ else:
25
+ # For Pandas DataFrame, handle mixed types and invalid values
26
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
+ df[col] = df[col].fillna(fill_value).astype(dtype)
28
+
29
+ return df
30
+
10
31
  @staticmethod
11
32
  def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
12
33
  """
@@ -14,6 +35,7 @@ class DataUtils:
14
35
 
15
36
  Parameters:
16
37
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
+ - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
17
39
  - fill_value (int or float): The value to replace NA values with.
18
40
  - transform_func (callable, optional): The transformation function to apply.
19
41
  If None, no additional transformation is applied.
@@ -28,31 +50,28 @@ class DataUtils:
28
50
  if not columns:
29
51
  return df
30
52
 
53
+ columns = [column for column in columns if column in df.columns]
31
54
  # Default transformation function (identity) if none is provided
32
55
  if transform_func is None:
33
56
  transform_func = lambda x: x
34
57
 
35
- # Apply transformations
36
- for col in columns:
37
- dtype = df[col].dtype
38
- if pd.api.types.is_integer_dtype(dtype):
39
- meta_type = 'int64'
40
- elif pd.api.types.is_float_dtype(dtype):
41
- meta_type = 'float64'
42
- else:
43
- continue # Skip non-numeric columns
58
+ # Batch processing for Dask
59
+ if isinstance(df, dd.DataFrame):
60
+ def transform_partition(partition):
61
+ # Apply transformations for all numeric columns in a single pass
62
+ partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
+ return partition
64
+
65
+ # Apply the transformation function to all specified columns
66
+ df = df.map_partitions(transform_partition, meta=df)
67
+ else:
68
+ # Pandas: Vectorized operations for all specified columns
69
+ df[columns] = df[columns].fillna(fill_value).map(transform_func)
44
70
 
45
- df[col] = df[col].fillna(fill_value).astype(meta_type)
46
- if isinstance(df, dd.DataFrame):
47
- df[col] = df[col].map_partitions(
48
- lambda s: s.apply(transform_func), meta=(col, meta_type)
49
- )
50
- else:
51
- df[col] = df[col].apply(transform_func)
52
71
  return df
53
72
 
54
73
  @staticmethod
55
- def transform_boolean_columns(df, columns=None, sample_size=100):
74
+ def transform_boolean_columns(df, columns=None):
56
75
  """
57
76
  Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
58
77
  and convert them to boolean. Detection is performed using a sample.
@@ -96,73 +115,67 @@ class DataUtils:
96
115
  Returns:
97
116
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
98
117
  """
99
- # Check if the DataFrame is empty
118
+ # Return early if the DataFrame is empty
100
119
  if self.is_dataframe_empty(df):
101
120
  return df
102
121
 
103
- # Extract required parameters with default values
104
- source_col = kwargs.pop('source_col', None)
105
- lookup_col = kwargs.pop('lookup_col', None)
106
- lookup_description_col = kwargs.pop('lookup_description_col', None)
107
- source_description_alias = kwargs.pop('source_description_alias', None)
108
- fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
109
- fieldnames = kwargs.get('fieldnames', None)
110
- column_names = kwargs.get('column_names', None)
122
+ # Extract and validate required parameters
123
+ required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
124
+ missing_params = [param for param in required_params if param not in kwargs]
125
+ if missing_params:
126
+ raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
111
127
 
112
- # Validate required parameters
113
- if not all([source_col, lookup_col, lookup_description_col, source_description_alias]):
114
- raise ValueError(
115
- 'source_col, lookup_col, lookup_description_col, and source_description_alias must be specified'
116
- )
128
+ source_col = kwargs.pop('source_col')
129
+ lookup_col = kwargs.pop('lookup_col')
130
+ lookup_description_col = kwargs.pop('lookup_description_col')
131
+ source_description_alias = kwargs.pop('source_description_alias')
132
+
133
+ # Optional parameters with default values
134
+ fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
135
+ fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
136
+ column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
117
137
 
118
138
  if source_col not in df.columns:
119
- self.logger.info(f'{source_col} not in DataFrame columns')
139
+ self.logger.info(f"{source_col} not in DataFrame columns")
120
140
  return df
121
141
 
122
142
  # Get unique IDs from source column
123
143
  ids = df[source_col].dropna().unique()
124
144
  if isinstance(ids, dd.Series):
125
145
  ids = ids.compute()
126
- ids = ids.tolist()
127
-
128
- if not ids:
129
- self.logger.info(f'No IDs found in the source column: {source_col}')
146
+ if not len(ids):
147
+ self.logger.info(f"No IDs found in the source column: {source_col}")
130
148
  return df
131
-
132
- # Set default fieldnames and column_names if not provided
133
- if fieldnames is None:
134
- kwargs['fieldnames'] = (lookup_col, lookup_description_col)
135
- if column_names is None:
136
- kwargs['column_names'] = ['temp_join_col', source_description_alias]
137
-
149
+ ids = sorted(ids.tolist())
138
150
  # Prepare kwargs for loading lookup data
139
151
  load_kwargs = kwargs.copy()
140
- load_kwargs[f'{lookup_col}__in'] = ids
141
-
152
+ load_kwargs.update({
153
+ 'fieldnames': fieldnames,
154
+ 'column_names': column_names,
155
+ f'{lookup_col}__in': ids
156
+ })
142
157
  # Load lookup data
143
- lookup_instance = classname()
158
+ lookup_instance = classname(debug=True, verbose_debug=True)
144
159
  result = lookup_instance.load(**load_kwargs)
145
-
160
+ if len(result.index) == 0:
161
+ self.logger.info(f"No IDs found in the source column: {source_col}")
162
+ return df
146
163
  # Determine the join column on the result DataFrame
147
- if 'temp_join_col' in kwargs.get("column_names", []):
148
- temp_join_col = 'temp_join_col'
149
- else:
150
- temp_join_col = lookup_col
164
+ temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
151
165
 
152
166
  # Merge DataFrames
153
167
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
154
168
 
155
169
  if fillna_source_description_alias and source_description_alias in df.columns:
156
- df[source_description_alias] = df[source_description_alias].fillna('')
170
+ df[source_description_alias]=df[source_description_alias].fillna('')
157
171
 
158
172
  # Drop temp_join_col if present
159
- if 'temp_join_col' in df.columns:
160
- df = df.drop(columns='temp_join_col')
173
+ df = df.drop(columns='temp_join_col', errors='ignore')
161
174
 
162
175
  return df
163
176
 
164
- @staticmethod
165
- def is_dataframe_empty(df):
177
+
178
+ def is_dataframe_empty(self, df):
166
179
  """
167
180
  Check if a DataFrame (Pandas or Dask) is empty.
168
181
 
@@ -173,14 +186,30 @@ class DataUtils:
173
186
  - bool: True if the DataFrame is empty, False otherwise.
174
187
  """
175
188
  if isinstance(df, dd.DataFrame):
176
- df_size = df.map_partitions(len).sum().compute()
177
- return df_size == 0
178
- else:
189
+ try:
190
+ return len(df.index) == 0
191
+ except Exception as e:
192
+ self.logger.error(f"Error while processing Dask DataFrame: {e}")
193
+ return False
194
+ elif isinstance(df, pd.DataFrame):
179
195
  return df.empty
196
+ else:
197
+ self.logger.error("Input must be a pandas or dask DataFrame.")
198
+ return False
180
199
 
181
200
  @staticmethod
182
- def convert_to_datetime(df, date_fields):
201
+ def convert_to_datetime_dask(df, date_fields):
202
+ """
203
+ Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
204
+
205
+ Parameters:
206
+ - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
207
+ - date_fields (list of str): List of column names to convert to datetime.
208
+
209
+ Returns:
210
+ - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
211
+ """
183
212
  for col in date_fields:
184
213
  if col in df.columns:
185
- df[col] = pd.to_datetime(df[col], errors='coerce')
186
- return df
214
+ df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
215
+ return df
@@ -0,0 +1,238 @@
1
+ import datetime
2
+ from typing import Type, Any, Dict, Optional
3
+ import fsspec
4
+ import pandas as pd
5
+ from IPython.display import display
6
+ from sibi_dst.utils import Logger
7
+ from tqdm import tqdm
8
+ from sibi_dst.utils import ParquetSaver
9
+
10
+ class DataWrapper:
11
+ DEFAULT_MAX_AGE_MINUTES = 1440
12
+ DEFAULT_HISTORY_DAYS_THRESHOLD = 30
13
+
14
+ def __init__(self,
15
+ dataclass: Type,
16
+ date_field: str,
17
+ data_path: str,
18
+ parquet_filename: str,
19
+ start_date: Any,
20
+ end_date: Any,
21
+ filesystem_type: str = "file",
22
+ filesystem_options: Optional[Dict] = None,
23
+ verbose: bool = False,
24
+ class_params: Optional[Dict] = None,
25
+ load_params: Optional[Dict] = None,
26
+ reverse_order: bool = False,
27
+ overwrite: bool = False,
28
+ ignore_missing: bool = False,
29
+ logger: Optional[Logger] = None,
30
+ max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
31
+ history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
32
+ show_progress: bool = False):
33
+ self.dataclass = dataclass
34
+ self.date_field = date_field
35
+ self.data_path = self.ensure_forward_slash(data_path)
36
+ self.parquet_filename = parquet_filename
37
+ self.filesystem_type = filesystem_type
38
+ self.filesystem_options = filesystem_options or {}
39
+ self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
40
+ self.verbose = verbose
41
+ self.class_params = class_params or {}
42
+ self.load_params = load_params or {}
43
+ self.reverse_order = reverse_order
44
+ self.overwrite = overwrite
45
+ self.ignore_missing = ignore_missing
46
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
47
+ self.max_age_minutes = max_age_minutes
48
+ self.history_days_threshold = history_days_threshold
49
+ self.show_progress = show_progress
50
+
51
+ self.start_date = self.convert_to_date(start_date)
52
+ self.end_date = self.convert_to_date(end_date)
53
+
54
+ @staticmethod
55
+ def convert_to_date(date: Any) -> datetime.date:
56
+ if isinstance(date, datetime.date):
57
+ return date
58
+ try:
59
+ return pd.to_datetime(date).date()
60
+ except ValueError as e:
61
+ raise ValueError(f"Error converting {date} to datetime: {e}")
62
+
63
+ @staticmethod
64
+ def ensure_forward_slash(path: str) -> str:
65
+ return path if path.endswith('/') else path + '/'
66
+
67
+ def generate_date_range(self):
68
+ """Generate a range of dates between start_date and end_date."""
69
+ date_range = pd.date_range(self.start_date, self.end_date, freq='D')
70
+ if self.reverse_order:
71
+ date_range = date_range[::-1]
72
+ for date in date_range:
73
+ yield date.date()
74
+
75
+ def process(self):
76
+ """Execute the update plan following the specified hierarchy."""
77
+ update_plan_table = self.generate_update_plan_with_conditions()
78
+
79
+ # Display the update plan table to the user if show_progress is True
80
+ if self.show_progress:
81
+ display(update_plan_table)
82
+
83
+ # Process files according to the hierarchy, considering only `update_required` dates
84
+ for category, description in [
85
+ ("overwrite", "Processing files due to overwrite=True"),
86
+ ("history_days", "Processing files within history_days_threshold"),
87
+ ("missing_files", "Processing missing files")
88
+ ]:
89
+ # Filter dates in the category where `update_required` is True
90
+ dates_to_process = update_plan_table[
91
+ (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
92
+ ]["date"].tolist()
93
+
94
+ date_iterator = dates_to_process
95
+ if self.show_progress:
96
+ date_iterator = tqdm(date_iterator, desc=description, unit="date")
97
+
98
+ for current_date in date_iterator:
99
+ self.process_date(current_date)
100
+
101
+ def is_file_older_than(self, file_path: str) -> bool:
102
+ """
103
+ Check if a file is older than the specified max_age_minutes.
104
+ """
105
+ try:
106
+ info = self.fs.info(file_path)
107
+ file_modification_time = info['mtime']
108
+ file_modification_datetime = datetime.datetime.fromtimestamp(
109
+ file_modification_time, tz=datetime.timezone.utc
110
+ )
111
+ current_time = datetime.datetime.now(datetime.timezone.utc)
112
+ file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
113
+
114
+ if self.verbose:
115
+ self.logger.info(
116
+ f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
117
+ f"(threshold: {self.max_age_minutes} minutes)"
118
+ )
119
+
120
+ return file_age_minutes > self.max_age_minutes
121
+ except FileNotFoundError:
122
+ return True # Treat missing files as old
123
+
124
+ def process_date(self, date: datetime.date):
125
+ """Process a specific date by regenerating data as necessary."""
126
+ folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
127
+ full_parquet_filename = f"{folder}{self.parquet_filename}"
128
+
129
+ start_time = datetime.datetime.now()
130
+
131
+ if self.verbose:
132
+ self.logger.info(f"Processing {full_parquet_filename}...")
133
+
134
+ data_object = self.dataclass(**self.class_params)
135
+ df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
136
+
137
+ if len(df.index)==0:
138
+ if self.verbose:
139
+ self.logger.info("No data found for the specified date.")
140
+ return
141
+
142
+ parquet_saver = ParquetSaver(df, folder, self.logger)
143
+ parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
144
+
145
+ end_time = datetime.datetime.now()
146
+ duration_seconds = (end_time - start_time).total_seconds()
147
+
148
+ if self.verbose:
149
+ self.logger.info(
150
+ f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
151
+ )
152
+
153
+ def generate_update_plan_with_conditions(self):
154
+ """
155
+ Generate an update plan that evaluates files based on the specified hierarchy:
156
+ 1. Overwrite (all files regenerated).
157
+ 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
158
+ 3. Missing files: Detect missing files, ignoring future dates.
159
+ """
160
+ rows = []
161
+
162
+ today = datetime.date.today()
163
+ history_start_date = today - datetime.timedelta(days=self.history_days_threshold)
164
+
165
+ date_range = self.generate_date_range()
166
+ if self.show_progress:
167
+ date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
168
+
169
+ for current_date in date_range:
170
+ folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
171
+ full_parquet_filename = f"{folder}{self.parquet_filename}"
172
+
173
+ file_exists = self.fs.exists(full_parquet_filename)
174
+ within_history = history_start_date <= current_date <= today
175
+ missing_file = not file_exists and not self.ignore_missing
176
+ category = None
177
+
178
+ # Hierarchy 1: Overwrite
179
+ if self.overwrite:
180
+ category = "overwrite"
181
+ update_required = True
182
+ # Hierarchy 2: History threshold evaluation
183
+ elif within_history:
184
+ if self.is_file_older_than(full_parquet_filename):
185
+ category = "history_days"
186
+ update_required = True
187
+ else:
188
+ update_required = False
189
+ # Hierarchy 3: Missing files
190
+ elif missing_file and current_date <= today:
191
+ category = "missing_files"
192
+ update_required = True
193
+ else:
194
+ update_required = False
195
+
196
+ # Collect condition descriptions for the update plan table
197
+ rows.append({
198
+ "date": current_date,
199
+ "file_exists": file_exists,
200
+ "within_history": within_history,
201
+ "missing_file": missing_file,
202
+ "update_required": update_required,
203
+ "update_category": category
204
+ })
205
+
206
+ update_plan_table = pd.DataFrame(rows)
207
+ return update_plan_table
208
+
209
+
210
+
211
+ # # Usage:
212
+ # # wrapper = DataWrapper(
213
+ # # dataclass=YourDataClass,
214
+ # # date_field="created_at",
215
+ # # data_path="/path/to/data",
216
+ # # parquet_filename="data.parquet",
217
+ # # start_date="2022-01-01",
218
+ # # end_date="2022-12-31",
219
+ # # filesystem_type="file",
220
+ # # verbose=True
221
+ # # )
222
+ # # wrapper.process()
223
+ # # wrapper = DataWrapper(
224
+ # # dataclass=YourDataClass,
225
+ # # date_field="created_at",
226
+ # # data_path="s3://your-bucket-name/path/to/data",
227
+ # # parquet_filename="data.parquet",
228
+ # # start_date="2022-01-01",
229
+ # # end_date="2022-12-31",
230
+ # # filesystem_type="s3",
231
+ # # filesystem_options={
232
+ # # "key": "your_aws_access_key",
233
+ # # "secret": "your_aws_secret_key",
234
+ # # "client_kwargs": {"endpoint_url": "https://s3.amazonaws.com"}
235
+ # # },
236
+ # # verbose=True
237
+ # #)
238
+ # #wrapper.process()