sibi-dst 2025.1.9__py3-none-any.whl → 2025.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,6 +49,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
49
49
  self.completion_times: Dict[str, float] = {}
50
50
  self.failed: List[str] = []
51
51
  self.original_classes: List[Type] = []
52
+ self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
52
53
 
53
54
  def get_artifact_classes(self, data_type: str) -> List[Type]:
54
55
  """Retrieve artifact classes by data type."""
@@ -270,6 +271,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
270
271
  self.completion_times: Dict[str, float] = {}
271
272
  self.failed: List[str] = []
272
273
  self.original_classes: List[Type] = []
274
+ self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
273
275
 
274
276
  def get_artifact_classes(self, data_type: str) -> List[Type]:
275
277
  """
@@ -28,6 +28,7 @@ class BaseBackend:
28
28
  self.logger = helper.logger
29
29
  self.debug = helper.debug
30
30
  self.total_records = helper.total_records # no records loaded yet
31
+ self._entered = helper._entered # Track if the helper is used in a context manager
31
32
 
32
33
  def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
33
34
  """Synchronous data loading method. Must be implemented by sync backends."""
@@ -67,7 +68,7 @@ class ParquetBackend(BaseBackend):
67
68
  df = self.helper.backend_parquet.load_files()
68
69
  if options and df is not None:
69
70
  df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
70
- self.total_records = len(df)
71
+ self.total_records = len(df) or -1 # If df is empty, set total_records to -1
71
72
  return self.total_records, df
72
73
  except Exception as e:
73
74
  self.total_records = -1 # Reset total_records on failure
@@ -105,6 +106,12 @@ class DfHelper(ManagedResource):
105
106
  'http': HttpBackend,
106
107
  }
107
108
 
109
+ _BACKEND_ATTR_MAP = {
110
+ 'sqlalchemy': 'backend_db_connection',
111
+ 'parquet': 'backend_parquet',
112
+ 'http': 'backend_http',
113
+ }
114
+
108
115
  default_config: Dict = None
109
116
 
110
117
  def __init__(self, backend='sqlalchemy', **kwargs):
@@ -140,9 +147,15 @@ class DfHelper(ManagedResource):
140
147
  super().__exit__(exc_type, exc_value, traceback)
141
148
 
142
149
  def _cleanup(self):
143
- active_config = getattr(self, f"backend_{self.backend}", None)
150
+ attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
151
+ if not attr_name:
152
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
153
+ return
154
+ # Get the actual config object (e.g., self.backend_db_connection)
155
+ active_config = getattr(self, attr_name, None)
156
+
144
157
  if active_config and hasattr(active_config, "close"):
145
- self.logger.debug(f"Closing resources for '{self.backend}' backend.")
158
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
146
159
  active_config.close()
147
160
 
148
161
  def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
@@ -156,6 +169,10 @@ class DfHelper(ManagedResource):
156
169
  self.total_records, df = self.backend_strategy.load(**options)
157
170
  df = self._process_loaded_data(df)
158
171
  df = self._post_process_df(df)
172
+ if not self._entered:
173
+ self.logger.warning(
174
+ "DfHelper instance was not used in a context manager; cleanup is being called manually.")
175
+ self._cleanup()
159
176
  return df.compute() if as_pandas else df
160
177
 
161
178
  async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
@@ -200,7 +217,11 @@ class DfHelper(ManagedResource):
200
217
  self.logger.warning("Cannot save to parquet; DataFrame is empty.")
201
218
  return
202
219
  fs = kwargs.pop('fs', self.fs)
203
- path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
220
+ if not fs:
221
+ raise ValueError("Filesystem (fs) must be provided to save to parquet.")
222
+ path = kwargs.pop('parquet_storage_path', None)
223
+ if not path:
224
+ raise ValueError("parquet_storage_path must be provided to save to parquet.")
204
225
  writer_config = {
205
226
  'df_result': df,
206
227
  'parquet_storage_path': path,
@@ -4,8 +4,8 @@ from typing import Optional, List
4
4
 
5
5
  import dask.dataframe as dd
6
6
  import fsspec
7
- from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
8
-
7
+ import pandas as pd
8
+ from pydantic import BaseModel, model_validator, ConfigDict
9
9
  from sibi_dst.utils import FilePathGenerator
10
10
  from sibi_dst.utils import Logger
11
11
 
@@ -93,7 +93,7 @@ class ParquetConfig(BaseModel):
93
93
  self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
94
94
  if not self.fs.exists(self.parquet_storage_path):
95
95
  self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
96
- #raise ValueError('Parquet storage path does not exist')
96
+ # raise ValueError('Parquet storage path does not exist')
97
97
  self.load_parquet = False
98
98
  if self.parquet_filename is not None:
99
99
  self.parquet_full_path = self.ensure_file_extension(
@@ -184,11 +184,36 @@ class ParquetConfig(BaseModel):
184
184
  :return: A Dask DataFrame containing loaded parquet file data.
185
185
  :rtype: dask.dataframe.DataFrame
186
186
  """
187
- if self.load_parquet:
188
- if self.parquet_folder_list:
189
- return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
190
- else:
191
- return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
187
+ if not self.load_parquet:
188
+ self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
189
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
190
+
191
+ paths_to_load = []
192
+ if self.parquet_folder_list:
193
+ # Filter out any None values from the list
194
+ paths_to_load = [p for p in self.parquet_folder_list if p is not None]
195
+ elif self.parquet_full_path:
196
+ # Treat the single path as a list with one item
197
+ paths_to_load = [self.parquet_full_path]
198
+
199
+ if not paths_to_load:
200
+ self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
201
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
202
+
203
+ try:
204
+ self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
+ return dd.read_parquet(
206
+ paths_to_load,
207
+ engine="pyarrow",
208
+ filesystem=self.fs,
209
+ exclude=["_*", ".*"]
210
+ )
211
+ except Exception as e:
212
+ # This robust error handling is excellent.
213
+ self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
214
+ self.logger.warning("Returning empty DataFrame due to loading error.")
215
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
216
+
192
217
 
193
218
  @staticmethod
194
219
  def ensure_file_extension(filepath: str, extension: str) -> str:
@@ -172,7 +172,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
172
172
  return
173
173
 
174
174
  engine_wrapper['ref_count'] -= 1
175
- self.logger.debug(f"Closing config. Ref count is now {engine_wrapper['ref_count']}.")
175
+ self.logger.debug(f"Closing connection within engine wrapper. Ref count is now {engine_wrapper['ref_count']}.")
176
176
 
177
177
  if engine_wrapper['ref_count'] <= 0:
178
178
  self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
@@ -153,44 +153,44 @@ class DataWrapper(ManagedResource):
153
153
  # Create a copy to avoid mutating the shared instance dictionary
154
154
  local_load_params = self.load_params.copy()
155
155
  local_load_params.update(date_filter)
156
- local_class_instance = self.dataclass(**self.class_params)
157
- df = local_class_instance.load(**local_load_params)
158
- load_time = time.perf_counter() - load_start
159
-
160
- if hasattr(local_class_instance, "total_records"):
161
- self.logger.debug(
162
- f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
163
- if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
164
- if self.mmanifest:
165
- self.mmanifest.record(
156
+ with self.dataclass(**self.class_params) as local_class_instance:
157
+ df = local_class_instance.load(**local_load_params)
158
+ load_time = time.perf_counter() - load_start
159
+
160
+ if hasattr(local_class_instance, "total_records"):
161
+ self.logger.debug(
162
+ f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
163
+ if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
164
+ if self.mmanifest:
165
+ self.mmanifest.record(
166
166
  full_path=path
167
167
  )
168
- self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
- elif int(local_class_instance.total_records) < 0:
170
- self.logger.warning(
171
- f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
- "This may indicate an error in the data loading process."
173
- )
174
- else:
175
- save_start = time.perf_counter()
176
- parquet_params ={
177
- "df_result": df,
178
- "parquet_storage_path": path,
179
- "fs": self.fs,
180
- "logger": self.logger,
181
- "debug": self.debug,
182
- }
183
- with ParquetSaver(**parquet_params) as ps:
184
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
- save_time = time.perf_counter() - save_start
186
-
187
- total_time = time.perf_counter() - overall_start
188
- self.benchmarks[date] = {
189
- "load_duration": load_time,
190
- "save_duration": save_time,
191
- "total_duration": total_time
192
- }
193
- self._log_success(date, total_time, full_path)
168
+ self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
+ elif int(local_class_instance.total_records) < 0:
170
+ self.logger.warning(
171
+ f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
+ "This may indicate an error in the data loading process."
173
+ )
174
+ else:
175
+ save_start = time.perf_counter()
176
+ parquet_params ={
177
+ "df_result": df,
178
+ "parquet_storage_path": path,
179
+ "fs": self.fs,
180
+ "logger": self.logger,
181
+ "debug": self.debug,
182
+ }
183
+ with ParquetSaver(**parquet_params) as ps:
184
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
+ save_time = time.perf_counter() - save_start
186
+
187
+ total_time = time.perf_counter() - overall_start
188
+ self.benchmarks[date] = {
189
+ "load_duration": load_time,
190
+ "save_duration": save_time,
191
+ "total_duration": total_time
192
+ }
193
+ self._log_success(date, total_time, full_path)
194
194
  except Exception as e:
195
195
  self._log_failure(date, e)
196
196
  raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.9
3
+ Version: 2025.1.10
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,7 +1,7 @@
1
1
  sibi_dst/__init__.py,sha256=j8lZpGCJlxlLgEgeIMxZnWdqJ0g3MCs7-gsnbvPn_KY,285
2
2
  sibi_dst/df_helper/__init__.py,sha256=Jur_MO8RGPkVw0CS3XH5YIWv-d922DC_FwRDTvHHV6Y,432
3
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=10EkCYEfoWwTQbS-ahYWo6TvbtNXM8p0UqqDu0gTuyI,17426
4
- sibi_dst/df_helper/_df_helper.py,sha256=iBoWz2iVgLzQ3hA1EwllL62dkraKamRx2sXseu30FVI,11914
3
+ sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=pSSw3N_ZNZCZHAiChbsF_ECyCmz0L2xCgvt9srHtPOM,17575
4
+ sibi_dst/df_helper/_df_helper.py,sha256=BbpP0BOLDGCOE8oAxqP5ODN_HqYohQcGsh-8Dx2-sks,12885
5
5
  sibi_dst/df_helper/_parquet_artifact.py,sha256=dCvUA2bytv0wY0pFI8lxbcLwXlgGpHndS36iKfEmjLw,14310
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=m98C0TZRroOXvVc2LpEuElrJnquGlR81E1gjI7v1hi4,3102
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,9 +9,9 @@ sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJS
9
9
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
10
10
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
11
11
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
12
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=lrDn2-BbgxDor5g71LAu5LDg2g3ApGAPiQfbFTB2xNA,10702
12
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FWExRRTlhGrOhGPyzL1tucxgoHa3nJenLLs87I2gs-I,11776
13
13
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
14
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Rsvh1nfVtqzfMhv968vNTYYIqVxYsEs4PB-O5CTSYdk,10935
14
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=AOYvWw1vxd1CwXpIakQNFln7PHzFLfp7oaOsGaG0UN8,10961
15
15
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=NqBSHqeYv_1vHt6J0tez0GdMwKrP_sIRcXYXu869ZkY,13313
16
16
  sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ibxeVqpIEsSVusP2bgcd1MNV_wJIoNgXwacltUbwTas,3194
17
17
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=d_-ip-dQnWOlM8btCjoywAXpaiSuN6AaavkTGJsVQfY,3576
@@ -38,7 +38,7 @@ sibi_dst/utils/clickhouse_writer.py,sha256=mNUJoYOreIdRrEFv2mQ6pdtLi1Iz_2rALDyO6
38
38
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
39
39
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
40
40
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
41
- sibi_dst/utils/data_wrapper.py,sha256=deUz2760T_v42Ni1twLUcGS4ucIQM63vJnC6p8sWsb4,9470
41
+ sibi_dst/utils/data_wrapper.py,sha256=9aYXorbrqDX53NVJ5oUnNQy6FbXYhs5osxzeMcdZpC4,9609
42
42
  sibi_dst/utils/date_utils.py,sha256=8fwPpOYqSdM3nHeNykh7Ftk-uPdFa44cEAy5S8iUNw4,18667
43
43
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
44
44
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
@@ -71,6 +71,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
71
71
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
72
72
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
73
73
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
74
- sibi_dst-2025.1.9.dist-info/METADATA,sha256=aGk1rY4nTE2KjIYLgIobb0ER3DhtncHp_GTqlXxxizg,2610
75
- sibi_dst-2025.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
76
- sibi_dst-2025.1.9.dist-info/RECORD,,
74
+ sibi_dst-2025.1.10.dist-info/METADATA,sha256=8vs8tux9EiNETH_j1d-2JMDfWfhN7DysoBAa9HtJk1w,2611
75
+ sibi_dst-2025.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
76
+ sibi_dst-2025.1.10.dist-info/RECORD,,