sibi-dst 2025.1.9__py3-none-any.whl → 2025.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,6 +49,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
49
49
  self.completion_times: Dict[str, float] = {}
50
50
  self.failed: List[str] = []
51
51
  self.original_classes: List[Type] = []
52
+ self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
52
53
 
53
54
  def get_artifact_classes(self, data_type: str) -> List[Type]:
54
55
  """Retrieve artifact classes by data type."""
@@ -270,6 +271,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
270
271
  self.completion_times: Dict[str, float] = {}
271
272
  self.failed: List[str] = []
272
273
  self.original_classes: List[Type] = []
274
+ self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
273
275
 
274
276
  def get_artifact_classes(self, data_type: str) -> List[Type]:
275
277
  """
@@ -28,6 +28,7 @@ class BaseBackend:
28
28
  self.logger = helper.logger
29
29
  self.debug = helper.debug
30
30
  self.total_records = helper.total_records # no records loaded yet
31
+ self._entered = helper._entered # Track if the helper is used in a context manager
31
32
 
32
33
  def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
33
34
  """Synchronous data loading method. Must be implemented by sync backends."""
@@ -67,7 +68,10 @@ class ParquetBackend(BaseBackend):
67
68
  df = self.helper.backend_parquet.load_files()
68
69
  if options and df is not None:
69
70
  df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
70
- self.total_records = len(df)
71
+
72
+ df = df.persist()
73
+
74
+ self.total_records = len(df) or -1 # If df is empty, set total_records to -1
71
75
  return self.total_records, df
72
76
  except Exception as e:
73
77
  self.total_records = -1 # Reset total_records on failure
@@ -105,6 +109,12 @@ class DfHelper(ManagedResource):
105
109
  'http': HttpBackend,
106
110
  }
107
111
 
112
+ _BACKEND_ATTR_MAP = {
113
+ 'sqlalchemy': 'backend_db_connection',
114
+ 'parquet': 'backend_parquet',
115
+ 'http': 'backend_http',
116
+ }
117
+
108
118
  default_config: Dict = None
109
119
 
110
120
  def __init__(self, backend='sqlalchemy', **kwargs):
@@ -140,9 +150,15 @@ class DfHelper(ManagedResource):
140
150
  super().__exit__(exc_type, exc_value, traceback)
141
151
 
142
152
  def _cleanup(self):
143
- active_config = getattr(self, f"backend_{self.backend}", None)
153
+ attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
154
+ if not attr_name:
155
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
156
+ return
157
+ # Get the actual config object (e.g., self.backend_db_connection)
158
+ active_config = getattr(self, attr_name, None)
159
+
144
160
  if active_config and hasattr(active_config, "close"):
145
- self.logger.debug(f"Closing resources for '{self.backend}' backend.")
161
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
146
162
  active_config.close()
147
163
 
148
164
  def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
@@ -156,6 +172,10 @@ class DfHelper(ManagedResource):
156
172
  self.total_records, df = self.backend_strategy.load(**options)
157
173
  df = self._process_loaded_data(df)
158
174
  df = self._post_process_df(df)
175
+ if not self._entered:
176
+ self.logger.warning(
177
+ "DfHelper instance was not used in a context manager; cleanup is being called manually.")
178
+ self._cleanup()
159
179
  return df.compute() if as_pandas else df
160
180
 
161
181
  async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
@@ -200,7 +220,11 @@ class DfHelper(ManagedResource):
200
220
  self.logger.warning("Cannot save to parquet; DataFrame is empty.")
201
221
  return
202
222
  fs = kwargs.pop('fs', self.fs)
203
- path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
223
+ if not fs:
224
+ raise ValueError("Filesystem (fs) must be provided to save to parquet.")
225
+ path = kwargs.pop('parquet_storage_path', None)
226
+ if not path:
227
+ raise ValueError("parquet_storage_path must be provided to save to parquet.")
204
228
  writer_config = {
205
229
  'df_result': df,
206
230
  'parquet_storage_path': path,
@@ -4,8 +4,8 @@ from typing import Optional, List
4
4
 
5
5
  import dask.dataframe as dd
6
6
  import fsspec
7
- from pydantic import BaseModel, model_validator, DirectoryPath, FilePath, ConfigDict
8
-
7
+ import pandas as pd
8
+ from pydantic import BaseModel, model_validator, ConfigDict
9
9
  from sibi_dst.utils import FilePathGenerator
10
10
  from sibi_dst.utils import Logger
11
11
 
@@ -93,7 +93,7 @@ class ParquetConfig(BaseModel):
93
93
  self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
94
94
  if not self.fs.exists(self.parquet_storage_path):
95
95
  self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
96
- #raise ValueError('Parquet storage path does not exist')
96
+ # raise ValueError('Parquet storage path does not exist')
97
97
  self.load_parquet = False
98
98
  if self.parquet_filename is not None:
99
99
  self.parquet_full_path = self.ensure_file_extension(
@@ -184,11 +184,36 @@ class ParquetConfig(BaseModel):
184
184
  :return: A Dask DataFrame containing loaded parquet file data.
185
185
  :rtype: dask.dataframe.DataFrame
186
186
  """
187
- if self.load_parquet:
188
- if self.parquet_folder_list:
189
- return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
190
- else:
191
- return dd.read_parquet(self.parquet_full_path, engine="pyarrow", filesystem=self.fs)
187
+ if not self.load_parquet:
188
+ self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
189
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
190
+
191
+ paths_to_load = []
192
+ if self.parquet_folder_list:
193
+ # Filter out any None values from the list
194
+ paths_to_load = [p for p in self.parquet_folder_list if p is not None]
195
+ elif self.parquet_full_path:
196
+ # Treat the single path as a list with one item
197
+ paths_to_load = [self.parquet_full_path]
198
+
199
+ if not paths_to_load:
200
+ self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
201
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
202
+
203
+ try:
204
+ self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
+ return dd.read_parquet(
206
+ paths_to_load,
207
+ engine="pyarrow",
208
+ filesystem=self.fs,
209
+ exclude=["_*", ".*"]
210
+ )
211
+ except Exception as e:
212
+ # This robust error handling is excellent.
213
+ self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
214
+ self.logger.warning("Returning empty DataFrame due to loading error.")
215
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
216
+
192
217
 
193
218
  @staticmethod
194
219
  def ensure_file_extension(filepath: str, extension: str) -> str:
@@ -15,7 +15,7 @@ from sqlalchemy.engine import url as sqlalchemy_url
15
15
  from sqlalchemy.engine import Engine
16
16
  from sqlalchemy.exc import OperationalError, SQLAlchemyError
17
17
  from sqlalchemy.orm import sessionmaker, Session
18
- from sqlalchemy.pool import QueuePool, NullPool, StaticPool
18
+ from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
19
19
 
20
20
  # Assuming these are your project's internal modules
21
21
  from sibi_dst.utils import Logger
@@ -54,7 +54,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
54
54
  pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
55
55
  pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
56
56
  pool_pre_ping: bool = True
57
- poolclass: Type[QueuePool] = QueuePool
57
+ poolclass: Type[Pool] = QueuePool
58
58
 
59
59
  # --- Internal & Runtime State ---
60
60
  model: Optional[Type[Any]] = None
@@ -172,7 +172,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
172
172
  return
173
173
 
174
174
  engine_wrapper['ref_count'] -= 1
175
- self.logger.debug(f"Closing config. Ref count is now {engine_wrapper['ref_count']}.")
175
+ self.logger.debug(f"Closing connection within engine wrapper. Ref count is now {engine_wrapper['ref_count']}.")
176
176
 
177
177
  if engine_wrapper['ref_count'] <= 0:
178
178
  self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
@@ -195,7 +195,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
195
195
  wrapper = self._engine_registry.get(self._engine_key_instance)
196
196
  if wrapper:
197
197
  wrapper['active_connections'] += 1
198
- # self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
199
198
 
200
199
  def _on_checkin(self, *args) -> None:
201
200
  """Event listener for when a connection is returned to the pool."""
@@ -203,7 +202,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
203
202
  wrapper = self._engine_registry.get(self._engine_key_instance)
204
203
  if wrapper:
205
204
  wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
206
- # self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
207
205
 
208
206
  @property
209
207
  def active_connections(self) -> int:
@@ -153,44 +153,44 @@ class DataWrapper(ManagedResource):
153
153
  # Create a copy to avoid mutating the shared instance dictionary
154
154
  local_load_params = self.load_params.copy()
155
155
  local_load_params.update(date_filter)
156
- local_class_instance = self.dataclass(**self.class_params)
157
- df = local_class_instance.load(**local_load_params)
158
- load_time = time.perf_counter() - load_start
159
-
160
- if hasattr(local_class_instance, "total_records"):
161
- self.logger.debug(
162
- f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
163
- if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
164
- if self.mmanifest:
165
- self.mmanifest.record(
156
+ with self.dataclass(**self.class_params) as local_class_instance:
157
+ df = local_class_instance.load(**local_load_params)
158
+ load_time = time.perf_counter() - load_start
159
+
160
+ if hasattr(local_class_instance, "total_records"):
161
+ self.logger.debug(
162
+ f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
163
+ if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
164
+ if self.mmanifest:
165
+ self.mmanifest.record(
166
166
  full_path=path
167
167
  )
168
- self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
- elif int(local_class_instance.total_records) < 0:
170
- self.logger.warning(
171
- f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
- "This may indicate an error in the data loading process."
173
- )
174
- else:
175
- save_start = time.perf_counter()
176
- parquet_params ={
177
- "df_result": df,
178
- "parquet_storage_path": path,
179
- "fs": self.fs,
180
- "logger": self.logger,
181
- "debug": self.debug,
182
- }
183
- with ParquetSaver(**parquet_params) as ps:
184
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
- save_time = time.perf_counter() - save_start
186
-
187
- total_time = time.perf_counter() - overall_start
188
- self.benchmarks[date] = {
189
- "load_duration": load_time,
190
- "save_duration": save_time,
191
- "total_duration": total_time
192
- }
193
- self._log_success(date, total_time, full_path)
168
+ self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
+ elif int(local_class_instance.total_records) < 0:
170
+ self.logger.warning(
171
+ f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
+ "This may indicate an error in the data loading process."
173
+ )
174
+ else:
175
+ save_start = time.perf_counter()
176
+ parquet_params ={
177
+ "df_result": df,
178
+ "parquet_storage_path": path,
179
+ "fs": self.fs,
180
+ "logger": self.logger,
181
+ "debug": self.debug,
182
+ }
183
+ with ParquetSaver(**parquet_params) as ps:
184
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
+ save_time = time.perf_counter() - save_start
186
+
187
+ total_time = time.perf_counter() - overall_start
188
+ self.benchmarks[date] = {
189
+ "load_duration": load_time,
190
+ "save_duration": save_time,
191
+ "total_duration": total_time
192
+ }
193
+ self._log_success(date, total_time, full_path)
194
194
  except Exception as e:
195
195
  self._log_failure(date, e)
196
196
  raise
@@ -4,7 +4,7 @@ from typing import Union, Tuple, Callable, Dict, Optional
4
4
  import fsspec
5
5
  import numpy as np
6
6
  import pandas as pd
7
-
7
+ import dask.dataframe as dd
8
8
  from .log_utils import Logger
9
9
 
10
10
 
@@ -305,154 +305,153 @@ class FileAgeChecker:
305
305
  raise ValueError(f"Unsupported modification time format for {file_path}") from e
306
306
 
307
307
 
308
+ # --- Vectorized Helper Functions ---
309
+ # These replace the slow, row-by-row .apply() logic. They operate
310
+ # on entire DataFrame partitions for maximum efficiency.
311
+
312
+ def _vectorized_busday_count(
313
+ partition: pd.DataFrame,
314
+ begin_col: str,
315
+ end_col: str,
316
+ holidays: list
317
+ ) -> pd.Series:
318
+ """Vectorized function to count business days on a DataFrame partition."""
319
+ if partition.empty:
320
+ return pd.Series([], dtype=float)
321
+
322
+ # Convert entire columns to datetime at once, coercing errors to NaT
323
+ start_dates = pd.to_datetime(partition[begin_col], errors='coerce').dt.date
324
+ end_dates = pd.to_datetime(partition[end_col], errors='coerce').dt.date
325
+
326
+ # Create a result series filled with NaN to handle rows with invalid dates
327
+ result = pd.Series(np.nan, index=partition.index, dtype=float)
328
+
329
+ # Create a boolean mask for valid, non-NaT date pairs
330
+ valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
331
+
332
+ # Perform the vectorized calculation only on the valid subset of dates
333
+ result.loc[valid_mask] = np.busday_count(
334
+ start_dates[valid_mask],
335
+ end_dates[valid_mask],
336
+ holidays=holidays
337
+ )
338
+ return result
339
+
340
+
341
+ def _vectorized_sla_end_date(
342
+ partition: pd.DataFrame,
343
+ start_col: str,
344
+ n_days_col: str,
345
+ holidays: list
346
+ ) -> pd.Series:
347
+ """Vectorized function to calculate the SLA end date on a DataFrame partition."""
348
+ if partition.empty:
349
+ return pd.Series([], dtype='datetime64[ns]')
350
+
351
+ start_dates = pd.to_datetime(partition[start_col], errors='coerce').dt.date
352
+ sla_days = partition[n_days_col]
353
+
354
+ # Create a result series filled with NaT for rows with invalid start dates
355
+ result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
356
+
357
+ # Create a boolean mask for valid start dates and sla_days
358
+ valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
359
+
360
+ # Perform the vectorized calculation only on the valid subset
361
+ result.loc[valid_mask] = np.busday_offset(
362
+ start_dates[valid_mask],
363
+ sla_days[valid_mask].astype(int), # Ensure days are integers
364
+ roll='forward',
365
+ holidays=holidays
366
+ )
367
+ return result
368
+
369
+
370
+ # --- Refactored BusinessDays Class ---
371
+
308
372
  class BusinessDays:
309
373
  """
310
- Provides functionality for handling business days calculations with a custom
311
- holiday list. The class includes methods for calculating the number of
312
- business days, modifying dates by adding business days, and applying these
313
- operations to Dask DataFrames.
314
-
315
- :ivar logger: Logger instance for logging error, warning, and debug messages.
316
- :type logger: logging.Logger
317
- :ivar HOLIDAY_LIST: Dictionary mapping years to lists of holiday dates.
318
- :type HOLIDAY_LIST: dict
319
- :ivar bd_cal: Numpy busdaycalendar object containing holidays and week mask.
320
- :type bd_cal: numpy.busdaycalendar
321
- :ivar holidays: Array of holiday dates used by the business day calendar.
322
- :type holidays: numpy.ndarray
323
- :ivar week_mask: Boolean array indicating working days within a week.
324
- :type week_mask: numpy.ndarray
374
+ Business days calculations with a custom holiday list.
375
+ Supports scalar and efficient, vectorized Dask DataFrame operations.
325
376
  """
326
377
 
327
- def __init__(self, holiday_list, logger):
328
- """
329
- Initialize a BusinessDays object with a given holiday list.
330
- """
378
+ def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
331
379
  self.logger = logger
332
380
  self.HOLIDAY_LIST = holiday_list
333
- bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
334
- self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
335
- self.holidays = self.bd_cal.holidays
336
- self.week_mask = self.bd_cal.weekmask
337
381
 
338
- def get_business_days_count(self, begin_date, end_date):
339
- """
340
- Calculate the number of business days between two dates.
341
- """
342
- try:
343
- begin_date = pd.to_datetime(begin_date)
344
- end_date = pd.to_datetime(end_date)
345
- except Exception as e:
346
- raise ValueError(f"Invalid date format: {e}")
347
-
348
- years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
349
- if not all(year in self.HOLIDAY_LIST for year in years):
350
- raise ValueError("Not all years in date range are in the holiday list")
351
-
352
- return np.busday_count(
353
- begin_date.strftime("%Y-%m-%d"),
354
- end_date.strftime("%Y-%m-%d"),
355
- busdaycal=self.bd_cal,
356
- )
382
+ # Flatten and store as tuple for determinism
383
+ bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
384
+ self.holidays = tuple(bd_holidays)
357
385
 
358
- def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
359
- """
360
- Add a column to a Dask DataFrame with the number of business days between two date columns.
361
- """
362
- if not all(col in df.columns for col in [begin_date_col, end_date_col]):
363
- self.logger.error("Column names not found in DataFrame")
364
- raise ValueError("Required columns are missing")
365
-
366
- # Extract holidays and weekmask to recreate the busdaycalendar
367
- holidays = self.bd_cal.holidays
368
- weekmask = self.bd_cal.weekmask
369
-
370
- # Define a function to calculate business days
371
- def calculate_business_days(row, holidays, weekmask):
372
- begin_date = pd.to_datetime(row[begin_date_col])
373
- end_date = pd.to_datetime(row[end_date_col])
374
- if pd.isna(begin_date) or pd.isna(end_date):
375
- return np.nan
376
- busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
377
- return np.busday_count(
378
- begin_date.strftime("%Y-%m-%d"),
379
- end_date.strftime("%Y-%m-%d"),
380
- busdaycal=busdaycal,
381
- )
382
-
383
- # Define a wrapper function for partition-wise operations
384
- def apply_business_days(partition, holidays, weekmask):
385
- return partition.apply(
386
- calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
387
- )
388
-
389
- # Apply the function using map_partitions
390
- df[result_col] = df.map_partitions(
391
- apply_business_days,
392
- holidays,
393
- weekmask,
394
- meta=(result_col, "int64"),
386
+ def get_business_days_count(
387
+ self,
388
+ begin_date: str | datetime.date | pd.Timestamp,
389
+ end_date: str | datetime.date | pd.Timestamp,
390
+ ) -> int:
391
+ """Scalar method to count business days between two dates."""
392
+ begin = pd.to_datetime(begin_date)
393
+ end = pd.to_datetime(end_date)
394
+ return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
395
+
396
+ def calc_business_days_from_df(
397
+ self,
398
+ df: dd.DataFrame,
399
+ begin_date_col: str,
400
+ end_date_col: str,
401
+ result_col: str = "business_days",
402
+ ) -> dd.DataFrame:
403
+ """Calculates business days between two columns in a Dask DataFrame."""
404
+ missing = {begin_date_col, end_date_col} - set(df.columns)
405
+ if missing:
406
+ self.logger.error(f"Missing columns: {missing}")
407
+ raise ValueError("Required columns are missing from DataFrame")
408
+
409
+ return df.assign(
410
+ **{result_col: df.map_partitions(
411
+ _vectorized_busday_count,
412
+ begin_col=begin_date_col,
413
+ end_col=end_date_col,
414
+ holidays=list(self.holidays),
415
+ meta=(result_col, 'f8') # f8 is float64
416
+ )}
395
417
  )
396
418
 
397
- return df
398
-
399
- def add_business_days(self, start_date, n_days):
400
- """
401
- Add n_days business days to start_date.
402
- """
403
- try:
404
- start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
405
- except ValueError:
406
- raise ValueError("Date should be a string in the format YYYY-MM-DD")
407
-
408
- if str(start_date.year) not in self.HOLIDAY_LIST:
409
- self.logger.warning(f"Year {start_date.year} is not in the holiday list")
410
-
419
+ def add_business_days(
420
+ self,
421
+ start_date: str | datetime.date | pd.Timestamp,
422
+ n_days: int,
423
+ ) -> np.datetime64:
424
+ """Scalar method to add N business days to a start date."""
425
+ start = pd.to_datetime(start_date)
411
426
  return np.busday_offset(
412
- start_date.strftime("%Y-%m-%d"),
427
+ start.date(),
413
428
  n_days,
414
- roll="forward",
415
- busdaycal=self.bd_cal,
429
+ roll='forward',
430
+ holidays=list(self.holidays),
416
431
  )
417
432
 
418
- def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
419
- """
420
- Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
421
- """
422
- if not all(col in df.columns for col in [start_date_col, n_days_col]):
423
- raise ValueError("Column names not found in DataFrame")
424
-
425
- # Extract holidays and weekmask to recreate the busdaycalendar
426
- holidays = self.bd_cal.holidays
427
- weekmask = self.bd_cal.weekmask
428
-
429
- # Define a function to calculate SLA end dates
430
- def calculate_sla_end_date(row, holidays, weekmask):
431
- start_date = pd.to_datetime(row[start_date_col])
432
- n_days = row[n_days_col]
433
- busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
434
- return np.busday_offset(
435
- start_date.strftime("%Y-%m-%d"),
436
- n_days,
437
- roll="forward",
438
- busdaycal=busdaycal,
439
- )
440
-
441
- # Define a wrapper for partition-wise operation
442
- def apply_sla_end_date(partition, holidays, weekmask):
443
- return partition.apply(
444
- calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
445
- )
446
-
447
- # Apply the function using map_partitions
448
- df[result_col] = df.map_partitions(
449
- apply_sla_end_date,
450
- holidays,
451
- weekmask,
452
- meta=(result_col, "object"),
433
+ def calc_sla_end_date(
434
+ self,
435
+ df: dd.DataFrame,
436
+ start_date_col: str,
437
+ n_days_col: str,
438
+ result_col: str = "sla_end_date",
439
+ ) -> dd.DataFrame:
440
+ """Calculates an SLA end date column for a Dask DataFrame."""
441
+ missing = {start_date_col, n_days_col} - set(df.columns)
442
+ if missing:
443
+ self.logger.error(f"Missing columns: {missing}")
444
+ raise ValueError("Required columns are missing from DataFrame")
445
+
446
+ return df.assign(
447
+ **{result_col: df.map_partitions(
448
+ _vectorized_sla_end_date,
449
+ start_col=start_date_col,
450
+ n_days_col=n_days_col,
451
+ holidays=list(self.holidays),
452
+ meta=(result_col, 'datetime64[ns]')
453
+ )}
453
454
  )
454
-
455
- return df
456
455
  # Class enhancements
457
456
  # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
458
457
  # datetime.date.today() + datetime.timedelta(days=13)))
@@ -1,7 +1,8 @@
1
1
  import itertools
2
2
  import dask.dataframe as dd
3
3
  import pandas as pd
4
- from sqlmodel import create_engine, Session, select
4
+
5
+ #from sqlmodel import create_engine, Session, select
5
6
  from sibi_dst.v2.df_helper.core import FilterHandler
6
7
  from sibi_dst.v2.utils import Logger
7
8
 
@@ -116,7 +117,7 @@ class SQLModelDask:
116
117
  return dask_df
117
118
 
118
119
  except Exception as e:
119
- self.logger.error(f"Error executing query: {str(e)}")
120
- self.logger.error(self.query)
120
+ self.logger.error(f"_io_dask:Error executing query: {str(e)}")
121
+ self.logger.error(f"_io_dask:{self.query})
121
122
  # In case of error, return an empty Dask DataFrame with the expected columns.
122
123
  return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.9
3
+ Version: 2025.1.11
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,7 +1,7 @@
1
1
  sibi_dst/__init__.py,sha256=j8lZpGCJlxlLgEgeIMxZnWdqJ0g3MCs7-gsnbvPn_KY,285
2
2
  sibi_dst/df_helper/__init__.py,sha256=Jur_MO8RGPkVw0CS3XH5YIWv-d922DC_FwRDTvHHV6Y,432
3
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=10EkCYEfoWwTQbS-ahYWo6TvbtNXM8p0UqqDu0gTuyI,17426
4
- sibi_dst/df_helper/_df_helper.py,sha256=iBoWz2iVgLzQ3hA1EwllL62dkraKamRx2sXseu30FVI,11914
3
+ sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=pSSw3N_ZNZCZHAiChbsF_ECyCmz0L2xCgvt9srHtPOM,17575
4
+ sibi_dst/df_helper/_df_helper.py,sha256=PNoN0nlzRwo_4JiaVyzmOM--LRrsJ0jB9pZqDi_kkRA,12917
5
5
  sibi_dst/df_helper/_parquet_artifact.py,sha256=dCvUA2bytv0wY0pFI8lxbcLwXlgGpHndS36iKfEmjLw,14310
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=m98C0TZRroOXvVc2LpEuElrJnquGlR81E1gjI7v1hi4,3102
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,9 +9,9 @@ sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJS
9
9
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
10
10
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
11
11
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
12
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=lrDn2-BbgxDor5g71LAu5LDg2g3ApGAPiQfbFTB2xNA,10702
12
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=FWExRRTlhGrOhGPyzL1tucxgoHa3nJenLLs87I2gs-I,11776
13
13
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
14
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Rsvh1nfVtqzfMhv968vNTYYIqVxYsEs4PB-O5CTSYdk,10935
14
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=ycjnkhD1lWMKnLFy1bycle__jbfaWH6oI7m9ymX59c4,10783
15
15
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=NqBSHqeYv_1vHt6J0tez0GdMwKrP_sIRcXYXu869ZkY,13313
16
16
  sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ibxeVqpIEsSVusP2bgcd1MNV_wJIoNgXwacltUbwTas,3194
17
17
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=d_-ip-dQnWOlM8btCjoywAXpaiSuN6AaavkTGJsVQfY,3576
@@ -38,8 +38,8 @@ sibi_dst/utils/clickhouse_writer.py,sha256=mNUJoYOreIdRrEFv2mQ6pdtLi1Iz_2rALDyO6
38
38
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
39
39
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
40
40
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
41
- sibi_dst/utils/data_wrapper.py,sha256=deUz2760T_v42Ni1twLUcGS4ucIQM63vJnC6p8sWsb4,9470
42
- sibi_dst/utils/date_utils.py,sha256=8fwPpOYqSdM3nHeNykh7Ftk-uPdFa44cEAy5S8iUNw4,18667
41
+ sibi_dst/utils/data_wrapper.py,sha256=9aYXorbrqDX53NVJ5oUnNQy6FbXYhs5osxzeMcdZpC4,9609
42
+ sibi_dst/utils/date_utils.py,sha256=T0uXNIG2IQfgs0AyQNsF9S6-cTujtA4GDC1IalvZVSU,18040
43
43
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
44
44
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
45
45
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
@@ -62,7 +62,7 @@ sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=jhgN0OO5Sk1zQF
62
62
  sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py,sha256=jX_mQAzl_6xdh7CTYw4uvUIX2wMp3NzXMlfbC5alOzs,13632
63
63
  sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py,sha256=LcwJjVVxxrnVZalWqnz5m7r77i9tmJR0-U2k8eSQ-m8,249
64
64
  sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py,sha256=n3CDbda0OY3X7eTeu_PR2KcZ5hYyEJL7Hroo8yQkjG8,15435
65
- sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py,sha256=wVgNPo5V75aLtlZr_SIQ-yteyXq-Rg93eMfR8JCfkSo,5422
65
+ sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py,sha256=VyhSGZGSN0gpsGhHHpY07NkmeAvPmMyQi3ewAaE79VM,5446
66
66
  sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py,sha256=FIs6UrNxdJ7eDHDvTv-cJuybIue2-oCRedhW-MNe7CU,6285
67
67
  sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py,sha256=k0dnMLkLMMvkDYDYWkGFgibW5UD8pJgB3YrEg_R7pj8,13556
68
68
  sibi_dst/v2/df_helper/core/__init__.py,sha256=rZhBh32Rgcxj4MBii-KsYVJQmrT9egiWKXk68gWKblo,197
@@ -71,6 +71,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
71
71
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
72
72
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
73
73
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
74
- sibi_dst-2025.1.9.dist-info/METADATA,sha256=aGk1rY4nTE2KjIYLgIobb0ER3DhtncHp_GTqlXxxizg,2610
75
- sibi_dst-2025.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
76
- sibi_dst-2025.1.9.dist-info/RECORD,,
74
+ sibi_dst-2025.1.11.dist-info/METADATA,sha256=7iwn7RFfaDF_9dfpWvnNl2Al_8NHWu7l8vGhzO9BAac,2611
75
+ sibi_dst-2025.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
76
+ sibi_dst-2025.1.11.dist-info/RECORD,,