sibi-dst 2025.1.10__tar.gz → 2025.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/PKG-INFO +1 -1
  2. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/pyproject.toml +1 -1
  3. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/_df_helper.py +3 -0
  4. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +2 -4
  5. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/date_utils.py +132 -132
  6. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +4 -3
  7. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/README.md +0 -0
  8. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/__init__.py +0 -0
  9. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/__init__.py +0 -0
  10. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  11. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  12. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  13. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/__init__.py +0 -0
  14. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  15. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  16. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  17. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  18. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  19. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  20. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  21. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  22. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  23. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/core/__init__.py +0 -0
  24. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/core/_defaults.py +0 -0
  25. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  26. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/core/_params_config.py +0 -0
  27. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/core/_query_config.py +0 -0
  28. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/df_helper/data_cleaner.py +0 -0
  29. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/geopy_helper/__init__.py +0 -0
  30. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  31. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/geopy_helper/utils.py +0 -0
  32. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/__init__.py +0 -0
  33. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  34. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  35. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  36. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  37. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/osmnx_helper/utils.py +0 -0
  38. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/tests/__init__.py +0 -0
  39. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  40. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/__init__.py +0 -0
  41. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/base.py +0 -0
  42. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/clickhouse_writer.py +0 -0
  43. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/credentials.py +0 -0
  44. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/data_from_http_source.py +0 -0
  45. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/data_utils.py +0 -0
  46. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/data_wrapper.py +0 -0
  47. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/df_utils.py +0 -0
  48. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/file_utils.py +0 -0
  49. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/filepath_generator.py +0 -0
  50. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/log_utils.py +0 -0
  51. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/manifest_manager.py +0 -0
  52. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/parquet_saver.py +0 -0
  53. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/phone_formatter.py +0 -0
  54. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/storage_config.py +0 -0
  55. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/storage_manager.py +0 -0
  56. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/update_planner.py +0 -0
  57. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/utils/webdav_client.py +0 -0
  58. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/__init__.py +0 -0
  59. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/__init__.py +0 -0
  60. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  61. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  62. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  63. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  64. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  65. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  66. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  67. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  68. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  69. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  70. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  71. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  72. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  73. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  74. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  75. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/utils/__init__.py +0 -0
  76. {sibi_dst-2025.1.10 → sibi_dst-2025.1.12}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.10
3
+ Version: 2025.1.12
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.1.10"
3
+ version = "2025.1.12"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -68,6 +68,9 @@ class ParquetBackend(BaseBackend):
68
68
  df = self.helper.backend_parquet.load_files()
69
69
  if options and df is not None:
70
70
  df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
71
+
72
+ df = df.persist()
73
+
71
74
  self.total_records = len(df) or -1 # If df is empty, set total_records to -1
72
75
  return self.total_records, df
73
76
  except Exception as e:
@@ -15,7 +15,7 @@ from sqlalchemy.engine import url as sqlalchemy_url
15
15
  from sqlalchemy.engine import Engine
16
16
  from sqlalchemy.exc import OperationalError, SQLAlchemyError
17
17
  from sqlalchemy.orm import sessionmaker, Session
18
- from sqlalchemy.pool import QueuePool, NullPool, StaticPool
18
+ from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
19
19
 
20
20
  # Assuming these are your project's internal modules
21
21
  from sibi_dst.utils import Logger
@@ -54,7 +54,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
54
54
  pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
55
55
  pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
56
56
  pool_pre_ping: bool = True
57
- poolclass: Type[QueuePool] = QueuePool
57
+ poolclass: Type[Pool] = QueuePool
58
58
 
59
59
  # --- Internal & Runtime State ---
60
60
  model: Optional[Type[Any]] = None
@@ -195,7 +195,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
195
195
  wrapper = self._engine_registry.get(self._engine_key_instance)
196
196
  if wrapper:
197
197
  wrapper['active_connections'] += 1
198
- # self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
199
198
 
200
199
  def _on_checkin(self, *args) -> None:
201
200
  """Event listener for when a connection is returned to the pool."""
@@ -203,7 +202,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
203
202
  wrapper = self._engine_registry.get(self._engine_key_instance)
204
203
  if wrapper:
205
204
  wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
206
- # self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
207
205
 
208
206
  @property
209
207
  def active_connections(self) -> int:
@@ -1,10 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import datetime
2
4
  from typing import Union, Tuple, Callable, Dict, Optional
3
5
 
4
6
  import fsspec
5
7
  import numpy as np
6
8
  import pandas as pd
7
-
9
+ import dask.dataframe as dd
8
10
  from .log_utils import Logger
9
11
 
10
12
 
@@ -305,154 +307,152 @@ class FileAgeChecker:
305
307
  raise ValueError(f"Unsupported modification time format for {file_path}") from e
306
308
 
307
309
 
308
- class BusinessDays:
310
+ # --- Vectorized Helper Functions ---
311
+
312
+ def _vectorized_busday_count(partition, begin_col, end_col, holidays):
309
313
  """
310
- Provides functionality for handling business days calculations with a custom
311
- holiday list. The class includes methods for calculating the number of
312
- business days, modifying dates by adding business days, and applying these
313
- operations to Dask DataFrames.
314
-
315
- :ivar logger: Logger instance for logging error, warning, and debug messages.
316
- :type logger: logging.Logger
317
- :ivar HOLIDAY_LIST: Dictionary mapping years to lists of holiday dates.
318
- :type HOLIDAY_LIST: dict
319
- :ivar bd_cal: Numpy busdaycalendar object containing holidays and week mask.
320
- :type bd_cal: numpy.busdaycalendar
321
- :ivar holidays: Array of holiday dates used by the business day calendar.
322
- :type holidays: numpy.ndarray
323
- :ivar week_mask: Boolean array indicating working days within a week.
324
- :type week_mask: numpy.ndarray
314
+ Calculates the number of business days between a start and end date.
325
315
  """
316
+ # Extract the raw columns
317
+ start_dates_raw = partition[begin_col]
318
+ end_dates_raw = partition[end_col]
326
319
 
327
- def __init__(self, holiday_list, logger):
328
- """
329
- Initialize a BusinessDays object with a given holiday list.
330
- """
331
- self.logger = logger
332
- self.HOLIDAY_LIST = holiday_list
333
- bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
334
- self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
335
- self.holidays = self.bd_cal.holidays
336
- self.week_mask = self.bd_cal.weekmask
337
320
 
338
- def get_business_days_count(self, begin_date, end_date):
339
- """
340
- Calculate the number of business days between two dates.
341
- """
342
- try:
343
- begin_date = pd.to_datetime(begin_date)
344
- end_date = pd.to_datetime(end_date)
345
- except Exception as e:
346
- raise ValueError(f"Invalid date format: {e}")
321
+ start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
322
+ end_dates = pd.to_datetime(end_dates_raw, errors='coerce')
347
323
 
348
- years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
349
- if not all(year in self.HOLIDAY_LIST for year in years):
350
- raise ValueError("Not all years in date range are in the holiday list")
324
+ # Initialize the result Series with NaN, as the output is a number
325
+ result = pd.Series(np.nan, index=partition.index)
351
326
 
352
- return np.busday_count(
353
- begin_date.strftime("%Y-%m-%d"),
354
- end_date.strftime("%Y-%m-%d"),
355
- busdaycal=self.bd_cal,
356
- )
327
+ # Create a mask for rows where both start and end dates are valid
328
+ valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
357
329
 
358
- def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
359
- """
360
- Add a column to a Dask DataFrame with the number of business days between two date columns.
361
- """
362
- if not all(col in df.columns for col in [begin_date_col, end_date_col]):
363
- self.logger.error("Column names not found in DataFrame")
364
- raise ValueError("Required columns are missing")
365
-
366
- # Extract holidays and weekmask to recreate the busdaycalendar
367
- holidays = self.bd_cal.holidays
368
- weekmask = self.bd_cal.weekmask
369
-
370
- # Define a function to calculate business days
371
- def calculate_business_days(row, holidays, weekmask):
372
- begin_date = pd.to_datetime(row[begin_date_col])
373
- end_date = pd.to_datetime(row[end_date_col])
374
- if pd.isna(begin_date) or pd.isna(end_date):
375
- return np.nan
376
- busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
377
- return np.busday_count(
378
- begin_date.strftime("%Y-%m-%d"),
379
- end_date.strftime("%Y-%m-%d"),
380
- busdaycal=busdaycal,
381
- )
382
-
383
- # Define a wrapper function for partition-wise operations
384
- def apply_business_days(partition, holidays, weekmask):
385
- return partition.apply(
386
- calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
387
- )
388
-
389
- # Apply the function using map_partitions
390
- df[result_col] = df.map_partitions(
391
- apply_business_days,
392
- holidays,
393
- weekmask,
394
- meta=(result_col, "int64"),
395
- )
330
+ # Perform the vectorized calculation only on the valid subset
331
+ # Convert to NumPy arrays of date type for the calculation
332
+ result.loc[valid_mask] = np.busday_count(
333
+ start_dates[valid_mask].values.astype('datetime64[D]'),
334
+ end_dates[valid_mask].values.astype('datetime64[D]'),
335
+ holidays=holidays
336
+ )
396
337
 
397
- return df
338
+ return result
339
+
340
+
341
+ def _vectorized_sla_end_date(partition, start_col, n_days_col, holidays):
342
+ """
343
+ Calculates the end date of an SLA, skipping weekends and holidays.
344
+ """
345
+ # Extract the relevant columns as pandas Series
346
+ start_dates_raw = partition[start_col]
347
+ sla_days = partition[n_days_col]
398
348
 
399
- def add_business_days(self, start_date, n_days):
400
- """
401
- Add n_days business days to start_date.
402
- """
403
- try:
404
- start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
405
- except ValueError:
406
- raise ValueError("Date should be a string in the format YYYY-MM-DD")
407
349
 
408
- if str(start_date.year) not in self.HOLIDAY_LIST:
409
- self.logger.warning(f"Year {start_date.year} is not in the holiday list")
350
+ start_dates = pd.to_datetime(start_dates_raw, errors='coerce')
410
351
 
352
+ # Initialize the result Series with NaT (Not a Time)
353
+ result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
354
+
355
+ # Create a mask for rows that have valid start dates and SLA days
356
+ valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
357
+
358
+ # Perform the vectorized calculation only on the valid subset
359
+ # Note: np.busday_offset requires a NumPy array, so we use .values
360
+ result.loc[valid_mask] = np.busday_offset(
361
+ start_dates[valid_mask].values.astype('datetime64[D]'), # Convert to numpy array of dates
362
+ sla_days[valid_mask].astype(int), # Ensure days are integers
363
+ roll='forward',
364
+ holidays=holidays
365
+ )
366
+
367
+ return result
368
+
369
+
370
+ # --- Refactored BusinessDays Class ---
371
+
372
+ class BusinessDays:
373
+ """
374
+ Business days calculations with a custom holiday list.
375
+ Supports scalar and efficient, vectorized Dask DataFrame operations.
376
+ """
377
+
378
+ def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
379
+ self.logger = logger
380
+ self.HOLIDAY_LIST = holiday_list
381
+
382
+ # Flatten and store as tuple for determinism
383
+ bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
384
+ self.holidays = tuple(bd_holidays)
385
+
386
+ def get_business_days_count(
387
+ self,
388
+ begin_date: str | datetime.date | pd.Timestamp,
389
+ end_date: str | datetime.date | pd.Timestamp,
390
+ ) -> int:
391
+ """Scalar method to count business days between two dates."""
392
+ begin = pd.to_datetime(begin_date)
393
+ end = pd.to_datetime(end_date)
394
+ return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
395
+
396
+ def calc_business_days_from_df(
397
+ self,
398
+ df: dd.DataFrame,
399
+ begin_date_col: str,
400
+ end_date_col: str,
401
+ result_col: str = "business_days",
402
+ ) -> dd.DataFrame:
403
+ """Calculates business days between two columns in a Dask DataFrame."""
404
+ missing = {begin_date_col, end_date_col} - set(df.columns)
405
+ if missing:
406
+ self.logger.error(f"Missing columns: {missing}")
407
+ raise ValueError("Required columns are missing from DataFrame")
408
+
409
+ return df.assign(
410
+ **{result_col: df.map_partitions(
411
+ _vectorized_busday_count,
412
+ begin_col=begin_date_col,
413
+ end_col=end_date_col,
414
+ holidays=list(self.holidays),
415
+ meta=(result_col, 'f8') # f8 is float64
416
+ )}
417
+ )
418
+
419
+ def add_business_days(
420
+ self,
421
+ start_date: str | datetime.date | pd.Timestamp,
422
+ n_days: int,
423
+ ) -> np.datetime64:
424
+ """Scalar method to add N business days to a start date."""
425
+ start = pd.to_datetime(start_date)
411
426
  return np.busday_offset(
412
- start_date.strftime("%Y-%m-%d"),
427
+ start.date(),
413
428
  n_days,
414
- roll="forward",
415
- busdaycal=self.bd_cal,
429
+ roll='forward',
430
+ holidays=list(self.holidays),
416
431
  )
417
432
 
418
- def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
419
- """
420
- Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
421
- """
422
- if not all(col in df.columns for col in [start_date_col, n_days_col]):
423
- raise ValueError("Column names not found in DataFrame")
424
-
425
- # Extract holidays and weekmask to recreate the busdaycalendar
426
- holidays = self.bd_cal.holidays
427
- weekmask = self.bd_cal.weekmask
428
-
429
- # Define a function to calculate SLA end dates
430
- def calculate_sla_end_date(row, holidays, weekmask):
431
- start_date = pd.to_datetime(row[start_date_col])
432
- n_days = row[n_days_col]
433
- busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
434
- return np.busday_offset(
435
- start_date.strftime("%Y-%m-%d"),
436
- n_days,
437
- roll="forward",
438
- busdaycal=busdaycal,
439
- )
440
-
441
- # Define a wrapper for partition-wise operation
442
- def apply_sla_end_date(partition, holidays, weekmask):
443
- return partition.apply(
444
- calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
445
- )
446
-
447
- # Apply the function using map_partitions
448
- df[result_col] = df.map_partitions(
449
- apply_sla_end_date,
450
- holidays,
451
- weekmask,
452
- meta=(result_col, "object"),
433
+ def calc_sla_end_date(
434
+ self,
435
+ df: dd.DataFrame,
436
+ start_date_col: str,
437
+ n_days_col: str,
438
+ result_col: str = "sla_end_date",
439
+ ) -> dd.DataFrame:
440
+ """Calculates an SLA end date column for a Dask DataFrame."""
441
+ missing = {start_date_col, n_days_col} - set(df.columns)
442
+ if missing:
443
+ self.logger.error(f"Missing columns: {missing}")
444
+ raise ValueError("Required columns are missing from DataFrame")
445
+
446
+ return df.assign(
447
+ **{result_col: df.map_partitions(
448
+ _vectorized_sla_end_date,
449
+ start_col=start_date_col,
450
+ n_days_col=n_days_col,
451
+ holidays=list(self.holidays),
452
+ meta=(result_col, 'datetime64[ns]')
453
+ )}
453
454
  )
454
455
 
455
- return df
456
456
  # Class enhancements
457
457
  # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
458
458
  # datetime.date.today() + datetime.timedelta(days=13)))
@@ -1,7 +1,8 @@
1
1
  import itertools
2
2
  import dask.dataframe as dd
3
3
  import pandas as pd
4
- from sqlmodel import create_engine, Session, select
4
+
5
+ #from sqlmodel import create_engine, Session, select
5
6
  from sibi_dst.v2.df_helper.core import FilterHandler
6
7
  from sibi_dst.v2.utils import Logger
7
8
 
@@ -116,7 +117,7 @@ class SQLModelDask:
116
117
  return dask_df
117
118
 
118
119
  except Exception as e:
119
- self.logger.error(f"Error executing query: {str(e)}")
120
- self.logger.error(self.query)
120
+ self.logger.error(f"_io_dask:Error executing query: {str(e)}")
121
+ self.logger.error(f"_io_dask:{self.query})
121
122
  # In case of error, return an empty Dask DataFrame with the expected columns.
122
123
  return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
File without changes