sibi-dst 2025.1.9__tar.gz → 2025.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/PKG-INFO +1 -1
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/pyproject.toml +1 -1
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +2 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_df_helper.py +28 -4
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +33 -8
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -5
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/data_wrapper.py +36 -36
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/date_utils.py +133 -134
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +4 -3
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/README.md +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/utils/log_utils.py +0 -0
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py
RENAMED
@@ -49,6 +49,7 @@ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
|
49
49
|
self.completion_times: Dict[str, float] = {}
|
50
50
|
self.failed: List[str] = []
|
51
51
|
self.original_classes: List[Type] = []
|
52
|
+
self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
|
52
53
|
|
53
54
|
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
54
55
|
"""Retrieve artifact classes by data type."""
|
@@ -270,6 +271,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
270
271
|
self.completion_times: Dict[str, float] = {}
|
271
272
|
self.failed: List[str] = []
|
272
273
|
self.original_classes: List[Type] = []
|
274
|
+
self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
|
273
275
|
|
274
276
|
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
275
277
|
"""
|
@@ -28,6 +28,7 @@ class BaseBackend:
|
|
28
28
|
self.logger = helper.logger
|
29
29
|
self.debug = helper.debug
|
30
30
|
self.total_records = helper.total_records # no records loaded yet
|
31
|
+
self._entered = helper._entered # Track if the helper is used in a context manager
|
31
32
|
|
32
33
|
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
33
34
|
"""Synchronous data loading method. Must be implemented by sync backends."""
|
@@ -67,7 +68,10 @@ class ParquetBackend(BaseBackend):
|
|
67
68
|
df = self.helper.backend_parquet.load_files()
|
68
69
|
if options and df is not None:
|
69
70
|
df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
70
|
-
|
71
|
+
|
72
|
+
df = df.persist()
|
73
|
+
|
74
|
+
self.total_records = len(df) or -1 # If df is empty, set total_records to -1
|
71
75
|
return self.total_records, df
|
72
76
|
except Exception as e:
|
73
77
|
self.total_records = -1 # Reset total_records on failure
|
@@ -105,6 +109,12 @@ class DfHelper(ManagedResource):
|
|
105
109
|
'http': HttpBackend,
|
106
110
|
}
|
107
111
|
|
112
|
+
_BACKEND_ATTR_MAP = {
|
113
|
+
'sqlalchemy': 'backend_db_connection',
|
114
|
+
'parquet': 'backend_parquet',
|
115
|
+
'http': 'backend_http',
|
116
|
+
}
|
117
|
+
|
108
118
|
default_config: Dict = None
|
109
119
|
|
110
120
|
def __init__(self, backend='sqlalchemy', **kwargs):
|
@@ -140,9 +150,15 @@ class DfHelper(ManagedResource):
|
|
140
150
|
super().__exit__(exc_type, exc_value, traceback)
|
141
151
|
|
142
152
|
def _cleanup(self):
|
143
|
-
|
153
|
+
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
154
|
+
if not attr_name:
|
155
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
156
|
+
return
|
157
|
+
# Get the actual config object (e.g., self.backend_db_connection)
|
158
|
+
active_config = getattr(self, attr_name, None)
|
159
|
+
|
144
160
|
if active_config and hasattr(active_config, "close"):
|
145
|
-
self.logger.debug(f"Closing resources for '{self.backend}' backend.")
|
161
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
146
162
|
active_config.close()
|
147
163
|
|
148
164
|
def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
@@ -156,6 +172,10 @@ class DfHelper(ManagedResource):
|
|
156
172
|
self.total_records, df = self.backend_strategy.load(**options)
|
157
173
|
df = self._process_loaded_data(df)
|
158
174
|
df = self._post_process_df(df)
|
175
|
+
if not self._entered:
|
176
|
+
self.logger.warning(
|
177
|
+
"DfHelper instance was not used in a context manager; cleanup is being called manually.")
|
178
|
+
self._cleanup()
|
159
179
|
return df.compute() if as_pandas else df
|
160
180
|
|
161
181
|
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
@@ -200,7 +220,11 @@ class DfHelper(ManagedResource):
|
|
200
220
|
self.logger.warning("Cannot save to parquet; DataFrame is empty.")
|
201
221
|
return
|
202
222
|
fs = kwargs.pop('fs', self.fs)
|
203
|
-
|
223
|
+
if not fs:
|
224
|
+
raise ValueError("Filesystem (fs) must be provided to save to parquet.")
|
225
|
+
path = kwargs.pop('parquet_storage_path', None)
|
226
|
+
if not path:
|
227
|
+
raise ValueError("parquet_storage_path must be provided to save to parquet.")
|
204
228
|
writer_config = {
|
205
229
|
'df_result': df,
|
206
230
|
'parquet_storage_path': path,
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
@@ -4,8 +4,8 @@ from typing import Optional, List
|
|
4
4
|
|
5
5
|
import dask.dataframe as dd
|
6
6
|
import fsspec
|
7
|
-
|
8
|
-
|
7
|
+
import pandas as pd
|
8
|
+
from pydantic import BaseModel, model_validator, ConfigDict
|
9
9
|
from sibi_dst.utils import FilePathGenerator
|
10
10
|
from sibi_dst.utils import Logger
|
11
11
|
|
@@ -93,7 +93,7 @@ class ParquetConfig(BaseModel):
|
|
93
93
|
self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
|
94
94
|
if not self.fs.exists(self.parquet_storage_path):
|
95
95
|
self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
|
96
|
-
#raise ValueError('Parquet storage path does not exist')
|
96
|
+
# raise ValueError('Parquet storage path does not exist')
|
97
97
|
self.load_parquet = False
|
98
98
|
if self.parquet_filename is not None:
|
99
99
|
self.parquet_full_path = self.ensure_file_extension(
|
@@ -184,11 +184,36 @@ class ParquetConfig(BaseModel):
|
|
184
184
|
:return: A Dask DataFrame containing loaded parquet file data.
|
185
185
|
:rtype: dask.dataframe.DataFrame
|
186
186
|
"""
|
187
|
-
if self.load_parquet:
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
187
|
+
if not self.load_parquet:
|
188
|
+
self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
189
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
190
|
+
|
191
|
+
paths_to_load = []
|
192
|
+
if self.parquet_folder_list:
|
193
|
+
# Filter out any None values from the list
|
194
|
+
paths_to_load = [p for p in self.parquet_folder_list if p is not None]
|
195
|
+
elif self.parquet_full_path:
|
196
|
+
# Treat the single path as a list with one item
|
197
|
+
paths_to_load = [self.parquet_full_path]
|
198
|
+
|
199
|
+
if not paths_to_load:
|
200
|
+
self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
201
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
202
|
+
|
203
|
+
try:
|
204
|
+
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
+
return dd.read_parquet(
|
206
|
+
paths_to_load,
|
207
|
+
engine="pyarrow",
|
208
|
+
filesystem=self.fs,
|
209
|
+
exclude=["_*", ".*"]
|
210
|
+
)
|
211
|
+
except Exception as e:
|
212
|
+
# This robust error handling is excellent.
|
213
|
+
self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
|
214
|
+
self.logger.warning("Returning empty DataFrame due to loading error.")
|
215
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
216
|
+
|
192
217
|
|
193
218
|
@staticmethod
|
194
219
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
@@ -15,7 +15,7 @@ from sqlalchemy.engine import url as sqlalchemy_url
|
|
15
15
|
from sqlalchemy.engine import Engine
|
16
16
|
from sqlalchemy.exc import OperationalError, SQLAlchemyError
|
17
17
|
from sqlalchemy.orm import sessionmaker, Session
|
18
|
-
from sqlalchemy.pool import QueuePool, NullPool, StaticPool
|
18
|
+
from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
|
19
19
|
|
20
20
|
# Assuming these are your project's internal modules
|
21
21
|
from sibi_dst.utils import Logger
|
@@ -54,7 +54,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
54
54
|
pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
|
55
55
|
pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
|
56
56
|
pool_pre_ping: bool = True
|
57
|
-
poolclass: Type[
|
57
|
+
poolclass: Type[Pool] = QueuePool
|
58
58
|
|
59
59
|
# --- Internal & Runtime State ---
|
60
60
|
model: Optional[Type[Any]] = None
|
@@ -172,7 +172,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
172
172
|
return
|
173
173
|
|
174
174
|
engine_wrapper['ref_count'] -= 1
|
175
|
-
self.logger.debug(f"Closing
|
175
|
+
self.logger.debug(f"Closing connection within engine wrapper. Ref count is now {engine_wrapper['ref_count']}.")
|
176
176
|
|
177
177
|
if engine_wrapper['ref_count'] <= 0:
|
178
178
|
self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
|
@@ -195,7 +195,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
195
195
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
196
196
|
if wrapper:
|
197
197
|
wrapper['active_connections'] += 1
|
198
|
-
# self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
|
199
198
|
|
200
199
|
def _on_checkin(self, *args) -> None:
|
201
200
|
"""Event listener for when a connection is returned to the pool."""
|
@@ -203,7 +202,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
203
202
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
204
203
|
if wrapper:
|
205
204
|
wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
|
206
|
-
# self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
|
207
205
|
|
208
206
|
@property
|
209
207
|
def active_connections(self) -> int:
|
@@ -153,44 +153,44 @@ class DataWrapper(ManagedResource):
|
|
153
153
|
# Create a copy to avoid mutating the shared instance dictionary
|
154
154
|
local_load_params = self.load_params.copy()
|
155
155
|
local_load_params.update(date_filter)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
156
|
+
with self.dataclass(**self.class_params) as local_class_instance:
|
157
|
+
df = local_class_instance.load(**local_load_params)
|
158
|
+
load_time = time.perf_counter() - load_start
|
159
|
+
|
160
|
+
if hasattr(local_class_instance, "total_records"):
|
161
|
+
self.logger.debug(
|
162
|
+
f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
163
|
+
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
164
|
+
if self.mmanifest:
|
165
|
+
self.mmanifest.record(
|
166
166
|
full_path=path
|
167
167
|
)
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
168
|
+
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
169
|
+
elif int(local_class_instance.total_records) < 0:
|
170
|
+
self.logger.warning(
|
171
|
+
f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
|
172
|
+
"This may indicate an error in the data loading process."
|
173
|
+
)
|
174
|
+
else:
|
175
|
+
save_start = time.perf_counter()
|
176
|
+
parquet_params ={
|
177
|
+
"df_result": df,
|
178
|
+
"parquet_storage_path": path,
|
179
|
+
"fs": self.fs,
|
180
|
+
"logger": self.logger,
|
181
|
+
"debug": self.debug,
|
182
|
+
}
|
183
|
+
with ParquetSaver(**parquet_params) as ps:
|
184
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
185
|
+
save_time = time.perf_counter() - save_start
|
186
|
+
|
187
|
+
total_time = time.perf_counter() - overall_start
|
188
|
+
self.benchmarks[date] = {
|
189
|
+
"load_duration": load_time,
|
190
|
+
"save_duration": save_time,
|
191
|
+
"total_duration": total_time
|
192
|
+
}
|
193
|
+
self._log_success(date, total_time, full_path)
|
194
194
|
except Exception as e:
|
195
195
|
self._log_failure(date, e)
|
196
196
|
raise
|
@@ -4,7 +4,7 @@ from typing import Union, Tuple, Callable, Dict, Optional
|
|
4
4
|
import fsspec
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
|
-
|
7
|
+
import dask.dataframe as dd
|
8
8
|
from .log_utils import Logger
|
9
9
|
|
10
10
|
|
@@ -305,154 +305,153 @@ class FileAgeChecker:
|
|
305
305
|
raise ValueError(f"Unsupported modification time format for {file_path}") from e
|
306
306
|
|
307
307
|
|
308
|
+
# --- Vectorized Helper Functions ---
|
309
|
+
# These replace the slow, row-by-row .apply() logic. They operate
|
310
|
+
# on entire DataFrame partitions for maximum efficiency.
|
311
|
+
|
312
|
+
def _vectorized_busday_count(
|
313
|
+
partition: pd.DataFrame,
|
314
|
+
begin_col: str,
|
315
|
+
end_col: str,
|
316
|
+
holidays: list
|
317
|
+
) -> pd.Series:
|
318
|
+
"""Vectorized function to count business days on a DataFrame partition."""
|
319
|
+
if partition.empty:
|
320
|
+
return pd.Series([], dtype=float)
|
321
|
+
|
322
|
+
# Convert entire columns to datetime at once, coercing errors to NaT
|
323
|
+
start_dates = pd.to_datetime(partition[begin_col], errors='coerce').dt.date
|
324
|
+
end_dates = pd.to_datetime(partition[end_col], errors='coerce').dt.date
|
325
|
+
|
326
|
+
# Create a result series filled with NaN to handle rows with invalid dates
|
327
|
+
result = pd.Series(np.nan, index=partition.index, dtype=float)
|
328
|
+
|
329
|
+
# Create a boolean mask for valid, non-NaT date pairs
|
330
|
+
valid_mask = pd.notna(start_dates) & pd.notna(end_dates)
|
331
|
+
|
332
|
+
# Perform the vectorized calculation only on the valid subset of dates
|
333
|
+
result.loc[valid_mask] = np.busday_count(
|
334
|
+
start_dates[valid_mask],
|
335
|
+
end_dates[valid_mask],
|
336
|
+
holidays=holidays
|
337
|
+
)
|
338
|
+
return result
|
339
|
+
|
340
|
+
|
341
|
+
def _vectorized_sla_end_date(
|
342
|
+
partition: pd.DataFrame,
|
343
|
+
start_col: str,
|
344
|
+
n_days_col: str,
|
345
|
+
holidays: list
|
346
|
+
) -> pd.Series:
|
347
|
+
"""Vectorized function to calculate the SLA end date on a DataFrame partition."""
|
348
|
+
if partition.empty:
|
349
|
+
return pd.Series([], dtype='datetime64[ns]')
|
350
|
+
|
351
|
+
start_dates = pd.to_datetime(partition[start_col], errors='coerce').dt.date
|
352
|
+
sla_days = partition[n_days_col]
|
353
|
+
|
354
|
+
# Create a result series filled with NaT for rows with invalid start dates
|
355
|
+
result = pd.Series(pd.NaT, index=partition.index, dtype='datetime64[ns]')
|
356
|
+
|
357
|
+
# Create a boolean mask for valid start dates and sla_days
|
358
|
+
valid_mask = pd.notna(start_dates) & pd.notna(sla_days)
|
359
|
+
|
360
|
+
# Perform the vectorized calculation only on the valid subset
|
361
|
+
result.loc[valid_mask] = np.busday_offset(
|
362
|
+
start_dates[valid_mask],
|
363
|
+
sla_days[valid_mask].astype(int), # Ensure days are integers
|
364
|
+
roll='forward',
|
365
|
+
holidays=holidays
|
366
|
+
)
|
367
|
+
return result
|
368
|
+
|
369
|
+
|
370
|
+
# --- Refactored BusinessDays Class ---
|
371
|
+
|
308
372
|
class BusinessDays:
|
309
373
|
"""
|
310
|
-
|
311
|
-
|
312
|
-
business days, modifying dates by adding business days, and applying these
|
313
|
-
operations to Dask DataFrames.
|
314
|
-
|
315
|
-
:ivar logger: Logger instance for logging error, warning, and debug messages.
|
316
|
-
:type logger: logging.Logger
|
317
|
-
:ivar HOLIDAY_LIST: Dictionary mapping years to lists of holiday dates.
|
318
|
-
:type HOLIDAY_LIST: dict
|
319
|
-
:ivar bd_cal: Numpy busdaycalendar object containing holidays and week mask.
|
320
|
-
:type bd_cal: numpy.busdaycalendar
|
321
|
-
:ivar holidays: Array of holiday dates used by the business day calendar.
|
322
|
-
:type holidays: numpy.ndarray
|
323
|
-
:ivar week_mask: Boolean array indicating working days within a week.
|
324
|
-
:type week_mask: numpy.ndarray
|
374
|
+
Business days calculations with a custom holiday list.
|
375
|
+
Supports scalar and efficient, vectorized Dask DataFrame operations.
|
325
376
|
"""
|
326
377
|
|
327
|
-
def __init__(self, holiday_list, logger):
|
328
|
-
"""
|
329
|
-
Initialize a BusinessDays object with a given holiday list.
|
330
|
-
"""
|
378
|
+
def __init__(self, holiday_list: dict[str, list[str]], logger) -> None:
|
331
379
|
self.logger = logger
|
332
380
|
self.HOLIDAY_LIST = holiday_list
|
333
|
-
bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
|
334
|
-
self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
|
335
|
-
self.holidays = self.bd_cal.holidays
|
336
|
-
self.week_mask = self.bd_cal.weekmask
|
337
381
|
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
"""
|
342
|
-
try:
|
343
|
-
begin_date = pd.to_datetime(begin_date)
|
344
|
-
end_date = pd.to_datetime(end_date)
|
345
|
-
except Exception as e:
|
346
|
-
raise ValueError(f"Invalid date format: {e}")
|
347
|
-
|
348
|
-
years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
|
349
|
-
if not all(year in self.HOLIDAY_LIST for year in years):
|
350
|
-
raise ValueError("Not all years in date range are in the holiday list")
|
351
|
-
|
352
|
-
return np.busday_count(
|
353
|
-
begin_date.strftime("%Y-%m-%d"),
|
354
|
-
end_date.strftime("%Y-%m-%d"),
|
355
|
-
busdaycal=self.bd_cal,
|
356
|
-
)
|
382
|
+
# Flatten and store as tuple for determinism
|
383
|
+
bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
|
384
|
+
self.holidays = tuple(bd_holidays)
|
357
385
|
|
358
|
-
def
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
# Apply the function using map_partitions
|
390
|
-
df[result_col] = df.map_partitions(
|
391
|
-
apply_business_days,
|
392
|
-
holidays,
|
393
|
-
weekmask,
|
394
|
-
meta=(result_col, "int64"),
|
386
|
+
def get_business_days_count(
|
387
|
+
self,
|
388
|
+
begin_date: str | datetime.date | pd.Timestamp,
|
389
|
+
end_date: str | datetime.date | pd.Timestamp,
|
390
|
+
) -> int:
|
391
|
+
"""Scalar method to count business days between two dates."""
|
392
|
+
begin = pd.to_datetime(begin_date)
|
393
|
+
end = pd.to_datetime(end_date)
|
394
|
+
return int(np.busday_count(begin.date(), end.date(), holidays=list(self.holidays)))
|
395
|
+
|
396
|
+
def calc_business_days_from_df(
|
397
|
+
self,
|
398
|
+
df: dd.DataFrame,
|
399
|
+
begin_date_col: str,
|
400
|
+
end_date_col: str,
|
401
|
+
result_col: str = "business_days",
|
402
|
+
) -> dd.DataFrame:
|
403
|
+
"""Calculates business days between two columns in a Dask DataFrame."""
|
404
|
+
missing = {begin_date_col, end_date_col} - set(df.columns)
|
405
|
+
if missing:
|
406
|
+
self.logger.error(f"Missing columns: {missing}")
|
407
|
+
raise ValueError("Required columns are missing from DataFrame")
|
408
|
+
|
409
|
+
return df.assign(
|
410
|
+
**{result_col: df.map_partitions(
|
411
|
+
_vectorized_busday_count,
|
412
|
+
begin_col=begin_date_col,
|
413
|
+
end_col=end_date_col,
|
414
|
+
holidays=list(self.holidays),
|
415
|
+
meta=(result_col, 'f8') # f8 is float64
|
416
|
+
)}
|
395
417
|
)
|
396
418
|
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
"""
|
403
|
-
|
404
|
-
start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
405
|
-
except ValueError:
|
406
|
-
raise ValueError("Date should be a string in the format YYYY-MM-DD")
|
407
|
-
|
408
|
-
if str(start_date.year) not in self.HOLIDAY_LIST:
|
409
|
-
self.logger.warning(f"Year {start_date.year} is not in the holiday list")
|
410
|
-
|
419
|
+
def add_business_days(
|
420
|
+
self,
|
421
|
+
start_date: str | datetime.date | pd.Timestamp,
|
422
|
+
n_days: int,
|
423
|
+
) -> np.datetime64:
|
424
|
+
"""Scalar method to add N business days to a start date."""
|
425
|
+
start = pd.to_datetime(start_date)
|
411
426
|
return np.busday_offset(
|
412
|
-
|
427
|
+
start.date(),
|
413
428
|
n_days,
|
414
|
-
roll=
|
415
|
-
|
429
|
+
roll='forward',
|
430
|
+
holidays=list(self.holidays),
|
416
431
|
)
|
417
432
|
|
418
|
-
def calc_sla_end_date(
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
)
|
440
|
-
|
441
|
-
# Define a wrapper for partition-wise operation
|
442
|
-
def apply_sla_end_date(partition, holidays, weekmask):
|
443
|
-
return partition.apply(
|
444
|
-
calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
|
445
|
-
)
|
446
|
-
|
447
|
-
# Apply the function using map_partitions
|
448
|
-
df[result_col] = df.map_partitions(
|
449
|
-
apply_sla_end_date,
|
450
|
-
holidays,
|
451
|
-
weekmask,
|
452
|
-
meta=(result_col, "object"),
|
433
|
+
def calc_sla_end_date(
|
434
|
+
self,
|
435
|
+
df: dd.DataFrame,
|
436
|
+
start_date_col: str,
|
437
|
+
n_days_col: str,
|
438
|
+
result_col: str = "sla_end_date",
|
439
|
+
) -> dd.DataFrame:
|
440
|
+
"""Calculates an SLA end date column for a Dask DataFrame."""
|
441
|
+
missing = {start_date_col, n_days_col} - set(df.columns)
|
442
|
+
if missing:
|
443
|
+
self.logger.error(f"Missing columns: {missing}")
|
444
|
+
raise ValueError("Required columns are missing from DataFrame")
|
445
|
+
|
446
|
+
return df.assign(
|
447
|
+
**{result_col: df.map_partitions(
|
448
|
+
_vectorized_sla_end_date,
|
449
|
+
start_col=start_date_col,
|
450
|
+
n_days_col=n_days_col,
|
451
|
+
holidays=list(self.holidays),
|
452
|
+
meta=(result_col, 'datetime64[ns]')
|
453
|
+
)}
|
453
454
|
)
|
454
|
-
|
455
|
-
return df
|
456
455
|
# Class enhancements
|
457
456
|
# DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
|
458
457
|
# datetime.date.today() + datetime.timedelta(days=13)))
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py
RENAMED
@@ -1,7 +1,8 @@
|
|
1
1
|
import itertools
|
2
2
|
import dask.dataframe as dd
|
3
3
|
import pandas as pd
|
4
|
-
|
4
|
+
|
5
|
+
#from sqlmodel import create_engine, Session, select
|
5
6
|
from sibi_dst.v2.df_helper.core import FilterHandler
|
6
7
|
from sibi_dst.v2.utils import Logger
|
7
8
|
|
@@ -116,7 +117,7 @@ class SQLModelDask:
|
|
116
117
|
return dask_df
|
117
118
|
|
118
119
|
except Exception as e:
|
119
|
-
self.logger.error(f"Error executing query: {str(e)}")
|
120
|
-
self.logger.error(self.query)
|
120
|
+
self.logger.error(f"_io_dask:Error executing query: {str(e)}")
|
121
|
+
self.logger.error(f"_io_dask:{self.query})
|
121
122
|
# In case of error, return an empty Dask DataFrame with the expected columns.
|
122
123
|
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/parquet/_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.1.9 → sibi_dst-2025.1.11}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|