sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +418 -118
- sibi_dst/df_helper/_parquet_artifact.py +275 -283
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +235 -100
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +392 -88
- sibi_dst/utils/date_utils.py +711 -393
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,7 +1,8 @@
|
|
1
|
-
from typing import Optional, ClassVar, Dict
|
1
|
+
from typing import Optional, ClassVar, Dict, Any, Union
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
4
|
import fsspec
|
5
|
+
import pandas as pd
|
5
6
|
|
6
7
|
from sibi_dst.df_helper import DfHelper
|
7
8
|
|
@@ -23,7 +24,7 @@ class ParquetReader(DfHelper):
|
|
23
24
|
:type config: dict
|
24
25
|
:ivar df: Stores the loaded Dask DataFrame after the `load()` method is
|
25
26
|
invoked. Initially set to None.
|
26
|
-
:type df: Optional[dd.DataFrame]
|
27
|
+
:type df: Optional[dd.DataFrame | pd.DataFrame]
|
27
28
|
:ivar parquet_storage_path: The path to the Parquet storage directory.
|
28
29
|
:type parquet_storage_path: str
|
29
30
|
:ivar parquet_start_date: Start date for Parquet data selection. Must
|
@@ -32,19 +33,15 @@ class ParquetReader(DfHelper):
|
|
32
33
|
:ivar parquet_end_date: End date for Parquet data selection. Must be
|
33
34
|
set in the configuration.
|
34
35
|
:type parquet_end_date: str
|
35
|
-
|
36
|
-
stored on (e.g., "file", "s3").
|
37
|
-
:type filesystem_type: str
|
38
|
-
:ivar filesystem_options: Any additional options required for the
|
39
|
-
specified filesystem type.
|
40
|
-
:type filesystem_options: dict
|
36
|
+
|
41
37
|
:ivar fs: Instance of `fsspec` filesystem used to interact with the
|
42
38
|
Parquet storage.
|
43
39
|
:type fs: fsspec.AbstractFileSystem
|
44
40
|
"""
|
45
|
-
DEFAULT_CONFIG: ClassVar[Dict[str,
|
41
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
|
46
42
|
'backend': 'parquet'
|
47
43
|
}
|
44
|
+
df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
|
48
45
|
|
49
46
|
def __init__(self, **kwargs):
|
50
47
|
self.config = {
|
@@ -63,11 +60,13 @@ class ParquetReader(DfHelper):
|
|
63
60
|
self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
|
64
61
|
if self.parquet_end_date is None:
|
65
62
|
raise ValueError('parquet_end_date must be set')
|
63
|
+
if self.fs is None:
|
64
|
+
raise ValueError('Parquet Reader mush be supplied a fs instance')
|
66
65
|
|
67
66
|
if not self.directory_exists():
|
68
67
|
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
69
68
|
|
70
|
-
def load(self, **kwargs):
|
69
|
+
def load(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
71
70
|
self.df = super().load(**kwargs)
|
72
71
|
return self.df
|
73
72
|
|
@@ -202,16 +202,20 @@ class ParquetConfig(BaseModel):
|
|
202
202
|
|
203
203
|
try:
|
204
204
|
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
-
|
205
|
+
dd_result=dd.read_parquet(
|
206
206
|
paths_to_load,
|
207
207
|
engine="pyarrow",
|
208
208
|
filesystem=self.fs,
|
209
209
|
exclude=["_*", ".*"]
|
210
210
|
)
|
211
|
+
return dd_result
|
212
|
+
except FileNotFoundError as e:
|
213
|
+
self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
214
|
+
self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|
215
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
211
216
|
except Exception as e:
|
212
|
-
|
213
|
-
self.logger.
|
214
|
-
self.logger.warning("Returning empty DataFrame due to loading error.")
|
217
|
+
self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
|
218
|
+
self.logger.debug("Returning empty DataFrame due to loading error.")
|
215
219
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
216
220
|
|
217
221
|
|
@@ -1,15 +1,11 @@
|
|
1
|
+
|
1
2
|
from __future__ import annotations
|
2
3
|
import os
|
3
4
|
import threading
|
4
5
|
from contextlib import contextmanager
|
5
6
|
from typing import Any, Optional, ClassVar, Generator, Type, Dict
|
6
7
|
|
7
|
-
from pydantic import
|
8
|
-
BaseModel,
|
9
|
-
field_validator,
|
10
|
-
model_validator,
|
11
|
-
ConfigDict,
|
12
|
-
)
|
8
|
+
from pydantic import BaseModel, field_validator, model_validator, ConfigDict
|
13
9
|
from sqlalchemy import create_engine, event, text
|
14
10
|
from sqlalchemy.engine import url as sqlalchemy_url
|
15
11
|
from sqlalchemy.engine import Engine
|
@@ -17,32 +13,18 @@ from sqlalchemy.exc import OperationalError, SQLAlchemyError
|
|
17
13
|
from sqlalchemy.orm import sessionmaker, Session
|
18
14
|
from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
|
19
15
|
|
20
|
-
# Assuming these are your project's internal modules
|
21
16
|
from sibi_dst.utils import Logger
|
22
17
|
from ._sql_model_builder import SqlAlchemyModelBuilder
|
23
18
|
|
19
|
+
_ENGINE_REGISTRY_LOCK = threading.RLock()
|
20
|
+
_ENGINE_REGISTRY: Dict[tuple, Dict[str, Any]] = {}
|
21
|
+
|
24
22
|
|
25
23
|
class SqlAlchemyConnectionConfig(BaseModel):
|
26
24
|
"""
|
27
|
-
|
28
|
-
|
29
|
-
This class encapsulates database connection configuration and provides robust,
|
30
|
-
shared resource management. It is designed to be used as a context manager
|
31
|
-
to ensure resources are always released correctly.
|
32
|
-
|
33
|
-
Recommended Usage is via the `with` statement.
|
34
|
-
with SqlAlchemyConnectionConfig(...) as config:
|
35
|
-
session = config.get_session()
|
36
|
-
# ... do work ...
|
37
|
-
# config.close() is called automatically upon exiting the block.
|
38
|
-
|
39
|
-
Key Features:
|
40
|
-
- Context Manager Support: Guarantees resource cleanup.
|
41
|
-
- Shared Engine & Pool: Reuses a single SQLAlchemy Engine for identical
|
42
|
-
database URLs and pool settings, improving application performance.
|
43
|
-
- Reference Counting: Safely manages the lifecycle of the shared engine,
|
44
|
-
disposing of it only when the last user has closed its connection config.
|
25
|
+
Thread-safe, registry-backed SQLAlchemy connection manager.
|
45
26
|
"""
|
27
|
+
|
46
28
|
# --- Public Configuration ---
|
47
29
|
connection_url: str
|
48
30
|
table: Optional[str] = None
|
@@ -50,36 +32,28 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
50
32
|
|
51
33
|
# --- Pool Configuration ---
|
52
34
|
pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
|
53
|
-
max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
|
35
|
+
max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW", 10))
|
54
36
|
pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
|
55
37
|
pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
|
56
38
|
pool_pre_ping: bool = True
|
57
39
|
poolclass: Type[Pool] = QueuePool
|
58
40
|
|
59
|
-
# --- Internal & Runtime State ---
|
41
|
+
# --- Internal & Runtime State (normal fields; Pydantic allowed) ---
|
60
42
|
model: Optional[Type[Any]] = None
|
61
43
|
engine: Optional[Engine] = None
|
62
44
|
logger: Optional[Logger] = None
|
63
|
-
_own_logger: bool = False
|
45
|
+
_own_logger: bool = False
|
64
46
|
session_factory: Optional[sessionmaker] = None
|
65
47
|
|
66
|
-
# --- Private State ---
|
48
|
+
# --- Private State (plain Python values only) ---
|
67
49
|
_engine_key_instance: tuple = ()
|
68
|
-
_closed: bool = False #
|
69
|
-
|
70
|
-
# --- Class-level Shared Resources ---
|
71
|
-
_engine_registry: ClassVar[Dict[tuple, Dict[str, Any]]] = {}
|
72
|
-
_registry_lock: ClassVar[threading.Lock] = threading.Lock()
|
73
|
-
|
50
|
+
_closed: bool = False # prevent double-closing
|
74
51
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
75
52
|
|
76
|
-
|
77
|
-
def __enter__(self) -> SqlAlchemyConnectionConfig:
|
78
|
-
"""Enter the runtime context, returning self."""
|
53
|
+
def __enter__(self) -> "SqlAlchemyConnectionConfig":
|
79
54
|
return self
|
80
55
|
|
81
56
|
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
82
|
-
"""Exit the runtime context, ensuring that close() is called."""
|
83
57
|
self.close()
|
84
58
|
|
85
59
|
@field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
|
@@ -90,8 +64,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
90
64
|
return v
|
91
65
|
|
92
66
|
@model_validator(mode="after")
|
93
|
-
def _init_all(self) -> SqlAlchemyConnectionConfig:
|
94
|
-
"""Orchestrates the initialization process after Pydantic validation."""
|
67
|
+
def _init_all(self) -> "SqlAlchemyConnectionConfig":
|
95
68
|
self._init_logger()
|
96
69
|
self._engine_key_instance = self._get_engine_key()
|
97
70
|
self._init_engine()
|
@@ -102,17 +75,12 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
102
75
|
return self
|
103
76
|
|
104
77
|
def _init_logger(self) -> None:
|
105
|
-
"""Initializes the logger for this instance."""
|
106
|
-
# This is not a ManagedResource subclass, so we handle logger initialization directly.
|
107
|
-
# unless a logger is provided, we create our own.
|
108
78
|
if self.logger is None:
|
109
79
|
self._own_logger = True
|
110
80
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
111
|
-
|
112
|
-
self.logger.set_level(log_level)
|
81
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
113
82
|
|
114
83
|
def _get_engine_key(self) -> tuple:
|
115
|
-
"""Generates a unique, normalized key for an engine configuration."""
|
116
84
|
parsed = sqlalchemy_url.make_url(self.connection_url)
|
117
85
|
query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
|
118
86
|
normalized_url = parsed.set(query=query)
|
@@ -125,104 +93,97 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
125
93
|
return tuple(key_parts)
|
126
94
|
|
127
95
|
def _init_engine(self) -> None:
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
96
|
+
with _ENGINE_REGISTRY_LOCK:
|
97
|
+
wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
|
98
|
+
if wrapper:
|
99
|
+
self.engine = wrapper["engine"]
|
100
|
+
wrapper["ref_count"] += 1
|
101
|
+
if self.debug:
|
102
|
+
self.logger.debug(f"Reusing engine. Ref count: {wrapper['ref_count']}.")
|
135
103
|
else:
|
136
|
-
self.
|
104
|
+
if self.debug:
|
105
|
+
self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
|
137
106
|
try:
|
138
107
|
new_engine = create_engine(
|
139
|
-
self.connection_url,
|
140
|
-
|
141
|
-
|
108
|
+
self.connection_url,
|
109
|
+
pool_size=self.pool_size,
|
110
|
+
max_overflow=self.max_overflow,
|
111
|
+
pool_timeout=self.pool_timeout,
|
112
|
+
pool_recycle=self.pool_recycle,
|
113
|
+
pool_pre_ping=self.pool_pre_ping,
|
142
114
|
poolclass=self.poolclass,
|
143
115
|
)
|
144
116
|
self.engine = new_engine
|
145
117
|
self._attach_events()
|
146
|
-
|
147
|
-
|
118
|
+
_ENGINE_REGISTRY[self._engine_key_instance] = {
|
119
|
+
"engine": new_engine,
|
120
|
+
"ref_count": 1,
|
121
|
+
"active_connections": 0,
|
148
122
|
}
|
149
123
|
except Exception as e:
|
150
124
|
self.logger.error(f"Failed to create engine: {e}")
|
151
125
|
raise SQLAlchemyError(f"Engine creation failed: {e}") from e
|
152
126
|
|
153
|
-
#self.logger.debug(f"Connections Active: {self.active_connections}")
|
154
|
-
|
155
127
|
def close(self) -> None:
|
156
|
-
"""
|
157
|
-
Decrements the engine's reference count and disposes of the engine
|
158
|
-
if the count reaches zero. This is now typically called automatically
|
159
|
-
when exiting a `with` block.
|
160
|
-
"""
|
161
|
-
# Prevent the method from running more than once per instance.
|
162
128
|
if self._closed:
|
163
|
-
self.
|
129
|
+
if self.debug:
|
130
|
+
self.logger.debug("Attempted to close an already-closed config instance.")
|
164
131
|
return
|
165
132
|
|
166
|
-
with
|
133
|
+
with _ENGINE_REGISTRY_LOCK:
|
167
134
|
key = self._engine_key_instance
|
168
|
-
|
169
|
-
|
170
|
-
if not engine_wrapper:
|
135
|
+
wrapper = _ENGINE_REGISTRY.get(key)
|
136
|
+
if not wrapper:
|
171
137
|
self.logger.warning("Attempted to close a config whose engine is not in the registry.")
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
138
|
+
else:
|
139
|
+
wrapper["ref_count"] -= 1
|
140
|
+
if self.debug:
|
141
|
+
self.logger.debug(f"Closing connection. Ref count now {wrapper['ref_count']}.")
|
142
|
+
if wrapper["ref_count"] <= 0:
|
143
|
+
if self.debug:
|
144
|
+
self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
|
145
|
+
try:
|
146
|
+
wrapper["engine"].dispose()
|
147
|
+
finally:
|
148
|
+
del _ENGINE_REGISTRY[key]
|
183
149
|
self._closed = True
|
184
150
|
|
185
|
-
|
186
151
|
def _attach_events(self) -> None:
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
152
|
+
if not self.engine:
|
153
|
+
return
|
154
|
+
event.listen(self.engine, "checkout", self._on_checkout)
|
155
|
+
event.listen(self.engine, "checkin", self._on_checkin)
|
191
156
|
|
192
157
|
def _on_checkout(self, *args) -> None:
|
193
|
-
|
194
|
-
|
195
|
-
wrapper = self._engine_registry.get(self._engine_key_instance)
|
158
|
+
with _ENGINE_REGISTRY_LOCK:
|
159
|
+
wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
|
196
160
|
if wrapper:
|
197
|
-
wrapper[
|
161
|
+
wrapper["active_connections"] += 1
|
198
162
|
|
199
163
|
def _on_checkin(self, *args) -> None:
|
200
|
-
|
201
|
-
|
202
|
-
wrapper = self._engine_registry.get(self._engine_key_instance)
|
164
|
+
with _ENGINE_REGISTRY_LOCK:
|
165
|
+
wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
|
203
166
|
if wrapper:
|
204
|
-
wrapper[
|
167
|
+
wrapper["active_connections"] = max(0, wrapper["active_connections"] - 1)
|
205
168
|
|
206
169
|
@property
|
207
170
|
def active_connections(self) -> int:
|
208
|
-
|
209
|
-
|
210
|
-
wrapper
|
211
|
-
return wrapper['active_connections'] if wrapper else 0
|
171
|
+
with _ENGINE_REGISTRY_LOCK:
|
172
|
+
wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
|
173
|
+
return wrapper["active_connections"] if wrapper else 0
|
212
174
|
|
213
175
|
def _validate_conn(self) -> None:
|
214
|
-
"""Tests the database connection by executing a simple query."""
|
215
176
|
try:
|
216
177
|
with self.managed_connection() as conn:
|
217
178
|
conn.execute(text("SELECT 1"))
|
218
|
-
self.
|
179
|
+
if self.debug:
|
180
|
+
self.logger.debug("Database connection validated successfully.")
|
219
181
|
except OperationalError as e:
|
220
182
|
self.logger.error(f"Database connection failed: {e}")
|
221
183
|
raise ValueError(f"DB connection failed: {e}") from e
|
222
184
|
|
223
185
|
@contextmanager
|
224
186
|
def managed_connection(self) -> Generator[Any, None, None]:
|
225
|
-
"""Provides a single database connection from the engine pool."""
|
226
187
|
if not self.engine:
|
227
188
|
raise RuntimeError("Engine not initialized. Cannot get a connection.")
|
228
189
|
conn = self.engine.connect()
|
@@ -232,19 +193,19 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
232
193
|
conn.close()
|
233
194
|
|
234
195
|
def get_session(self) -> Session:
|
235
|
-
"""Returns a new SQLAlchemy Session from the session factory."""
|
236
196
|
if not self.session_factory:
|
237
197
|
raise RuntimeError("Session factory not initialized. Cannot get a session.")
|
238
198
|
return self.session_factory()
|
239
199
|
|
240
200
|
def _build_model(self) -> None:
|
241
|
-
"""Dynamically builds an ORM model if `self.table` is set."""
|
242
201
|
if not self.table or not self.engine:
|
243
202
|
return
|
244
203
|
try:
|
245
204
|
builder = SqlAlchemyModelBuilder(self.engine, self.table)
|
246
205
|
self.model = builder.build_model()
|
247
|
-
self.
|
206
|
+
if self.debug:
|
207
|
+
self.logger.debug(f"Successfully built ORM model for table: {self.table}")
|
248
208
|
except Exception as e:
|
249
209
|
self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}")
|
250
210
|
raise ValueError(f"Model construction failed for table '{self.table}': {e}") from e
|
211
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
import threading
|
2
|
+
|
3
|
+
|
4
|
+
class DBGatekeeper:
|
5
|
+
_locks = {}
|
6
|
+
_global_lock = threading.Lock()
|
7
|
+
|
8
|
+
@classmethod
|
9
|
+
def get(cls, key: str, max_concurrency: int):
|
10
|
+
with cls._global_lock:
|
11
|
+
sem = cls._locks.get(key)
|
12
|
+
if sem is None:
|
13
|
+
sem = threading.BoundedSemaphore(max_concurrency)
|
14
|
+
cls._locks[key] = sem
|
15
|
+
return sem
|