sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,7 +1,8 @@
1
- from typing import Optional, ClassVar, Dict
1
+ from typing import Optional, ClassVar, Dict, Any, Union
2
2
 
3
3
  import dask.dataframe as dd
4
4
  import fsspec
5
+ import pandas as pd
5
6
 
6
7
  from sibi_dst.df_helper import DfHelper
7
8
 
@@ -23,7 +24,7 @@ class ParquetReader(DfHelper):
23
24
  :type config: dict
24
25
  :ivar df: Stores the loaded Dask DataFrame after the `load()` method is
25
26
  invoked. Initially set to None.
26
- :type df: Optional[dd.DataFrame]
27
+ :type df: Optional[dd.DataFrame | pd.DataFrame]
27
28
  :ivar parquet_storage_path: The path to the Parquet storage directory.
28
29
  :type parquet_storage_path: str
29
30
  :ivar parquet_start_date: Start date for Parquet data selection. Must
@@ -32,19 +33,15 @@ class ParquetReader(DfHelper):
32
33
  :ivar parquet_end_date: End date for Parquet data selection. Must be
33
34
  set in the configuration.
34
35
  :type parquet_end_date: str
35
- :ivar filesystem_type: The type of filesystem the Parquet files are
36
- stored on (e.g., "file", "s3").
37
- :type filesystem_type: str
38
- :ivar filesystem_options: Any additional options required for the
39
- specified filesystem type.
40
- :type filesystem_options: dict
36
+
41
37
  :ivar fs: Instance of `fsspec` filesystem used to interact with the
42
38
  Parquet storage.
43
39
  :type fs: fsspec.AbstractFileSystem
44
40
  """
45
- DEFAULT_CONFIG: ClassVar[Dict[str, int]] = {
41
+ DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
46
42
  'backend': 'parquet'
47
43
  }
44
+ df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
48
45
 
49
46
  def __init__(self, **kwargs):
50
47
  self.config = {
@@ -63,11 +60,13 @@ class ParquetReader(DfHelper):
63
60
  self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
64
61
  if self.parquet_end_date is None:
65
62
  raise ValueError('parquet_end_date must be set')
63
+ if self.fs is None:
64
+ raise ValueError('Parquet Reader mush be supplied a fs instance')
66
65
 
67
66
  if not self.directory_exists():
68
67
  raise ValueError(f"{self.parquet_storage_path} does not exist")
69
68
 
70
- def load(self, **kwargs):
69
+ def load(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
71
70
  self.df = super().load(**kwargs)
72
71
  return self.df
73
72
 
@@ -202,16 +202,20 @@ class ParquetConfig(BaseModel):
202
202
 
203
203
  try:
204
204
  self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
- return dd.read_parquet(
205
+ dd_result=dd.read_parquet(
206
206
  paths_to_load,
207
207
  engine="pyarrow",
208
208
  filesystem=self.fs,
209
209
  exclude=["_*", ".*"]
210
210
  )
211
+ return dd_result
212
+ except FileNotFoundError as e:
213
+ self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
214
+ self.logger.debug("Returning empty DataFrame due to missing parquet files.")
215
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
211
216
  except Exception as e:
212
- # This robust error handling is excellent.
213
- self.logger.error(f"Parquet loading failed for paths {paths_to_load}: {e}", exc_info=True)
214
- self.logger.warning("Returning empty DataFrame due to loading error.")
217
+ self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
218
+ self.logger.debug("Returning empty DataFrame due to loading error.")
215
219
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
216
220
 
217
221
 
@@ -1,15 +1,11 @@
1
+
1
2
  from __future__ import annotations
2
3
  import os
3
4
  import threading
4
5
  from contextlib import contextmanager
5
6
  from typing import Any, Optional, ClassVar, Generator, Type, Dict
6
7
 
7
- from pydantic import (
8
- BaseModel,
9
- field_validator,
10
- model_validator,
11
- ConfigDict,
12
- )
8
+ from pydantic import BaseModel, field_validator, model_validator, ConfigDict
13
9
  from sqlalchemy import create_engine, event, text
14
10
  from sqlalchemy.engine import url as sqlalchemy_url
15
11
  from sqlalchemy.engine import Engine
@@ -17,32 +13,18 @@ from sqlalchemy.exc import OperationalError, SQLAlchemyError
17
13
  from sqlalchemy.orm import sessionmaker, Session
18
14
  from sqlalchemy.pool import QueuePool, NullPool, StaticPool, Pool
19
15
 
20
- # Assuming these are your project's internal modules
21
16
  from sibi_dst.utils import Logger
22
17
  from ._sql_model_builder import SqlAlchemyModelBuilder
23
18
 
19
+ _ENGINE_REGISTRY_LOCK = threading.RLock()
20
+ _ENGINE_REGISTRY: Dict[tuple, Dict[str, Any]] = {}
21
+
24
22
 
25
23
  class SqlAlchemyConnectionConfig(BaseModel):
26
24
  """
27
- A thread-safe, registry-backed SQLAlchemy connection manager.
28
-
29
- This class encapsulates database connection configuration and provides robust,
30
- shared resource management. It is designed to be used as a context manager
31
- to ensure resources are always released correctly.
32
-
33
- Recommended Usage is via the `with` statement.
34
- with SqlAlchemyConnectionConfig(...) as config:
35
- session = config.get_session()
36
- # ... do work ...
37
- # config.close() is called automatically upon exiting the block.
38
-
39
- Key Features:
40
- - Context Manager Support: Guarantees resource cleanup.
41
- - Shared Engine & Pool: Reuses a single SQLAlchemy Engine for identical
42
- database URLs and pool settings, improving application performance.
43
- - Reference Counting: Safely manages the lifecycle of the shared engine,
44
- disposing of it only when the last user has closed its connection config.
25
+ Thread-safe, registry-backed SQLAlchemy connection manager.
45
26
  """
27
+
46
28
  # --- Public Configuration ---
47
29
  connection_url: str
48
30
  table: Optional[str] = None
@@ -50,36 +32,28 @@ class SqlAlchemyConnectionConfig(BaseModel):
50
32
 
51
33
  # --- Pool Configuration ---
52
34
  pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
53
- max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
35
+ max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW", 10))
54
36
  pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
55
37
  pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
56
38
  pool_pre_ping: bool = True
57
39
  poolclass: Type[Pool] = QueuePool
58
40
 
59
- # --- Internal & Runtime State ---
41
+ # --- Internal & Runtime State (normal fields; Pydantic allowed) ---
60
42
  model: Optional[Type[Any]] = None
61
43
  engine: Optional[Engine] = None
62
44
  logger: Optional[Logger] = None
63
- _own_logger: bool = False # Indicates if this instance owns the logger.
45
+ _own_logger: bool = False
64
46
  session_factory: Optional[sessionmaker] = None
65
47
 
66
- # --- Private State ---
48
+ # --- Private State (plain Python values only) ---
67
49
  _engine_key_instance: tuple = ()
68
- _closed: bool = False # Flag to prevent double-closing.
69
-
70
- # --- Class-level Shared Resources ---
71
- _engine_registry: ClassVar[Dict[tuple, Dict[str, Any]]] = {}
72
- _registry_lock: ClassVar[threading.Lock] = threading.Lock()
73
-
50
+ _closed: bool = False # prevent double-closing
74
51
  model_config = ConfigDict(arbitrary_types_allowed=True)
75
52
 
76
- # Add __enter__ and __exit__ for context manager protocol
77
- def __enter__(self) -> SqlAlchemyConnectionConfig:
78
- """Enter the runtime context, returning self."""
53
+ def __enter__(self) -> "SqlAlchemyConnectionConfig":
79
54
  return self
80
55
 
81
56
  def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
82
- """Exit the runtime context, ensuring that close() is called."""
83
57
  self.close()
84
58
 
85
59
  @field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
@@ -90,8 +64,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
90
64
  return v
91
65
 
92
66
  @model_validator(mode="after")
93
- def _init_all(self) -> SqlAlchemyConnectionConfig:
94
- """Orchestrates the initialization process after Pydantic validation."""
67
+ def _init_all(self) -> "SqlAlchemyConnectionConfig":
95
68
  self._init_logger()
96
69
  self._engine_key_instance = self._get_engine_key()
97
70
  self._init_engine()
@@ -102,17 +75,12 @@ class SqlAlchemyConnectionConfig(BaseModel):
102
75
  return self
103
76
 
104
77
  def _init_logger(self) -> None:
105
- """Initializes the logger for this instance."""
106
- # This is not a ManagedResource subclass, so we handle logger initialization directly.
107
- # unless a logger is provided, we create our own.
108
78
  if self.logger is None:
109
79
  self._own_logger = True
110
80
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
111
- log_level = Logger.DEBUG if self.debug else Logger.INFO
112
- self.logger.set_level(log_level)
81
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
113
82
 
114
83
  def _get_engine_key(self) -> tuple:
115
- """Generates a unique, normalized key for an engine configuration."""
116
84
  parsed = sqlalchemy_url.make_url(self.connection_url)
117
85
  query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
118
86
  normalized_url = parsed.set(query=query)
@@ -125,104 +93,97 @@ class SqlAlchemyConnectionConfig(BaseModel):
125
93
  return tuple(key_parts)
126
94
 
127
95
  def _init_engine(self) -> None:
128
- """Initializes or reuses a shared SQLAlchemy Engine."""
129
- with self._registry_lock:
130
- engine_wrapper = self._engine_registry.get(self._engine_key_instance)
131
- if engine_wrapper:
132
- self.engine = engine_wrapper['engine']
133
- engine_wrapper['ref_count'] += 1
134
- self.logger.debug(f"Reusing engine. Ref count: {engine_wrapper['ref_count']}.")
96
+ with _ENGINE_REGISTRY_LOCK:
97
+ wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
98
+ if wrapper:
99
+ self.engine = wrapper["engine"]
100
+ wrapper["ref_count"] += 1
101
+ if self.debug:
102
+ self.logger.debug(f"Reusing engine. Ref count: {wrapper['ref_count']}.")
135
103
  else:
136
- self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
104
+ if self.debug:
105
+ self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
137
106
  try:
138
107
  new_engine = create_engine(
139
- self.connection_url, pool_size=self.pool_size,
140
- max_overflow=self.max_overflow, pool_timeout=self.pool_timeout,
141
- pool_recycle=self.pool_recycle, pool_pre_ping=self.pool_pre_ping,
108
+ self.connection_url,
109
+ pool_size=self.pool_size,
110
+ max_overflow=self.max_overflow,
111
+ pool_timeout=self.pool_timeout,
112
+ pool_recycle=self.pool_recycle,
113
+ pool_pre_ping=self.pool_pre_ping,
142
114
  poolclass=self.poolclass,
143
115
  )
144
116
  self.engine = new_engine
145
117
  self._attach_events()
146
- self._engine_registry[self._engine_key_instance] = {
147
- 'engine': new_engine, 'ref_count': 1, 'active_connections': 0
118
+ _ENGINE_REGISTRY[self._engine_key_instance] = {
119
+ "engine": new_engine,
120
+ "ref_count": 1,
121
+ "active_connections": 0,
148
122
  }
149
123
  except Exception as e:
150
124
  self.logger.error(f"Failed to create engine: {e}")
151
125
  raise SQLAlchemyError(f"Engine creation failed: {e}") from e
152
126
 
153
- #self.logger.debug(f"Connections Active: {self.active_connections}")
154
-
155
127
  def close(self) -> None:
156
- """
157
- Decrements the engine's reference count and disposes of the engine
158
- if the count reaches zero. This is now typically called automatically
159
- when exiting a `with` block.
160
- """
161
- # Prevent the method from running more than once per instance.
162
128
  if self._closed:
163
- self.logger.debug("Attempted to close an already-closed config instance.")
129
+ if self.debug:
130
+ self.logger.debug("Attempted to close an already-closed config instance.")
164
131
  return
165
132
 
166
- with self._registry_lock:
133
+ with _ENGINE_REGISTRY_LOCK:
167
134
  key = self._engine_key_instance
168
- engine_wrapper = self._engine_registry.get(key)
169
-
170
- if not engine_wrapper:
135
+ wrapper = _ENGINE_REGISTRY.get(key)
136
+ if not wrapper:
171
137
  self.logger.warning("Attempted to close a config whose engine is not in the registry.")
172
- return
173
-
174
- engine_wrapper['ref_count'] -= 1
175
- self.logger.debug(f"Closing connection within engine wrapper. Ref count is now {engine_wrapper['ref_count']}.")
176
-
177
- if engine_wrapper['ref_count'] <= 0:
178
- self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
179
- engine_wrapper['engine'].dispose()
180
- del self._engine_registry[key]
181
-
182
- # Mark this instance as closed to prevent subsequent calls.
138
+ else:
139
+ wrapper["ref_count"] -= 1
140
+ if self.debug:
141
+ self.logger.debug(f"Closing connection. Ref count now {wrapper['ref_count']}.")
142
+ if wrapper["ref_count"] <= 0:
143
+ if self.debug:
144
+ self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
145
+ try:
146
+ wrapper["engine"].dispose()
147
+ finally:
148
+ del _ENGINE_REGISTRY[key]
183
149
  self._closed = True
184
150
 
185
-
186
151
  def _attach_events(self) -> None:
187
- """Attaches checkout/checkin events to the engine for connection tracking."""
188
- if self.engine:
189
- event.listen(self.engine, "checkout", self._on_checkout)
190
- event.listen(self.engine, "checkin", self._on_checkin)
152
+ if not self.engine:
153
+ return
154
+ event.listen(self.engine, "checkout", self._on_checkout)
155
+ event.listen(self.engine, "checkin", self._on_checkin)
191
156
 
192
157
  def _on_checkout(self, *args) -> None:
193
- """Event listener for when a connection is checked out from the pool."""
194
- with self._registry_lock:
195
- wrapper = self._engine_registry.get(self._engine_key_instance)
158
+ with _ENGINE_REGISTRY_LOCK:
159
+ wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
196
160
  if wrapper:
197
- wrapper['active_connections'] += 1
161
+ wrapper["active_connections"] += 1
198
162
 
199
163
  def _on_checkin(self, *args) -> None:
200
- """Event listener for when a connection is returned to the pool."""
201
- with self._registry_lock:
202
- wrapper = self._engine_registry.get(self._engine_key_instance)
164
+ with _ENGINE_REGISTRY_LOCK:
165
+ wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
203
166
  if wrapper:
204
- wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
167
+ wrapper["active_connections"] = max(0, wrapper["active_connections"] - 1)
205
168
 
206
169
  @property
207
170
  def active_connections(self) -> int:
208
- """Returns the number of active connections for this instance's engine."""
209
- with self._registry_lock:
210
- wrapper = self._engine_registry.get(self._engine_key_instance)
211
- return wrapper['active_connections'] if wrapper else 0
171
+ with _ENGINE_REGISTRY_LOCK:
172
+ wrapper = _ENGINE_REGISTRY.get(self._engine_key_instance)
173
+ return wrapper["active_connections"] if wrapper else 0
212
174
 
213
175
  def _validate_conn(self) -> None:
214
- """Tests the database connection by executing a simple query."""
215
176
  try:
216
177
  with self.managed_connection() as conn:
217
178
  conn.execute(text("SELECT 1"))
218
- self.logger.debug("Database connection validated successfully.")
179
+ if self.debug:
180
+ self.logger.debug("Database connection validated successfully.")
219
181
  except OperationalError as e:
220
182
  self.logger.error(f"Database connection failed: {e}")
221
183
  raise ValueError(f"DB connection failed: {e}") from e
222
184
 
223
185
  @contextmanager
224
186
  def managed_connection(self) -> Generator[Any, None, None]:
225
- """Provides a single database connection from the engine pool."""
226
187
  if not self.engine:
227
188
  raise RuntimeError("Engine not initialized. Cannot get a connection.")
228
189
  conn = self.engine.connect()
@@ -232,19 +193,19 @@ class SqlAlchemyConnectionConfig(BaseModel):
232
193
  conn.close()
233
194
 
234
195
  def get_session(self) -> Session:
235
- """Returns a new SQLAlchemy Session from the session factory."""
236
196
  if not self.session_factory:
237
197
  raise RuntimeError("Session factory not initialized. Cannot get a session.")
238
198
  return self.session_factory()
239
199
 
240
200
  def _build_model(self) -> None:
241
- """Dynamically builds an ORM model if `self.table` is set."""
242
201
  if not self.table or not self.engine:
243
202
  return
244
203
  try:
245
204
  builder = SqlAlchemyModelBuilder(self.engine, self.table)
246
205
  self.model = builder.build_model()
247
- self.logger.debug(f"Successfully built ORM model for table: {self.table}")
206
+ if self.debug:
207
+ self.logger.debug(f"Successfully built ORM model for table: {self.table}")
248
208
  except Exception as e:
249
209
  self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}")
250
210
  raise ValueError(f"Model construction failed for table '{self.table}': {e}") from e
211
+
@@ -0,0 +1,15 @@
1
+ import threading
2
+
3
+
4
+ class DBGatekeeper:
5
+ _locks = {}
6
+ _global_lock = threading.Lock()
7
+
8
+ @classmethod
9
+ def get(cls, key: str, max_concurrency: int):
10
+ with cls._global_lock:
11
+ sem = cls._locks.get(key)
12
+ if sem is None:
13
+ sem = threading.BoundedSemaphore(max_concurrency)
14
+ cls._locks[key] = sem
15
+ return sem