sibi-dst 0.3.63__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,3 @@
1
- from ._filter_handler import SqlAlchemyFilterHandler
2
1
  from ._db_connection import SqlAlchemyConnectionConfig
3
2
  from ._load_from_db import SqlAlchemyLoadFromDb
4
3
  from ._sql_model_builder import SqlAlchemyModelBuilder
@@ -7,5 +6,4 @@ __all__ = [
7
6
  'SqlAlchemyConnectionConfig',
8
7
  'SqlAlchemyModelBuilder',
9
8
  'SqlAlchemyLoadFromDb',
10
- 'SqlAlchemyFilterHandler'
11
9
  ]
@@ -1,51 +1,85 @@
1
1
  from __future__ import annotations
2
- from typing import Any, Optional, ClassVar, Generator, Type
2
+
3
3
  import threading
4
4
  from contextlib import contextmanager
5
- from pydantic import BaseModel, field_validator, ValidationError, model_validator
5
+ from typing import Any, Optional, ClassVar, Generator, Type, Dict
6
+
7
+ from pydantic import (
8
+ BaseModel,
9
+ field_validator,
10
+ model_validator,
11
+ ConfigDict,
12
+ )
6
13
  from sqlalchemy import create_engine, event, text
7
14
  from sqlalchemy.engine import url as sqlalchemy_url
8
15
  from sqlalchemy.engine import Engine
9
- from sqlalchemy.exc import OperationalError
10
- from sqlalchemy.pool import QueuePool, NullPool, StaticPool
16
+ from sqlalchemy.exc import OperationalError, SQLAlchemyError
11
17
  from sqlalchemy.orm import sessionmaker, Session
18
+ from sqlalchemy.pool import QueuePool, NullPool, StaticPool
19
+
20
+ # Assuming these are your project's internal modules
12
21
  from sibi_dst.utils import Logger
13
22
  from ._sql_model_builder import SqlAlchemyModelBuilder
14
23
 
15
24
 
16
25
  class SqlAlchemyConnectionConfig(BaseModel):
17
26
  """
18
- Thread-safe, registry-backed SQLAlchemy connection manager with:
19
- - Shared engine reuse
20
- - Active connection tracking
21
- - Idle-pool and database-level cleanup
22
- - Dynamic ORM model building via SqlAlchemyModelBuilder
23
- - Optional session factory
27
+ A thread-safe, registry-backed SQLAlchemy connection manager.
28
+
29
+ This class encapsulates database connection configuration and provides robust,
30
+ shared resource management. It is designed to be used as a context manager
31
+ to ensure resources are always released correctly.
32
+
33
+ Recommended Usage is via the `with` statement.
34
+ with SqlAlchemyConnectionConfig(...) as config:
35
+ session = config.get_session()
36
+ # ... do work ...
37
+ # config.close() is called automatically upon exiting the block.
38
+
39
+ Key Features:
40
+ - Context Manager Support: Guarantees resource cleanup.
41
+ - Shared Engine & Pool: Reuses a single SQLAlchemy Engine for identical
42
+ database URLs and pool settings, improving application performance.
43
+ - Reference Counting: Safely manages the lifecycle of the shared engine,
44
+ disposing of it only when the last user has closed its connection config.
24
45
  """
46
+ # --- Public Configuration ---
25
47
  connection_url: str
26
48
  table: Optional[str] = None
27
- model: Optional[Any] = None
28
- engine: Optional[Engine] = None
29
- logger: Logger = None
30
49
  debug: bool = False
31
50
 
51
+ # --- Pool Configuration ---
32
52
  pool_size: int = 5
33
53
  max_overflow: int = 10
34
54
  pool_timeout: int = 30
35
- pool_recycle: int = 300
55
+ pool_recycle: int = 1800
36
56
  pool_pre_ping: bool = True
37
- poolclass: Type = QueuePool
57
+ poolclass: Type[QueuePool] = QueuePool
38
58
 
59
+ # --- Internal & Runtime State ---
60
+ model: Optional[Type[Any]] = None
61
+ engine: Optional[Engine] = None
62
+ logger: Optional[Logger] = None
39
63
  session_factory: Optional[sessionmaker] = None
40
- _owns_engine: bool = False
41
64
 
42
- _engine_registry: ClassVar[dict[tuple, Engine]] = {}
65
+ # --- Private State ---
66
+ _engine_key_instance: tuple = ()
67
+ _closed: bool = False # Flag to prevent double-closing.
68
+
69
+ # --- Class-level Shared Resources ---
70
+ _engine_registry: ClassVar[Dict[tuple, Dict[str, Any]]] = {}
43
71
  _registry_lock: ClassVar[threading.Lock] = threading.Lock()
44
- _active_connections: ClassVar[int] = 0
45
72
 
46
- class Config:
47
- arbitrary_types_allowed = True
48
- underscore_attrs_are_private = True
73
+ model_config = ConfigDict(arbitrary_types_allowed=True)
74
+
75
+ # Add __enter__ and __exit__ for context manager protocol
76
+ def __enter__(self) -> SqlAlchemyConnectionConfig:
77
+ """Enter the runtime context, returning self."""
78
+ return self
79
+
80
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
81
+ """Exit the runtime context, ensuring that close() is called."""
82
+ self.close()
49
83
 
50
84
  @field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
51
85
  @classmethod
@@ -56,78 +90,139 @@ class SqlAlchemyConnectionConfig(BaseModel):
56
90
 
57
91
  @model_validator(mode="after")
58
92
  def _init_all(self) -> SqlAlchemyConnectionConfig:
93
+ """Orchestrates the initialization process after Pydantic validation."""
59
94
  self._init_logger()
95
+ self._engine_key_instance = self._get_engine_key()
60
96
  self._init_engine()
61
97
  self._validate_conn()
62
98
  self._build_model()
63
- self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False)
99
+ if self.engine:
100
+ self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False)
64
101
  return self
65
102
 
66
103
  def _init_logger(self) -> None:
67
- self.logger = self.logger or Logger.default_logger(logger_name=self.__class__.__name__)
68
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
69
-
70
- def _engine_key(self) -> tuple:
104
+ """Initializes the logger for this instance."""
105
+ if self.logger is None:
106
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
107
+ log_level = Logger.DEBUG if self.debug else Logger.INFO
108
+ self.logger.set_level(log_level)
109
+
110
+ def _get_engine_key(self) -> tuple:
111
+ """Generates a unique, normalized key for an engine configuration."""
71
112
  parsed = sqlalchemy_url.make_url(self.connection_url)
72
113
  query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
73
- normalized = parsed.set(query=query)
74
- key = [str(normalized)]
114
+ normalized_url = parsed.set(query=query)
115
+ key_parts = [str(normalized_url)]
75
116
  if self.poolclass not in (NullPool, StaticPool):
76
- key += [self.pool_size, self.max_overflow, self.pool_timeout, self.pool_recycle, self.pool_pre_ping, self.table]
77
- return tuple(key)
117
+ key_parts += [
118
+ self.pool_size, self.max_overflow, self.pool_timeout,
119
+ self.pool_recycle, self.pool_pre_ping
120
+ ]
121
+ return tuple(key_parts)
78
122
 
79
123
  def _init_engine(self) -> None:
80
- key = self._engine_key()
124
+ """Initializes or reuses a shared SQLAlchemy Engine."""
81
125
  with self._registry_lock:
82
- existing = self._engine_registry.get(key)
83
- if existing:
84
- self.engine = existing
85
- self._owns_engine = False
86
- self.logger.debug(f"Reusing engine {key}")
126
+ engine_wrapper = self._engine_registry.get(self._engine_key_instance)
127
+ if engine_wrapper:
128
+ self.engine = engine_wrapper['engine']
129
+ engine_wrapper['ref_count'] += 1
130
+ self.logger.debug(f"Reusing engine. Ref count: {engine_wrapper['ref_count']}.")
87
131
  else:
88
- self.logger.debug(f"Creating engine {key}")
89
- self.engine = create_engine(
90
- self.connection_url,
91
- pool_size=self.pool_size,
92
- max_overflow=self.max_overflow,
93
- pool_timeout=self.pool_timeout,
94
- pool_recycle=self.pool_recycle,
95
- pool_pre_ping=self.pool_pre_ping,
96
- poolclass=self.poolclass,
97
- )
98
- self._attach_events()
99
- self._engine_registry[key] = self.engine
100
- self._owns_engine = True
132
+ self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
133
+ try:
134
+ new_engine = create_engine(
135
+ self.connection_url, pool_size=self.pool_size,
136
+ max_overflow=self.max_overflow, pool_timeout=self.pool_timeout,
137
+ pool_recycle=self.pool_recycle, pool_pre_ping=self.pool_pre_ping,
138
+ poolclass=self.poolclass,
139
+ )
140
+ self.engine = new_engine
141
+ self._attach_events()
142
+ self._engine_registry[self._engine_key_instance] = {
143
+ 'engine': new_engine, 'ref_count': 1, 'active_connections': 0
144
+ }
145
+ except Exception as e:
146
+ self.logger.error(f"Failed to create engine: {e}")
147
+ raise SQLAlchemyError(f"Engine creation failed: {e}") from e
148
+
149
+ def close(self) -> None:
150
+ """
151
+ Decrements the engine's reference count and disposes of the engine
152
+ if the count reaches zero. This is now typically called automatically
153
+ when exiting a `with` block.
154
+ """
155
+ # Prevent the method from running more than once per instance.
156
+ if self._closed:
157
+ self.logger.debug("Attempted to close an already-closed config instance.")
158
+ return
159
+
160
+ with self._registry_lock:
161
+ key = self._engine_key_instance
162
+ engine_wrapper = self._engine_registry.get(key)
163
+
164
+ if not engine_wrapper:
165
+ self.logger.warning("Attempted to close a config whose engine is not in the registry.")
166
+ return
167
+
168
+ engine_wrapper['ref_count'] -= 1
169
+ self.logger.debug(f"Closing config. Ref count is now {engine_wrapper['ref_count']}.")
170
+
171
+ if engine_wrapper['ref_count'] <= 0:
172
+ self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
173
+ engine_wrapper['engine'].dispose()
174
+ del self._engine_registry[key]
175
+
176
+ # Mark this instance as closed to prevent subsequent calls.
177
+ self._closed = True
178
+
179
+ # ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
180
+ # They are omitted here for brevity but should be included in your final file.
101
181
 
102
182
  def _attach_events(self) -> None:
103
- event.listen(self.engine, "checkout", self._on_checkout)
104
- event.listen(self.engine, "checkin", self._on_checkin)
183
+ """Attaches checkout/checkin events to the engine for connection tracking."""
184
+ if self.engine:
185
+ event.listen(self.engine, "checkout", self._on_checkout)
186
+ event.listen(self.engine, "checkin", self._on_checkin)
105
187
 
106
188
  def _on_checkout(self, *args) -> None:
189
+ """Event listener for when a connection is checked out from the pool."""
107
190
  with self._registry_lock:
108
- type(self)._active_connections += 1
109
- self.logger.debug(f"Checked out, active: {self.active_connections}")
191
+ wrapper = self._engine_registry.get(self._engine_key_instance)
192
+ if wrapper:
193
+ wrapper['active_connections'] += 1
194
+ self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
110
195
 
111
196
  def _on_checkin(self, *args) -> None:
197
+ """Event listener for when a connection is returned to the pool."""
112
198
  with self._registry_lock:
113
- type(self)._active_connections = max(type(self)._active_connections - 1, 0)
114
- self.logger.debug(f"Checked in, active: {self.active_connections}")
199
+ wrapper = self._engine_registry.get(self._engine_key_instance)
200
+ if wrapper:
201
+ wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
202
+ self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
115
203
 
116
204
  @property
117
205
  def active_connections(self) -> int:
118
- return type(self)._active_connections
206
+ """Returns the number of active connections for this instance's engine."""
207
+ with self._registry_lock:
208
+ wrapper = self._engine_registry.get(self._engine_key_instance)
209
+ return wrapper['active_connections'] if wrapper else 0
119
210
 
120
211
  def _validate_conn(self) -> None:
212
+ """Tests the database connection by executing a simple query."""
121
213
  try:
122
214
  with self.managed_connection() as conn:
123
215
  conn.execute(text("SELECT 1"))
124
- self.logger.debug("Connection OK")
216
+ self.logger.debug("Database connection validated successfully.")
125
217
  except OperationalError as e:
126
- self.logger.error(f"Connection failed: {e}")
127
- raise ValidationError(f"DB connection failed: {e}")
218
+ self.logger.error(f"Database connection failed: {e}")
219
+ raise ValueError(f"DB connection failed: {e}") from e
128
220
 
129
221
  @contextmanager
130
- def managed_connection(self) -> Generator[Any, None, Any]:
222
+ def managed_connection(self) -> Generator[Any, None, None]:
223
+ """Provides a single database connection from the engine pool."""
224
+ if not self.engine:
225
+ raise RuntimeError("Engine not initialized. Cannot get a connection.")
131
226
  conn = self.engine.connect()
132
227
  try:
133
228
  yield conn
@@ -135,68 +230,19 @@ class SqlAlchemyConnectionConfig(BaseModel):
135
230
  conn.close()
136
231
 
137
232
  def get_session(self) -> Session:
233
+ """Returns a new SQLAlchemy Session from the session factory."""
138
234
  if not self.session_factory:
139
- raise RuntimeError("Session factory not initialized")
235
+ raise RuntimeError("Session factory not initialized. Cannot get a session.")
140
236
  return self.session_factory()
141
237
 
142
238
  def _build_model(self) -> None:
143
- """Dynamically build and assign the ORM model if table is set"""
239
+ """Dynamically builds an ORM model if `self.table` is set."""
144
240
  if not self.table or not self.engine:
145
241
  return
146
242
  try:
147
243
  builder = SqlAlchemyModelBuilder(self.engine, self.table)
148
244
  self.model = builder.build_model()
149
- self.logger.debug(f"Model built for table: {self.table}")
245
+ self.logger.debug(f"Successfully built ORM model for table: {self.table}")
150
246
  except Exception as e:
151
- self.logger.error(f"Model build failed: {e}")
152
- raise ValidationError(f"Model construction error: {e}") from e
153
-
154
- def dispose_idle_connections(self) -> int:
155
- key = self._engine_key()
156
- with self._registry_lock:
157
- if self._engine_registry.get(key) is not self.engine:
158
- self.logger.debug("Engine changed")
159
- return 0
160
- pool = self.engine.pool
161
- if isinstance(pool, QueuePool):
162
- count = pool.checkedin()
163
- pool.dispose()
164
- self.logger.debug(f"Disposed {count}")
165
- return count
166
- self.logger.warning(f"No idle dispose for {type(pool).__name__}")
167
- return 0
168
-
169
- def terminate_idle_connections(self, idle_seconds: int = 300) -> int:
170
- terminated = 0
171
- dialect = self.engine.dialect.name
172
- with self.managed_connection() as conn:
173
- if dialect == 'postgresql':
174
- res = conn.execute(text(
175
- f"SELECT pg_terminate_backend(pid) FROM pg_stat_activity "
176
- f"WHERE state='idle' AND (now() - query_start) > interval '{idle_seconds} seconds' "
177
- f"AND pid<>pg_backend_pid()"
178
- ))
179
- terminated = res.rowcount
180
- elif dialect == 'mysql':
181
- for row in conn.execute(text("SHOW PROCESSLIST")):
182
- if row.Command == 'Sleep' and row.Time > idle_seconds:
183
- conn.execute(text(f"KILL {row.Id}"))
184
- terminated += 1
185
- else:
186
- self.logger.warning(f"Idle termination not supported: {dialect}")
187
- self.logger.debug(f"Terminated {terminated}")
188
- return terminated
189
-
190
- def close(self) -> None:
191
- with self._registry_lock:
192
- key = self._engine_key()
193
- if not self._owns_engine:
194
- self.logger.debug("Not owner, skipping close")
195
- return
196
- if self._engine_registry.get(key) != self.engine:
197
- self.logger.debug("Engine not in registry")
198
- return
199
- self.engine.dispose()
200
- del self._engine_registry[key]
201
- type(self)._active_connections = 0
202
- self.logger.debug(f"Engine closed {key}")
247
+ self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}")
248
+ raise ValueError(f"Model construction failed for table '{self.table}': {e}") from e
@@ -1,135 +1,179 @@
1
- import itertools
1
+ from typing import Type
2
2
 
3
+ import dask
3
4
  import dask.dataframe as dd
4
5
  import pandas as pd
5
- from sqlalchemy import create_engine, inspect, select
6
- from sqlalchemy.orm import sessionmaker
7
-
6
+ from sqlalchemy import (
7
+ inspect,
8
+ select,
9
+ func,
10
+ )
11
+ from sqlalchemy.engine import Engine
12
+ from sqlalchemy.orm import declarative_base
13
+ import time
14
+ from sqlalchemy.exc import TimeoutError
15
+ import sqlalchemy as sa
8
16
  from sibi_dst.df_helper.core import FilterHandler
9
17
  from sibi_dst.utils import Logger
10
18
 
11
19
 
12
20
  class SQLAlchemyDask:
13
- def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
21
+ """
22
+ Loads data from a database into a Dask DataFrame using a memory-safe,
23
+ non-parallel, paginated approach.
24
+
25
+ This class avoids using a numeric `index_col for parallel loading.
26
+ """
27
+
28
+ _SQLALCHEMY_TO_DASK_DTYPE = {
29
+ "INTEGER": "Int64",
30
+ "SMALLINT": "Int64",
31
+ "BIGINT": "Int64",
32
+ "FLOAT": "float64",
33
+ "NUMERIC": "float64",
34
+ "BOOLEAN": "bool",
35
+ "VARCHAR": "object",
36
+ "TEXT": "object",
37
+ "DATE": "datetime64[ns]",
38
+ "DATETIME": "datetime64[ns]",
39
+ "TIME": "object",
40
+ "UUID": "object",
41
+ }
42
+
43
+ def __init__(
44
+ self,
45
+ model: Type[declarative_base()],
46
+ filters: dict,
47
+ engine: Engine,
48
+ chunk_size: int = 1000,
49
+ logger=None,
50
+ debug: bool = False,
51
+ ):
14
52
  """
15
- Initialize with an SQLAlchemy query and database engine URL.
16
-
17
- :param model: SQLAlchemy ORM model.
18
- :param filters: Filters to apply on the query.
19
- :param engine_url: Database connection string for SQLAlchemy engine.
20
- :param chunk_size: Number of records per chunk for Dask partitions.
21
- :param logger: Logger instance for logging.
22
- :param debug: Whether to print detailed logs.
53
+ Initializes the data loader.
54
+
55
+ Args:
56
+ model: The SQLAlchemy ORM model for the table.
57
+ filters: A dictionary of filters to apply to the query.
58
+ engine: An SQLAlchemy Engine instance.
59
+ chunk_size: The number of records to fetch in each database query.
60
+ logger: A logger instance.
61
+ debug: Whether to enable detailed logging.
23
62
  """
24
- self.query = None
25
63
  self.model = model
26
64
  self.filters = filters
65
+ self.engine = engine
27
66
  self.chunk_size = chunk_size
28
67
  self.debug = debug
29
- self.engine = create_engine(engine_url)
30
- self.Session = sessionmaker(bind=self.engine)
31
68
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
32
- self.logger.set_level(logger.DEBUG if debug else logger.INFO)
69
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
+ self.filter_handler_cls = FilterHandler
33
71
 
34
- @staticmethod
35
- def infer_dtypes_from_model(model):
72
+ @classmethod
73
+ def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
36
74
  """
37
- Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
75
+ Infers a metadata dictionary for Dask based on the SQLAlchemy model.
76
+ This helps Dask understand the DataFrame structure without reading data.
38
77
  """
39
78
  mapper = inspect(model)
40
- sqlalchemy_to_dask_dtype = {
41
- 'INTEGER': 'Int64',
42
- 'SMALLINT': 'Int64',
43
- 'BIGINT': 'Int64',
44
- 'FLOAT': 'float64',
45
- 'NUMERIC': 'float64',
46
- 'BOOLEAN': 'bool',
47
- 'VARCHAR': 'object',
48
- 'TEXT': 'object',
49
- 'DATE': 'datetime64[ns]',
50
- 'DATETIME': 'datetime64[ns]',
51
- 'TIME': 'object',
52
- 'UUID': 'object',
53
- }
54
-
55
79
  dtypes = {}
56
80
  for column in mapper.columns:
57
- dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
81
+ dtype_str = str(column.type).upper().split("(")[0]
82
+ dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
58
83
  dtypes[column.name] = dtype
59
-
60
84
  return dtypes
61
85
 
62
- def read_frame(self, fillna_value=None):
86
+ def read_frame(self, fillna_value=None) -> dd.DataFrame:
63
87
  """
64
- Load data from an SQLAlchemy query into a Dask DataFrame.
88
+ Builds and executes a query to load data into a Dask DataFrame.
65
89
 
66
- :param fillna_value: Value to replace NaN or NULL values with, if any.
67
- :return: Dask DataFrame.
90
+ This method works by first running a COUNT query to get the total
91
+ size, then creating a series of delayed tasks that each fetch a
92
+ chunk of data using LIMIT/OFFSET.
93
+
94
+ Args:
95
+ fillna_value: Value to replace NaN or NULL values with, if any.
96
+
97
+ Returns:
98
+ A lazy Dask DataFrame.
68
99
  """
69
- with self.Session() as session:
100
+ # 1. Build the base query and apply filters
101
+ query = select(self.model)
102
+ if self.filters:
103
+ query = self.filter_handler_cls(
104
+ backend="sqlalchemy", logger=self.logger, debug=self.debug
105
+ ).apply_filters(query, model=self.model, filters=self.filters)
106
+
107
+ self.logger.debug(f"Base query for pagination: {query}")
108
+
109
+ # 2. Get metadata for the Dask DataFrame structure
110
+ ordered_columns = [column.name for column in self.model.__table__.columns]
111
+ meta_dtypes = self.infer_meta_from_model(self.model)
112
+ meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
113
+
114
+ # 3. Get the total record count to calculate the number of chunks
115
+ # try:
116
+ # with self.engine.connect() as connection:
117
+ # count_query = select(func.count()).select_from(query.alias())
118
+ # total_records = connection.execute(count_query).scalar_one()
119
+ # except Exception as e:
120
+ # self.logger.error(f"Failed to count records for pagination: {e}", exc_info=True)
121
+ # return dd.from_pandas(meta_df, npartitions=1)
122
+ retry_attempts = 3
123
+ backoff_factor = 0.5 # start with a 0.5-second delay
124
+
125
+ for attempt in range(retry_attempts):
70
126
  try:
71
- # Build query
72
- self.query = select(self.model)
73
- if self.filters:
74
- self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
75
- model=self.model,
76
- filters=self.filters)
127
+ with self.engine.connect() as connection:
128
+ count_query = sa.select(sa.func.count()).select_from(query.alias())
129
+ total_records = connection.execute(count_query).scalar_one()
130
+
131
+ # If successful, break the loop
132
+ break
133
+
134
+ except TimeoutError:
135
+ if attempt < retry_attempts - 1:
136
+ self.logger.warning(
137
+ f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
138
+ )
139
+ time.sleep(backoff_factor)
140
+ backoff_factor *= 2 # Double the backoff time for the next attempt
77
141
  else:
78
- n_records = 100
79
- self.query = self.query.limit(n_records)
80
- self.logger.debug(f"query:{self.query}")
81
- # Infer dtypes
82
- dtypes = self.infer_dtypes_from_model(self.model)
83
- # Get the column order from the SQLAlchemy model
84
- ordered_columns = [column.name for column in self.model.__table__.columns]
85
-
86
- # Execute query and fetch results in chunks
87
- result_proxy = session.execute(self.query)
88
- results = result_proxy.scalars().all() # Fetch all rows
89
- iterator = iter(results)
90
-
91
- partitions = []
92
-
93
- while True:
94
- chunk = list(itertools.islice(iterator, self.chunk_size))
95
- if not chunk:
96
- break
97
-
98
- # Convert chunk to Pandas DataFrame
99
- df = pd.DataFrame.from_records(
100
- [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
142
+ self.logger.error(
143
+ "Failed to get a connection from the pool after several retries.",
144
+ exc_info=True
101
145
  )
102
- # Drop internal SQLAlchemy state if it exists
103
- df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
146
+ return dd.from_pandas(meta_df, npartitions=1)
147
+
148
+ except Exception as e:
149
+ self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
150
+ return dd.from_pandas(meta_df, npartitions=1)
104
151
 
105
- # Reorder columns to match the model's order
106
- df = df[ordered_columns]
152
+ if total_records == 0:
153
+ self.logger.warning("Query returned 0 records.")
154
+ return dd.from_pandas(meta_df, npartitions=1)
107
155
 
108
- # Fill NaN values
109
- if fillna_value is not None:
110
- df = df.fillna(fillna_value)
156
+ self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
111
157
 
112
- # Convert timezone-aware columns to naive
113
- for col in df.columns:
114
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
115
- df[col] = df[col].dt.tz_localize(None)
158
+ # 4. Create a list of Dask Delayed objects, one for each chunk
159
+ @dask.delayed
160
+ def get_chunk(sql_query, chunk_offset):
161
+ """A Dask-delayed function to fetch one chunk of data."""
162
+ # LIMIT/OFFSET must be applied in the delayed function
163
+ paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
164
+ df = pd.read_sql(paginated_query, self.engine)
116
165
 
117
- # Apply inferred dtypes
118
- df = df.astype(dtypes)
119
- # Create a Dask partition
120
- partitions.append(dd.from_pandas(df, npartitions=1))
166
+ if fillna_value is not None:
167
+ df = df.fillna(fillna_value)
121
168
 
122
- # Concatenate all partitions
123
- if partitions:
124
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
125
- else:
126
- dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
169
+ # Ensure column order and types match the meta
170
+ return df[ordered_columns].astype(meta_dtypes)
127
171
 
128
- self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
172
+ offsets = range(0, total_records, self.chunk_size)
173
+ delayed_chunks = [get_chunk(query, offset) for offset in offsets]
129
174
 
130
- return dask_df
175
+ # 5. Construct the final lazy Dask DataFrame from the delayed chunks
176
+ ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
177
+ self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
131
178
 
132
- except Exception as e:
133
- self.logger.error(f"Error executing query: {str(e)}")
134
- self.logger.error(self.query)
135
- return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
179
+ return ddf