sibi-dst 2025.1.3__py3-none-any.whl → 2025.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,12 +4,12 @@ import warnings
4
4
  from typing import Any, Dict, Optional, Union, TypeVar
5
5
 
6
6
  import dask.dataframe as dd
7
- import fsspec
8
7
  import pandas as pd
9
8
  from pydantic import BaseModel
10
9
 
11
10
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
12
- from sibi_dst.utils import Logger, ParquetSaver, ClickHouseWriter
11
+ from sibi_dst.utils import ManagedResource
12
+ from sibi_dst.utils import ParquetSaver, ClickHouseWriter
13
13
  from .backends.http import HttpConfig
14
14
  from .backends.parquet import ParquetConfig
15
15
  from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
@@ -27,18 +27,19 @@ class BaseBackend:
27
27
  self.helper = helper
28
28
  self.logger = helper.logger
29
29
  self.debug = helper.debug
30
+ self.total_records = helper.total_records # no records loaded yet
30
31
 
31
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
32
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
32
33
  """Synchronous data loading method. Must be implemented by sync backends."""
33
34
  raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
34
35
 
35
- async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
36
+ async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
36
37
  """Asynchronous data loading method. By default, it calls the sync version."""
37
38
  return self.load(**options)
38
39
 
39
40
 
40
41
  class SqlAlchemyBackend(BaseBackend):
41
- def load(self, **options) -> dd.DataFrame:
42
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
42
43
  try:
43
44
  # Process incoming filter options into the ParamsConfig object
44
45
  if options and hasattr(self.helper._backend_params, 'parse_params'):
@@ -49,45 +50,51 @@ class SqlAlchemyBackend(BaseBackend):
49
50
  plugin_query=self.helper._backend_query,
50
51
  plugin_params=self.helper._backend_params,
51
52
  logger=self.logger,
52
- debug= self.debug
53
+ debug=self.debug
53
54
  )
54
- return db_loader.build_and_load()
55
+ self.total_records, result = db_loader.build_and_load()
56
+ return self.total_records, result
55
57
  except Exception as e:
56
58
  self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
57
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
59
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
58
60
 
59
61
 
60
62
  class ParquetBackend(BaseBackend):
61
63
  """This backend is also purely synchronous."""
62
64
 
63
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
65
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
64
66
  try:
65
67
  df = self.helper.backend_parquet.load_files()
66
68
  if options and df is not None:
67
69
  df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
68
- return df
70
+ self.total_records = len(df)
71
+ return self.total_records, df
69
72
  except Exception as e:
73
+ self.total_records = -1 # Reset total_records on failure
70
74
  self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
71
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
75
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
72
76
 
73
77
 
74
78
  class HttpBackend(BaseBackend):
75
79
  """This backend is purely asynchronous."""
76
80
 
77
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
81
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
78
82
  # This will correctly fail by raising NotImplementedError from the base class.
79
83
  return self.helper.backend_http.fetch_data(**options)
80
84
 
81
- async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
85
+ async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
82
86
  if not self.helper.backend_http:
83
87
  self.logger.warning("HTTP plugin not configured properly.")
84
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
85
- return await self.helper.backend_http.fetch_data(**options)
88
+ self.total_records = -1
89
+ return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
90
+ result = await self.helper.backend_http.fetch_data(**options)
91
+ self.total_records = len(result)
92
+ return self.total_records, result
86
93
 
87
94
 
88
95
  # --- Main DfHelper Facade Class ---
89
96
 
90
- class DfHelper:
97
+ class DfHelper(ManagedResource):
91
98
  """
92
99
  A reusable utility for loading data. It provides both sync (`load`) and
93
100
  async (`aload`) methods to accommodate different backends.
@@ -103,13 +110,14 @@ class DfHelper:
103
110
  def __init__(self, backend='sqlalchemy', **kwargs):
104
111
  self.default_config = self.default_config or {}
105
112
  kwargs = {**self.default_config.copy(), **kwargs}
113
+ super().__init__(**kwargs)
106
114
  self.backend = backend
107
- self.debug = kwargs.get("debug", False)
108
- self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
109
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
110
- self.fs = kwargs.get("fs", fsspec.filesystem('file'))
115
+
116
+ # Need to set default values for backend-specific configurations
117
+ kwargs.setdefault("debug", self.debug)
111
118
  kwargs.setdefault("fs", self.fs)
112
119
  kwargs.setdefault("logger", self.logger)
120
+ self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
113
121
  self._backend_query = self._get_config(QueryConfig, kwargs)
114
122
  self._backend_params = self._get_config(ParamsConfig, kwargs)
115
123
  self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
@@ -127,11 +135,9 @@ class DfHelper:
127
135
  if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
128
136
  self.backend_strategy = strategy_class(self)
129
137
 
130
- def __enter__(self):
131
- return self
132
-
133
138
  def __exit__(self, exc_type, exc_value, traceback):
134
139
  self._cleanup()
140
+ super().__exit__(exc_type, exc_value, traceback)
135
141
 
136
142
  def _cleanup(self):
137
143
  active_config = getattr(self, f"backend_{self.backend}", None)
@@ -147,14 +153,14 @@ class DfHelper:
147
153
  def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
148
154
  """Loads data synchronously. Fails if backend is async-only."""
149
155
  self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
150
- df = self.backend_strategy.load(**options)
156
+ self.total_records, df = self.backend_strategy.load(**options)
151
157
  df = self._process_loaded_data(df)
152
158
  df = self._post_process_df(df)
153
159
  return df.compute() if as_pandas else df
154
160
 
155
161
  async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
162
  """Loads data asynchronously from any backend."""
157
- df = await self.backend_strategy.aload(**options)
163
+ self.total_records, df = await self.backend_strategy.aload(**options)
158
164
  df = self._process_loaded_data(df)
159
165
  df = self._post_process_df(df)
160
166
  return df.compute() if as_pandas else df
@@ -195,15 +201,27 @@ class DfHelper:
195
201
  return
196
202
  fs = kwargs.pop('fs', self.fs)
197
203
  path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
198
- ParquetSaver(df, path, self.logger, fs).save_to_parquet(parquet_filename)
199
- self.logger.debug(f"Parquet saved to {parquet_filename} in path: {path}.")
204
+ writer_config = {
205
+ 'df_result': df,
206
+ 'parquet_storage_path': path,
207
+ 'fs': fs,
208
+ 'debug': self.debug,
209
+ 'logger': self.logger,
210
+ 'verbose': self.verbose,
211
+ }
212
+ with ParquetSaver(**writer_config) as saver:
213
+ saver.save_to_parquet(parquet_filename)
214
+
215
+ self.logger.debug(f"Parquet saved to {parquet_filename} in {path}.")
200
216
 
201
217
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
202
218
  if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
203
219
  self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
204
220
  return
205
- ClickHouseWriter(self.logger, **credentials).save_to_clickhouse(df)
206
- self.logger.debug("Save to ClickHouse completed.")
221
+
222
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
223
+ writer.save_to_clickhouse(df)
224
+ self.logger.debug("Save to ClickHouse completed.")
207
225
 
208
226
  def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
209
227
  """Synchronous convenience method for loading a date range."""
@@ -229,4 +247,3 @@ class DfHelper:
229
247
  kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
230
248
  self.logger.debug(f"Period load generated filters: {kwargs}")
231
249
  return kwargs
232
-
@@ -1,3 +1,4 @@
1
+ from __future__ import annotations
1
2
  import datetime
2
3
  import logging
3
4
  import threading
@@ -7,7 +8,7 @@ import dask.dataframe as dd
7
8
  import fsspec
8
9
 
9
10
  from sibi_dst.df_helper import DfHelper
10
- from sibi_dst.utils import DataWrapper, DateUtils, Logger, ParquetSaver, UpdatePlanner
11
+ from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
11
12
  from sibi_dst.utils import MissingManifestManager
12
13
 
13
14
 
@@ -58,14 +59,6 @@ class ParquetArtifact(DfHelper):
58
59
  'backend': 'parquet'
59
60
  }
60
61
 
61
- # DEFAULT_UPDATE_PLANNER_CONFIG = {
62
- # 'reverse_order': True,
63
- # 'overwrite': False,
64
- # 'ignore_missing': True,
65
- # 'history_days_threshold': 30,
66
- # 'max_age_minutes': 10,
67
- # 'show_progress': False
68
- # }
69
62
 
70
63
  def __init__(self, data_wrapper_class, **kwargs):
71
64
  """
@@ -97,7 +90,9 @@ class ParquetArtifact(DfHelper):
97
90
  **kwargs,
98
91
  }
99
92
  self.df: Optional[dd.DataFrame] = None
100
- self._setup_logging()
93
+ super().__init__(**self.config)
94
+ #self._own_logger = False
95
+ #self._setup_logging()
101
96
  self.data_wrapper_class = data_wrapper_class
102
97
 
103
98
  self.date_field = self._validate_required('date_field')
@@ -107,35 +102,37 @@ class ParquetArtifact(DfHelper):
107
102
  self.parquet_end_date = self._validate_required('parquet_end_date')
108
103
 
109
104
  # Filesystem setup
110
- self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
111
- self.filesystem_options = self.config.setdefault('filesystem_options', {})
112
- self.fs = self.config.setdefault('fs', None)
113
- self._own_fs = self.fs is None
114
- if self.fs is None:
115
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
116
- self._own_fs = True
117
- self.config.setdefault('fs', self.fs)
105
+ #self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
106
+ #self.filesystem_options = self.config.setdefault('filesystem_options', {})
107
+ #self.fs = self.config.setdefault('fs', None)
108
+ #self._own_fs = self.fs is None
109
+ #if self.fs is None:
110
+ # self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
111
+ # self._own_fs = True
112
+ #self.config.setdefault('fs', self.fs)
118
113
  ## Populate to parameters to pass to data_wrapper_class
119
114
  self.class_params = self.config.pop('class_params', {
120
115
  'debug': self.debug,
121
116
  'logger': self.logger,
122
117
  'fs': self.fs,
118
+ 'verbose': self.verbose,
123
119
  })
124
120
  # Populate parameters to pass to load method of DataWrapper class
125
121
  self.load_params = self.config.setdefault('load_params', {})
126
122
  # Ensure the directory exists
127
123
  self.ensure_directory_exists(self.parquet_storage_path)
128
- super().__init__(**self.config)
124
+ #super().__init__(**self.config)
129
125
  self.update_planner_params = {}
130
126
  self.datawrapper_params = {}
131
127
 
132
- def _setup_logging(self):
133
- """Initialize logger and debug settings."""
134
- self.debug = self.config.get('debug', False)
135
- self.logger = self.config.get('logger',
136
- Logger.default_logger(
137
- logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}'))
138
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
128
+ #def _setup_logging(self):
129
+ # """Initialize logger and debug settings."""
130
+ # self.debug = self.config.get('debug', False)
131
+ # logger = self.config.get('logger', None)
132
+ # self._own_logger = logger is None
133
+ # self.logger = logger or Logger.default_logger(
134
+ # logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}')
135
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
139
136
 
140
137
  def _validate_required(self, key: str) -> Any:
141
138
  """Validate required configuration fields."""
@@ -150,7 +147,9 @@ class ParquetArtifact(DfHelper):
150
147
  self.mmanifest = MissingManifestManager(
151
148
  fs=self.fs,
152
149
  manifest_path=self.missing_manifest_path,
153
- clear_existing=overwrite
150
+ clear_existing=overwrite,
151
+ debug= self.debug,
152
+ logger=self.logger
154
153
  )
155
154
 
156
155
  # Initialize skipped files
@@ -158,7 +157,7 @@ class ParquetArtifact(DfHelper):
158
157
  if not manifest_exists:
159
158
  self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
160
159
  self.mmanifest.save()
161
- self.mmanifest.cleanup_temp_manifests()
160
+ #self.mmanifest.cleanup_temp_manifests()
162
161
  else:
163
162
  self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
164
163
 
@@ -172,7 +171,7 @@ class ParquetArtifact(DfHelper):
172
171
  def _setup_update_planner(self, **kwargs) -> None:
173
172
  self._prepare_update_params(**kwargs)
174
173
  self.update_planner = UpdatePlanner(**self.update_planner_params)
175
- self.update_planner.generate_plan(self.start_date, self.end_date)
174
+ self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
176
175
 
177
176
  def load(self, **kwargs):
178
177
  with self._lock:
@@ -197,36 +196,45 @@ class ParquetArtifact(DfHelper):
197
196
  with DataWrapper(self.data_wrapper_class, **params) as dw:
198
197
  dw.process()
199
198
 
200
- def __enter__(self):
201
- if getattr(self, "_entered", False):
202
- return self
203
- self._entered = True
204
- return self
205
-
206
199
  def __exit__(self, exc_type, exc_value, traceback):
207
200
  try:
208
201
  if self.mmanifest and self.mmanifest._new_records:
209
202
  self.mmanifest.save()
210
- self.mmanifest.cleanup_temp_manifests()
211
- if getattr(self, "_entered", False) and self.fs and self._own_fs:
212
- self.fs.close()
213
203
  except Exception as e:
214
204
  self.logger.warning(f"Error closing filesystem: {e}")
215
205
  finally:
216
- self._entered = False
206
+ super().__exit__(exc_type, exc_value, traceback)
217
207
  # return False so exceptions aren’t suppressed
218
208
  return False
219
209
 
220
- @classmethod
221
- def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
210
+ def get_size_estimate(self, **kwargs) -> int:
222
211
  """
223
- Estimate complexity as total bytes of all .parquet files under parquet_path.
224
- Returns size in megabytes (so you can cap or scale priority sensibly).
212
+ Synchronously estimates artifact size for use in multi-threaded environments.
213
+
214
+ This method uses the filesystem's own .sync() method to safely execute
215
+ asynchronous I/O operations from a synchronous context, preventing
216
+ event loop conflicts.
225
217
  """
226
- fs, _, paths = cls.fs.get_fs_token_paths(parquet_path)
227
- files = fs.glob(f"{parquet_path}/*.parquet")
228
- total_bytes = sum(fs.size(f) for f in files)
229
- # convert to “units” (e.g. MB) so priorities stay in a reasonable range
218
+
219
+ async def _get_total_bytes_async():
220
+ """A helper async coroutine to perform the I/O."""
221
+ import asyncio
222
+
223
+ # Use the async versions of fsspec methods (e.g., _glob, _size)
224
+ files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
225
+ if not files:
226
+ return 0
227
+
228
+ # Concurrently gather the size of all files for performance
229
+ size_tasks = [self.fs._size(f) for f in files]
230
+ sizes = await asyncio.gather(*size_tasks)
231
+ return sum(s for s in sizes if s is not None)
232
+
233
+ # Use the filesystem's own built-in sync method. This is the most
234
+ # reliable way to bridge the sync/async gap for fsspec.
235
+ total_bytes = self.fs.sync(_get_total_bytes_async())
236
+
237
+ # Convert to megabytes, ensuring a minimum of 1
230
238
  return max(1, int(total_bytes / (1024 ** 2)))
231
239
 
232
240
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
@@ -316,6 +324,8 @@ class ParquetArtifact(DfHelper):
316
324
  'parquet_filename': self.parquet_filename,
317
325
  'data_path': self.parquet_storage_path,
318
326
  'fs': self.fs,
327
+ 'debug': self.debug,
328
+ 'logger': self.logger,
319
329
  'class_params': self.class_params,
320
330
  'date_field': self.date_field,
321
331
  'load_params': self.load_params,
@@ -54,9 +54,9 @@ class ParquetReader(DfHelper):
54
54
  **kwargs,
55
55
  }
56
56
  self.df: Optional[dd.DataFrame] = None
57
- self.debug = self.config.setdefault('debug', False)
58
- self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
59
- self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
57
+ #self.debug = self.config.setdefault('debug', False)
58
+ #self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
59
+ #self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
60
60
  self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
61
61
  if self.parquet_storage_path is None:
62
62
  raise ValueError('parquet_storage_path must be set')
@@ -69,12 +69,12 @@ class ParquetReader(DfHelper):
69
69
  raise ValueError('parquet_end_date must be set')
70
70
 
71
71
  # Filesystem setup
72
- self.filesystem_type = filesystem_type
73
- self.filesystem_options = filesystem_options or {}
74
- self.fs = self.config.setdefault('fs', None)
75
- if self.fs is None:
76
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
77
- self.config.setdefault('fs', self.fs)
72
+ #self.filesystem_type = filesystem_type
73
+ #self.filesystem_options = filesystem_options or {}
74
+ #self.fs = self.config.setdefault('fs', None)
75
+ #if self.fs is None:
76
+ # self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
77
+ #self.config.setdefault('fs', self.fs)
78
78
 
79
79
  if not self.directory_exists():
80
80
  raise ValueError(f"{self.parquet_storage_path} does not exist")
@@ -92,7 +92,3 @@ class ParquetReader(DfHelper):
92
92
  except FileNotFoundError:
93
93
  return False
94
94
 
95
- def __exit__(self, exc_type, exc_value, traceback):
96
- # Ensure resources are cleaned up
97
- if self.fs:
98
- self.fs.close()
@@ -1,5 +1,5 @@
1
1
  from __future__ import annotations
2
-
2
+ import os
3
3
  import threading
4
4
  from contextlib import contextmanager
5
5
  from typing import Any, Optional, ClassVar, Generator, Type, Dict
@@ -49,10 +49,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
49
49
  debug: bool = False
50
50
 
51
51
  # --- Pool Configuration ---
52
- pool_size: int = 5
53
- max_overflow: int = 10
54
- pool_timeout: int = 30
55
- pool_recycle: int = 1800
52
+ pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
53
+ max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
54
+ pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
55
+ pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
56
56
  pool_pre_ping: bool = True
57
57
  poolclass: Type[QueuePool] = QueuePool
58
58
 
@@ -60,6 +60,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
60
60
  model: Optional[Type[Any]] = None
61
61
  engine: Optional[Engine] = None
62
62
  logger: Optional[Logger] = None
63
+ _own_logger: bool = False # Indicates if this instance owns the logger.
63
64
  session_factory: Optional[sessionmaker] = None
64
65
 
65
66
  # --- Private State ---
@@ -102,10 +103,13 @@ class SqlAlchemyConnectionConfig(BaseModel):
102
103
 
103
104
  def _init_logger(self) -> None:
104
105
  """Initializes the logger for this instance."""
106
+ # This is not a ManagedResource subclass, so we handle logger initialization directly.
107
+ # unless a logger is provided, we create our own.
105
108
  if self.logger is None:
109
+ self._own_logger = True
106
110
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
107
- log_level = Logger.DEBUG if self.debug else Logger.INFO
108
- self.logger.set_level(log_level)
111
+ log_level = Logger.DEBUG if self.debug else Logger.INFO
112
+ self.logger.set_level(log_level)
109
113
 
110
114
  def _get_engine_key(self) -> tuple:
111
115
  """Generates a unique, normalized key for an engine configuration."""
@@ -146,6 +150,8 @@ class SqlAlchemyConnectionConfig(BaseModel):
146
150
  self.logger.error(f"Failed to create engine: {e}")
147
151
  raise SQLAlchemyError(f"Engine creation failed: {e}") from e
148
152
 
153
+ #self.logger.debug(f"Connections Active: {self.active_connections}")
154
+
149
155
  def close(self) -> None:
150
156
  """
151
157
  Decrements the engine's reference count and disposes of the engine
@@ -176,8 +182,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
176
182
  # Mark this instance as closed to prevent subsequent calls.
177
183
  self._closed = True
178
184
 
179
- # ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
180
- # They are omitted here for brevity but should be included in your final file.
181
185
 
182
186
  def _attach_events(self) -> None:
183
187
  """Attaches checkout/checkin events to the engine for connection tracking."""
@@ -191,7 +195,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
191
195
  wrapper = self._engine_registry.get(self._engine_key_instance)
192
196
  if wrapper:
193
197
  wrapper['active_connections'] += 1
194
- self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
198
+ # self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
195
199
 
196
200
  def _on_checkin(self, *args) -> None:
197
201
  """Event listener for when a connection is returned to the pool."""
@@ -199,7 +203,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
199
203
  wrapper = self._engine_registry.get(self._engine_key_instance)
200
204
  if wrapper:
201
205
  wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
202
- self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
206
+ # self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
203
207
 
204
208
  @property
205
209
  def active_connections(self) -> int:
@@ -1,5 +1,6 @@
1
+ from __future__ import annotations
1
2
 
2
- from typing import Type
3
+ from typing import Type, Any
3
4
 
4
5
  import dask
5
6
  import dask.dataframe as dd
@@ -13,11 +14,12 @@ from sqlalchemy.orm import declarative_base
13
14
  import time
14
15
  from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
15
16
  import sqlalchemy as sa
17
+
18
+ from sibi_dst.utils import ManagedResource
16
19
  from sibi_dst.df_helper.core import FilterHandler
17
- from sibi_dst.utils import Logger
18
20
 
19
21
 
20
- class SQLAlchemyDask:
22
+ class SQLAlchemyDask(ManagedResource):
21
23
  """
22
24
  Loads data from a database into a Dask DataFrame using a memory-safe,
23
25
  non-parallel, paginated approach.
@@ -46,8 +48,7 @@ class SQLAlchemyDask:
46
48
  filters: dict,
47
49
  engine: Engine,
48
50
  chunk_size: int = 1000,
49
- logger=None,
50
- debug: bool = False,
51
+ **kwargs
51
52
  ):
52
53
  """
53
54
  Initializes the data loader.
@@ -60,14 +61,13 @@ class SQLAlchemyDask:
60
61
  logger: A logger instance.
61
62
  debug: Whether to enable detailed logging.
62
63
  """
64
+ super().__init__(**kwargs)
63
65
  self.model = model
64
66
  self.filters = filters
65
67
  self.engine = engine
66
68
  self.chunk_size = chunk_size
67
- self.debug = debug
68
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
69
- self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
69
  self.filter_handler_cls = FilterHandler
70
+ self.total_records = -1 # Initialize to -1 to indicate uncounted
71
71
 
72
72
  @classmethod
73
73
  def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
@@ -83,7 +83,7 @@ class SQLAlchemyDask:
83
83
  dtypes[column.name] = dtype
84
84
  return dtypes
85
85
 
86
- def read_frame(self, fillna_value=None) -> dd.DataFrame:
86
+ def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
87
87
  """
88
88
  Builds and executes a query to load data into a Dask DataFrame.
89
89
 
@@ -105,7 +105,8 @@ class SQLAlchemyDask:
105
105
  ).apply_filters(query, model=self.model, filters=self.filters)
106
106
  else:
107
107
  query = query.limit(self.chunk_size)
108
- self.logger.debug(f"Base query for pagination: {query}")
108
+ if self.verbose:
109
+ self.logger.debug(f"Base query for pagination: {query}")
109
110
 
110
111
  # 2. Get metadata for the Dask DataFrame structure
111
112
  ordered_columns = [column.name for column in self.model.__table__.columns]
@@ -116,6 +117,7 @@ class SQLAlchemyDask:
116
117
 
117
118
  retry_attempts = 3
118
119
  backoff_factor = 0.5 # start with a 0.5-second delay
120
+ total_records = 0
119
121
 
120
122
  for attempt in range(retry_attempts):
121
123
  try:
@@ -134,11 +136,12 @@ class SQLAlchemyDask:
134
136
  time.sleep(backoff_factor)
135
137
  backoff_factor *= 2 # Double the backoff time for the next attempt
136
138
  else:
139
+ self.total_records = -1 # Indicate failure to count records
137
140
  self.logger.error(
138
141
  "Failed to get a connection from the pool after several retries.",
139
142
  exc_info=True
140
143
  )
141
- return dd.from_pandas(meta_df, npartitions=1)
144
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
142
145
  except OperationalError as oe:
143
146
  # sometimes the DB driver wraps timeouts in OperationalError
144
147
  if "timeout" in str(oe).lower():
@@ -147,15 +150,18 @@ class SQLAlchemyDask:
147
150
  backoff_factor *= 2
148
151
  continue
149
152
  else:
153
+ self.total_records = -1 # Indicate failure to count records
150
154
  self.logger.error("OperationalError", exc_info=True)
151
- return dd.from_pandas(meta_df, npartitions=1)
155
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
152
156
  except Exception as e:
157
+ self.total_records = -1 # Indicate failure to count records
153
158
  self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
154
- return dd.from_pandas(meta_df, npartitions=1)
159
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
155
160
 
161
+ self.total_records = total_records
156
162
  if total_records == 0:
157
163
  self.logger.warning("Query returned 0 records.")
158
- return dd.from_pandas(meta_df, npartitions=1)
164
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
159
165
 
160
166
  self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
161
167
 
@@ -179,8 +185,9 @@ class SQLAlchemyDask:
179
185
  # 5. Construct the final lazy Dask DataFrame from the delayed chunks
180
186
  ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
181
187
  self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
182
-
183
- return ddf
188
+ if not self._entered:
189
+ super().cleanup()
190
+ return self.total_records, ddf
184
191
 
185
192
  ## Dask-Only Solution to test in better hardware
186
193