sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,12 +4,12 @@ import warnings
4
4
  from typing import Any, Dict, Optional, Union, TypeVar
5
5
 
6
6
  import dask.dataframe as dd
7
- import fsspec
8
7
  import pandas as pd
9
8
  from pydantic import BaseModel
10
9
 
11
10
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
12
- from sibi_dst.utils import Logger, ParquetSaver, ClickHouseWriter
11
+ from sibi_dst.utils import ManagedResource
12
+ from sibi_dst.utils import ParquetSaver, ClickHouseWriter
13
13
  from .backends.http import HttpConfig
14
14
  from .backends.parquet import ParquetConfig
15
15
  from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
@@ -27,18 +27,19 @@ class BaseBackend:
27
27
  self.helper = helper
28
28
  self.logger = helper.logger
29
29
  self.debug = helper.debug
30
+ self.total_records = helper.total_records # no records loaded yet
30
31
 
31
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
32
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
32
33
  """Synchronous data loading method. Must be implemented by sync backends."""
33
34
  raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
34
35
 
35
- async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
36
+ async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
36
37
  """Asynchronous data loading method. By default, it calls the sync version."""
37
38
  return self.load(**options)
38
39
 
39
40
 
40
41
  class SqlAlchemyBackend(BaseBackend):
41
- def load(self, **options) -> dd.DataFrame:
42
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
42
43
  try:
43
44
  # Process incoming filter options into the ParamsConfig object
44
45
  if options and hasattr(self.helper._backend_params, 'parse_params'):
@@ -49,45 +50,51 @@ class SqlAlchemyBackend(BaseBackend):
49
50
  plugin_query=self.helper._backend_query,
50
51
  plugin_params=self.helper._backend_params,
51
52
  logger=self.logger,
52
- debug= self.debug
53
+ debug=self.debug
53
54
  )
54
- return db_loader.build_and_load()
55
+ self.total_records, result = db_loader.build_and_load()
56
+ return self.total_records, result
55
57
  except Exception as e:
56
58
  self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
57
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
59
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
58
60
 
59
61
 
60
62
  class ParquetBackend(BaseBackend):
61
63
  """This backend is also purely synchronous."""
62
64
 
63
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
65
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
64
66
  try:
65
67
  df = self.helper.backend_parquet.load_files()
66
68
  if options and df is not None:
67
69
  df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
68
- return df
70
+ self.total_records = len(df)
71
+ return self.total_records, df
69
72
  except Exception as e:
73
+ self.total_records = -1 # Reset total_records on failure
70
74
  self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
71
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
75
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
72
76
 
73
77
 
74
78
  class HttpBackend(BaseBackend):
75
79
  """This backend is purely asynchronous."""
76
80
 
77
- def load(self, **options) -> dd.DataFrame | pd.DataFrame:
81
+ def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
78
82
  # This will correctly fail by raising NotImplementedError from the base class.
79
83
  return self.helper.backend_http.fetch_data(**options)
80
84
 
81
- async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
85
+ async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
82
86
  if not self.helper.backend_http:
83
87
  self.logger.warning("HTTP plugin not configured properly.")
84
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
85
- return await self.helper.backend_http.fetch_data(**options)
88
+ self.total_records = -1
89
+ return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
90
+ result = await self.helper.backend_http.fetch_data(**options)
91
+ self.total_records = len(result)
92
+ return self.total_records, result
86
93
 
87
94
 
88
95
  # --- Main DfHelper Facade Class ---
89
96
 
90
- class DfHelper:
97
+ class DfHelper(ManagedResource):
91
98
  """
92
99
  A reusable utility for loading data. It provides both sync (`load`) and
93
100
  async (`aload`) methods to accommodate different backends.
@@ -103,13 +110,14 @@ class DfHelper:
103
110
  def __init__(self, backend='sqlalchemy', **kwargs):
104
111
  self.default_config = self.default_config or {}
105
112
  kwargs = {**self.default_config.copy(), **kwargs}
113
+ super().__init__(**kwargs)
106
114
  self.backend = backend
107
- self.debug = kwargs.get("debug", False)
108
- self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
109
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
110
- self.fs = kwargs.get("fs", fsspec.filesystem('file'))
115
+
116
+ # Need to set default values for backend-specific configurations
117
+ kwargs.setdefault("debug", self.debug)
111
118
  kwargs.setdefault("fs", self.fs)
112
119
  kwargs.setdefault("logger", self.logger)
120
+ self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
113
121
  self._backend_query = self._get_config(QueryConfig, kwargs)
114
122
  self._backend_params = self._get_config(ParamsConfig, kwargs)
115
123
  self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
@@ -127,11 +135,9 @@ class DfHelper:
127
135
  if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
128
136
  self.backend_strategy = strategy_class(self)
129
137
 
130
- def __enter__(self):
131
- return self
132
-
133
138
  def __exit__(self, exc_type, exc_value, traceback):
134
139
  self._cleanup()
140
+ super().__exit__(exc_type, exc_value, traceback)
135
141
 
136
142
  def _cleanup(self):
137
143
  active_config = getattr(self, f"backend_{self.backend}", None)
@@ -147,14 +153,14 @@ class DfHelper:
147
153
  def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
148
154
  """Loads data synchronously. Fails if backend is async-only."""
149
155
  self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
150
- df = self.backend_strategy.load(**options)
156
+ self.total_records, df = self.backend_strategy.load(**options)
151
157
  df = self._process_loaded_data(df)
152
158
  df = self._post_process_df(df)
153
159
  return df.compute() if as_pandas else df
154
160
 
155
161
  async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
156
162
  """Loads data asynchronously from any backend."""
157
- df = await self.backend_strategy.aload(**options)
163
+ self.total_records, df = await self.backend_strategy.aload(**options)
158
164
  df = self._process_loaded_data(df)
159
165
  df = self._post_process_df(df)
160
166
  return df.compute() if as_pandas else df
@@ -195,15 +201,27 @@ class DfHelper:
195
201
  return
196
202
  fs = kwargs.pop('fs', self.fs)
197
203
  path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
198
- ParquetSaver(df, path, self.logger, fs).save_to_parquet(parquet_filename)
199
- self.logger.debug(f"Parquet saved to {parquet_filename} in path: {path}.")
204
+ writer_config = {
205
+ 'df_result': df,
206
+ 'parquet_storage_path': path,
207
+ 'fs': fs,
208
+ 'debug': self.debug,
209
+ 'logger': self.logger,
210
+ 'verbose': self.verbose,
211
+ }
212
+ with ParquetSaver(**writer_config) as saver:
213
+ saver.save_to_parquet(parquet_filename)
214
+
215
+ self.logger.debug(f"Parquet saved to {parquet_filename} in {path}.")
200
216
 
201
217
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
202
218
  if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
203
219
  self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
204
220
  return
205
- ClickHouseWriter(self.logger, **credentials).save_to_clickhouse(df)
206
- self.logger.debug("Save to ClickHouse completed.")
221
+
222
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
223
+ writer.save_to_clickhouse(df)
224
+ self.logger.debug("Save to ClickHouse completed.")
207
225
 
208
226
  def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
209
227
  """Synchronous convenience method for loading a date range."""
@@ -229,4 +247,3 @@ class DfHelper:
229
247
  kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
230
248
  self.logger.debug(f"Period load generated filters: {kwargs}")
231
249
  return kwargs
232
-
@@ -1,13 +1,13 @@
1
+ from __future__ import annotations
1
2
  import datetime
2
- import logging
3
3
  import threading
4
- from typing import Optional, Any, Dict
4
+ from typing import Optional, Any, Dict, ClassVar
5
5
 
6
6
  import dask.dataframe as dd
7
7
  import fsspec
8
8
 
9
9
  from sibi_dst.df_helper import DfHelper
10
- from sibi_dst.utils import DataWrapper, DateUtils, Logger, ParquetSaver, UpdatePlanner
10
+ from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
11
11
  from sibi_dst.utils import MissingManifestManager
12
12
 
13
13
 
@@ -54,18 +54,10 @@ class ParquetArtifact(DfHelper):
54
54
  :ivar fs: Filesystem object used for storage operations.
55
55
  :type fs: fsspec.AbstractFileSystem
56
56
  """
57
- DEFAULT_CONFIG = {
57
+ DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
58
58
  'backend': 'parquet'
59
59
  }
60
60
 
61
- # DEFAULT_UPDATE_PLANNER_CONFIG = {
62
- # 'reverse_order': True,
63
- # 'overwrite': False,
64
- # 'ignore_missing': True,
65
- # 'history_days_threshold': 30,
66
- # 'max_age_minutes': 10,
67
- # 'show_progress': False
68
- # }
69
61
 
70
62
  def __init__(self, data_wrapper_class, **kwargs):
71
63
  """
@@ -97,7 +89,7 @@ class ParquetArtifact(DfHelper):
97
89
  **kwargs,
98
90
  }
99
91
  self.df: Optional[dd.DataFrame] = None
100
- self._setup_logging()
92
+ super().__init__(**self.config)
101
93
  self.data_wrapper_class = data_wrapper_class
102
94
 
103
95
  self.date_field = self._validate_required('date_field')
@@ -106,37 +98,20 @@ class ParquetArtifact(DfHelper):
106
98
  self.parquet_start_date = self._validate_required('parquet_start_date')
107
99
  self.parquet_end_date = self._validate_required('parquet_end_date')
108
100
 
109
- # Filesystem setup
110
- self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
111
- self.filesystem_options = self.config.setdefault('filesystem_options', {})
112
- self.fs = self.config.setdefault('fs', None)
113
- self._own_fs = self.fs is None
114
- if self.fs is None:
115
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
116
- self._own_fs = True
117
- self.config.setdefault('fs', self.fs)
118
- ## Populate to parameters to pass to data_wrapper_class
119
101
  self.class_params = self.config.pop('class_params', {
120
102
  'debug': self.debug,
121
103
  'logger': self.logger,
122
104
  'fs': self.fs,
105
+ 'verbose': self.verbose,
123
106
  })
124
107
  # Populate parameters to pass to load method of DataWrapper class
125
108
  self.load_params = self.config.setdefault('load_params', {})
126
109
  # Ensure the directory exists
127
110
  self.ensure_directory_exists(self.parquet_storage_path)
128
- super().__init__(**self.config)
111
+ #super().__init__(**self.config)
129
112
  self.update_planner_params = {}
130
113
  self.datawrapper_params = {}
131
114
 
132
- def _setup_logging(self):
133
- """Initialize logger and debug settings."""
134
- self.debug = self.config.get('debug', False)
135
- self.logger = self.config.get('logger',
136
- Logger.default_logger(
137
- logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}'))
138
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
139
-
140
115
  def _validate_required(self, key: str) -> Any:
141
116
  """Validate required configuration fields."""
142
117
  value = self.config.setdefault(key, None)
@@ -150,7 +125,9 @@ class ParquetArtifact(DfHelper):
150
125
  self.mmanifest = MissingManifestManager(
151
126
  fs=self.fs,
152
127
  manifest_path=self.missing_manifest_path,
153
- clear_existing=overwrite
128
+ clear_existing=overwrite,
129
+ debug= self.debug,
130
+ logger=self.logger
154
131
  )
155
132
 
156
133
  # Initialize skipped files
@@ -158,7 +135,7 @@ class ParquetArtifact(DfHelper):
158
135
  if not manifest_exists:
159
136
  self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
160
137
  self.mmanifest.save()
161
- self.mmanifest.cleanup_temp_manifests()
138
+ #self.mmanifest.cleanup_temp_manifests()
162
139
  else:
163
140
  self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
164
141
 
@@ -172,7 +149,7 @@ class ParquetArtifact(DfHelper):
172
149
  def _setup_update_planner(self, **kwargs) -> None:
173
150
  self._prepare_update_params(**kwargs)
174
151
  self.update_planner = UpdatePlanner(**self.update_planner_params)
175
- self.update_planner.generate_plan(self.start_date, self.end_date)
152
+ self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
176
153
 
177
154
  def load(self, **kwargs):
178
155
  with self._lock:
@@ -197,36 +174,45 @@ class ParquetArtifact(DfHelper):
197
174
  with DataWrapper(self.data_wrapper_class, **params) as dw:
198
175
  dw.process()
199
176
 
200
- def __enter__(self):
201
- if getattr(self, "_entered", False):
202
- return self
203
- self._entered = True
204
- return self
205
-
206
177
  def __exit__(self, exc_type, exc_value, traceback):
207
178
  try:
208
179
  if self.mmanifest and self.mmanifest._new_records:
209
180
  self.mmanifest.save()
210
- self.mmanifest.cleanup_temp_manifests()
211
- if getattr(self, "_entered", False) and self.fs and self._own_fs:
212
- self.fs.close()
213
181
  except Exception as e:
214
182
  self.logger.warning(f"Error closing filesystem: {e}")
215
183
  finally:
216
- self._entered = False
184
+ super().__exit__(exc_type, exc_value, traceback)
217
185
  # return False so exceptions aren’t suppressed
218
186
  return False
219
187
 
220
- @classmethod
221
- def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
188
+ def get_size_estimate(self, **kwargs) -> int:
222
189
  """
223
- Estimate complexity as total bytes of all .parquet files under parquet_path.
224
- Returns size in megabytes (so you can cap or scale priority sensibly).
190
+ Synchronously estimates artifact size for use in multi-threaded environments.
191
+
192
+ This method safely executes asynchronous I/O operations from a synchronous
193
+ context, handling variations in fsspec filesystem implementations.
225
194
  """
226
- fs, _, paths = cls.fs.get_fs_token_paths(parquet_path)
227
- files = fs.glob(f"{parquet_path}/*.parquet")
228
- total_bytes = sum(fs.size(f) for f in files)
229
- # convert to “units” (e.g. MB) so priorities stay in a reasonable range
195
+
196
+ async def _get_total_bytes_async():
197
+ """A helper async coroutine to perform the I/O."""
198
+ import asyncio
199
+
200
+ files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
201
+ if not files:
202
+ return 0
203
+
204
+ size_tasks = [self.fs._size(f) for f in files]
205
+ sizes = await asyncio.gather(*size_tasks)
206
+ return sum(s for s in sizes if s is not None)
207
+
208
+ try:
209
+ # Attempt the standard fsspec method first
210
+ total_bytes = self.fs.sync(_get_total_bytes_async())
211
+ except AttributeError:
212
+ # fallback for filesystems like s3fs that lack .sync()
213
+ total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
214
+
215
+ # Convert to megabytes, ensuring a minimum of 1
230
216
  return max(1, int(total_bytes / (1024 ** 2)))
231
217
 
232
218
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
@@ -316,6 +302,8 @@ class ParquetArtifact(DfHelper):
316
302
  'parquet_filename': self.parquet_filename,
317
303
  'data_path': self.parquet_storage_path,
318
304
  'fs': self.fs,
305
+ 'debug': self.debug,
306
+ 'logger': self.logger,
319
307
  'class_params': self.class_params,
320
308
  'date_field': self.date_field,
321
309
  'load_params': self.load_params,
@@ -1,11 +1,10 @@
1
1
  import logging
2
- from typing import Optional
2
+ from typing import Optional, ClassVar, Dict
3
3
 
4
4
  import dask.dataframe as dd
5
5
  import fsspec
6
6
 
7
7
  from sibi_dst.df_helper import DfHelper
8
- from sibi_dst.utils import Logger
9
8
 
10
9
  class ParquetReader(DfHelper):
11
10
  """
@@ -44,7 +43,7 @@ class ParquetReader(DfHelper):
44
43
  Parquet storage.
45
44
  :type fs: fsspec.AbstractFileSystem
46
45
  """
47
- DEFAULT_CONFIG = {
46
+ DEFAULT_CONFIG: ClassVar[Dict[str, int]] = {
48
47
  'backend': 'parquet'
49
48
  }
50
49
 
@@ -54,9 +53,9 @@ class ParquetReader(DfHelper):
54
53
  **kwargs,
55
54
  }
56
55
  self.df: Optional[dd.DataFrame] = None
57
- self.debug = self.config.setdefault('debug', False)
58
- self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
59
- self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
56
+ #self.debug = self.config.setdefault('debug', False)
57
+ #self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
58
+ #self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
60
59
  self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
61
60
  if self.parquet_storage_path is None:
62
61
  raise ValueError('parquet_storage_path must be set')
@@ -69,12 +68,12 @@ class ParquetReader(DfHelper):
69
68
  raise ValueError('parquet_end_date must be set')
70
69
 
71
70
  # Filesystem setup
72
- self.filesystem_type = filesystem_type
73
- self.filesystem_options = filesystem_options or {}
74
- self.fs = self.config.setdefault('fs', None)
75
- if self.fs is None:
76
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
77
- self.config.setdefault('fs', self.fs)
71
+ #self.filesystem_type = filesystem_type
72
+ #self.filesystem_options = filesystem_options or {}
73
+ #self.fs = self.config.setdefault('fs', None)
74
+ #if self.fs is None:
75
+ # self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
76
+ #self.config.setdefault('fs', self.fs)
78
77
 
79
78
  if not self.directory_exists():
80
79
  raise ValueError(f"{self.parquet_storage_path} does not exist")
@@ -92,7 +91,3 @@ class ParquetReader(DfHelper):
92
91
  except FileNotFoundError:
93
92
  return False
94
93
 
95
- def __exit__(self, exc_type, exc_value, traceback):
96
- # Ensure resources are cleaned up
97
- if self.fs:
98
- self.fs.close()
@@ -85,7 +85,8 @@ class ParquetConfig(BaseModel):
85
85
  if self.logger is None:
86
86
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
87
87
  self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
88
-
88
+ if self.fs is None:
89
+ raise ValueError('Parquet Options: File system (fs) must be specified')
89
90
 
90
91
  if self.parquet_storage_path is None:
91
92
  raise ValueError('Parquet storage path must be specified')
@@ -1,5 +1,5 @@
1
1
  from __future__ import annotations
2
-
2
+ import os
3
3
  import threading
4
4
  from contextlib import contextmanager
5
5
  from typing import Any, Optional, ClassVar, Generator, Type, Dict
@@ -49,10 +49,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
49
49
  debug: bool = False
50
50
 
51
51
  # --- Pool Configuration ---
52
- pool_size: int = 5
53
- max_overflow: int = 10
54
- pool_timeout: int = 30
55
- pool_recycle: int = 1800
52
+ pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
53
+ max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
54
+ pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
55
+ pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
56
56
  pool_pre_ping: bool = True
57
57
  poolclass: Type[QueuePool] = QueuePool
58
58
 
@@ -60,6 +60,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
60
60
  model: Optional[Type[Any]] = None
61
61
  engine: Optional[Engine] = None
62
62
  logger: Optional[Logger] = None
63
+ _own_logger: bool = False # Indicates if this instance owns the logger.
63
64
  session_factory: Optional[sessionmaker] = None
64
65
 
65
66
  # --- Private State ---
@@ -102,10 +103,13 @@ class SqlAlchemyConnectionConfig(BaseModel):
102
103
 
103
104
  def _init_logger(self) -> None:
104
105
  """Initializes the logger for this instance."""
106
+ # This is not a ManagedResource subclass, so we handle logger initialization directly.
107
+ # unless a logger is provided, we create our own.
105
108
  if self.logger is None:
109
+ self._own_logger = True
106
110
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
107
- log_level = Logger.DEBUG if self.debug else Logger.INFO
108
- self.logger.set_level(log_level)
111
+ log_level = Logger.DEBUG if self.debug else Logger.INFO
112
+ self.logger.set_level(log_level)
109
113
 
110
114
  def _get_engine_key(self) -> tuple:
111
115
  """Generates a unique, normalized key for an engine configuration."""
@@ -146,6 +150,8 @@ class SqlAlchemyConnectionConfig(BaseModel):
146
150
  self.logger.error(f"Failed to create engine: {e}")
147
151
  raise SQLAlchemyError(f"Engine creation failed: {e}") from e
148
152
 
153
+ #self.logger.debug(f"Connections Active: {self.active_connections}")
154
+
149
155
  def close(self) -> None:
150
156
  """
151
157
  Decrements the engine's reference count and disposes of the engine
@@ -176,8 +182,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
176
182
  # Mark this instance as closed to prevent subsequent calls.
177
183
  self._closed = True
178
184
 
179
- # ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
180
- # They are omitted here for brevity but should be included in your final file.
181
185
 
182
186
  def _attach_events(self) -> None:
183
187
  """Attaches checkout/checkin events to the engine for connection tracking."""
@@ -191,7 +195,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
191
195
  wrapper = self._engine_registry.get(self._engine_key_instance)
192
196
  if wrapper:
193
197
  wrapper['active_connections'] += 1
194
- self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
198
+ # self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
195
199
 
196
200
  def _on_checkin(self, *args) -> None:
197
201
  """Event listener for when a connection is returned to the pool."""
@@ -199,7 +203,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
199
203
  wrapper = self._engine_registry.get(self._engine_key_instance)
200
204
  if wrapper:
201
205
  wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
202
- self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
206
+ # self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
203
207
 
204
208
  @property
205
209
  def active_connections(self) -> int:
@@ -1,5 +1,6 @@
1
+ from __future__ import annotations
1
2
 
2
- from typing import Type
3
+ from typing import Type, Any
3
4
 
4
5
  import dask
5
6
  import dask.dataframe as dd
@@ -13,11 +14,12 @@ from sqlalchemy.orm import declarative_base
13
14
  import time
14
15
  from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
15
16
  import sqlalchemy as sa
17
+
18
+ from sibi_dst.utils import ManagedResource
16
19
  from sibi_dst.df_helper.core import FilterHandler
17
- from sibi_dst.utils import Logger
18
20
 
19
21
 
20
- class SQLAlchemyDask:
22
+ class SQLAlchemyDask(ManagedResource):
21
23
  """
22
24
  Loads data from a database into a Dask DataFrame using a memory-safe,
23
25
  non-parallel, paginated approach.
@@ -46,8 +48,7 @@ class SQLAlchemyDask:
46
48
  filters: dict,
47
49
  engine: Engine,
48
50
  chunk_size: int = 1000,
49
- logger=None,
50
- debug: bool = False,
51
+ **kwargs
51
52
  ):
52
53
  """
53
54
  Initializes the data loader.
@@ -60,14 +61,13 @@ class SQLAlchemyDask:
60
61
  logger: A logger instance.
61
62
  debug: Whether to enable detailed logging.
62
63
  """
64
+ super().__init__(**kwargs)
63
65
  self.model = model
64
66
  self.filters = filters
65
67
  self.engine = engine
66
68
  self.chunk_size = chunk_size
67
- self.debug = debug
68
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
69
- self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
69
  self.filter_handler_cls = FilterHandler
70
+ self.total_records = -1 # Initialize to -1 to indicate uncounted
71
71
 
72
72
  @classmethod
73
73
  def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
@@ -83,7 +83,7 @@ class SQLAlchemyDask:
83
83
  dtypes[column.name] = dtype
84
84
  return dtypes
85
85
 
86
- def read_frame(self, fillna_value=None) -> dd.DataFrame:
86
+ def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
87
87
  """
88
88
  Builds and executes a query to load data into a Dask DataFrame.
89
89
 
@@ -105,7 +105,8 @@ class SQLAlchemyDask:
105
105
  ).apply_filters(query, model=self.model, filters=self.filters)
106
106
  else:
107
107
  query = query.limit(self.chunk_size)
108
- self.logger.debug(f"Base query for pagination: {query}")
108
+ if self.verbose:
109
+ self.logger.debug(f"Base query for pagination: {query}")
109
110
 
110
111
  # 2. Get metadata for the Dask DataFrame structure
111
112
  ordered_columns = [column.name for column in self.model.__table__.columns]
@@ -116,6 +117,7 @@ class SQLAlchemyDask:
116
117
 
117
118
  retry_attempts = 3
118
119
  backoff_factor = 0.5 # start with a 0.5-second delay
120
+ total_records = 0
119
121
 
120
122
  for attempt in range(retry_attempts):
121
123
  try:
@@ -134,11 +136,12 @@ class SQLAlchemyDask:
134
136
  time.sleep(backoff_factor)
135
137
  backoff_factor *= 2 # Double the backoff time for the next attempt
136
138
  else:
139
+ self.total_records = -1 # Indicate failure to count records
137
140
  self.logger.error(
138
141
  "Failed to get a connection from the pool after several retries.",
139
142
  exc_info=True
140
143
  )
141
- return dd.from_pandas(meta_df, npartitions=1)
144
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
142
145
  except OperationalError as oe:
143
146
  # sometimes the DB driver wraps timeouts in OperationalError
144
147
  if "timeout" in str(oe).lower():
@@ -147,15 +150,18 @@ class SQLAlchemyDask:
147
150
  backoff_factor *= 2
148
151
  continue
149
152
  else:
153
+ self.total_records = -1 # Indicate failure to count records
150
154
  self.logger.error("OperationalError", exc_info=True)
151
- return dd.from_pandas(meta_df, npartitions=1)
155
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
152
156
  except Exception as e:
157
+ self.total_records = -1 # Indicate failure to count records
153
158
  self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
154
- return dd.from_pandas(meta_df, npartitions=1)
159
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
155
160
 
161
+ self.total_records = total_records
156
162
  if total_records == 0:
157
163
  self.logger.warning("Query returned 0 records.")
158
- return dd.from_pandas(meta_df, npartitions=1)
164
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
159
165
 
160
166
  self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
161
167
 
@@ -179,8 +185,9 @@ class SQLAlchemyDask:
179
185
  # 5. Construct the final lazy Dask DataFrame from the delayed chunks
180
186
  ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
181
187
  self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
182
-
183
- return ddf
188
+ if not self._entered:
189
+ super().cleanup()
190
+ return self.total_records, ddf
184
191
 
185
192
  ## Dask-Only Solution to test in better hardware
186
193