sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +94 -373
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +2 -1
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -4,12 +4,12 @@ import warnings
|
|
4
4
|
from typing import Any, Dict, Optional, Union, TypeVar
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
|
-
import fsspec
|
8
7
|
import pandas as pd
|
9
8
|
from pydantic import BaseModel
|
10
9
|
|
11
10
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
12
|
-
from sibi_dst.utils import
|
11
|
+
from sibi_dst.utils import ManagedResource
|
12
|
+
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
13
13
|
from .backends.http import HttpConfig
|
14
14
|
from .backends.parquet import ParquetConfig
|
15
15
|
from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
@@ -27,18 +27,19 @@ class BaseBackend:
|
|
27
27
|
self.helper = helper
|
28
28
|
self.logger = helper.logger
|
29
29
|
self.debug = helper.debug
|
30
|
+
self.total_records = helper.total_records # no records loaded yet
|
30
31
|
|
31
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
32
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
32
33
|
"""Synchronous data loading method. Must be implemented by sync backends."""
|
33
34
|
raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
34
35
|
|
35
|
-
async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
|
36
|
+
async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
36
37
|
"""Asynchronous data loading method. By default, it calls the sync version."""
|
37
38
|
return self.load(**options)
|
38
39
|
|
39
40
|
|
40
41
|
class SqlAlchemyBackend(BaseBackend):
|
41
|
-
def load(self, **options) -> dd.DataFrame:
|
42
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
42
43
|
try:
|
43
44
|
# Process incoming filter options into the ParamsConfig object
|
44
45
|
if options and hasattr(self.helper._backend_params, 'parse_params'):
|
@@ -49,45 +50,51 @@ class SqlAlchemyBackend(BaseBackend):
|
|
49
50
|
plugin_query=self.helper._backend_query,
|
50
51
|
plugin_params=self.helper._backend_params,
|
51
52
|
logger=self.logger,
|
52
|
-
debug=
|
53
|
+
debug=self.debug
|
53
54
|
)
|
54
|
-
|
55
|
+
self.total_records, result = db_loader.build_and_load()
|
56
|
+
return self.total_records, result
|
55
57
|
except Exception as e:
|
56
58
|
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
57
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
59
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
58
60
|
|
59
61
|
|
60
62
|
class ParquetBackend(BaseBackend):
|
61
63
|
"""This backend is also purely synchronous."""
|
62
64
|
|
63
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
65
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
64
66
|
try:
|
65
67
|
df = self.helper.backend_parquet.load_files()
|
66
68
|
if options and df is not None:
|
67
69
|
df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
68
|
-
|
70
|
+
self.total_records = len(df)
|
71
|
+
return self.total_records, df
|
69
72
|
except Exception as e:
|
73
|
+
self.total_records = -1 # Reset total_records on failure
|
70
74
|
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
|
71
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
75
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
72
76
|
|
73
77
|
|
74
78
|
class HttpBackend(BaseBackend):
|
75
79
|
"""This backend is purely asynchronous."""
|
76
80
|
|
77
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
81
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
78
82
|
# This will correctly fail by raising NotImplementedError from the base class.
|
79
83
|
return self.helper.backend_http.fetch_data(**options)
|
80
84
|
|
81
|
-
async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
85
|
+
async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
|
82
86
|
if not self.helper.backend_http:
|
83
87
|
self.logger.warning("HTTP plugin not configured properly.")
|
84
|
-
|
85
|
-
|
88
|
+
self.total_records = -1
|
89
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
90
|
+
result = await self.helper.backend_http.fetch_data(**options)
|
91
|
+
self.total_records = len(result)
|
92
|
+
return self.total_records, result
|
86
93
|
|
87
94
|
|
88
95
|
# --- Main DfHelper Facade Class ---
|
89
96
|
|
90
|
-
class DfHelper:
|
97
|
+
class DfHelper(ManagedResource):
|
91
98
|
"""
|
92
99
|
A reusable utility for loading data. It provides both sync (`load`) and
|
93
100
|
async (`aload`) methods to accommodate different backends.
|
@@ -103,13 +110,14 @@ class DfHelper:
|
|
103
110
|
def __init__(self, backend='sqlalchemy', **kwargs):
|
104
111
|
self.default_config = self.default_config or {}
|
105
112
|
kwargs = {**self.default_config.copy(), **kwargs}
|
113
|
+
super().__init__(**kwargs)
|
106
114
|
self.backend = backend
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
self.fs = kwargs.get("fs", fsspec.filesystem('file'))
|
115
|
+
|
116
|
+
# Need to set default values for backend-specific configurations
|
117
|
+
kwargs.setdefault("debug", self.debug)
|
111
118
|
kwargs.setdefault("fs", self.fs)
|
112
119
|
kwargs.setdefault("logger", self.logger)
|
120
|
+
self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
113
121
|
self._backend_query = self._get_config(QueryConfig, kwargs)
|
114
122
|
self._backend_params = self._get_config(ParamsConfig, kwargs)
|
115
123
|
self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
@@ -127,11 +135,9 @@ class DfHelper:
|
|
127
135
|
if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
128
136
|
self.backend_strategy = strategy_class(self)
|
129
137
|
|
130
|
-
def __enter__(self):
|
131
|
-
return self
|
132
|
-
|
133
138
|
def __exit__(self, exc_type, exc_value, traceback):
|
134
139
|
self._cleanup()
|
140
|
+
super().__exit__(exc_type, exc_value, traceback)
|
135
141
|
|
136
142
|
def _cleanup(self):
|
137
143
|
active_config = getattr(self, f"backend_{self.backend}", None)
|
@@ -147,14 +153,14 @@ class DfHelper:
|
|
147
153
|
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
148
154
|
"""Loads data synchronously. Fails if backend is async-only."""
|
149
155
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
150
|
-
df = self.backend_strategy.load(**options)
|
156
|
+
self.total_records, df = self.backend_strategy.load(**options)
|
151
157
|
df = self._process_loaded_data(df)
|
152
158
|
df = self._post_process_df(df)
|
153
159
|
return df.compute() if as_pandas else df
|
154
160
|
|
155
161
|
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
156
162
|
"""Loads data asynchronously from any backend."""
|
157
|
-
df = await self.backend_strategy.aload(**options)
|
163
|
+
self.total_records, df = await self.backend_strategy.aload(**options)
|
158
164
|
df = self._process_loaded_data(df)
|
159
165
|
df = self._post_process_df(df)
|
160
166
|
return df.compute() if as_pandas else df
|
@@ -195,15 +201,27 @@ class DfHelper:
|
|
195
201
|
return
|
196
202
|
fs = kwargs.pop('fs', self.fs)
|
197
203
|
path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
|
198
|
-
|
199
|
-
|
204
|
+
writer_config = {
|
205
|
+
'df_result': df,
|
206
|
+
'parquet_storage_path': path,
|
207
|
+
'fs': fs,
|
208
|
+
'debug': self.debug,
|
209
|
+
'logger': self.logger,
|
210
|
+
'verbose': self.verbose,
|
211
|
+
}
|
212
|
+
with ParquetSaver(**writer_config) as saver:
|
213
|
+
saver.save_to_parquet(parquet_filename)
|
214
|
+
|
215
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in {path}.")
|
200
216
|
|
201
217
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
202
218
|
if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
203
219
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
204
220
|
return
|
205
|
-
|
206
|
-
self.logger.
|
221
|
+
|
222
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
223
|
+
writer.save_to_clickhouse(df)
|
224
|
+
self.logger.debug("Save to ClickHouse completed.")
|
207
225
|
|
208
226
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
209
227
|
"""Synchronous convenience method for loading a date range."""
|
@@ -229,4 +247,3 @@ class DfHelper:
|
|
229
247
|
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
230
248
|
self.logger.debug(f"Period load generated filters: {kwargs}")
|
231
249
|
return kwargs
|
232
|
-
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from __future__ import annotations
|
1
2
|
import datetime
|
2
3
|
import logging
|
3
4
|
import threading
|
@@ -7,7 +8,7 @@ import dask.dataframe as dd
|
|
7
8
|
import fsspec
|
8
9
|
|
9
10
|
from sibi_dst.df_helper import DfHelper
|
10
|
-
from sibi_dst.utils import DataWrapper, DateUtils,
|
11
|
+
from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
|
11
12
|
from sibi_dst.utils import MissingManifestManager
|
12
13
|
|
13
14
|
|
@@ -58,14 +59,6 @@ class ParquetArtifact(DfHelper):
|
|
58
59
|
'backend': 'parquet'
|
59
60
|
}
|
60
61
|
|
61
|
-
# DEFAULT_UPDATE_PLANNER_CONFIG = {
|
62
|
-
# 'reverse_order': True,
|
63
|
-
# 'overwrite': False,
|
64
|
-
# 'ignore_missing': True,
|
65
|
-
# 'history_days_threshold': 30,
|
66
|
-
# 'max_age_minutes': 10,
|
67
|
-
# 'show_progress': False
|
68
|
-
# }
|
69
62
|
|
70
63
|
def __init__(self, data_wrapper_class, **kwargs):
|
71
64
|
"""
|
@@ -97,7 +90,9 @@ class ParquetArtifact(DfHelper):
|
|
97
90
|
**kwargs,
|
98
91
|
}
|
99
92
|
self.df: Optional[dd.DataFrame] = None
|
100
|
-
self.
|
93
|
+
super().__init__(**self.config)
|
94
|
+
#self._own_logger = False
|
95
|
+
#self._setup_logging()
|
101
96
|
self.data_wrapper_class = data_wrapper_class
|
102
97
|
|
103
98
|
self.date_field = self._validate_required('date_field')
|
@@ -107,35 +102,37 @@ class ParquetArtifact(DfHelper):
|
|
107
102
|
self.parquet_end_date = self._validate_required('parquet_end_date')
|
108
103
|
|
109
104
|
# Filesystem setup
|
110
|
-
self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
111
|
-
self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
112
|
-
self.fs = self.config.setdefault('fs', None)
|
113
|
-
self._own_fs = self.fs is None
|
114
|
-
if self.fs is None:
|
115
|
-
|
116
|
-
|
117
|
-
self.config.setdefault('fs', self.fs)
|
105
|
+
#self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
106
|
+
#self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
107
|
+
#self.fs = self.config.setdefault('fs', None)
|
108
|
+
#self._own_fs = self.fs is None
|
109
|
+
#if self.fs is None:
|
110
|
+
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
111
|
+
# self._own_fs = True
|
112
|
+
#self.config.setdefault('fs', self.fs)
|
118
113
|
## Populate to parameters to pass to data_wrapper_class
|
119
114
|
self.class_params = self.config.pop('class_params', {
|
120
115
|
'debug': self.debug,
|
121
116
|
'logger': self.logger,
|
122
117
|
'fs': self.fs,
|
118
|
+
'verbose': self.verbose,
|
123
119
|
})
|
124
120
|
# Populate parameters to pass to load method of DataWrapper class
|
125
121
|
self.load_params = self.config.setdefault('load_params', {})
|
126
122
|
# Ensure the directory exists
|
127
123
|
self.ensure_directory_exists(self.parquet_storage_path)
|
128
|
-
super().__init__(**self.config)
|
124
|
+
#super().__init__(**self.config)
|
129
125
|
self.update_planner_params = {}
|
130
126
|
self.datawrapper_params = {}
|
131
127
|
|
132
|
-
def _setup_logging(self):
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
128
|
+
#def _setup_logging(self):
|
129
|
+
# """Initialize logger and debug settings."""
|
130
|
+
# self.debug = self.config.get('debug', False)
|
131
|
+
# logger = self.config.get('logger', None)
|
132
|
+
# self._own_logger = logger is None
|
133
|
+
# self.logger = logger or Logger.default_logger(
|
134
|
+
# logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}')
|
135
|
+
# self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
139
136
|
|
140
137
|
def _validate_required(self, key: str) -> Any:
|
141
138
|
"""Validate required configuration fields."""
|
@@ -150,7 +147,9 @@ class ParquetArtifact(DfHelper):
|
|
150
147
|
self.mmanifest = MissingManifestManager(
|
151
148
|
fs=self.fs,
|
152
149
|
manifest_path=self.missing_manifest_path,
|
153
|
-
clear_existing=overwrite
|
150
|
+
clear_existing=overwrite,
|
151
|
+
debug= self.debug,
|
152
|
+
logger=self.logger
|
154
153
|
)
|
155
154
|
|
156
155
|
# Initialize skipped files
|
@@ -158,7 +157,7 @@ class ParquetArtifact(DfHelper):
|
|
158
157
|
if not manifest_exists:
|
159
158
|
self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
|
160
159
|
self.mmanifest.save()
|
161
|
-
self.mmanifest.cleanup_temp_manifests()
|
160
|
+
#self.mmanifest.cleanup_temp_manifests()
|
162
161
|
else:
|
163
162
|
self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
|
164
163
|
|
@@ -172,7 +171,7 @@ class ParquetArtifact(DfHelper):
|
|
172
171
|
def _setup_update_planner(self, **kwargs) -> None:
|
173
172
|
self._prepare_update_params(**kwargs)
|
174
173
|
self.update_planner = UpdatePlanner(**self.update_planner_params)
|
175
|
-
self.update_planner.generate_plan(self.start_date, self.end_date)
|
174
|
+
self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
|
176
175
|
|
177
176
|
def load(self, **kwargs):
|
178
177
|
with self._lock:
|
@@ -197,36 +196,45 @@ class ParquetArtifact(DfHelper):
|
|
197
196
|
with DataWrapper(self.data_wrapper_class, **params) as dw:
|
198
197
|
dw.process()
|
199
198
|
|
200
|
-
def __enter__(self):
|
201
|
-
if getattr(self, "_entered", False):
|
202
|
-
return self
|
203
|
-
self._entered = True
|
204
|
-
return self
|
205
|
-
|
206
199
|
def __exit__(self, exc_type, exc_value, traceback):
|
207
200
|
try:
|
208
201
|
if self.mmanifest and self.mmanifest._new_records:
|
209
202
|
self.mmanifest.save()
|
210
|
-
self.mmanifest.cleanup_temp_manifests()
|
211
|
-
if getattr(self, "_entered", False) and self.fs and self._own_fs:
|
212
|
-
self.fs.close()
|
213
203
|
except Exception as e:
|
214
204
|
self.logger.warning(f"Error closing filesystem: {e}")
|
215
205
|
finally:
|
216
|
-
|
206
|
+
super().__exit__(exc_type, exc_value, traceback)
|
217
207
|
# return False so exceptions aren’t suppressed
|
218
208
|
return False
|
219
209
|
|
220
|
-
|
221
|
-
def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
|
210
|
+
def get_size_estimate(self, **kwargs) -> int:
|
222
211
|
"""
|
223
|
-
|
224
|
-
|
212
|
+
Synchronously estimates artifact size for use in multi-threaded environments.
|
213
|
+
|
214
|
+
This method uses the filesystem's own .sync() method to safely execute
|
215
|
+
asynchronous I/O operations from a synchronous context, preventing
|
216
|
+
event loop conflicts.
|
225
217
|
"""
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
218
|
+
|
219
|
+
async def _get_total_bytes_async():
|
220
|
+
"""A helper async coroutine to perform the I/O."""
|
221
|
+
import asyncio
|
222
|
+
|
223
|
+
# Use the async versions of fsspec methods (e.g., _glob, _size)
|
224
|
+
files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
|
225
|
+
if not files:
|
226
|
+
return 0
|
227
|
+
|
228
|
+
# Concurrently gather the size of all files for performance
|
229
|
+
size_tasks = [self.fs._size(f) for f in files]
|
230
|
+
sizes = await asyncio.gather(*size_tasks)
|
231
|
+
return sum(s for s in sizes if s is not None)
|
232
|
+
|
233
|
+
# Use the filesystem's own built-in sync method. This is the most
|
234
|
+
# reliable way to bridge the sync/async gap for fsspec.
|
235
|
+
total_bytes = self.fs.sync(_get_total_bytes_async())
|
236
|
+
|
237
|
+
# Convert to megabytes, ensuring a minimum of 1
|
230
238
|
return max(1, int(total_bytes / (1024 ** 2)))
|
231
239
|
|
232
240
|
def update_parquet(self, period: str = 'today', **kwargs) -> None:
|
@@ -316,6 +324,8 @@ class ParquetArtifact(DfHelper):
|
|
316
324
|
'parquet_filename': self.parquet_filename,
|
317
325
|
'data_path': self.parquet_storage_path,
|
318
326
|
'fs': self.fs,
|
327
|
+
'debug': self.debug,
|
328
|
+
'logger': self.logger,
|
319
329
|
'class_params': self.class_params,
|
320
330
|
'date_field': self.date_field,
|
321
331
|
'load_params': self.load_params,
|
@@ -54,9 +54,9 @@ class ParquetReader(DfHelper):
|
|
54
54
|
**kwargs,
|
55
55
|
}
|
56
56
|
self.df: Optional[dd.DataFrame] = None
|
57
|
-
self.debug = self.config.setdefault('debug', False)
|
58
|
-
self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
59
|
-
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
57
|
+
#self.debug = self.config.setdefault('debug', False)
|
58
|
+
#self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
59
|
+
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
60
60
|
self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
|
61
61
|
if self.parquet_storage_path is None:
|
62
62
|
raise ValueError('parquet_storage_path must be set')
|
@@ -69,12 +69,12 @@ class ParquetReader(DfHelper):
|
|
69
69
|
raise ValueError('parquet_end_date must be set')
|
70
70
|
|
71
71
|
# Filesystem setup
|
72
|
-
self.filesystem_type = filesystem_type
|
73
|
-
self.filesystem_options = filesystem_options or {}
|
74
|
-
self.fs = self.config.setdefault('fs', None)
|
75
|
-
if self.fs is None:
|
76
|
-
|
77
|
-
self.config.setdefault('fs', self.fs)
|
72
|
+
#self.filesystem_type = filesystem_type
|
73
|
+
#self.filesystem_options = filesystem_options or {}
|
74
|
+
#self.fs = self.config.setdefault('fs', None)
|
75
|
+
#if self.fs is None:
|
76
|
+
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
77
|
+
#self.config.setdefault('fs', self.fs)
|
78
78
|
|
79
79
|
if not self.directory_exists():
|
80
80
|
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
@@ -92,7 +92,3 @@ class ParquetReader(DfHelper):
|
|
92
92
|
except FileNotFoundError:
|
93
93
|
return False
|
94
94
|
|
95
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
96
|
-
# Ensure resources are cleaned up
|
97
|
-
if self.fs:
|
98
|
-
self.fs.close()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
import os
|
3
3
|
import threading
|
4
4
|
from contextlib import contextmanager
|
5
5
|
from typing import Any, Optional, ClassVar, Generator, Type, Dict
|
@@ -49,10 +49,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
49
49
|
debug: bool = False
|
50
50
|
|
51
51
|
# --- Pool Configuration ---
|
52
|
-
pool_size: int = 5
|
53
|
-
max_overflow: int = 10
|
54
|
-
pool_timeout: int = 30
|
55
|
-
pool_recycle: int = 1800
|
52
|
+
pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
|
53
|
+
max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
|
54
|
+
pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
|
55
|
+
pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
|
56
56
|
pool_pre_ping: bool = True
|
57
57
|
poolclass: Type[QueuePool] = QueuePool
|
58
58
|
|
@@ -60,6 +60,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
60
60
|
model: Optional[Type[Any]] = None
|
61
61
|
engine: Optional[Engine] = None
|
62
62
|
logger: Optional[Logger] = None
|
63
|
+
_own_logger: bool = False # Indicates if this instance owns the logger.
|
63
64
|
session_factory: Optional[sessionmaker] = None
|
64
65
|
|
65
66
|
# --- Private State ---
|
@@ -102,10 +103,13 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
102
103
|
|
103
104
|
def _init_logger(self) -> None:
|
104
105
|
"""Initializes the logger for this instance."""
|
106
|
+
# This is not a ManagedResource subclass, so we handle logger initialization directly.
|
107
|
+
# unless a logger is provided, we create our own.
|
105
108
|
if self.logger is None:
|
109
|
+
self._own_logger = True
|
106
110
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
107
|
-
|
108
|
-
|
111
|
+
log_level = Logger.DEBUG if self.debug else Logger.INFO
|
112
|
+
self.logger.set_level(log_level)
|
109
113
|
|
110
114
|
def _get_engine_key(self) -> tuple:
|
111
115
|
"""Generates a unique, normalized key for an engine configuration."""
|
@@ -146,6 +150,8 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
146
150
|
self.logger.error(f"Failed to create engine: {e}")
|
147
151
|
raise SQLAlchemyError(f"Engine creation failed: {e}") from e
|
148
152
|
|
153
|
+
#self.logger.debug(f"Connections Active: {self.active_connections}")
|
154
|
+
|
149
155
|
def close(self) -> None:
|
150
156
|
"""
|
151
157
|
Decrements the engine's reference count and disposes of the engine
|
@@ -176,8 +182,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
176
182
|
# Mark this instance as closed to prevent subsequent calls.
|
177
183
|
self._closed = True
|
178
184
|
|
179
|
-
# ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
|
180
|
-
# They are omitted here for brevity but should be included in your final file.
|
181
185
|
|
182
186
|
def _attach_events(self) -> None:
|
183
187
|
"""Attaches checkout/checkin events to the engine for connection tracking."""
|
@@ -191,7 +195,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
191
195
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
192
196
|
if wrapper:
|
193
197
|
wrapper['active_connections'] += 1
|
194
|
-
self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
|
198
|
+
# self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
|
195
199
|
|
196
200
|
def _on_checkin(self, *args) -> None:
|
197
201
|
"""Event listener for when a connection is returned to the pool."""
|
@@ -199,7 +203,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
199
203
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
200
204
|
if wrapper:
|
201
205
|
wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
|
202
|
-
self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
|
206
|
+
# self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
|
203
207
|
|
204
208
|
@property
|
205
209
|
def active_connections(self) -> int:
|
@@ -1,5 +1,6 @@
|
|
1
|
+
from __future__ import annotations
|
1
2
|
|
2
|
-
from typing import Type
|
3
|
+
from typing import Type, Any
|
3
4
|
|
4
5
|
import dask
|
5
6
|
import dask.dataframe as dd
|
@@ -13,11 +14,12 @@ from sqlalchemy.orm import declarative_base
|
|
13
14
|
import time
|
14
15
|
from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
15
16
|
import sqlalchemy as sa
|
17
|
+
|
18
|
+
from sibi_dst.utils import ManagedResource
|
16
19
|
from sibi_dst.df_helper.core import FilterHandler
|
17
|
-
from sibi_dst.utils import Logger
|
18
20
|
|
19
21
|
|
20
|
-
class SQLAlchemyDask:
|
22
|
+
class SQLAlchemyDask(ManagedResource):
|
21
23
|
"""
|
22
24
|
Loads data from a database into a Dask DataFrame using a memory-safe,
|
23
25
|
non-parallel, paginated approach.
|
@@ -46,8 +48,7 @@ class SQLAlchemyDask:
|
|
46
48
|
filters: dict,
|
47
49
|
engine: Engine,
|
48
50
|
chunk_size: int = 1000,
|
49
|
-
|
50
|
-
debug: bool = False,
|
51
|
+
**kwargs
|
51
52
|
):
|
52
53
|
"""
|
53
54
|
Initializes the data loader.
|
@@ -60,14 +61,13 @@ class SQLAlchemyDask:
|
|
60
61
|
logger: A logger instance.
|
61
62
|
debug: Whether to enable detailed logging.
|
62
63
|
"""
|
64
|
+
super().__init__(**kwargs)
|
63
65
|
self.model = model
|
64
66
|
self.filters = filters
|
65
67
|
self.engine = engine
|
66
68
|
self.chunk_size = chunk_size
|
67
|
-
self.debug = debug
|
68
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
69
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
70
69
|
self.filter_handler_cls = FilterHandler
|
70
|
+
self.total_records = -1 # Initialize to -1 to indicate uncounted
|
71
71
|
|
72
72
|
@classmethod
|
73
73
|
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
@@ -83,7 +83,7 @@ class SQLAlchemyDask:
|
|
83
83
|
dtypes[column.name] = dtype
|
84
84
|
return dtypes
|
85
85
|
|
86
|
-
def read_frame(self, fillna_value=None) ->
|
86
|
+
def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
|
87
87
|
"""
|
88
88
|
Builds and executes a query to load data into a Dask DataFrame.
|
89
89
|
|
@@ -105,7 +105,8 @@ class SQLAlchemyDask:
|
|
105
105
|
).apply_filters(query, model=self.model, filters=self.filters)
|
106
106
|
else:
|
107
107
|
query = query.limit(self.chunk_size)
|
108
|
-
self.
|
108
|
+
if self.verbose:
|
109
|
+
self.logger.debug(f"Base query for pagination: {query}")
|
109
110
|
|
110
111
|
# 2. Get metadata for the Dask DataFrame structure
|
111
112
|
ordered_columns = [column.name for column in self.model.__table__.columns]
|
@@ -116,6 +117,7 @@ class SQLAlchemyDask:
|
|
116
117
|
|
117
118
|
retry_attempts = 3
|
118
119
|
backoff_factor = 0.5 # start with a 0.5-second delay
|
120
|
+
total_records = 0
|
119
121
|
|
120
122
|
for attempt in range(retry_attempts):
|
121
123
|
try:
|
@@ -134,11 +136,12 @@ class SQLAlchemyDask:
|
|
134
136
|
time.sleep(backoff_factor)
|
135
137
|
backoff_factor *= 2 # Double the backoff time for the next attempt
|
136
138
|
else:
|
139
|
+
self.total_records = -1 # Indicate failure to count records
|
137
140
|
self.logger.error(
|
138
141
|
"Failed to get a connection from the pool after several retries.",
|
139
142
|
exc_info=True
|
140
143
|
)
|
141
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
144
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
142
145
|
except OperationalError as oe:
|
143
146
|
# sometimes the DB driver wraps timeouts in OperationalError
|
144
147
|
if "timeout" in str(oe).lower():
|
@@ -147,15 +150,18 @@ class SQLAlchemyDask:
|
|
147
150
|
backoff_factor *= 2
|
148
151
|
continue
|
149
152
|
else:
|
153
|
+
self.total_records = -1 # Indicate failure to count records
|
150
154
|
self.logger.error("OperationalError", exc_info=True)
|
151
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
155
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
152
156
|
except Exception as e:
|
157
|
+
self.total_records = -1 # Indicate failure to count records
|
153
158
|
self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
154
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
159
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
155
160
|
|
161
|
+
self.total_records = total_records
|
156
162
|
if total_records == 0:
|
157
163
|
self.logger.warning("Query returned 0 records.")
|
158
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
164
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
159
165
|
|
160
166
|
self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
|
161
167
|
|
@@ -179,8 +185,9 @@ class SQLAlchemyDask:
|
|
179
185
|
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
180
186
|
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
181
187
|
self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
|
182
|
-
|
183
|
-
|
188
|
+
if not self._entered:
|
189
|
+
super().cleanup()
|
190
|
+
return self.total_records, ddf
|
184
191
|
|
185
192
|
## Dask-Only Solution to test in better hardware
|
186
193
|
|