sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +41 -53
- sibi_dst/df_helper/_parquet_reader.py +11 -16
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +117 -0
- sibi_dst/utils/clickhouse_writer.py +7 -5
- sibi_dst/utils/data_wrapper.py +64 -89
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +94 -373
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +75 -25
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/METADATA +4 -1
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/RECORD +25 -28
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.6.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -4,12 +4,12 @@ import warnings
|
|
4
4
|
from typing import Any, Dict, Optional, Union, TypeVar
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
|
-
import fsspec
|
8
7
|
import pandas as pd
|
9
8
|
from pydantic import BaseModel
|
10
9
|
|
11
10
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
12
|
-
from sibi_dst.utils import
|
11
|
+
from sibi_dst.utils import ManagedResource
|
12
|
+
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
13
13
|
from .backends.http import HttpConfig
|
14
14
|
from .backends.parquet import ParquetConfig
|
15
15
|
from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
@@ -27,18 +27,19 @@ class BaseBackend:
|
|
27
27
|
self.helper = helper
|
28
28
|
self.logger = helper.logger
|
29
29
|
self.debug = helper.debug
|
30
|
+
self.total_records = helper.total_records # no records loaded yet
|
30
31
|
|
31
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
32
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
32
33
|
"""Synchronous data loading method. Must be implemented by sync backends."""
|
33
34
|
raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
34
35
|
|
35
|
-
async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
|
36
|
+
async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
36
37
|
"""Asynchronous data loading method. By default, it calls the sync version."""
|
37
38
|
return self.load(**options)
|
38
39
|
|
39
40
|
|
40
41
|
class SqlAlchemyBackend(BaseBackend):
|
41
|
-
def load(self, **options) -> dd.DataFrame:
|
42
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
42
43
|
try:
|
43
44
|
# Process incoming filter options into the ParamsConfig object
|
44
45
|
if options and hasattr(self.helper._backend_params, 'parse_params'):
|
@@ -49,45 +50,51 @@ class SqlAlchemyBackend(BaseBackend):
|
|
49
50
|
plugin_query=self.helper._backend_query,
|
50
51
|
plugin_params=self.helper._backend_params,
|
51
52
|
logger=self.logger,
|
52
|
-
debug=
|
53
|
+
debug=self.debug
|
53
54
|
)
|
54
|
-
|
55
|
+
self.total_records, result = db_loader.build_and_load()
|
56
|
+
return self.total_records, result
|
55
57
|
except Exception as e:
|
56
58
|
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
57
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
59
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
58
60
|
|
59
61
|
|
60
62
|
class ParquetBackend(BaseBackend):
|
61
63
|
"""This backend is also purely synchronous."""
|
62
64
|
|
63
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
65
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
64
66
|
try:
|
65
67
|
df = self.helper.backend_parquet.load_files()
|
66
68
|
if options and df is not None:
|
67
69
|
df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
68
|
-
|
70
|
+
self.total_records = len(df)
|
71
|
+
return self.total_records, df
|
69
72
|
except Exception as e:
|
73
|
+
self.total_records = -1 # Reset total_records on failure
|
70
74
|
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
|
71
|
-
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
75
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
72
76
|
|
73
77
|
|
74
78
|
class HttpBackend(BaseBackend):
|
75
79
|
"""This backend is purely asynchronous."""
|
76
80
|
|
77
|
-
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
81
|
+
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
78
82
|
# This will correctly fail by raising NotImplementedError from the base class.
|
79
83
|
return self.helper.backend_http.fetch_data(**options)
|
80
84
|
|
81
|
-
async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
85
|
+
async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
|
82
86
|
if not self.helper.backend_http:
|
83
87
|
self.logger.warning("HTTP plugin not configured properly.")
|
84
|
-
|
85
|
-
|
88
|
+
self.total_records = -1
|
89
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
90
|
+
result = await self.helper.backend_http.fetch_data(**options)
|
91
|
+
self.total_records = len(result)
|
92
|
+
return self.total_records, result
|
86
93
|
|
87
94
|
|
88
95
|
# --- Main DfHelper Facade Class ---
|
89
96
|
|
90
|
-
class DfHelper:
|
97
|
+
class DfHelper(ManagedResource):
|
91
98
|
"""
|
92
99
|
A reusable utility for loading data. It provides both sync (`load`) and
|
93
100
|
async (`aload`) methods to accommodate different backends.
|
@@ -103,13 +110,14 @@ class DfHelper:
|
|
103
110
|
def __init__(self, backend='sqlalchemy', **kwargs):
|
104
111
|
self.default_config = self.default_config or {}
|
105
112
|
kwargs = {**self.default_config.copy(), **kwargs}
|
113
|
+
super().__init__(**kwargs)
|
106
114
|
self.backend = backend
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
self.fs = kwargs.get("fs", fsspec.filesystem('file'))
|
115
|
+
|
116
|
+
# Need to set default values for backend-specific configurations
|
117
|
+
kwargs.setdefault("debug", self.debug)
|
111
118
|
kwargs.setdefault("fs", self.fs)
|
112
119
|
kwargs.setdefault("logger", self.logger)
|
120
|
+
self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
113
121
|
self._backend_query = self._get_config(QueryConfig, kwargs)
|
114
122
|
self._backend_params = self._get_config(ParamsConfig, kwargs)
|
115
123
|
self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
@@ -127,11 +135,9 @@ class DfHelper:
|
|
127
135
|
if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
128
136
|
self.backend_strategy = strategy_class(self)
|
129
137
|
|
130
|
-
def __enter__(self):
|
131
|
-
return self
|
132
|
-
|
133
138
|
def __exit__(self, exc_type, exc_value, traceback):
|
134
139
|
self._cleanup()
|
140
|
+
super().__exit__(exc_type, exc_value, traceback)
|
135
141
|
|
136
142
|
def _cleanup(self):
|
137
143
|
active_config = getattr(self, f"backend_{self.backend}", None)
|
@@ -147,14 +153,14 @@ class DfHelper:
|
|
147
153
|
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
148
154
|
"""Loads data synchronously. Fails if backend is async-only."""
|
149
155
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
150
|
-
df = self.backend_strategy.load(**options)
|
156
|
+
self.total_records, df = self.backend_strategy.load(**options)
|
151
157
|
df = self._process_loaded_data(df)
|
152
158
|
df = self._post_process_df(df)
|
153
159
|
return df.compute() if as_pandas else df
|
154
160
|
|
155
161
|
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
156
162
|
"""Loads data asynchronously from any backend."""
|
157
|
-
df = await self.backend_strategy.aload(**options)
|
163
|
+
self.total_records, df = await self.backend_strategy.aload(**options)
|
158
164
|
df = self._process_loaded_data(df)
|
159
165
|
df = self._post_process_df(df)
|
160
166
|
return df.compute() if as_pandas else df
|
@@ -195,15 +201,27 @@ class DfHelper:
|
|
195
201
|
return
|
196
202
|
fs = kwargs.pop('fs', self.fs)
|
197
203
|
path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
|
198
|
-
|
199
|
-
|
204
|
+
writer_config = {
|
205
|
+
'df_result': df,
|
206
|
+
'parquet_storage_path': path,
|
207
|
+
'fs': fs,
|
208
|
+
'debug': self.debug,
|
209
|
+
'logger': self.logger,
|
210
|
+
'verbose': self.verbose,
|
211
|
+
}
|
212
|
+
with ParquetSaver(**writer_config) as saver:
|
213
|
+
saver.save_to_parquet(parquet_filename)
|
214
|
+
|
215
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in {path}.")
|
200
216
|
|
201
217
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
202
218
|
if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
203
219
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
204
220
|
return
|
205
|
-
|
206
|
-
self.logger.
|
221
|
+
|
222
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
223
|
+
writer.save_to_clickhouse(df)
|
224
|
+
self.logger.debug("Save to ClickHouse completed.")
|
207
225
|
|
208
226
|
def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
209
227
|
"""Synchronous convenience method for loading a date range."""
|
@@ -229,4 +247,3 @@ class DfHelper:
|
|
229
247
|
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
230
248
|
self.logger.debug(f"Period load generated filters: {kwargs}")
|
231
249
|
return kwargs
|
232
|
-
|
@@ -1,13 +1,13 @@
|
|
1
|
+
from __future__ import annotations
|
1
2
|
import datetime
|
2
|
-
import logging
|
3
3
|
import threading
|
4
|
-
from typing import Optional, Any, Dict
|
4
|
+
from typing import Optional, Any, Dict, ClassVar
|
5
5
|
|
6
6
|
import dask.dataframe as dd
|
7
7
|
import fsspec
|
8
8
|
|
9
9
|
from sibi_dst.df_helper import DfHelper
|
10
|
-
from sibi_dst.utils import DataWrapper, DateUtils,
|
10
|
+
from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner
|
11
11
|
from sibi_dst.utils import MissingManifestManager
|
12
12
|
|
13
13
|
|
@@ -54,18 +54,10 @@ class ParquetArtifact(DfHelper):
|
|
54
54
|
:ivar fs: Filesystem object used for storage operations.
|
55
55
|
:type fs: fsspec.AbstractFileSystem
|
56
56
|
"""
|
57
|
-
DEFAULT_CONFIG = {
|
57
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, str]] = {
|
58
58
|
'backend': 'parquet'
|
59
59
|
}
|
60
60
|
|
61
|
-
# DEFAULT_UPDATE_PLANNER_CONFIG = {
|
62
|
-
# 'reverse_order': True,
|
63
|
-
# 'overwrite': False,
|
64
|
-
# 'ignore_missing': True,
|
65
|
-
# 'history_days_threshold': 30,
|
66
|
-
# 'max_age_minutes': 10,
|
67
|
-
# 'show_progress': False
|
68
|
-
# }
|
69
61
|
|
70
62
|
def __init__(self, data_wrapper_class, **kwargs):
|
71
63
|
"""
|
@@ -97,7 +89,7 @@ class ParquetArtifact(DfHelper):
|
|
97
89
|
**kwargs,
|
98
90
|
}
|
99
91
|
self.df: Optional[dd.DataFrame] = None
|
100
|
-
self.
|
92
|
+
super().__init__(**self.config)
|
101
93
|
self.data_wrapper_class = data_wrapper_class
|
102
94
|
|
103
95
|
self.date_field = self._validate_required('date_field')
|
@@ -106,37 +98,20 @@ class ParquetArtifact(DfHelper):
|
|
106
98
|
self.parquet_start_date = self._validate_required('parquet_start_date')
|
107
99
|
self.parquet_end_date = self._validate_required('parquet_end_date')
|
108
100
|
|
109
|
-
# Filesystem setup
|
110
|
-
self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
111
|
-
self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
112
|
-
self.fs = self.config.setdefault('fs', None)
|
113
|
-
self._own_fs = self.fs is None
|
114
|
-
if self.fs is None:
|
115
|
-
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
116
|
-
self._own_fs = True
|
117
|
-
self.config.setdefault('fs', self.fs)
|
118
|
-
## Populate to parameters to pass to data_wrapper_class
|
119
101
|
self.class_params = self.config.pop('class_params', {
|
120
102
|
'debug': self.debug,
|
121
103
|
'logger': self.logger,
|
122
104
|
'fs': self.fs,
|
105
|
+
'verbose': self.verbose,
|
123
106
|
})
|
124
107
|
# Populate parameters to pass to load method of DataWrapper class
|
125
108
|
self.load_params = self.config.setdefault('load_params', {})
|
126
109
|
# Ensure the directory exists
|
127
110
|
self.ensure_directory_exists(self.parquet_storage_path)
|
128
|
-
super().__init__(**self.config)
|
111
|
+
#super().__init__(**self.config)
|
129
112
|
self.update_planner_params = {}
|
130
113
|
self.datawrapper_params = {}
|
131
114
|
|
132
|
-
def _setup_logging(self):
|
133
|
-
"""Initialize logger and debug settings."""
|
134
|
-
self.debug = self.config.get('debug', False)
|
135
|
-
self.logger = self.config.get('logger',
|
136
|
-
Logger.default_logger(
|
137
|
-
logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}'))
|
138
|
-
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
139
|
-
|
140
115
|
def _validate_required(self, key: str) -> Any:
|
141
116
|
"""Validate required configuration fields."""
|
142
117
|
value = self.config.setdefault(key, None)
|
@@ -150,7 +125,9 @@ class ParquetArtifact(DfHelper):
|
|
150
125
|
self.mmanifest = MissingManifestManager(
|
151
126
|
fs=self.fs,
|
152
127
|
manifest_path=self.missing_manifest_path,
|
153
|
-
clear_existing=overwrite
|
128
|
+
clear_existing=overwrite,
|
129
|
+
debug= self.debug,
|
130
|
+
logger=self.logger
|
154
131
|
)
|
155
132
|
|
156
133
|
# Initialize skipped files
|
@@ -158,7 +135,7 @@ class ParquetArtifact(DfHelper):
|
|
158
135
|
if not manifest_exists:
|
159
136
|
self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
|
160
137
|
self.mmanifest.save()
|
161
|
-
self.mmanifest.cleanup_temp_manifests()
|
138
|
+
#self.mmanifest.cleanup_temp_manifests()
|
162
139
|
else:
|
163
140
|
self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
|
164
141
|
|
@@ -172,7 +149,7 @@ class ParquetArtifact(DfHelper):
|
|
172
149
|
def _setup_update_planner(self, **kwargs) -> None:
|
173
150
|
self._prepare_update_params(**kwargs)
|
174
151
|
self.update_planner = UpdatePlanner(**self.update_planner_params)
|
175
|
-
self.update_planner.generate_plan(self.start_date, self.end_date)
|
152
|
+
self.update_planner.generate_plan(start=self.start_date,end= self.end_date)
|
176
153
|
|
177
154
|
def load(self, **kwargs):
|
178
155
|
with self._lock:
|
@@ -197,36 +174,45 @@ class ParquetArtifact(DfHelper):
|
|
197
174
|
with DataWrapper(self.data_wrapper_class, **params) as dw:
|
198
175
|
dw.process()
|
199
176
|
|
200
|
-
def __enter__(self):
|
201
|
-
if getattr(self, "_entered", False):
|
202
|
-
return self
|
203
|
-
self._entered = True
|
204
|
-
return self
|
205
|
-
|
206
177
|
def __exit__(self, exc_type, exc_value, traceback):
|
207
178
|
try:
|
208
179
|
if self.mmanifest and self.mmanifest._new_records:
|
209
180
|
self.mmanifest.save()
|
210
|
-
self.mmanifest.cleanup_temp_manifests()
|
211
|
-
if getattr(self, "_entered", False) and self.fs and self._own_fs:
|
212
|
-
self.fs.close()
|
213
181
|
except Exception as e:
|
214
182
|
self.logger.warning(f"Error closing filesystem: {e}")
|
215
183
|
finally:
|
216
|
-
|
184
|
+
super().__exit__(exc_type, exc_value, traceback)
|
217
185
|
# return False so exceptions aren’t suppressed
|
218
186
|
return False
|
219
187
|
|
220
|
-
|
221
|
-
def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
|
188
|
+
def get_size_estimate(self, **kwargs) -> int:
|
222
189
|
"""
|
223
|
-
|
224
|
-
|
190
|
+
Synchronously estimates artifact size for use in multi-threaded environments.
|
191
|
+
|
192
|
+
This method safely executes asynchronous I/O operations from a synchronous
|
193
|
+
context, handling variations in fsspec filesystem implementations.
|
225
194
|
"""
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
195
|
+
|
196
|
+
async def _get_total_bytes_async():
|
197
|
+
"""A helper async coroutine to perform the I/O."""
|
198
|
+
import asyncio
|
199
|
+
|
200
|
+
files = await self.fs._glob(f"{self.parquet_storage_path}/*.parquet")
|
201
|
+
if not files:
|
202
|
+
return 0
|
203
|
+
|
204
|
+
size_tasks = [self.fs._size(f) for f in files]
|
205
|
+
sizes = await asyncio.gather(*size_tasks)
|
206
|
+
return sum(s for s in sizes if s is not None)
|
207
|
+
|
208
|
+
try:
|
209
|
+
# Attempt the standard fsspec method first
|
210
|
+
total_bytes = self.fs.sync(_get_total_bytes_async())
|
211
|
+
except AttributeError:
|
212
|
+
# fallback for filesystems like s3fs that lack .sync()
|
213
|
+
total_bytes = self.fs.loop.run_until_complete(_get_total_bytes_async())
|
214
|
+
|
215
|
+
# Convert to megabytes, ensuring a minimum of 1
|
230
216
|
return max(1, int(total_bytes / (1024 ** 2)))
|
231
217
|
|
232
218
|
def update_parquet(self, period: str = 'today', **kwargs) -> None:
|
@@ -316,6 +302,8 @@ class ParquetArtifact(DfHelper):
|
|
316
302
|
'parquet_filename': self.parquet_filename,
|
317
303
|
'data_path': self.parquet_storage_path,
|
318
304
|
'fs': self.fs,
|
305
|
+
'debug': self.debug,
|
306
|
+
'logger': self.logger,
|
319
307
|
'class_params': self.class_params,
|
320
308
|
'date_field': self.date_field,
|
321
309
|
'load_params': self.load_params,
|
@@ -1,11 +1,10 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Optional
|
2
|
+
from typing import Optional, ClassVar, Dict
|
3
3
|
|
4
4
|
import dask.dataframe as dd
|
5
5
|
import fsspec
|
6
6
|
|
7
7
|
from sibi_dst.df_helper import DfHelper
|
8
|
-
from sibi_dst.utils import Logger
|
9
8
|
|
10
9
|
class ParquetReader(DfHelper):
|
11
10
|
"""
|
@@ -44,7 +43,7 @@ class ParquetReader(DfHelper):
|
|
44
43
|
Parquet storage.
|
45
44
|
:type fs: fsspec.AbstractFileSystem
|
46
45
|
"""
|
47
|
-
DEFAULT_CONFIG = {
|
46
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, int]] = {
|
48
47
|
'backend': 'parquet'
|
49
48
|
}
|
50
49
|
|
@@ -54,9 +53,9 @@ class ParquetReader(DfHelper):
|
|
54
53
|
**kwargs,
|
55
54
|
}
|
56
55
|
self.df: Optional[dd.DataFrame] = None
|
57
|
-
self.debug = self.config.setdefault('debug', False)
|
58
|
-
self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
59
|
-
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
56
|
+
#self.debug = self.config.setdefault('debug', False)
|
57
|
+
#self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
58
|
+
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
60
59
|
self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
|
61
60
|
if self.parquet_storage_path is None:
|
62
61
|
raise ValueError('parquet_storage_path must be set')
|
@@ -69,12 +68,12 @@ class ParquetReader(DfHelper):
|
|
69
68
|
raise ValueError('parquet_end_date must be set')
|
70
69
|
|
71
70
|
# Filesystem setup
|
72
|
-
self.filesystem_type = filesystem_type
|
73
|
-
self.filesystem_options = filesystem_options or {}
|
74
|
-
self.fs = self.config.setdefault('fs', None)
|
75
|
-
if self.fs is None:
|
76
|
-
|
77
|
-
self.config.setdefault('fs', self.fs)
|
71
|
+
#self.filesystem_type = filesystem_type
|
72
|
+
#self.filesystem_options = filesystem_options or {}
|
73
|
+
#self.fs = self.config.setdefault('fs', None)
|
74
|
+
#if self.fs is None:
|
75
|
+
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
76
|
+
#self.config.setdefault('fs', self.fs)
|
78
77
|
|
79
78
|
if not self.directory_exists():
|
80
79
|
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
@@ -92,7 +91,3 @@ class ParquetReader(DfHelper):
|
|
92
91
|
except FileNotFoundError:
|
93
92
|
return False
|
94
93
|
|
95
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
96
|
-
# Ensure resources are cleaned up
|
97
|
-
if self.fs:
|
98
|
-
self.fs.close()
|
@@ -85,7 +85,8 @@ class ParquetConfig(BaseModel):
|
|
85
85
|
if self.logger is None:
|
86
86
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
87
87
|
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
88
|
-
|
88
|
+
if self.fs is None:
|
89
|
+
raise ValueError('Parquet Options: File system (fs) must be specified')
|
89
90
|
|
90
91
|
if self.parquet_storage_path is None:
|
91
92
|
raise ValueError('Parquet storage path must be specified')
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
import os
|
3
3
|
import threading
|
4
4
|
from contextlib import contextmanager
|
5
5
|
from typing import Any, Optional, ClassVar, Generator, Type, Dict
|
@@ -49,10 +49,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
49
49
|
debug: bool = False
|
50
50
|
|
51
51
|
# --- Pool Configuration ---
|
52
|
-
pool_size: int = 5
|
53
|
-
max_overflow: int = 10
|
54
|
-
pool_timeout: int = 30
|
55
|
-
pool_recycle: int = 1800
|
52
|
+
pool_size: int = int(os.environ.get("DB_POOL_SIZE", 5))
|
53
|
+
max_overflow: int = int(os.environ.get("DB_MAX_OVERFLOW",10))
|
54
|
+
pool_timeout: int = int(os.environ.get("DB_POOL_TIMEOUT", 30))
|
55
|
+
pool_recycle: int = int(os.environ.get("DB_POOL_RECYCLE", 1800))
|
56
56
|
pool_pre_ping: bool = True
|
57
57
|
poolclass: Type[QueuePool] = QueuePool
|
58
58
|
|
@@ -60,6 +60,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
60
60
|
model: Optional[Type[Any]] = None
|
61
61
|
engine: Optional[Engine] = None
|
62
62
|
logger: Optional[Logger] = None
|
63
|
+
_own_logger: bool = False # Indicates if this instance owns the logger.
|
63
64
|
session_factory: Optional[sessionmaker] = None
|
64
65
|
|
65
66
|
# --- Private State ---
|
@@ -102,10 +103,13 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
102
103
|
|
103
104
|
def _init_logger(self) -> None:
|
104
105
|
"""Initializes the logger for this instance."""
|
106
|
+
# This is not a ManagedResource subclass, so we handle logger initialization directly.
|
107
|
+
# unless a logger is provided, we create our own.
|
105
108
|
if self.logger is None:
|
109
|
+
self._own_logger = True
|
106
110
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
107
|
-
|
108
|
-
|
111
|
+
log_level = Logger.DEBUG if self.debug else Logger.INFO
|
112
|
+
self.logger.set_level(log_level)
|
109
113
|
|
110
114
|
def _get_engine_key(self) -> tuple:
|
111
115
|
"""Generates a unique, normalized key for an engine configuration."""
|
@@ -146,6 +150,8 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
146
150
|
self.logger.error(f"Failed to create engine: {e}")
|
147
151
|
raise SQLAlchemyError(f"Engine creation failed: {e}") from e
|
148
152
|
|
153
|
+
#self.logger.debug(f"Connections Active: {self.active_connections}")
|
154
|
+
|
149
155
|
def close(self) -> None:
|
150
156
|
"""
|
151
157
|
Decrements the engine's reference count and disposes of the engine
|
@@ -176,8 +182,6 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
176
182
|
# Mark this instance as closed to prevent subsequent calls.
|
177
183
|
self._closed = True
|
178
184
|
|
179
|
-
# ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
|
180
|
-
# They are omitted here for brevity but should be included in your final file.
|
181
185
|
|
182
186
|
def _attach_events(self) -> None:
|
183
187
|
"""Attaches checkout/checkin events to the engine for connection tracking."""
|
@@ -191,7 +195,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
191
195
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
192
196
|
if wrapper:
|
193
197
|
wrapper['active_connections'] += 1
|
194
|
-
self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
|
198
|
+
# self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
|
195
199
|
|
196
200
|
def _on_checkin(self, *args) -> None:
|
197
201
|
"""Event listener for when a connection is returned to the pool."""
|
@@ -199,7 +203,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
199
203
|
wrapper = self._engine_registry.get(self._engine_key_instance)
|
200
204
|
if wrapper:
|
201
205
|
wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
|
202
|
-
self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
|
206
|
+
# self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
|
203
207
|
|
204
208
|
@property
|
205
209
|
def active_connections(self) -> int:
|
@@ -1,5 +1,6 @@
|
|
1
|
+
from __future__ import annotations
|
1
2
|
|
2
|
-
from typing import Type
|
3
|
+
from typing import Type, Any
|
3
4
|
|
4
5
|
import dask
|
5
6
|
import dask.dataframe as dd
|
@@ -13,11 +14,12 @@ from sqlalchemy.orm import declarative_base
|
|
13
14
|
import time
|
14
15
|
from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
15
16
|
import sqlalchemy as sa
|
17
|
+
|
18
|
+
from sibi_dst.utils import ManagedResource
|
16
19
|
from sibi_dst.df_helper.core import FilterHandler
|
17
|
-
from sibi_dst.utils import Logger
|
18
20
|
|
19
21
|
|
20
|
-
class SQLAlchemyDask:
|
22
|
+
class SQLAlchemyDask(ManagedResource):
|
21
23
|
"""
|
22
24
|
Loads data from a database into a Dask DataFrame using a memory-safe,
|
23
25
|
non-parallel, paginated approach.
|
@@ -46,8 +48,7 @@ class SQLAlchemyDask:
|
|
46
48
|
filters: dict,
|
47
49
|
engine: Engine,
|
48
50
|
chunk_size: int = 1000,
|
49
|
-
|
50
|
-
debug: bool = False,
|
51
|
+
**kwargs
|
51
52
|
):
|
52
53
|
"""
|
53
54
|
Initializes the data loader.
|
@@ -60,14 +61,13 @@ class SQLAlchemyDask:
|
|
60
61
|
logger: A logger instance.
|
61
62
|
debug: Whether to enable detailed logging.
|
62
63
|
"""
|
64
|
+
super().__init__(**kwargs)
|
63
65
|
self.model = model
|
64
66
|
self.filters = filters
|
65
67
|
self.engine = engine
|
66
68
|
self.chunk_size = chunk_size
|
67
|
-
self.debug = debug
|
68
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
69
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
70
69
|
self.filter_handler_cls = FilterHandler
|
70
|
+
self.total_records = -1 # Initialize to -1 to indicate uncounted
|
71
71
|
|
72
72
|
@classmethod
|
73
73
|
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
@@ -83,7 +83,7 @@ class SQLAlchemyDask:
|
|
83
83
|
dtypes[column.name] = dtype
|
84
84
|
return dtypes
|
85
85
|
|
86
|
-
def read_frame(self, fillna_value=None) ->
|
86
|
+
def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
|
87
87
|
"""
|
88
88
|
Builds and executes a query to load data into a Dask DataFrame.
|
89
89
|
|
@@ -105,7 +105,8 @@ class SQLAlchemyDask:
|
|
105
105
|
).apply_filters(query, model=self.model, filters=self.filters)
|
106
106
|
else:
|
107
107
|
query = query.limit(self.chunk_size)
|
108
|
-
self.
|
108
|
+
if self.verbose:
|
109
|
+
self.logger.debug(f"Base query for pagination: {query}")
|
109
110
|
|
110
111
|
# 2. Get metadata for the Dask DataFrame structure
|
111
112
|
ordered_columns = [column.name for column in self.model.__table__.columns]
|
@@ -116,6 +117,7 @@ class SQLAlchemyDask:
|
|
116
117
|
|
117
118
|
retry_attempts = 3
|
118
119
|
backoff_factor = 0.5 # start with a 0.5-second delay
|
120
|
+
total_records = 0
|
119
121
|
|
120
122
|
for attempt in range(retry_attempts):
|
121
123
|
try:
|
@@ -134,11 +136,12 @@ class SQLAlchemyDask:
|
|
134
136
|
time.sleep(backoff_factor)
|
135
137
|
backoff_factor *= 2 # Double the backoff time for the next attempt
|
136
138
|
else:
|
139
|
+
self.total_records = -1 # Indicate failure to count records
|
137
140
|
self.logger.error(
|
138
141
|
"Failed to get a connection from the pool after several retries.",
|
139
142
|
exc_info=True
|
140
143
|
)
|
141
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
144
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
142
145
|
except OperationalError as oe:
|
143
146
|
# sometimes the DB driver wraps timeouts in OperationalError
|
144
147
|
if "timeout" in str(oe).lower():
|
@@ -147,15 +150,18 @@ class SQLAlchemyDask:
|
|
147
150
|
backoff_factor *= 2
|
148
151
|
continue
|
149
152
|
else:
|
153
|
+
self.total_records = -1 # Indicate failure to count records
|
150
154
|
self.logger.error("OperationalError", exc_info=True)
|
151
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
155
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
152
156
|
except Exception as e:
|
157
|
+
self.total_records = -1 # Indicate failure to count records
|
153
158
|
self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
154
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
159
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
155
160
|
|
161
|
+
self.total_records = total_records
|
156
162
|
if total_records == 0:
|
157
163
|
self.logger.warning("Query returned 0 records.")
|
158
|
-
return dd.from_pandas(meta_df, npartitions=1)
|
164
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
159
165
|
|
160
166
|
self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
|
161
167
|
|
@@ -179,8 +185,9 @@ class SQLAlchemyDask:
|
|
179
185
|
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
180
186
|
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
181
187
|
self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
|
182
|
-
|
183
|
-
|
188
|
+
if not self._entered:
|
189
|
+
super().cleanup()
|
190
|
+
return self.total_records, ddf
|
184
191
|
|
185
192
|
## Dask-Only Solution to test in better hardware
|
186
193
|
|