sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +186 -591
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/__init__.py +0 -4
- sibi_dst/df_helper/core/_defaults.py +1 -50
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/__init__.py +0 -2
- sibi_dst/utils/data_wrapper.py +9 -12
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/utils/update_planner.py +2 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- sibi_dst-2025.1.1.dist-info/METADATA +55 -0
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
- sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- sibi_dst/utils/airflow_manager.py +0 -212
- sibi_dst-0.3.63.dist-info/METADATA +0 -90
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,135 +1,329 @@
|
|
1
|
-
import itertools
|
2
1
|
|
2
|
+
from typing import Type
|
3
|
+
|
4
|
+
import dask
|
3
5
|
import dask.dataframe as dd
|
4
6
|
import pandas as pd
|
5
|
-
from sqlalchemy import
|
6
|
-
|
7
|
-
|
7
|
+
from sqlalchemy import (
|
8
|
+
inspect,
|
9
|
+
select
|
10
|
+
)
|
11
|
+
from sqlalchemy.engine import Engine
|
12
|
+
from sqlalchemy.orm import declarative_base
|
13
|
+
import time
|
14
|
+
from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
15
|
+
import sqlalchemy as sa
|
8
16
|
from sibi_dst.df_helper.core import FilterHandler
|
9
17
|
from sibi_dst.utils import Logger
|
10
18
|
|
11
19
|
|
12
20
|
class SQLAlchemyDask:
|
13
|
-
|
21
|
+
"""
|
22
|
+
Loads data from a database into a Dask DataFrame using a memory-safe,
|
23
|
+
non-parallel, paginated approach.
|
24
|
+
|
25
|
+
This class avoids using a numeric `index_col for parallel loading.
|
26
|
+
"""
|
27
|
+
|
28
|
+
_SQLALCHEMY_TO_DASK_DTYPE = {
|
29
|
+
"INTEGER": "Int64",
|
30
|
+
"SMALLINT": "Int64",
|
31
|
+
"BIGINT": "Int64",
|
32
|
+
"FLOAT": "float64",
|
33
|
+
"NUMERIC": "float64",
|
34
|
+
"BOOLEAN": "bool",
|
35
|
+
"VARCHAR": "object",
|
36
|
+
"TEXT": "object",
|
37
|
+
"DATE": "datetime64[ns]",
|
38
|
+
"DATETIME": "datetime64[ns]",
|
39
|
+
"TIME": "object",
|
40
|
+
"UUID": "object",
|
41
|
+
}
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
model: Type[declarative_base()],
|
46
|
+
filters: dict,
|
47
|
+
engine: Engine,
|
48
|
+
chunk_size: int = 1000,
|
49
|
+
logger=None,
|
50
|
+
debug: bool = False,
|
51
|
+
):
|
14
52
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
:
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
53
|
+
Initializes the data loader.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
model: The SQLAlchemy ORM model for the table.
|
57
|
+
filters: A dictionary of filters to apply to the query.
|
58
|
+
engine: An SQLAlchemy Engine instance.
|
59
|
+
chunk_size: The number of records to fetch in each database query.
|
60
|
+
logger: A logger instance.
|
61
|
+
debug: Whether to enable detailed logging.
|
23
62
|
"""
|
24
|
-
self.query = None
|
25
63
|
self.model = model
|
26
64
|
self.filters = filters
|
65
|
+
self.engine = engine
|
27
66
|
self.chunk_size = chunk_size
|
28
67
|
self.debug = debug
|
29
|
-
self.engine = create_engine(engine_url)
|
30
|
-
self.Session = sessionmaker(bind=self.engine)
|
31
68
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
32
|
-
self.logger.set_level(
|
69
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
70
|
+
self.filter_handler_cls = FilterHandler
|
33
71
|
|
34
|
-
@
|
35
|
-
def
|
72
|
+
@classmethod
|
73
|
+
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
36
74
|
"""
|
37
|
-
|
75
|
+
Infers a metadata dictionary for Dask based on the SQLAlchemy model.
|
76
|
+
This helps Dask understand the DataFrame structure without reading data.
|
38
77
|
"""
|
39
78
|
mapper = inspect(model)
|
40
|
-
sqlalchemy_to_dask_dtype = {
|
41
|
-
'INTEGER': 'Int64',
|
42
|
-
'SMALLINT': 'Int64',
|
43
|
-
'BIGINT': 'Int64',
|
44
|
-
'FLOAT': 'float64',
|
45
|
-
'NUMERIC': 'float64',
|
46
|
-
'BOOLEAN': 'bool',
|
47
|
-
'VARCHAR': 'object',
|
48
|
-
'TEXT': 'object',
|
49
|
-
'DATE': 'datetime64[ns]',
|
50
|
-
'DATETIME': 'datetime64[ns]',
|
51
|
-
'TIME': 'object',
|
52
|
-
'UUID': 'object',
|
53
|
-
}
|
54
|
-
|
55
79
|
dtypes = {}
|
56
80
|
for column in mapper.columns:
|
57
|
-
|
81
|
+
dtype_str = str(column.type).upper().split("(")[0]
|
82
|
+
dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
|
58
83
|
dtypes[column.name] = dtype
|
59
|
-
|
60
84
|
return dtypes
|
61
85
|
|
62
|
-
def read_frame(self, fillna_value=None):
|
86
|
+
def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
63
87
|
"""
|
64
|
-
|
88
|
+
Builds and executes a query to load data into a Dask DataFrame.
|
89
|
+
|
90
|
+
This method works by first running a COUNT query to get the total
|
91
|
+
size, then creating a series of delayed tasks that each fetch a
|
92
|
+
chunk of data using LIMIT/OFFSET.
|
65
93
|
|
66
|
-
:
|
67
|
-
|
94
|
+
Args:
|
95
|
+
fillna_value: Value to replace NaN or NULL values with, if any.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
A lazy Dask DataFrame.
|
68
99
|
"""
|
69
|
-
|
100
|
+
# 1. Build the base query and apply filters
|
101
|
+
query = select(self.model)
|
102
|
+
if self.filters:
|
103
|
+
query = self.filter_handler_cls(
|
104
|
+
backend="sqlalchemy", logger=self.logger, debug=self.debug
|
105
|
+
).apply_filters(query, model=self.model, filters=self.filters)
|
106
|
+
else:
|
107
|
+
query = query.limit(self.chunk_size)
|
108
|
+
self.logger.debug(f"Base query for pagination: {query}")
|
109
|
+
|
110
|
+
# 2. Get metadata for the Dask DataFrame structure
|
111
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
112
|
+
meta_dtypes = self.infer_meta_from_model(self.model)
|
113
|
+
meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
|
114
|
+
|
115
|
+
# 3. Get the total record count to calculate the number of chunks
|
116
|
+
|
117
|
+
retry_attempts = 3
|
118
|
+
backoff_factor = 0.5 # start with a 0.5-second delay
|
119
|
+
|
120
|
+
for attempt in range(retry_attempts):
|
70
121
|
try:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
122
|
+
with self.engine.connect() as connection:
|
123
|
+
count_query = sa.select(sa.func.count()).select_from(query.alias())
|
124
|
+
total_records = connection.execute(count_query).scalar_one()
|
125
|
+
|
126
|
+
# If successful, break the loop
|
127
|
+
break
|
128
|
+
|
129
|
+
except SASQLTimeoutError:
|
130
|
+
if attempt < retry_attempts - 1:
|
131
|
+
self.logger.warning(
|
132
|
+
f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
|
133
|
+
)
|
134
|
+
time.sleep(backoff_factor)
|
135
|
+
backoff_factor *= 2 # Double the backoff time for the next attempt
|
77
136
|
else:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
# Infer dtypes
|
82
|
-
dtypes = self.infer_dtypes_from_model(self.model)
|
83
|
-
# Get the column order from the SQLAlchemy model
|
84
|
-
ordered_columns = [column.name for column in self.model.__table__.columns]
|
85
|
-
|
86
|
-
# Execute query and fetch results in chunks
|
87
|
-
result_proxy = session.execute(self.query)
|
88
|
-
results = result_proxy.scalars().all() # Fetch all rows
|
89
|
-
iterator = iter(results)
|
90
|
-
|
91
|
-
partitions = []
|
92
|
-
|
93
|
-
while True:
|
94
|
-
chunk = list(itertools.islice(iterator, self.chunk_size))
|
95
|
-
if not chunk:
|
96
|
-
break
|
97
|
-
|
98
|
-
# Convert chunk to Pandas DataFrame
|
99
|
-
df = pd.DataFrame.from_records(
|
100
|
-
[row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
|
137
|
+
self.logger.error(
|
138
|
+
"Failed to get a connection from the pool after several retries.",
|
139
|
+
exc_info=True
|
101
140
|
)
|
102
|
-
|
103
|
-
|
141
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
142
|
+
except OperationalError as oe:
|
143
|
+
# sometimes the DB driver wraps timeouts in OperationalError
|
144
|
+
if "timeout" in str(oe).lower():
|
145
|
+
self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
|
146
|
+
time.sleep(backoff_factor)
|
147
|
+
backoff_factor *= 2
|
148
|
+
continue
|
149
|
+
else:
|
150
|
+
self.logger.error("OperationalError", exc_info=True)
|
151
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
152
|
+
except Exception as e:
|
153
|
+
self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
154
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
104
155
|
|
105
|
-
|
106
|
-
|
156
|
+
if total_records == 0:
|
157
|
+
self.logger.warning("Query returned 0 records.")
|
158
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
107
159
|
|
108
|
-
|
109
|
-
if fillna_value is not None:
|
110
|
-
df = df.fillna(fillna_value)
|
160
|
+
self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
|
111
161
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
162
|
+
# 4. Create a list of Dask Delayed objects, one for each chunk
|
163
|
+
@dask.delayed
|
164
|
+
def get_chunk(sql_query, chunk_offset):
|
165
|
+
"""A Dask-delayed function to fetch one chunk of data."""
|
166
|
+
# LIMIT/OFFSET must be applied in the delayed function
|
167
|
+
paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
|
168
|
+
df = pd.read_sql(paginated_query, self.engine)
|
116
169
|
|
117
|
-
|
118
|
-
|
119
|
-
# Create a Dask partition
|
120
|
-
partitions.append(dd.from_pandas(df, npartitions=1))
|
170
|
+
if fillna_value is not None:
|
171
|
+
df = df.fillna(fillna_value)
|
121
172
|
|
122
|
-
|
123
|
-
|
124
|
-
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
125
|
-
else:
|
126
|
-
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
173
|
+
# Ensure column order and types match the meta
|
174
|
+
return df[ordered_columns].astype(meta_dtypes)
|
127
175
|
|
128
|
-
|
176
|
+
offsets = range(0, total_records, self.chunk_size)
|
177
|
+
delayed_chunks = [get_chunk(query, offset) for offset in offsets]
|
129
178
|
|
130
|
-
|
179
|
+
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
180
|
+
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
181
|
+
self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
|
131
182
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
183
|
+
return ddf
|
184
|
+
|
185
|
+
## Dask-Only Solution to test in better hardware
|
186
|
+
|
187
|
+
# from typing import Type, Dict, Any
|
188
|
+
# import math
|
189
|
+
# import time
|
190
|
+
# import pandas as pd
|
191
|
+
# import dask
|
192
|
+
# import dask.dataframe as dd
|
193
|
+
#
|
194
|
+
# import sqlalchemy as sa
|
195
|
+
# from sqlalchemy import select, func
|
196
|
+
# from sqlalchemy.engine import Engine
|
197
|
+
# from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
198
|
+
# from sqlalchemy.orm import declarative_base
|
199
|
+
#
|
200
|
+
# from sibi_dst.df_helper.core import FilterHandler
|
201
|
+
# from sibi_dst.utils import Logger
|
202
|
+
#
|
203
|
+
#
|
204
|
+
# class SQLAlchemyDask:
|
205
|
+
# """
|
206
|
+
# Loads data into a Dask DataFrame. If there’s exactly one integer PK,
|
207
|
+
# use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
|
208
|
+
# pagination pushed into dask.delayed to keep memory use minimal.
|
209
|
+
# """
|
210
|
+
#
|
211
|
+
# def __init__(
|
212
|
+
# self,
|
213
|
+
# model: Type[declarative_base()],
|
214
|
+
# filters: Dict[str, Any],
|
215
|
+
# engine: Engine,
|
216
|
+
# chunk_size: int = 1_000,
|
217
|
+
# logger=None,
|
218
|
+
# debug: bool = False,
|
219
|
+
# ):
|
220
|
+
# self.model = model
|
221
|
+
# self.filters = filters or {}
|
222
|
+
# self.engine = engine
|
223
|
+
# self.chunk_size = chunk_size
|
224
|
+
# self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
225
|
+
# self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
226
|
+
# self.filter_handler_cls = FilterHandler
|
227
|
+
# self.debug = debug
|
228
|
+
#
|
229
|
+
# def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
230
|
+
# # 1) Build base query + filters
|
231
|
+
# base_q = select(self.model)
|
232
|
+
# if self.filters:
|
233
|
+
# base_q = self.filter_handler_cls(
|
234
|
+
# backend="sqlalchemy",
|
235
|
+
# logger=self.logger,
|
236
|
+
# debug=self.debug,
|
237
|
+
# ).apply_filters(base_q, model=self.model, filters=self.filters)
|
238
|
+
#
|
239
|
+
# # 2) Zero-row meta for dtype inference
|
240
|
+
# meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
|
241
|
+
# if meta.shape[1] == 0:
|
242
|
+
# self.logger.warning("No columns detected; returning empty DataFrame.")
|
243
|
+
# return dd.from_pandas(meta, npartitions=1)
|
244
|
+
#
|
245
|
+
# # 3) Single‐PK parallel path?
|
246
|
+
# pk_cols = list(self.model.__table__.primary_key.columns)
|
247
|
+
# if (
|
248
|
+
# len(pk_cols) == 1
|
249
|
+
# and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
|
250
|
+
# ):
|
251
|
+
# try:
|
252
|
+
# return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
|
253
|
+
# except Exception:
|
254
|
+
# self.logger.warning(
|
255
|
+
# "read_sql_table path failed, falling back to offset pagination",
|
256
|
+
# exc_info=True,
|
257
|
+
# )
|
258
|
+
#
|
259
|
+
# # 4) Composite PK or fallback → offset pagination in delayed tasks
|
260
|
+
# return self._offset_paginated_ddf(base_q, meta, fillna_value)
|
261
|
+
#
|
262
|
+
# def _offset_paginated_ddf(self, base_q, meta, fillna):
|
263
|
+
# # 1) count total rows
|
264
|
+
# try:
|
265
|
+
# with self.engine.connect() as conn:
|
266
|
+
# total = conn.execute(
|
267
|
+
# select(func.count()).select_from(base_q.alias())
|
268
|
+
# ).scalar_one()
|
269
|
+
# except Exception:
|
270
|
+
# self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
|
271
|
+
# return dd.from_pandas(meta, npartitions=1)
|
272
|
+
#
|
273
|
+
# if total == 0:
|
274
|
+
# self.logger.warning("Query returned 0 records.")
|
275
|
+
# return dd.from_pandas(meta, npartitions=1)
|
276
|
+
# self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
|
277
|
+
# # 2) create delayed tasks per offset
|
278
|
+
# @dask.delayed
|
279
|
+
# def _fetch_chunk(offset: int) -> pd.DataFrame:
|
280
|
+
# q = base_q.limit(self.chunk_size).offset(offset)
|
281
|
+
# df = pd.read_sql_query(q, self.engine)
|
282
|
+
# if fillna is not None:
|
283
|
+
# df = df.fillna(fillna)
|
284
|
+
# return df[meta.columns].astype(meta.dtypes.to_dict())
|
285
|
+
#
|
286
|
+
# offsets = range(0, total, self.chunk_size)
|
287
|
+
# parts = [_fetch_chunk(off) for off in offsets]
|
288
|
+
#
|
289
|
+
# ddf = dd.from_delayed(parts, meta=meta)
|
290
|
+
# self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
|
291
|
+
# return ddf
|
292
|
+
#
|
293
|
+
# def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
|
294
|
+
# # same as before: min/max + dd.read_sql_table
|
295
|
+
# backoff = 0.5
|
296
|
+
# for attempt in range(3):
|
297
|
+
# try:
|
298
|
+
# with self.engine.connect() as conn:
|
299
|
+
# min_id, max_id = conn.execute(
|
300
|
+
# select(func.min(pk_col), func.max(pk_col))
|
301
|
+
# .select_from(self.model.__table__)
|
302
|
+
# ).one()
|
303
|
+
# break
|
304
|
+
# except (SASQLTimeoutError, OperationalError) as e:
|
305
|
+
# if "timeout" in str(e).lower() and attempt < 2:
|
306
|
+
# self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
|
307
|
+
# time.sleep(backoff)
|
308
|
+
# backoff *= 2
|
309
|
+
# else:
|
310
|
+
# raise
|
311
|
+
#
|
312
|
+
# if min_id is None or max_id is None:
|
313
|
+
# self.logger.warning("Table empty—no PK bounds.")
|
314
|
+
# return dd.from_pandas(meta, npartitions=1)
|
315
|
+
#
|
316
|
+
# total = max_id - min_id + 1
|
317
|
+
# nparts = max(1, math.ceil(total / self.chunk_size))
|
318
|
+
# ddf = dd.read_sql_table(
|
319
|
+
# table=self.model.__table__.name,
|
320
|
+
# uri=str(self.engine.url),
|
321
|
+
# index_col=pk_col.name,
|
322
|
+
# limits=(min_id, max_id),
|
323
|
+
# npartitions=nparts,
|
324
|
+
# columns=list(meta.columns),
|
325
|
+
# )
|
326
|
+
# if fillna is not None:
|
327
|
+
# ddf = ddf.fillna(fillna)
|
328
|
+
# self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
|
329
|
+
# return ddf
|
@@ -3,143 +3,72 @@ import pandas as pd
|
|
3
3
|
|
4
4
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
5
5
|
from sibi_dst.utils import Logger
|
6
|
-
from ._io_dask import SQLAlchemyDask
|
7
6
|
from ._db_connection import SqlAlchemyConnectionConfig
|
7
|
+
from ._io_dask import SQLAlchemyDask
|
8
8
|
|
9
9
|
|
10
10
|
class SqlAlchemyLoadFromDb:
|
11
11
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
large datasets efficiently by utilizing the Dask framework for parallel
|
15
|
-
computations.
|
16
|
-
|
17
|
-
This class is initialized with a database connection configuration, query
|
18
|
-
configuration, optional parameters, and a logger. It can execute a query
|
19
|
-
using the specified configurations and read the results into a Dask
|
20
|
-
DataFrame. This is useful for processing and analyzing large-scale data.
|
21
|
-
|
22
|
-
:ivar df: Dask DataFrame to store the loaded data.
|
23
|
-
:type df: dd.DataFrame
|
24
|
-
:ivar db_connection: Database connection configuration object, containing details
|
25
|
-
such as the table, model, and engine to be used for the query.
|
26
|
-
:type db_connection: SqlAlchemyConnectionConfig
|
27
|
-
:ivar table_name: Name of the database table being queried.
|
28
|
-
:type table_name: str
|
29
|
-
:ivar model: SQLAlchemy model associated with the database connection.
|
30
|
-
:type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
|
31
|
-
:ivar engine: SQLAlchemy engine used for executing queries.
|
32
|
-
:type engine: sqlalchemy.engine.base.Engine
|
33
|
-
:ivar logger: Logger instance for logging debug and error information.
|
34
|
-
:type logger: Logger
|
35
|
-
:ivar query_config: Query configuration, including query-related details such
|
36
|
-
as the SQL query or query settings.
|
37
|
-
:type query_config: QueryConfig
|
38
|
-
:ivar params_config: Parameters configuration, including filter parameters for
|
39
|
-
the query.
|
40
|
-
:type params_config: ParamsConfig
|
41
|
-
:ivar debug: Debug flag indicating whether debug mode is enabled.
|
42
|
-
:type debug: bool
|
43
|
-
:ivar chunk_size: Size of data chunks to process at a time.
|
44
|
-
:type chunk_size: int
|
12
|
+
Orchestrates loading data from a database using SQLAlchemy into a Dask
|
13
|
+
DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
45
14
|
"""
|
46
|
-
df: dd.DataFrame = None
|
47
15
|
|
48
16
|
def __init__(
|
49
17
|
self,
|
50
|
-
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
18
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
51
19
|
plugin_query: QueryConfig = None,
|
52
20
|
plugin_params: ParamsConfig = None,
|
53
21
|
logger: Logger = None,
|
54
22
|
**kwargs,
|
55
23
|
):
|
56
24
|
"""
|
57
|
-
Initializes
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
associated with the database operations.
|
66
|
-
:param plugin_query:
|
67
|
-
The query configuration object, used to define specific query
|
68
|
-
options or rules. Defaults to None.
|
69
|
-
:param plugin_params:
|
70
|
-
The parameters configuration object, used for any additional
|
71
|
-
parameterized settings or configurations. Defaults to None.
|
72
|
-
:param logger:
|
73
|
-
Optional logger instance for logging purposes. If not provided,
|
74
|
-
a default logger is instantiated using the standard logging system.
|
75
|
-
:param kwargs:
|
76
|
-
Optional additional keyword arguments for customization. Can
|
77
|
-
include optional settings like `debug` mode or `chunk_size`
|
78
|
-
for batch operations.
|
25
|
+
Initializes the loader with all necessary configurations.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
plugin_sqlalchemy: The database connection configuration object.
|
29
|
+
plugin_query: The query configuration object.
|
30
|
+
plugin_params: The parameters and filters configuration object.
|
31
|
+
logger: An optional logger instance.
|
32
|
+
**kwargs: Must contain 'index_column' for Dask partitioning.
|
79
33
|
"""
|
80
34
|
self.db_connection = plugin_sqlalchemy
|
81
|
-
self.table_name = self.db_connection.table
|
82
35
|
self.model = self.db_connection.model
|
83
36
|
self.engine = self.db_connection.engine
|
84
37
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
85
38
|
self.query_config = plugin_query
|
86
39
|
self.params_config = plugin_params
|
87
|
-
self.debug = kwargs.
|
88
|
-
self.chunk_size = kwargs.
|
40
|
+
self.debug = kwargs.get("debug", False)
|
41
|
+
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
89
42
|
|
90
43
|
def build_and_load(self) -> dd.DataFrame:
|
91
44
|
"""
|
92
|
-
Builds and
|
93
|
-
build and load function. This method triggers the `_build_and_load`
|
94
|
-
function to process and prepare the data before returning it as
|
95
|
-
a dask dataframe.
|
45
|
+
Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
96
46
|
|
97
|
-
|
47
|
+
This method is stateless and returns the DataFrame directly.
|
98
48
|
|
99
|
-
:
|
100
|
-
|
101
|
-
|
102
|
-
self._build_and_load()
|
103
|
-
return self.df
|
104
|
-
|
105
|
-
def _build_and_load(self) -> dd.DataFrame:
|
106
|
-
"""
|
107
|
-
Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
|
108
|
-
|
109
|
-
This method initializes a SQLAlchemyDask object with the provided model,
|
110
|
-
filters, engine URL, logger, chunk size, and debug configuration.
|
111
|
-
It attempts to load the data using the ``read_frame`` method of
|
112
|
-
SQLAlchemyDask. If the data cannot be loaded or the query returns
|
113
|
-
no rows, it creates and returns an empty Dask DataFrame.
|
114
|
-
|
115
|
-
:raises Exception: On failure to load data or to create a DataFrame.
|
116
|
-
|
117
|
-
:return: A Dask DataFrame object containing the queried data or an
|
118
|
-
empty DataFrame if the query returns no results or fails.
|
119
|
-
:rtype: dask.dataframe.DataFrame
|
49
|
+
Returns:
|
50
|
+
A Dask DataFrame containing the queried data or an empty,
|
51
|
+
correctly structured DataFrame if the query fails or returns no results.
|
120
52
|
"""
|
121
53
|
try:
|
122
|
-
|
54
|
+
# Instantiate and use the low-level Dask loader
|
55
|
+
sqlalchemy_dask_loader=SQLAlchemyDask(
|
123
56
|
model=self.model,
|
124
|
-
filters=self.params_config.filters,
|
125
|
-
|
126
|
-
logger=self.logger,
|
57
|
+
filters=self.params_config.filters if self.params_config else {},
|
58
|
+
engine=self.engine,
|
127
59
|
chunk_size=self.chunk_size,
|
60
|
+
logger=self.logger,
|
128
61
|
debug=self.debug
|
129
|
-
)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
62
|
+
)
|
63
|
+
# Create the lazy DataFrame
|
64
|
+
dask_df = sqlalchemy_dask_loader.read_frame()
|
65
|
+
return dask_df
|
134
66
|
|
135
|
-
return dask_df
|
136
67
|
|
137
|
-
return self.df
|
138
|
-
except RuntimeError as e:
|
139
|
-
self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
|
140
|
-
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
|
-
return dask_df
|
142
68
|
except Exception as e:
|
143
|
-
self.logger.
|
144
|
-
|
145
|
-
|
69
|
+
self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
|
70
|
+
# Return an empty dataframe with the correct schema on failure
|
71
|
+
columns = [c.name for c in self.model.__table__.columns]
|
72
|
+
return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
73
|
+
|
74
|
+
|