sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/__init__.py +1 -0
- sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
- sibi_dst/osmnx_helper/route_path_builder.py +97 -0
- sibi_dst/osmnx_helper/utils.py +2 -0
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,33 +1,29 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, Tuple, Type
|
4
5
|
|
5
6
|
import dask
|
6
7
|
import dask.dataframe as dd
|
7
8
|
import pandas as pd
|
8
|
-
|
9
|
-
|
10
|
-
select
|
11
|
-
)
|
9
|
+
import sqlalchemy as sa
|
10
|
+
from sqlalchemy import select, inspect
|
12
11
|
from sqlalchemy.engine import Engine
|
13
|
-
from sqlalchemy.orm import declarative_base
|
14
|
-
import time
|
15
12
|
from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
16
|
-
|
13
|
+
from sqlalchemy.orm import declarative_base
|
17
14
|
|
18
15
|
from sibi_dst.utils import ManagedResource
|
19
16
|
from sibi_dst.df_helper.core import FilterHandler
|
17
|
+
from ._db_gatekeeper import DBGatekeeper
|
20
18
|
|
21
19
|
|
22
20
|
class SQLAlchemyDask(ManagedResource):
|
23
21
|
"""
|
24
22
|
Loads data from a database into a Dask DataFrame using a memory-safe,
|
25
|
-
non-parallel, paginated approach.
|
26
|
-
|
27
|
-
This class avoids using a numeric `index_col for parallel loading.
|
23
|
+
non-parallel, paginated approach (LIMIT/OFFSET).
|
28
24
|
"""
|
29
25
|
|
30
|
-
_SQLALCHEMY_TO_DASK_DTYPE = {
|
26
|
+
_SQLALCHEMY_TO_DASK_DTYPE: Dict[str, str] = {
|
31
27
|
"INTEGER": "Int64",
|
32
28
|
"SMALLINT": "Int64",
|
33
29
|
"BIGINT": "Int64",
|
@@ -38,66 +34,85 @@ class SQLAlchemyDask(ManagedResource):
|
|
38
34
|
"TEXT": "object",
|
39
35
|
"DATE": "datetime64[ns]",
|
40
36
|
"DATETIME": "datetime64[ns]",
|
37
|
+
"TIMESTAMP": "datetime64[ns]",
|
41
38
|
"TIME": "object",
|
42
39
|
"UUID": "object",
|
43
40
|
}
|
44
41
|
|
45
42
|
def __init__(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
43
|
+
self,
|
44
|
+
model: Type[declarative_base()],
|
45
|
+
filters: Dict[str, Any],
|
46
|
+
engine: Engine,
|
47
|
+
chunk_size: int = 1000,
|
48
|
+
**kwargs: Any,
|
52
49
|
):
|
53
|
-
"""
|
54
|
-
Initializes the data loader.
|
55
|
-
|
56
|
-
Args:
|
57
|
-
model: The SQLAlchemy ORM model for the table.
|
58
|
-
filters: A dictionary of filters to apply to the query.
|
59
|
-
engine: An SQLAlchemy Engine instance.
|
60
|
-
chunk_size: The number of records to fetch in each database query.
|
61
|
-
logger: A logger instance.
|
62
|
-
debug: Whether to enable detailed logging.
|
63
|
-
"""
|
64
50
|
super().__init__(**kwargs)
|
65
51
|
self.model = model
|
66
|
-
self.filters = filters
|
52
|
+
self.filters = filters or {}
|
67
53
|
self.engine = engine
|
68
|
-
self.chunk_size = chunk_size
|
54
|
+
self.chunk_size = int(chunk_size)
|
69
55
|
self.filter_handler_cls = FilterHandler
|
70
|
-
self.total_records = -1
|
56
|
+
self.total_records: int = -1 # -1 indicates failure/unknown
|
57
|
+
self._sem = DBGatekeeper.get(str(engine.url), max_concurrency=self._safe_cap())
|
71
58
|
|
72
|
-
|
73
|
-
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
59
|
+
def _safe_cap(self) -> int:
|
74
60
|
"""
|
75
|
-
|
76
|
-
|
61
|
+
Calculate a safe concurrency cap for DB work based on the engine's pool.
|
62
|
+
|
63
|
+
Returns: max(1, pool_size + max_overflow - 1)
|
64
|
+
- Works across SQLAlchemy 1.4/2.x
|
65
|
+
- Tolerates pools that expose size/max_overflow as methods or attrs
|
66
|
+
- Allows explicit override via self.db_gatekeeper_cap (if you pass it)
|
77
67
|
"""
|
68
|
+
# optional explicit override
|
69
|
+
explicit = getattr(self, "db_gatekeeper_cap", None)
|
70
|
+
if isinstance(explicit, int) and explicit > 0:
|
71
|
+
return explicit
|
72
|
+
|
73
|
+
pool = getattr(self.engine, "pool", None)
|
74
|
+
|
75
|
+
def _to_int(val, default):
|
76
|
+
if val is None:
|
77
|
+
return default
|
78
|
+
if callable(val):
|
79
|
+
try:
|
80
|
+
return int(val()) # e.g., pool.size()
|
81
|
+
except Exception:
|
82
|
+
return default
|
83
|
+
try:
|
84
|
+
return int(val)
|
85
|
+
except Exception:
|
86
|
+
return default
|
87
|
+
|
88
|
+
# size: QueuePool.size() -> int
|
89
|
+
size_candidate = getattr(pool, "size", None) # method on QueuePool
|
90
|
+
pool_size = _to_int(size_candidate, 5)
|
91
|
+
|
92
|
+
# max_overflow: prefer attribute; fall back to private _max_overflow; avoid 'overflow()' (method)
|
93
|
+
max_overflow_attr = (
|
94
|
+
getattr(pool, "max_overflow", None) or # SQLAlchemy 2.x QueuePool
|
95
|
+
getattr(pool, "_max_overflow", None) # private fallback
|
96
|
+
)
|
97
|
+
max_overflow = _to_int(max_overflow_attr, 10)
|
98
|
+
|
99
|
+
cap = max(1, pool_size + max_overflow - 1)
|
100
|
+
self.logger.debug(f"Using a Cap of {cap} from pool size of {pool_size} and max overflow of {max_overflow}.")
|
101
|
+
return max(1, cap)
|
102
|
+
|
103
|
+
# ---------- meta ----------
|
104
|
+
@classmethod
|
105
|
+
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> Dict[str, str]:
|
78
106
|
mapper = inspect(model)
|
79
|
-
dtypes = {}
|
107
|
+
dtypes: Dict[str, str] = {}
|
80
108
|
for column in mapper.columns:
|
81
109
|
dtype_str = str(column.type).upper().split("(")[0]
|
82
110
|
dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
|
83
111
|
dtypes[column.name] = dtype
|
84
112
|
return dtypes
|
85
113
|
|
86
|
-
def read_frame(self, fillna_value=None) ->
|
87
|
-
|
88
|
-
Builds and executes a query to load data into a Dask DataFrame.
|
89
|
-
|
90
|
-
This method works by first running a COUNT query to get the total
|
91
|
-
size, then creating a series of delayed tasks that each fetch a
|
92
|
-
chunk of data using LIMIT/OFFSET.
|
93
|
-
|
94
|
-
Args:
|
95
|
-
fillna_value: Value to replace NaN or NULL values with, if any.
|
96
|
-
|
97
|
-
Returns:
|
98
|
-
A lazy Dask DataFrame.
|
99
|
-
"""
|
100
|
-
# 1. Build the base query and apply filters
|
114
|
+
def read_frame(self, fillna_value=None) -> Tuple[int, dd.DataFrame]:
|
115
|
+
# Base selectable
|
101
116
|
query = select(self.model)
|
102
117
|
if self.filters:
|
103
118
|
query = self.filter_handler_cls(
|
@@ -105,232 +120,67 @@ class SQLAlchemyDask(ManagedResource):
|
|
105
120
|
).apply_filters(query, model=self.model, filters=self.filters)
|
106
121
|
else:
|
107
122
|
query = query.limit(self.chunk_size)
|
108
|
-
if self.verbose:
|
109
|
-
self.logger.debug(f"Base query for pagination: {query}")
|
110
123
|
|
111
|
-
#
|
112
|
-
ordered_columns = [
|
124
|
+
# Meta dataframe (stable column order & dtypes)
|
125
|
+
ordered_columns = [c.name for c in self.model.__table__.columns]
|
113
126
|
meta_dtypes = self.infer_meta_from_model(self.model)
|
114
127
|
meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
|
115
128
|
|
116
|
-
#
|
117
|
-
|
129
|
+
# Count with retry/backoff
|
118
130
|
retry_attempts = 3
|
119
|
-
|
120
|
-
|
131
|
+
backoff = 0.5
|
132
|
+
total = 0
|
121
133
|
|
122
134
|
for attempt in range(retry_attempts):
|
123
135
|
try:
|
124
|
-
with self.
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
break
|
130
|
-
|
136
|
+
with self._sem:
|
137
|
+
with self.engine.connect() as connection:
|
138
|
+
count_q = sa.select(sa.func.count()).select_from(query.alias())
|
139
|
+
total = connection.execute(count_q).scalar_one()
|
140
|
+
break
|
131
141
|
except SASQLTimeoutError:
|
132
142
|
if attempt < retry_attempts - 1:
|
133
|
-
self.logger.warning(
|
134
|
-
|
135
|
-
|
136
|
-
time.sleep(backoff_factor)
|
137
|
-
backoff_factor *= 2 # Double the backoff time for the next attempt
|
143
|
+
self.logger.warning(f"Connection pool limit reached. Retrying in {backoff} seconds...")
|
144
|
+
time.sleep(backoff)
|
145
|
+
backoff *= 2
|
138
146
|
else:
|
139
|
-
self.total_records = -1
|
140
|
-
self.logger.error(
|
141
|
-
"Failed to get a connection from the pool after several retries.",
|
142
|
-
exc_info=True
|
143
|
-
)
|
147
|
+
self.total_records = -1
|
148
|
+
self.logger.error("Failed to get a connection from the pool after retries.", exc_info=True)
|
144
149
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
145
150
|
except OperationalError as oe:
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
backoff_factor *= 2
|
151
|
+
if "timeout" in str(oe).lower() and attempt < retry_attempts - 1:
|
152
|
+
self.logger.warning("Operational timeout, retrying…", exc_info=self.debug)
|
153
|
+
time.sleep(backoff)
|
154
|
+
backoff *= 2
|
151
155
|
continue
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
156
|
+
self.total_records = -1
|
157
|
+
self.logger.error("OperationalError during count.", exc_info=True)
|
158
|
+
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
156
159
|
except Exception as e:
|
157
|
-
self.total_records = -1
|
158
|
-
self.logger.error(f"
|
160
|
+
self.total_records = -1
|
161
|
+
self.logger.error(f"Unexpected error during count: {e}", exc_info=True)
|
159
162
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
160
163
|
|
161
|
-
self.total_records =
|
162
|
-
if
|
164
|
+
self.total_records = int(total)
|
165
|
+
if total == 0:
|
163
166
|
self.logger.warning("Query returned 0 records.")
|
167
|
+
super().close()
|
164
168
|
return self.total_records, dd.from_pandas(meta_df, npartitions=1)
|
165
169
|
|
166
|
-
self.logger.debug(f"Total records to fetch: {
|
170
|
+
self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
|
167
171
|
|
168
|
-
# 4. Create a list of Dask Delayed objects, one for each chunk
|
169
172
|
@dask.delayed
|
170
173
|
def get_chunk(sql_query, chunk_offset):
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
return df[ordered_columns].astype(meta_dtypes)
|
181
|
-
|
182
|
-
offsets = range(0, total_records, self.chunk_size)
|
183
|
-
delayed_chunks = [get_chunk(query, offset) for offset in offsets]
|
184
|
-
|
185
|
-
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
174
|
+
with self._sem: # <<< cap concurrent DB fetches
|
175
|
+
paginated = sql_query.limit(self.chunk_size).offset(chunk_offset)
|
176
|
+
df = pd.read_sql(paginated, self.engine)
|
177
|
+
if fillna_value is not None:
|
178
|
+
df = df.fillna(fillna_value)
|
179
|
+
return df[ordered_columns].astype(meta_dtypes)
|
180
|
+
|
181
|
+
offsets = range(0, total, self.chunk_size)
|
182
|
+
delayed_chunks = [get_chunk(query, off) for off in offsets]
|
186
183
|
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
187
|
-
self.logger.debug(f"
|
188
|
-
if not self._entered:
|
189
|
-
super().cleanup()
|
184
|
+
self.logger.debug(f"Created Dask DataFrame with {ddf.npartitions} partitions.")
|
190
185
|
return self.total_records, ddf
|
191
186
|
|
192
|
-
## Dask-Only Solution to test in better hardware
|
193
|
-
|
194
|
-
# from typing import Type, Dict, Any
|
195
|
-
# import math
|
196
|
-
# import time
|
197
|
-
# import pandas as pd
|
198
|
-
# import dask
|
199
|
-
# import dask.dataframe as dd
|
200
|
-
#
|
201
|
-
# import sqlalchemy as sa
|
202
|
-
# from sqlalchemy import select, func
|
203
|
-
# from sqlalchemy.engine import Engine
|
204
|
-
# from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
205
|
-
# from sqlalchemy.orm import declarative_base
|
206
|
-
#
|
207
|
-
# from sibi_dst.df_helper.core import FilterHandler
|
208
|
-
# from sibi_dst.utils import Logger
|
209
|
-
#
|
210
|
-
#
|
211
|
-
# class SQLAlchemyDask:
|
212
|
-
# """
|
213
|
-
# Loads data into a Dask DataFrame. If there’s exactly one integer PK,
|
214
|
-
# use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
|
215
|
-
# pagination pushed into dask.delayed to keep memory use minimal.
|
216
|
-
# """
|
217
|
-
#
|
218
|
-
# def __init__(
|
219
|
-
# self,
|
220
|
-
# model: Type[declarative_base()],
|
221
|
-
# filters: Dict[str, Any],
|
222
|
-
# engine: Engine,
|
223
|
-
# chunk_size: int = 1_000,
|
224
|
-
# logger=None,
|
225
|
-
# debug: bool = False,
|
226
|
-
# ):
|
227
|
-
# self.model = model
|
228
|
-
# self.filters = filters or {}
|
229
|
-
# self.engine = engine
|
230
|
-
# self.chunk_size = chunk_size
|
231
|
-
# self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
232
|
-
# self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
233
|
-
# self.filter_handler_cls = FilterHandler
|
234
|
-
# self.debug = debug
|
235
|
-
#
|
236
|
-
# def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
237
|
-
# # 1) Build base query + filters
|
238
|
-
# base_q = select(self.model)
|
239
|
-
# if self.filters:
|
240
|
-
# base_q = self.filter_handler_cls(
|
241
|
-
# backend="sqlalchemy",
|
242
|
-
# logger=self.logger,
|
243
|
-
# debug=self.debug,
|
244
|
-
# ).apply_filters(base_q, model=self.model, filters=self.filters)
|
245
|
-
#
|
246
|
-
# # 2) Zero-row meta for dtype inference
|
247
|
-
# meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
|
248
|
-
# if meta.shape[1] == 0:
|
249
|
-
# self.logger.warning("No columns detected; returning empty DataFrame.")
|
250
|
-
# return dd.from_pandas(meta, npartitions=1)
|
251
|
-
#
|
252
|
-
# # 3) Single‐PK parallel path?
|
253
|
-
# pk_cols = list(self.model.__table__.primary_key.columns)
|
254
|
-
# if (
|
255
|
-
# len(pk_cols) == 1
|
256
|
-
# and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
|
257
|
-
# ):
|
258
|
-
# try:
|
259
|
-
# return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
|
260
|
-
# except Exception:
|
261
|
-
# self.logger.warning(
|
262
|
-
# "read_sql_table path failed, falling back to offset pagination",
|
263
|
-
# exc_info=True,
|
264
|
-
# )
|
265
|
-
#
|
266
|
-
# # 4) Composite PK or fallback → offset pagination in delayed tasks
|
267
|
-
# return self._offset_paginated_ddf(base_q, meta, fillna_value)
|
268
|
-
#
|
269
|
-
# def _offset_paginated_ddf(self, base_q, meta, fillna):
|
270
|
-
# # 1) count total rows
|
271
|
-
# try:
|
272
|
-
# with self.engine.connect() as conn:
|
273
|
-
# total = conn.execute(
|
274
|
-
# select(func.count()).select_from(base_q.alias())
|
275
|
-
# ).scalar_one()
|
276
|
-
# except Exception:
|
277
|
-
# self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
|
278
|
-
# return dd.from_pandas(meta, npartitions=1)
|
279
|
-
#
|
280
|
-
# if total == 0:
|
281
|
-
# self.logger.warning("Query returned 0 records.")
|
282
|
-
# return dd.from_pandas(meta, npartitions=1)
|
283
|
-
# self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
|
284
|
-
# # 2) create delayed tasks per offset
|
285
|
-
# @dask.delayed
|
286
|
-
# def _fetch_chunk(offset: int) -> pd.DataFrame:
|
287
|
-
# q = base_q.limit(self.chunk_size).offset(offset)
|
288
|
-
# df = pd.read_sql_query(q, self.engine)
|
289
|
-
# if fillna is not None:
|
290
|
-
# df = df.fillna(fillna)
|
291
|
-
# return df[meta.columns].astype(meta.dtypes.to_dict())
|
292
|
-
#
|
293
|
-
# offsets = range(0, total, self.chunk_size)
|
294
|
-
# parts = [_fetch_chunk(off) for off in offsets]
|
295
|
-
#
|
296
|
-
# ddf = dd.from_delayed(parts, meta=meta)
|
297
|
-
# self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
|
298
|
-
# return ddf
|
299
|
-
#
|
300
|
-
# def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
|
301
|
-
# # same as before: min/max + dd.read_sql_table
|
302
|
-
# backoff = 0.5
|
303
|
-
# for attempt in range(3):
|
304
|
-
# try:
|
305
|
-
# with self.engine.connect() as conn:
|
306
|
-
# min_id, max_id = conn.execute(
|
307
|
-
# select(func.min(pk_col), func.max(pk_col))
|
308
|
-
# .select_from(self.model.__table__)
|
309
|
-
# ).one()
|
310
|
-
# break
|
311
|
-
# except (SASQLTimeoutError, OperationalError) as e:
|
312
|
-
# if "timeout" in str(e).lower() and attempt < 2:
|
313
|
-
# self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
|
314
|
-
# time.sleep(backoff)
|
315
|
-
# backoff *= 2
|
316
|
-
# else:
|
317
|
-
# raise
|
318
|
-
#
|
319
|
-
# if min_id is None or max_id is None:
|
320
|
-
# self.logger.warning("Table empty—no PK bounds.")
|
321
|
-
# return dd.from_pandas(meta, npartitions=1)
|
322
|
-
#
|
323
|
-
# total = max_id - min_id + 1
|
324
|
-
# nparts = max(1, math.ceil(total / self.chunk_size))
|
325
|
-
# ddf = dd.read_sql_table(
|
326
|
-
# table=self.model.__table__.name,
|
327
|
-
# uri=str(self.engine.url),
|
328
|
-
# index_col=pk_col.name,
|
329
|
-
# limits=(min_id, max_id),
|
330
|
-
# npartitions=nparts,
|
331
|
-
# columns=list(meta.columns),
|
332
|
-
# )
|
333
|
-
# if fillna is not None:
|
334
|
-
# ddf = ddf.fillna(fillna)
|
335
|
-
# self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
|
336
|
-
# return ddf
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import Any
|
3
|
+
from typing import Any, Tuple
|
4
4
|
|
5
5
|
import dask.dataframe as dd
|
6
6
|
import pandas as pd
|
@@ -13,68 +13,116 @@ from ._io_dask import SQLAlchemyDask
|
|
13
13
|
|
14
14
|
class SqlAlchemyLoadFromDb(ManagedResource):
|
15
15
|
"""
|
16
|
-
Orchestrates loading data from a database using SQLAlchemy into a Dask
|
17
|
-
DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
16
|
+
Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
|
18
17
|
"""
|
19
18
|
|
20
19
|
def __init__(
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
20
|
+
self,
|
21
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
22
|
+
plugin_query: QueryConfig = None,
|
23
|
+
plugin_params: ParamsConfig = None,
|
24
|
+
**kwargs,
|
26
25
|
):
|
27
|
-
"""
|
28
|
-
Initializes the loader with all necessary configurations.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
plugin_sqlalchemy: The database connection configuration object.
|
32
|
-
plugin_query: The query configuration object.
|
33
|
-
plugin_params: The parameters and filters configuration object.
|
34
|
-
logger: An optional logger instance.
|
35
|
-
**kwargs: Must contain 'index_column' for Dask partitioning.
|
36
|
-
"""
|
37
26
|
super().__init__(**kwargs)
|
38
27
|
self.db_connection = plugin_sqlalchemy
|
39
28
|
self.model = self.db_connection.model
|
40
29
|
self.engine = self.db_connection.engine
|
41
30
|
self.query_config = plugin_query
|
42
31
|
self.params_config = plugin_params
|
43
|
-
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
44
|
-
self.total_records = -1
|
45
|
-
|
46
|
-
def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
|
47
|
-
"""
|
48
|
-
Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
49
|
-
|
50
|
-
This method is stateless and returns the DataFrame directly.
|
32
|
+
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
|
33
|
+
self.total_records = -1
|
51
34
|
|
52
|
-
|
53
|
-
A Dask DataFrame containing the queried data or an empty,
|
54
|
-
correctly structured DataFrame if the query fails or returns no results.
|
55
|
-
"""
|
35
|
+
def build_and_load(self) -> Tuple[int, dd.DataFrame]:
|
56
36
|
try:
|
57
|
-
|
58
|
-
sqlalchemy_dask_loader=SQLAlchemyDask(
|
37
|
+
with SQLAlchemyDask(
|
59
38
|
model=self.model,
|
60
39
|
filters=self.params_config.filters if self.params_config else {},
|
61
40
|
engine=self.engine,
|
62
41
|
chunk_size=self.chunk_size,
|
63
42
|
logger=self.logger,
|
64
43
|
verbose=self.verbose,
|
65
|
-
debug=self.debug
|
66
|
-
)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
return self.total_records, dask_df
|
71
|
-
|
72
|
-
|
44
|
+
debug=self.debug,
|
45
|
+
) as loader:
|
46
|
+
self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
|
47
|
+
self.total_records, dask_df = loader.read_frame()
|
48
|
+
return self.total_records, dask_df
|
73
49
|
except Exception as e:
|
74
50
|
self.total_records = -1
|
75
51
|
self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
76
|
-
#
|
52
|
+
# empty df with correct columns
|
77
53
|
columns = [c.name for c in self.model.__table__.columns]
|
78
54
|
return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
79
55
|
|
80
|
-
|
56
|
+
# from __future__ import annotations
|
57
|
+
#
|
58
|
+
# from typing import Any
|
59
|
+
#
|
60
|
+
# import dask.dataframe as dd
|
61
|
+
# import pandas as pd
|
62
|
+
#
|
63
|
+
# from sibi_dst.utils import ManagedResource
|
64
|
+
# from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
65
|
+
# from ._db_connection import SqlAlchemyConnectionConfig
|
66
|
+
# from ._io_dask import SQLAlchemyDask
|
67
|
+
#
|
68
|
+
# class SqlAlchemyLoadFromDb(ManagedResource):
|
69
|
+
# """
|
70
|
+
# Orchestrates loading data from a database using SQLAlchemy into a Dask
|
71
|
+
# DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
72
|
+
# """
|
73
|
+
#
|
74
|
+
# def __init__(
|
75
|
+
# self,
|
76
|
+
# plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
77
|
+
# plugin_query: QueryConfig = None,
|
78
|
+
# plugin_params: ParamsConfig = None,
|
79
|
+
# **kwargs,
|
80
|
+
# ):
|
81
|
+
# """
|
82
|
+
# Initializes the loader with all necessary configurations.
|
83
|
+
#
|
84
|
+
# Args:
|
85
|
+
# plugin_sqlalchemy: The database connection configuration object.
|
86
|
+
# plugin_query: The query configuration object.
|
87
|
+
# plugin_params: The parameters and filters configuration object.
|
88
|
+
# logger: An optional logger instance.
|
89
|
+
# **kwargs: Must contain 'index_column' for Dask partitioning.
|
90
|
+
# """
|
91
|
+
# super().__init__(**kwargs)
|
92
|
+
# self.db_connection = plugin_sqlalchemy
|
93
|
+
# self.model = self.db_connection.model
|
94
|
+
# self.engine = self.db_connection.engine
|
95
|
+
# self.query_config = plugin_query
|
96
|
+
# self.params_config = plugin_params
|
97
|
+
# self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
98
|
+
# self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
99
|
+
#
|
100
|
+
# def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
|
101
|
+
# """
|
102
|
+
# Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
103
|
+
#
|
104
|
+
# This method is stateless and returns the DataFrame directly.
|
105
|
+
#
|
106
|
+
# Returns:
|
107
|
+
# A Dask DataFrame containing the queried data or an empty,
|
108
|
+
# correctly structured DataFrame if the query fails or returns no results.
|
109
|
+
# """
|
110
|
+
# try:
|
111
|
+
# # Instantiate and use the low-level Dask loader
|
112
|
+
# with SQLAlchemyDask(model=self.model,filters=self.params_config.filters if self.params_config else {},
|
113
|
+
# engine=self.engine,
|
114
|
+
# chunk_size=self.chunk_size,
|
115
|
+
# logger=self.logger,
|
116
|
+
# verbose=self.verbose,
|
117
|
+
# debug=self.debug) as sqlalchemy_dask_loader:
|
118
|
+
# self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
|
119
|
+
# # Create the lazy DataFrame and read a record count
|
120
|
+
# # if total_records less than 0, it means an error occurred during the loading process
|
121
|
+
# self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
|
122
|
+
# return self.total_records, dask_df
|
123
|
+
# except Exception as e:
|
124
|
+
# self.total_records = -1
|
125
|
+
# self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
|
126
|
+
# # Return an empty dataframe with the correct schema on failure
|
127
|
+
# columns = [c.name for c in self.model.__table__.columns]
|
128
|
+
# return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|