sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  13. sibi_dst/utils/base.py +302 -96
  14. sibi_dst/utils/clickhouse_writer.py +472 -206
  15. sibi_dst/utils/data_utils.py +139 -186
  16. sibi_dst/utils/data_wrapper.py +317 -73
  17. sibi_dst/utils/date_utils.py +1 -0
  18. sibi_dst/utils/df_utils.py +193 -213
  19. sibi_dst/utils/file_utils.py +3 -2
  20. sibi_dst/utils/filepath_generator.py +314 -152
  21. sibi_dst/utils/log_utils.py +581 -242
  22. sibi_dst/utils/manifest_manager.py +60 -76
  23. sibi_dst/utils/parquet_saver.py +33 -27
  24. sibi_dst/utils/phone_formatter.py +88 -95
  25. sibi_dst/utils/update_planner.py +180 -178
  26. sibi_dst/utils/webdav_client.py +116 -166
  27. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  28. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
  29. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,33 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Type, Any
3
+ import time
4
+ from typing import Any, Dict, Tuple, Type
4
5
 
5
6
  import dask
6
7
  import dask.dataframe as dd
7
8
  import pandas as pd
8
- from sqlalchemy import (
9
- inspect,
10
- select
11
- )
9
+ import sqlalchemy as sa
10
+ from sqlalchemy import select, inspect
12
11
  from sqlalchemy.engine import Engine
13
- from sqlalchemy.orm import declarative_base
14
- import time
15
12
  from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
16
- import sqlalchemy as sa
13
+ from sqlalchemy.orm import declarative_base
17
14
 
18
15
  from sibi_dst.utils import ManagedResource
19
16
  from sibi_dst.df_helper.core import FilterHandler
17
+ from ._db_gatekeeper import DBGatekeeper
20
18
 
21
19
 
22
20
  class SQLAlchemyDask(ManagedResource):
23
21
  """
24
22
  Loads data from a database into a Dask DataFrame using a memory-safe,
25
- non-parallel, paginated approach.
26
-
27
- This class avoids using a numeric `index_col for parallel loading.
23
+ non-parallel, paginated approach (LIMIT/OFFSET).
28
24
  """
29
25
 
30
- _SQLALCHEMY_TO_DASK_DTYPE = {
26
+ _SQLALCHEMY_TO_DASK_DTYPE: Dict[str, str] = {
31
27
  "INTEGER": "Int64",
32
28
  "SMALLINT": "Int64",
33
29
  "BIGINT": "Int64",
@@ -38,66 +34,85 @@ class SQLAlchemyDask(ManagedResource):
38
34
  "TEXT": "object",
39
35
  "DATE": "datetime64[ns]",
40
36
  "DATETIME": "datetime64[ns]",
37
+ "TIMESTAMP": "datetime64[ns]",
41
38
  "TIME": "object",
42
39
  "UUID": "object",
43
40
  }
44
41
 
45
42
  def __init__(
46
- self,
47
- model: Type[declarative_base()],
48
- filters: dict,
49
- engine: Engine,
50
- chunk_size: int = 1000,
51
- **kwargs
43
+ self,
44
+ model: Type[declarative_base()],
45
+ filters: Dict[str, Any],
46
+ engine: Engine,
47
+ chunk_size: int = 1000,
48
+ **kwargs: Any,
52
49
  ):
53
- """
54
- Initializes the data loader.
55
-
56
- Args:
57
- model: The SQLAlchemy ORM model for the table.
58
- filters: A dictionary of filters to apply to the query.
59
- engine: An SQLAlchemy Engine instance.
60
- chunk_size: The number of records to fetch in each database query.
61
- logger: A logger instance.
62
- debug: Whether to enable detailed logging.
63
- """
64
50
  super().__init__(**kwargs)
65
51
  self.model = model
66
- self.filters = filters
52
+ self.filters = filters or {}
67
53
  self.engine = engine
68
- self.chunk_size = chunk_size
54
+ self.chunk_size = int(chunk_size)
69
55
  self.filter_handler_cls = FilterHandler
70
- self.total_records = -1 # Initialize to -1 to indicate uncounted
56
+ self.total_records: int = -1 # -1 indicates failure/unknown
57
+ self._sem = DBGatekeeper.get(str(engine.url), max_concurrency=self._safe_cap())
71
58
 
72
- @classmethod
73
- def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
59
+ def _safe_cap(self) -> int:
74
60
  """
75
- Infers a metadata dictionary for Dask based on the SQLAlchemy model.
76
- This helps Dask understand the DataFrame structure without reading data.
61
+ Calculate a safe concurrency cap for DB work based on the engine's pool.
62
+
63
+ Returns: max(1, pool_size + max_overflow - 1)
64
+ - Works across SQLAlchemy 1.4/2.x
65
+ - Tolerates pools that expose size/max_overflow as methods or attrs
66
+ - Allows explicit override via self.db_gatekeeper_cap (if you pass it)
77
67
  """
68
+ # optional explicit override
69
+ explicit = getattr(self, "db_gatekeeper_cap", None)
70
+ if isinstance(explicit, int) and explicit > 0:
71
+ return explicit
72
+
73
+ pool = getattr(self.engine, "pool", None)
74
+
75
+ def _to_int(val, default):
76
+ if val is None:
77
+ return default
78
+ if callable(val):
79
+ try:
80
+ return int(val()) # e.g., pool.size()
81
+ except Exception:
82
+ return default
83
+ try:
84
+ return int(val)
85
+ except Exception:
86
+ return default
87
+
88
+ # size: QueuePool.size() -> int
89
+ size_candidate = getattr(pool, "size", None) # method on QueuePool
90
+ pool_size = _to_int(size_candidate, 5)
91
+
92
+ # max_overflow: prefer attribute; fall back to private _max_overflow; avoid 'overflow()' (method)
93
+ max_overflow_attr = (
94
+ getattr(pool, "max_overflow", None) or # SQLAlchemy 2.x QueuePool
95
+ getattr(pool, "_max_overflow", None) # private fallback
96
+ )
97
+ max_overflow = _to_int(max_overflow_attr, 10)
98
+
99
+ cap = max(1, pool_size + max_overflow - 1)
100
+ self.logger.debug(f"Using a Cap of {cap} from pool size of {pool_size} and max overflow of {max_overflow}.")
101
+ return max(1, cap)
102
+
103
+ # ---------- meta ----------
104
+ @classmethod
105
+ def infer_meta_from_model(cls, model: Type[declarative_base()]) -> Dict[str, str]:
78
106
  mapper = inspect(model)
79
- dtypes = {}
107
+ dtypes: Dict[str, str] = {}
80
108
  for column in mapper.columns:
81
109
  dtype_str = str(column.type).upper().split("(")[0]
82
110
  dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
83
111
  dtypes[column.name] = dtype
84
112
  return dtypes
85
113
 
86
- def read_frame(self, fillna_value=None) -> tuple[int | Any, Any] | Any:
87
- """
88
- Builds and executes a query to load data into a Dask DataFrame.
89
-
90
- This method works by first running a COUNT query to get the total
91
- size, then creating a series of delayed tasks that each fetch a
92
- chunk of data using LIMIT/OFFSET.
93
-
94
- Args:
95
- fillna_value: Value to replace NaN or NULL values with, if any.
96
-
97
- Returns:
98
- A lazy Dask DataFrame.
99
- """
100
- # 1. Build the base query and apply filters
114
+ def read_frame(self, fillna_value=None) -> Tuple[int, dd.DataFrame]:
115
+ # Base selectable
101
116
  query = select(self.model)
102
117
  if self.filters:
103
118
  query = self.filter_handler_cls(
@@ -105,232 +120,67 @@ class SQLAlchemyDask(ManagedResource):
105
120
  ).apply_filters(query, model=self.model, filters=self.filters)
106
121
  else:
107
122
  query = query.limit(self.chunk_size)
108
- if self.verbose:
109
- self.logger.debug(f"Base query for pagination: {query}")
110
123
 
111
- # 2. Get metadata for the Dask DataFrame structure
112
- ordered_columns = [column.name for column in self.model.__table__.columns]
124
+ # Meta dataframe (stable column order & dtypes)
125
+ ordered_columns = [c.name for c in self.model.__table__.columns]
113
126
  meta_dtypes = self.infer_meta_from_model(self.model)
114
127
  meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
115
128
 
116
- # 3. Get the total record count to calculate the number of chunks
117
-
129
+ # Count with retry/backoff
118
130
  retry_attempts = 3
119
- backoff_factor = 0.5 # start with a 0.5-second delay
120
- total_records = 0
131
+ backoff = 0.5
132
+ total = 0
121
133
 
122
134
  for attempt in range(retry_attempts):
123
135
  try:
124
- with self.engine.connect() as connection:
125
- count_query = sa.select(sa.func.count()).select_from(query.alias())
126
- total_records = connection.execute(count_query).scalar_one()
127
-
128
- # If successful, break the loop
129
- break
130
-
136
+ with self._sem:
137
+ with self.engine.connect() as connection:
138
+ count_q = sa.select(sa.func.count()).select_from(query.alias())
139
+ total = connection.execute(count_q).scalar_one()
140
+ break
131
141
  except SASQLTimeoutError:
132
142
  if attempt < retry_attempts - 1:
133
- self.logger.warning(
134
- f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
135
- )
136
- time.sleep(backoff_factor)
137
- backoff_factor *= 2 # Double the backoff time for the next attempt
143
+ self.logger.warning(f"Connection pool limit reached. Retrying in {backoff} seconds...")
144
+ time.sleep(backoff)
145
+ backoff *= 2
138
146
  else:
139
- self.total_records = -1 # Indicate failure to count records
140
- self.logger.error(
141
- "Failed to get a connection from the pool after several retries.",
142
- exc_info=True
143
- )
147
+ self.total_records = -1
148
+ self.logger.error("Failed to get a connection from the pool after retries.", exc_info=True)
144
149
  return self.total_records, dd.from_pandas(meta_df, npartitions=1)
145
150
  except OperationalError as oe:
146
- # sometimes the DB driver wraps timeouts in OperationalError
147
- if "timeout" in str(oe).lower():
148
- self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
149
- time.sleep(backoff_factor)
150
- backoff_factor *= 2
151
+ if "timeout" in str(oe).lower() and attempt < retry_attempts - 1:
152
+ self.logger.warning("Operational timeout, retrying…", exc_info=self.debug)
153
+ time.sleep(backoff)
154
+ backoff *= 2
151
155
  continue
152
- else:
153
- self.total_records = -1 # Indicate failure to count records
154
- self.logger.error("OperationalError", exc_info=True)
155
- return self.total_records, dd.from_pandas(meta_df, npartitions=1)
156
+ self.total_records = -1
157
+ self.logger.error("OperationalError during count.", exc_info=True)
158
+ return self.total_records, dd.from_pandas(meta_df, npartitions=1)
156
159
  except Exception as e:
157
- self.total_records = -1 # Indicate failure to count records
158
- self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
160
+ self.total_records = -1
161
+ self.logger.error(f"Unexpected error during count: {e}", exc_info=True)
159
162
  return self.total_records, dd.from_pandas(meta_df, npartitions=1)
160
163
 
161
- self.total_records = total_records
162
- if total_records == 0:
164
+ self.total_records = int(total)
165
+ if total == 0:
163
166
  self.logger.warning("Query returned 0 records.")
167
+ super().close()
164
168
  return self.total_records, dd.from_pandas(meta_df, npartitions=1)
165
169
 
166
- self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
170
+ self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
167
171
 
168
- # 4. Create a list of Dask Delayed objects, one for each chunk
169
172
  @dask.delayed
170
173
  def get_chunk(sql_query, chunk_offset):
171
- """A Dask-delayed function to fetch one chunk of data."""
172
- # LIMIT/OFFSET must be applied in the delayed function
173
- paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
174
- df = pd.read_sql(paginated_query, self.engine)
175
-
176
- if fillna_value is not None:
177
- df = df.fillna(fillna_value)
178
-
179
- # Ensure column order and types match the meta
180
- return df[ordered_columns].astype(meta_dtypes)
181
-
182
- offsets = range(0, total_records, self.chunk_size)
183
- delayed_chunks = [get_chunk(query, offset) for offset in offsets]
184
-
185
- # 5. Construct the final lazy Dask DataFrame from the delayed chunks
174
+ with self._sem: # <<< cap concurrent DB fetches
175
+ paginated = sql_query.limit(self.chunk_size).offset(chunk_offset)
176
+ df = pd.read_sql(paginated, self.engine)
177
+ if fillna_value is not None:
178
+ df = df.fillna(fillna_value)
179
+ return df[ordered_columns].astype(meta_dtypes)
180
+
181
+ offsets = range(0, total, self.chunk_size)
182
+ delayed_chunks = [get_chunk(query, off) for off in offsets]
186
183
  ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
187
- self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
188
- if not self._entered:
189
- super().cleanup()
184
+ self.logger.debug(f"Created Dask DataFrame with {ddf.npartitions} partitions.")
190
185
  return self.total_records, ddf
191
186
 
192
- ## Dask-Only Solution to test in better hardware
193
-
194
- # from typing import Type, Dict, Any
195
- # import math
196
- # import time
197
- # import pandas as pd
198
- # import dask
199
- # import dask.dataframe as dd
200
- #
201
- # import sqlalchemy as sa
202
- # from sqlalchemy import select, func
203
- # from sqlalchemy.engine import Engine
204
- # from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
205
- # from sqlalchemy.orm import declarative_base
206
- #
207
- # from sibi_dst.df_helper.core import FilterHandler
208
- # from sibi_dst.utils import Logger
209
- #
210
- #
211
- # class SQLAlchemyDask:
212
- # """
213
- # Loads data into a Dask DataFrame. If there’s exactly one integer PK,
214
- # use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
215
- # pagination pushed into dask.delayed to keep memory use minimal.
216
- # """
217
- #
218
- # def __init__(
219
- # self,
220
- # model: Type[declarative_base()],
221
- # filters: Dict[str, Any],
222
- # engine: Engine,
223
- # chunk_size: int = 1_000,
224
- # logger=None,
225
- # debug: bool = False,
226
- # ):
227
- # self.model = model
228
- # self.filters = filters or {}
229
- # self.engine = engine
230
- # self.chunk_size = chunk_size
231
- # self.logger = logger or Logger.default_logger(self.__class__.__name__)
232
- # self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
233
- # self.filter_handler_cls = FilterHandler
234
- # self.debug = debug
235
- #
236
- # def read_frame(self, fillna_value=None) -> dd.DataFrame:
237
- # # 1) Build base query + filters
238
- # base_q = select(self.model)
239
- # if self.filters:
240
- # base_q = self.filter_handler_cls(
241
- # backend="sqlalchemy",
242
- # logger=self.logger,
243
- # debug=self.debug,
244
- # ).apply_filters(base_q, model=self.model, filters=self.filters)
245
- #
246
- # # 2) Zero-row meta for dtype inference
247
- # meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
248
- # if meta.shape[1] == 0:
249
- # self.logger.warning("No columns detected; returning empty DataFrame.")
250
- # return dd.from_pandas(meta, npartitions=1)
251
- #
252
- # # 3) Single‐PK parallel path?
253
- # pk_cols = list(self.model.__table__.primary_key.columns)
254
- # if (
255
- # len(pk_cols) == 1
256
- # and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
257
- # ):
258
- # try:
259
- # return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
260
- # except Exception:
261
- # self.logger.warning(
262
- # "read_sql_table path failed, falling back to offset pagination",
263
- # exc_info=True,
264
- # )
265
- #
266
- # # 4) Composite PK or fallback → offset pagination in delayed tasks
267
- # return self._offset_paginated_ddf(base_q, meta, fillna_value)
268
- #
269
- # def _offset_paginated_ddf(self, base_q, meta, fillna):
270
- # # 1) count total rows
271
- # try:
272
- # with self.engine.connect() as conn:
273
- # total = conn.execute(
274
- # select(func.count()).select_from(base_q.alias())
275
- # ).scalar_one()
276
- # except Exception:
277
- # self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
278
- # return dd.from_pandas(meta, npartitions=1)
279
- #
280
- # if total == 0:
281
- # self.logger.warning("Query returned 0 records.")
282
- # return dd.from_pandas(meta, npartitions=1)
283
- # self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
284
- # # 2) create delayed tasks per offset
285
- # @dask.delayed
286
- # def _fetch_chunk(offset: int) -> pd.DataFrame:
287
- # q = base_q.limit(self.chunk_size).offset(offset)
288
- # df = pd.read_sql_query(q, self.engine)
289
- # if fillna is not None:
290
- # df = df.fillna(fillna)
291
- # return df[meta.columns].astype(meta.dtypes.to_dict())
292
- #
293
- # offsets = range(0, total, self.chunk_size)
294
- # parts = [_fetch_chunk(off) for off in offsets]
295
- #
296
- # ddf = dd.from_delayed(parts, meta=meta)
297
- # self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
298
- # return ddf
299
- #
300
- # def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
301
- # # same as before: min/max + dd.read_sql_table
302
- # backoff = 0.5
303
- # for attempt in range(3):
304
- # try:
305
- # with self.engine.connect() as conn:
306
- # min_id, max_id = conn.execute(
307
- # select(func.min(pk_col), func.max(pk_col))
308
- # .select_from(self.model.__table__)
309
- # ).one()
310
- # break
311
- # except (SASQLTimeoutError, OperationalError) as e:
312
- # if "timeout" in str(e).lower() and attempt < 2:
313
- # self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
314
- # time.sleep(backoff)
315
- # backoff *= 2
316
- # else:
317
- # raise
318
- #
319
- # if min_id is None or max_id is None:
320
- # self.logger.warning("Table empty—no PK bounds.")
321
- # return dd.from_pandas(meta, npartitions=1)
322
- #
323
- # total = max_id - min_id + 1
324
- # nparts = max(1, math.ceil(total / self.chunk_size))
325
- # ddf = dd.read_sql_table(
326
- # table=self.model.__table__.name,
327
- # uri=str(self.engine.url),
328
- # index_col=pk_col.name,
329
- # limits=(min_id, max_id),
330
- # npartitions=nparts,
331
- # columns=list(meta.columns),
332
- # )
333
- # if fillna is not None:
334
- # ddf = ddf.fillna(fillna)
335
- # self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
336
- # return ddf
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any
3
+ from typing import Any, Tuple
4
4
 
5
5
  import dask.dataframe as dd
6
6
  import pandas as pd
@@ -13,68 +13,116 @@ from ._io_dask import SQLAlchemyDask
13
13
 
14
14
  class SqlAlchemyLoadFromDb(ManagedResource):
15
15
  """
16
- Orchestrates loading data from a database using SQLAlchemy into a Dask
17
- DataFrame by configuring and delegating to the SQLAlchemyDask loader.
16
+ Orchestrates loading data from a database using SQLAlchemy into a Dask DataFrame.
18
17
  """
19
18
 
20
19
  def __init__(
21
- self,
22
- plugin_sqlalchemy: SqlAlchemyConnectionConfig,
23
- plugin_query: QueryConfig = None,
24
- plugin_params: ParamsConfig = None,
25
- **kwargs,
20
+ self,
21
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig,
22
+ plugin_query: QueryConfig = None,
23
+ plugin_params: ParamsConfig = None,
24
+ **kwargs,
26
25
  ):
27
- """
28
- Initializes the loader with all necessary configurations.
29
-
30
- Args:
31
- plugin_sqlalchemy: The database connection configuration object.
32
- plugin_query: The query configuration object.
33
- plugin_params: The parameters and filters configuration object.
34
- logger: An optional logger instance.
35
- **kwargs: Must contain 'index_column' for Dask partitioning.
36
- """
37
26
  super().__init__(**kwargs)
38
27
  self.db_connection = plugin_sqlalchemy
39
28
  self.model = self.db_connection.model
40
29
  self.engine = self.db_connection.engine
41
30
  self.query_config = plugin_query
42
31
  self.params_config = plugin_params
43
- self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
44
- self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
45
-
46
- def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
47
- """
48
- Builds and loads a Dask DataFrame from a SQLAlchemy source.
49
-
50
- This method is stateless and returns the DataFrame directly.
32
+ self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000) if self.params_config else 1000)
33
+ self.total_records = -1
51
34
 
52
- Returns:
53
- A Dask DataFrame containing the queried data or an empty,
54
- correctly structured DataFrame if the query fails or returns no results.
55
- """
35
+ def build_and_load(self) -> Tuple[int, dd.DataFrame]:
56
36
  try:
57
- # Instantiate and use the low-level Dask loader
58
- sqlalchemy_dask_loader=SQLAlchemyDask(
37
+ with SQLAlchemyDask(
59
38
  model=self.model,
60
39
  filters=self.params_config.filters if self.params_config else {},
61
40
  engine=self.engine,
62
41
  chunk_size=self.chunk_size,
63
42
  logger=self.logger,
64
43
  verbose=self.verbose,
65
- debug=self.debug
66
- )
67
- # Create the lazy DataFrame and read a record count
68
- # if total_records less than 0, it means an error occurred during the loading process
69
- self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
70
- return self.total_records, dask_df
71
-
72
-
44
+ debug=self.debug,
45
+ ) as loader:
46
+ self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
47
+ self.total_records, dask_df = loader.read_frame()
48
+ return self.total_records, dask_df
73
49
  except Exception as e:
74
50
  self.total_records = -1
75
51
  self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
76
- # Return an empty dataframe with the correct schema on failure
52
+ # empty df with correct columns
77
53
  columns = [c.name for c in self.model.__table__.columns]
78
54
  return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
79
55
 
80
-
56
+ # from __future__ import annotations
57
+ #
58
+ # from typing import Any
59
+ #
60
+ # import dask.dataframe as dd
61
+ # import pandas as pd
62
+ #
63
+ # from sibi_dst.utils import ManagedResource
64
+ # from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
65
+ # from ._db_connection import SqlAlchemyConnectionConfig
66
+ # from ._io_dask import SQLAlchemyDask
67
+ #
68
+ # class SqlAlchemyLoadFromDb(ManagedResource):
69
+ # """
70
+ # Orchestrates loading data from a database using SQLAlchemy into a Dask
71
+ # DataFrame by configuring and delegating to the SQLAlchemyDask loader.
72
+ # """
73
+ #
74
+ # def __init__(
75
+ # self,
76
+ # plugin_sqlalchemy: SqlAlchemyConnectionConfig,
77
+ # plugin_query: QueryConfig = None,
78
+ # plugin_params: ParamsConfig = None,
79
+ # **kwargs,
80
+ # ):
81
+ # """
82
+ # Initializes the loader with all necessary configurations.
83
+ #
84
+ # Args:
85
+ # plugin_sqlalchemy: The database connection configuration object.
86
+ # plugin_query: The query configuration object.
87
+ # plugin_params: The parameters and filters configuration object.
88
+ # logger: An optional logger instance.
89
+ # **kwargs: Must contain 'index_column' for Dask partitioning.
90
+ # """
91
+ # super().__init__(**kwargs)
92
+ # self.db_connection = plugin_sqlalchemy
93
+ # self.model = self.db_connection.model
94
+ # self.engine = self.db_connection.engine
95
+ # self.query_config = plugin_query
96
+ # self.params_config = plugin_params
97
+ # self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
98
+ # self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
99
+ #
100
+ # def build_and_load(self) -> tuple[int | Any, Any] | dd.DataFrame:
101
+ # """
102
+ # Builds and loads a Dask DataFrame from a SQLAlchemy source.
103
+ #
104
+ # This method is stateless and returns the DataFrame directly.
105
+ #
106
+ # Returns:
107
+ # A Dask DataFrame containing the queried data or an empty,
108
+ # correctly structured DataFrame if the query fails or returns no results.
109
+ # """
110
+ # try:
111
+ # # Instantiate and use the low-level Dask loader
112
+ # with SQLAlchemyDask(model=self.model,filters=self.params_config.filters if self.params_config else {},
113
+ # engine=self.engine,
114
+ # chunk_size=self.chunk_size,
115
+ # logger=self.logger,
116
+ # verbose=self.verbose,
117
+ # debug=self.debug) as sqlalchemy_dask_loader:
118
+ # self.logger.debug(f"SQLAlchemyDask loader initialized for model: {self.model.__name__}")
119
+ # # Create the lazy DataFrame and read a record count
120
+ # # if total_records less than 0, it means an error occurred during the loading process
121
+ # self.total_records, dask_df = sqlalchemy_dask_loader.read_frame()
122
+ # return self.total_records, dask_df
123
+ # except Exception as e:
124
+ # self.total_records = -1
125
+ # self.logger.error(f"{self.model.__name__} Failed to build and load data: {e}", exc_info=True)
126
+ # # Return an empty dataframe with the correct schema on failure
127
+ # columns = [c.name for c in self.model.__table__.columns]
128
+ # return self.total_records, dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)