sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sibi_dst/df_helper/_df_helper.py +186 -591
  2. sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
  3. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
  4. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
  5. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
  6. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
  7. sibi_dst/df_helper/core/__init__.py +0 -4
  8. sibi_dst/df_helper/core/_defaults.py +1 -50
  9. sibi_dst/df_helper/core/_query_config.py +2 -2
  10. sibi_dst/utils/__init__.py +0 -2
  11. sibi_dst/utils/data_wrapper.py +9 -12
  12. sibi_dst/utils/log_utils.py +15 -11
  13. sibi_dst/utils/update_planner.py +2 -0
  14. sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
  15. sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
  16. sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
  17. sibi_dst/v3/__init__.py +0 -0
  18. sibi_dst/v3/backends/__init__.py +0 -0
  19. sibi_dst/v3/df_helper/__init__.py +0 -0
  20. sibi_dst/v3/df_helper/_df_helper.py +91 -0
  21. sibi_dst-2025.1.1.dist-info/METADATA +55 -0
  22. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
  23. sibi_dst/df_helper/backends/django/__init__.py +0 -11
  24. sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
  25. sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
  26. sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
  27. sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
  28. sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
  29. sibi_dst/utils/airflow_manager.py +0 -212
  30. sibi_dst-0.3.63.dist-info/METADATA +0 -90
  31. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,135 +1,329 @@
1
- import itertools
2
1
 
2
+ from typing import Type
3
+
4
+ import dask
3
5
  import dask.dataframe as dd
4
6
  import pandas as pd
5
- from sqlalchemy import create_engine, inspect, select
6
- from sqlalchemy.orm import sessionmaker
7
-
7
+ from sqlalchemy import (
8
+ inspect,
9
+ select
10
+ )
11
+ from sqlalchemy.engine import Engine
12
+ from sqlalchemy.orm import declarative_base
13
+ import time
14
+ from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
15
+ import sqlalchemy as sa
8
16
  from sibi_dst.df_helper.core import FilterHandler
9
17
  from sibi_dst.utils import Logger
10
18
 
11
19
 
12
20
  class SQLAlchemyDask:
13
- def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
21
+ """
22
+ Loads data from a database into a Dask DataFrame using a memory-safe,
23
+ non-parallel, paginated approach.
24
+
25
+ This class avoids using a numeric `index_col for parallel loading.
26
+ """
27
+
28
+ _SQLALCHEMY_TO_DASK_DTYPE = {
29
+ "INTEGER": "Int64",
30
+ "SMALLINT": "Int64",
31
+ "BIGINT": "Int64",
32
+ "FLOAT": "float64",
33
+ "NUMERIC": "float64",
34
+ "BOOLEAN": "bool",
35
+ "VARCHAR": "object",
36
+ "TEXT": "object",
37
+ "DATE": "datetime64[ns]",
38
+ "DATETIME": "datetime64[ns]",
39
+ "TIME": "object",
40
+ "UUID": "object",
41
+ }
42
+
43
+ def __init__(
44
+ self,
45
+ model: Type[declarative_base()],
46
+ filters: dict,
47
+ engine: Engine,
48
+ chunk_size: int = 1000,
49
+ logger=None,
50
+ debug: bool = False,
51
+ ):
14
52
  """
15
- Initialize with an SQLAlchemy query and database engine URL.
16
-
17
- :param model: SQLAlchemy ORM model.
18
- :param filters: Filters to apply on the query.
19
- :param engine_url: Database connection string for SQLAlchemy engine.
20
- :param chunk_size: Number of records per chunk for Dask partitions.
21
- :param logger: Logger instance for logging.
22
- :param debug: Whether to print detailed logs.
53
+ Initializes the data loader.
54
+
55
+ Args:
56
+ model: The SQLAlchemy ORM model for the table.
57
+ filters: A dictionary of filters to apply to the query.
58
+ engine: An SQLAlchemy Engine instance.
59
+ chunk_size: The number of records to fetch in each database query.
60
+ logger: A logger instance.
61
+ debug: Whether to enable detailed logging.
23
62
  """
24
- self.query = None
25
63
  self.model = model
26
64
  self.filters = filters
65
+ self.engine = engine
27
66
  self.chunk_size = chunk_size
28
67
  self.debug = debug
29
- self.engine = create_engine(engine_url)
30
- self.Session = sessionmaker(bind=self.engine)
31
68
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
32
- self.logger.set_level(logger.DEBUG if debug else logger.INFO)
69
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
+ self.filter_handler_cls = FilterHandler
33
71
 
34
- @staticmethod
35
- def infer_dtypes_from_model(model):
72
+ @classmethod
73
+ def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
36
74
  """
37
- Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
75
+ Infers a metadata dictionary for Dask based on the SQLAlchemy model.
76
+ This helps Dask understand the DataFrame structure without reading data.
38
77
  """
39
78
  mapper = inspect(model)
40
- sqlalchemy_to_dask_dtype = {
41
- 'INTEGER': 'Int64',
42
- 'SMALLINT': 'Int64',
43
- 'BIGINT': 'Int64',
44
- 'FLOAT': 'float64',
45
- 'NUMERIC': 'float64',
46
- 'BOOLEAN': 'bool',
47
- 'VARCHAR': 'object',
48
- 'TEXT': 'object',
49
- 'DATE': 'datetime64[ns]',
50
- 'DATETIME': 'datetime64[ns]',
51
- 'TIME': 'object',
52
- 'UUID': 'object',
53
- }
54
-
55
79
  dtypes = {}
56
80
  for column in mapper.columns:
57
- dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
81
+ dtype_str = str(column.type).upper().split("(")[0]
82
+ dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
58
83
  dtypes[column.name] = dtype
59
-
60
84
  return dtypes
61
85
 
62
- def read_frame(self, fillna_value=None):
86
+ def read_frame(self, fillna_value=None) -> dd.DataFrame:
63
87
  """
64
- Load data from an SQLAlchemy query into a Dask DataFrame.
88
+ Builds and executes a query to load data into a Dask DataFrame.
89
+
90
+ This method works by first running a COUNT query to get the total
91
+ size, then creating a series of delayed tasks that each fetch a
92
+ chunk of data using LIMIT/OFFSET.
65
93
 
66
- :param fillna_value: Value to replace NaN or NULL values with, if any.
67
- :return: Dask DataFrame.
94
+ Args:
95
+ fillna_value: Value to replace NaN or NULL values with, if any.
96
+
97
+ Returns:
98
+ A lazy Dask DataFrame.
68
99
  """
69
- with self.Session() as session:
100
+ # 1. Build the base query and apply filters
101
+ query = select(self.model)
102
+ if self.filters:
103
+ query = self.filter_handler_cls(
104
+ backend="sqlalchemy", logger=self.logger, debug=self.debug
105
+ ).apply_filters(query, model=self.model, filters=self.filters)
106
+ else:
107
+ query = query.limit(self.chunk_size)
108
+ self.logger.debug(f"Base query for pagination: {query}")
109
+
110
+ # 2. Get metadata for the Dask DataFrame structure
111
+ ordered_columns = [column.name for column in self.model.__table__.columns]
112
+ meta_dtypes = self.infer_meta_from_model(self.model)
113
+ meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
114
+
115
+ # 3. Get the total record count to calculate the number of chunks
116
+
117
+ retry_attempts = 3
118
+ backoff_factor = 0.5 # start with a 0.5-second delay
119
+
120
+ for attempt in range(retry_attempts):
70
121
  try:
71
- # Build query
72
- self.query = select(self.model)
73
- if self.filters:
74
- self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
75
- model=self.model,
76
- filters=self.filters)
122
+ with self.engine.connect() as connection:
123
+ count_query = sa.select(sa.func.count()).select_from(query.alias())
124
+ total_records = connection.execute(count_query).scalar_one()
125
+
126
+ # If successful, break the loop
127
+ break
128
+
129
+ except SASQLTimeoutError:
130
+ if attempt < retry_attempts - 1:
131
+ self.logger.warning(
132
+ f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
133
+ )
134
+ time.sleep(backoff_factor)
135
+ backoff_factor *= 2 # Double the backoff time for the next attempt
77
136
  else:
78
- n_records = 100
79
- self.query = self.query.limit(n_records)
80
- self.logger.debug(f"query:{self.query}")
81
- # Infer dtypes
82
- dtypes = self.infer_dtypes_from_model(self.model)
83
- # Get the column order from the SQLAlchemy model
84
- ordered_columns = [column.name for column in self.model.__table__.columns]
85
-
86
- # Execute query and fetch results in chunks
87
- result_proxy = session.execute(self.query)
88
- results = result_proxy.scalars().all() # Fetch all rows
89
- iterator = iter(results)
90
-
91
- partitions = []
92
-
93
- while True:
94
- chunk = list(itertools.islice(iterator, self.chunk_size))
95
- if not chunk:
96
- break
97
-
98
- # Convert chunk to Pandas DataFrame
99
- df = pd.DataFrame.from_records(
100
- [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
137
+ self.logger.error(
138
+ "Failed to get a connection from the pool after several retries.",
139
+ exc_info=True
101
140
  )
102
- # Drop internal SQLAlchemy state if it exists
103
- df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
141
+ return dd.from_pandas(meta_df, npartitions=1)
142
+ except OperationalError as oe:
143
+ # sometimes the DB driver wraps timeouts in OperationalError
144
+ if "timeout" in str(oe).lower():
145
+ self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
146
+ time.sleep(backoff_factor)
147
+ backoff_factor *= 2
148
+ continue
149
+ else:
150
+ self.logger.error("OperationalError", exc_info=True)
151
+ return dd.from_pandas(meta_df, npartitions=1)
152
+ except Exception as e:
153
+ self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
154
+ return dd.from_pandas(meta_df, npartitions=1)
104
155
 
105
- # Reorder columns to match the model's order
106
- df = df[ordered_columns]
156
+ if total_records == 0:
157
+ self.logger.warning("Query returned 0 records.")
158
+ return dd.from_pandas(meta_df, npartitions=1)
107
159
 
108
- # Fill NaN values
109
- if fillna_value is not None:
110
- df = df.fillna(fillna_value)
160
+ self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
111
161
 
112
- # Convert timezone-aware columns to naive
113
- for col in df.columns:
114
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
115
- df[col] = df[col].dt.tz_localize(None)
162
+ # 4. Create a list of Dask Delayed objects, one for each chunk
163
+ @dask.delayed
164
+ def get_chunk(sql_query, chunk_offset):
165
+ """A Dask-delayed function to fetch one chunk of data."""
166
+ # LIMIT/OFFSET must be applied in the delayed function
167
+ paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
168
+ df = pd.read_sql(paginated_query, self.engine)
116
169
 
117
- # Apply inferred dtypes
118
- df = df.astype(dtypes)
119
- # Create a Dask partition
120
- partitions.append(dd.from_pandas(df, npartitions=1))
170
+ if fillna_value is not None:
171
+ df = df.fillna(fillna_value)
121
172
 
122
- # Concatenate all partitions
123
- if partitions:
124
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
125
- else:
126
- dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
173
+ # Ensure column order and types match the meta
174
+ return df[ordered_columns].astype(meta_dtypes)
127
175
 
128
- self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
176
+ offsets = range(0, total_records, self.chunk_size)
177
+ delayed_chunks = [get_chunk(query, offset) for offset in offsets]
129
178
 
130
- return dask_df
179
+ # 5. Construct the final lazy Dask DataFrame from the delayed chunks
180
+ ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
181
+ self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
131
182
 
132
- except Exception as e:
133
- self.logger.error(f"Error executing query: {str(e)}")
134
- self.logger.error(self.query)
135
- return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
183
+ return ddf
184
+
185
+ ## Dask-Only Solution to test in better hardware
186
+
187
+ # from typing import Type, Dict, Any
188
+ # import math
189
+ # import time
190
+ # import pandas as pd
191
+ # import dask
192
+ # import dask.dataframe as dd
193
+ #
194
+ # import sqlalchemy as sa
195
+ # from sqlalchemy import select, func
196
+ # from sqlalchemy.engine import Engine
197
+ # from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
198
+ # from sqlalchemy.orm import declarative_base
199
+ #
200
+ # from sibi_dst.df_helper.core import FilterHandler
201
+ # from sibi_dst.utils import Logger
202
+ #
203
+ #
204
+ # class SQLAlchemyDask:
205
+ # """
206
+ # Loads data into a Dask DataFrame. If there’s exactly one integer PK,
207
+ # use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
208
+ # pagination pushed into dask.delayed to keep memory use minimal.
209
+ # """
210
+ #
211
+ # def __init__(
212
+ # self,
213
+ # model: Type[declarative_base()],
214
+ # filters: Dict[str, Any],
215
+ # engine: Engine,
216
+ # chunk_size: int = 1_000,
217
+ # logger=None,
218
+ # debug: bool = False,
219
+ # ):
220
+ # self.model = model
221
+ # self.filters = filters or {}
222
+ # self.engine = engine
223
+ # self.chunk_size = chunk_size
224
+ # self.logger = logger or Logger.default_logger(self.__class__.__name__)
225
+ # self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
226
+ # self.filter_handler_cls = FilterHandler
227
+ # self.debug = debug
228
+ #
229
+ # def read_frame(self, fillna_value=None) -> dd.DataFrame:
230
+ # # 1) Build base query + filters
231
+ # base_q = select(self.model)
232
+ # if self.filters:
233
+ # base_q = self.filter_handler_cls(
234
+ # backend="sqlalchemy",
235
+ # logger=self.logger,
236
+ # debug=self.debug,
237
+ # ).apply_filters(base_q, model=self.model, filters=self.filters)
238
+ #
239
+ # # 2) Zero-row meta for dtype inference
240
+ # meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
241
+ # if meta.shape[1] == 0:
242
+ # self.logger.warning("No columns detected; returning empty DataFrame.")
243
+ # return dd.from_pandas(meta, npartitions=1)
244
+ #
245
+ # # 3) Single‐PK parallel path?
246
+ # pk_cols = list(self.model.__table__.primary_key.columns)
247
+ # if (
248
+ # len(pk_cols) == 1
249
+ # and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
250
+ # ):
251
+ # try:
252
+ # return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
253
+ # except Exception:
254
+ # self.logger.warning(
255
+ # "read_sql_table path failed, falling back to offset pagination",
256
+ # exc_info=True,
257
+ # )
258
+ #
259
+ # # 4) Composite PK or fallback → offset pagination in delayed tasks
260
+ # return self._offset_paginated_ddf(base_q, meta, fillna_value)
261
+ #
262
+ # def _offset_paginated_ddf(self, base_q, meta, fillna):
263
+ # # 1) count total rows
264
+ # try:
265
+ # with self.engine.connect() as conn:
266
+ # total = conn.execute(
267
+ # select(func.count()).select_from(base_q.alias())
268
+ # ).scalar_one()
269
+ # except Exception:
270
+ # self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
271
+ # return dd.from_pandas(meta, npartitions=1)
272
+ #
273
+ # if total == 0:
274
+ # self.logger.warning("Query returned 0 records.")
275
+ # return dd.from_pandas(meta, npartitions=1)
276
+ # self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
277
+ # # 2) create delayed tasks per offset
278
+ # @dask.delayed
279
+ # def _fetch_chunk(offset: int) -> pd.DataFrame:
280
+ # q = base_q.limit(self.chunk_size).offset(offset)
281
+ # df = pd.read_sql_query(q, self.engine)
282
+ # if fillna is not None:
283
+ # df = df.fillna(fillna)
284
+ # return df[meta.columns].astype(meta.dtypes.to_dict())
285
+ #
286
+ # offsets = range(0, total, self.chunk_size)
287
+ # parts = [_fetch_chunk(off) for off in offsets]
288
+ #
289
+ # ddf = dd.from_delayed(parts, meta=meta)
290
+ # self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
291
+ # return ddf
292
+ #
293
+ # def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
294
+ # # same as before: min/max + dd.read_sql_table
295
+ # backoff = 0.5
296
+ # for attempt in range(3):
297
+ # try:
298
+ # with self.engine.connect() as conn:
299
+ # min_id, max_id = conn.execute(
300
+ # select(func.min(pk_col), func.max(pk_col))
301
+ # .select_from(self.model.__table__)
302
+ # ).one()
303
+ # break
304
+ # except (SASQLTimeoutError, OperationalError) as e:
305
+ # if "timeout" in str(e).lower() and attempt < 2:
306
+ # self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
307
+ # time.sleep(backoff)
308
+ # backoff *= 2
309
+ # else:
310
+ # raise
311
+ #
312
+ # if min_id is None or max_id is None:
313
+ # self.logger.warning("Table empty—no PK bounds.")
314
+ # return dd.from_pandas(meta, npartitions=1)
315
+ #
316
+ # total = max_id - min_id + 1
317
+ # nparts = max(1, math.ceil(total / self.chunk_size))
318
+ # ddf = dd.read_sql_table(
319
+ # table=self.model.__table__.name,
320
+ # uri=str(self.engine.url),
321
+ # index_col=pk_col.name,
322
+ # limits=(min_id, max_id),
323
+ # npartitions=nparts,
324
+ # columns=list(meta.columns),
325
+ # )
326
+ # if fillna is not None:
327
+ # ddf = ddf.fillna(fillna)
328
+ # self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
329
+ # return ddf
@@ -3,143 +3,72 @@ import pandas as pd
3
3
 
4
4
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
5
5
  from sibi_dst.utils import Logger
6
- from ._io_dask import SQLAlchemyDask
7
6
  from ._db_connection import SqlAlchemyConnectionConfig
7
+ from ._io_dask import SQLAlchemyDask
8
8
 
9
9
 
10
10
  class SqlAlchemyLoadFromDb:
11
11
  """
12
- The SqlAlchemyLoadFromDb class provides functionality to load data from a
13
- database using SQLAlchemy into a Dask DataFrame. It is capable of handling
14
- large datasets efficiently by utilizing the Dask framework for parallel
15
- computations.
16
-
17
- This class is initialized with a database connection configuration, query
18
- configuration, optional parameters, and a logger. It can execute a query
19
- using the specified configurations and read the results into a Dask
20
- DataFrame. This is useful for processing and analyzing large-scale data.
21
-
22
- :ivar df: Dask DataFrame to store the loaded data.
23
- :type df: dd.DataFrame
24
- :ivar db_connection: Database connection configuration object, containing details
25
- such as the table, model, and engine to be used for the query.
26
- :type db_connection: SqlAlchemyConnectionConfig
27
- :ivar table_name: Name of the database table being queried.
28
- :type table_name: str
29
- :ivar model: SQLAlchemy model associated with the database connection.
30
- :type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
31
- :ivar engine: SQLAlchemy engine used for executing queries.
32
- :type engine: sqlalchemy.engine.base.Engine
33
- :ivar logger: Logger instance for logging debug and error information.
34
- :type logger: Logger
35
- :ivar query_config: Query configuration, including query-related details such
36
- as the SQL query or query settings.
37
- :type query_config: QueryConfig
38
- :ivar params_config: Parameters configuration, including filter parameters for
39
- the query.
40
- :type params_config: ParamsConfig
41
- :ivar debug: Debug flag indicating whether debug mode is enabled.
42
- :type debug: bool
43
- :ivar chunk_size: Size of data chunks to process at a time.
44
- :type chunk_size: int
12
+ Orchestrates loading data from a database using SQLAlchemy into a Dask
13
+ DataFrame by configuring and delegating to the SQLAlchemyDask loader.
45
14
  """
46
- df: dd.DataFrame = None
47
15
 
48
16
  def __init__(
49
17
  self,
50
- plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
18
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig,
51
19
  plugin_query: QueryConfig = None,
52
20
  plugin_params: ParamsConfig = None,
53
21
  logger: Logger = None,
54
22
  **kwargs,
55
23
  ):
56
24
  """
57
- Initializes an instance of the class, setting up a database connection,
58
- query configuration, parameter configuration, and other optional settings
59
- like debugging and logging. The class aims to manage the integration and
60
- interaction with SQLAlchemy-based database operations.
61
-
62
- :param plugin_sqlalchemy:
63
- The SQLAlchemy connection configuration object, which provides
64
- the connection details like engine, table name, and model
65
- associated with the database operations.
66
- :param plugin_query:
67
- The query configuration object, used to define specific query
68
- options or rules. Defaults to None.
69
- :param plugin_params:
70
- The parameters configuration object, used for any additional
71
- parameterized settings or configurations. Defaults to None.
72
- :param logger:
73
- Optional logger instance for logging purposes. If not provided,
74
- a default logger is instantiated using the standard logging system.
75
- :param kwargs:
76
- Optional additional keyword arguments for customization. Can
77
- include optional settings like `debug` mode or `chunk_size`
78
- for batch operations.
25
+ Initializes the loader with all necessary configurations.
26
+
27
+ Args:
28
+ plugin_sqlalchemy: The database connection configuration object.
29
+ plugin_query: The query configuration object.
30
+ plugin_params: The parameters and filters configuration object.
31
+ logger: An optional logger instance.
32
+ **kwargs: Must contain 'index_column' for Dask partitioning.
79
33
  """
80
34
  self.db_connection = plugin_sqlalchemy
81
- self.table_name = self.db_connection.table
82
35
  self.model = self.db_connection.model
83
36
  self.engine = self.db_connection.engine
84
37
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
85
38
  self.query_config = plugin_query
86
39
  self.params_config = plugin_params
87
- self.debug = kwargs.pop("debug", False)
88
- self.chunk_size = kwargs.pop("chunk_size", 1000)
40
+ self.debug = kwargs.get("debug", False)
41
+ self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
89
42
 
90
43
  def build_and_load(self) -> dd.DataFrame:
91
44
  """
92
- Builds and returns the resulting dataframe after calling the internal
93
- build and load function. This method triggers the `_build_and_load`
94
- function to process and prepare the data before returning it as
95
- a dask dataframe.
45
+ Builds and loads a Dask DataFrame from a SQLAlchemy source.
96
46
 
97
- :raises RuntimeError: If any error occurs during the build or load process.
47
+ This method is stateless and returns the DataFrame directly.
98
48
 
99
- :return: The processed data in a dask dataframe.
100
- :rtype: dd.DataFrame
101
- """
102
- self._build_and_load()
103
- return self.df
104
-
105
- def _build_and_load(self) -> dd.DataFrame:
106
- """
107
- Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
108
-
109
- This method initializes a SQLAlchemyDask object with the provided model,
110
- filters, engine URL, logger, chunk size, and debug configuration.
111
- It attempts to load the data using the ``read_frame`` method of
112
- SQLAlchemyDask. If the data cannot be loaded or the query returns
113
- no rows, it creates and returns an empty Dask DataFrame.
114
-
115
- :raises Exception: On failure to load data or to create a DataFrame.
116
-
117
- :return: A Dask DataFrame object containing the queried data or an
118
- empty DataFrame if the query returns no results or fails.
119
- :rtype: dask.dataframe.DataFrame
49
+ Returns:
50
+ A Dask DataFrame containing the queried data or an empty,
51
+ correctly structured DataFrame if the query fails or returns no results.
120
52
  """
121
53
  try:
122
- self.df = SQLAlchemyDask(
54
+ # Instantiate and use the low-level Dask loader
55
+ sqlalchemy_dask_loader=SQLAlchemyDask(
123
56
  model=self.model,
124
- filters=self.params_config.filters,
125
- engine_url=self.engine.url,
126
- logger=self.logger,
57
+ filters=self.params_config.filters if self.params_config else {},
58
+ engine=self.engine,
127
59
  chunk_size=self.chunk_size,
60
+ logger=self.logger,
128
61
  debug=self.debug
129
- ).read_frame()
130
-
131
- if self.df is None or len(self.df.head().index) == 0:
132
- self.logger.debug("Query returned no results.")
133
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
62
+ )
63
+ # Create the lazy DataFrame
64
+ dask_df = sqlalchemy_dask_loader.read_frame()
65
+ return dask_df
134
66
 
135
- return dask_df
136
67
 
137
- return self.df
138
- except RuntimeError as e:
139
- self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
140
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
- return dask_df
142
68
  except Exception as e:
143
- self.logger.info(f"Exception {e}:Failed to load data into Dask DataFrame.")
144
- dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
145
- return dask_df
69
+ self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
70
+ # Return an empty dataframe with the correct schema on failure
71
+ columns = [c.name for c in self.model.__table__.columns]
72
+ return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
73
+
74
+