sibi-dst 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +184 -591
- sibi_dst/df_helper/_parquet_artifact.py +2 -0
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +141 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/data_wrapper.py +2 -2
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/RECORD +20 -17
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- {sibi_dst-0.3.62.dist-info → sibi_dst-0.3.64.dist-info}/WHEEL +0 -0
@@ -1,135 +1,179 @@
|
|
1
|
-
import
|
1
|
+
from typing import Type
|
2
2
|
|
3
|
+
import dask
|
3
4
|
import dask.dataframe as dd
|
4
5
|
import pandas as pd
|
5
|
-
from sqlalchemy import
|
6
|
-
|
7
|
-
|
6
|
+
from sqlalchemy import (
|
7
|
+
inspect,
|
8
|
+
select,
|
9
|
+
func,
|
10
|
+
)
|
11
|
+
from sqlalchemy.engine import Engine
|
12
|
+
from sqlalchemy.orm import declarative_base
|
13
|
+
import time
|
14
|
+
from sqlalchemy.exc import TimeoutError
|
15
|
+
import sqlalchemy as sa
|
8
16
|
from sibi_dst.df_helper.core import FilterHandler
|
9
17
|
from sibi_dst.utils import Logger
|
10
18
|
|
11
19
|
|
12
20
|
class SQLAlchemyDask:
|
13
|
-
|
21
|
+
"""
|
22
|
+
Loads data from a database into a Dask DataFrame using a memory-safe,
|
23
|
+
non-parallel, paginated approach.
|
24
|
+
|
25
|
+
This class avoids using a numeric `index_col for parallel loading.
|
26
|
+
"""
|
27
|
+
|
28
|
+
_SQLALCHEMY_TO_DASK_DTYPE = {
|
29
|
+
"INTEGER": "Int64",
|
30
|
+
"SMALLINT": "Int64",
|
31
|
+
"BIGINT": "Int64",
|
32
|
+
"FLOAT": "float64",
|
33
|
+
"NUMERIC": "float64",
|
34
|
+
"BOOLEAN": "bool",
|
35
|
+
"VARCHAR": "object",
|
36
|
+
"TEXT": "object",
|
37
|
+
"DATE": "datetime64[ns]",
|
38
|
+
"DATETIME": "datetime64[ns]",
|
39
|
+
"TIME": "object",
|
40
|
+
"UUID": "object",
|
41
|
+
}
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
model: Type[declarative_base()],
|
46
|
+
filters: dict,
|
47
|
+
engine: Engine,
|
48
|
+
chunk_size: int = 1000,
|
49
|
+
logger=None,
|
50
|
+
debug: bool = False,
|
51
|
+
):
|
14
52
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
:
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
53
|
+
Initializes the data loader.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
model: The SQLAlchemy ORM model for the table.
|
57
|
+
filters: A dictionary of filters to apply to the query.
|
58
|
+
engine: An SQLAlchemy Engine instance.
|
59
|
+
chunk_size: The number of records to fetch in each database query.
|
60
|
+
logger: A logger instance.
|
61
|
+
debug: Whether to enable detailed logging.
|
23
62
|
"""
|
24
|
-
self.query = None
|
25
63
|
self.model = model
|
26
64
|
self.filters = filters
|
65
|
+
self.engine = engine
|
27
66
|
self.chunk_size = chunk_size
|
28
67
|
self.debug = debug
|
29
|
-
self.engine = create_engine(engine_url)
|
30
|
-
self.Session = sessionmaker(bind=self.engine)
|
31
68
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
32
|
-
self.logger.set_level(
|
69
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
70
|
+
self.filter_handler_cls = FilterHandler
|
33
71
|
|
34
|
-
@
|
35
|
-
def
|
72
|
+
@classmethod
|
73
|
+
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
36
74
|
"""
|
37
|
-
|
75
|
+
Infers a metadata dictionary for Dask based on the SQLAlchemy model.
|
76
|
+
This helps Dask understand the DataFrame structure without reading data.
|
38
77
|
"""
|
39
78
|
mapper = inspect(model)
|
40
|
-
sqlalchemy_to_dask_dtype = {
|
41
|
-
'INTEGER': 'Int64',
|
42
|
-
'SMALLINT': 'Int64',
|
43
|
-
'BIGINT': 'Int64',
|
44
|
-
'FLOAT': 'float64',
|
45
|
-
'NUMERIC': 'float64',
|
46
|
-
'BOOLEAN': 'bool',
|
47
|
-
'VARCHAR': 'object',
|
48
|
-
'TEXT': 'object',
|
49
|
-
'DATE': 'datetime64[ns]',
|
50
|
-
'DATETIME': 'datetime64[ns]',
|
51
|
-
'TIME': 'object',
|
52
|
-
'UUID': 'object',
|
53
|
-
}
|
54
|
-
|
55
79
|
dtypes = {}
|
56
80
|
for column in mapper.columns:
|
57
|
-
|
81
|
+
dtype_str = str(column.type).upper().split("(")[0]
|
82
|
+
dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
|
58
83
|
dtypes[column.name] = dtype
|
59
|
-
|
60
84
|
return dtypes
|
61
85
|
|
62
|
-
def read_frame(self, fillna_value=None):
|
86
|
+
def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
63
87
|
"""
|
64
|
-
|
88
|
+
Builds and executes a query to load data into a Dask DataFrame.
|
65
89
|
|
66
|
-
|
67
|
-
|
90
|
+
This method works by first running a COUNT query to get the total
|
91
|
+
size, then creating a series of delayed tasks that each fetch a
|
92
|
+
chunk of data using LIMIT/OFFSET.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
fillna_value: Value to replace NaN or NULL values with, if any.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
A lazy Dask DataFrame.
|
68
99
|
"""
|
69
|
-
|
100
|
+
# 1. Build the base query and apply filters
|
101
|
+
query = select(self.model)
|
102
|
+
if self.filters:
|
103
|
+
query = self.filter_handler_cls(
|
104
|
+
backend="sqlalchemy", logger=self.logger, debug=self.debug
|
105
|
+
).apply_filters(query, model=self.model, filters=self.filters)
|
106
|
+
|
107
|
+
self.logger.debug(f"Base query for pagination: {query}")
|
108
|
+
|
109
|
+
# 2. Get metadata for the Dask DataFrame structure
|
110
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
111
|
+
meta_dtypes = self.infer_meta_from_model(self.model)
|
112
|
+
meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
|
113
|
+
|
114
|
+
# 3. Get the total record count to calculate the number of chunks
|
115
|
+
# try:
|
116
|
+
# with self.engine.connect() as connection:
|
117
|
+
# count_query = select(func.count()).select_from(query.alias())
|
118
|
+
# total_records = connection.execute(count_query).scalar_one()
|
119
|
+
# except Exception as e:
|
120
|
+
# self.logger.error(f"Failed to count records for pagination: {e}", exc_info=True)
|
121
|
+
# return dd.from_pandas(meta_df, npartitions=1)
|
122
|
+
retry_attempts = 3
|
123
|
+
backoff_factor = 0.5 # start with a 0.5-second delay
|
124
|
+
|
125
|
+
for attempt in range(retry_attempts):
|
70
126
|
try:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
127
|
+
with self.engine.connect() as connection:
|
128
|
+
count_query = sa.select(sa.func.count()).select_from(query.alias())
|
129
|
+
total_records = connection.execute(count_query).scalar_one()
|
130
|
+
|
131
|
+
# If successful, break the loop
|
132
|
+
break
|
133
|
+
|
134
|
+
except TimeoutError:
|
135
|
+
if attempt < retry_attempts - 1:
|
136
|
+
self.logger.warning(
|
137
|
+
f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
|
138
|
+
)
|
139
|
+
time.sleep(backoff_factor)
|
140
|
+
backoff_factor *= 2 # Double the backoff time for the next attempt
|
77
141
|
else:
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
# Infer dtypes
|
82
|
-
dtypes = self.infer_dtypes_from_model(self.model)
|
83
|
-
# Get the column order from the SQLAlchemy model
|
84
|
-
ordered_columns = [column.name for column in self.model.__table__.columns]
|
85
|
-
|
86
|
-
# Execute query and fetch results in chunks
|
87
|
-
result_proxy = session.execute(self.query)
|
88
|
-
results = result_proxy.scalars().all() # Fetch all rows
|
89
|
-
iterator = iter(results)
|
90
|
-
|
91
|
-
partitions = []
|
92
|
-
|
93
|
-
while True:
|
94
|
-
chunk = list(itertools.islice(iterator, self.chunk_size))
|
95
|
-
if not chunk:
|
96
|
-
break
|
97
|
-
|
98
|
-
# Convert chunk to Pandas DataFrame
|
99
|
-
df = pd.DataFrame.from_records(
|
100
|
-
[row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
|
142
|
+
self.logger.error(
|
143
|
+
"Failed to get a connection from the pool after several retries.",
|
144
|
+
exc_info=True
|
101
145
|
)
|
102
|
-
|
103
|
-
|
146
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
147
|
+
|
148
|
+
except Exception as e:
|
149
|
+
self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
150
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
104
151
|
|
105
|
-
|
106
|
-
|
152
|
+
if total_records == 0:
|
153
|
+
self.logger.warning("Query returned 0 records.")
|
154
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
107
155
|
|
108
|
-
|
109
|
-
if fillna_value is not None:
|
110
|
-
df = df.fillna(fillna_value)
|
156
|
+
self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
|
111
157
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
158
|
+
# 4. Create a list of Dask Delayed objects, one for each chunk
|
159
|
+
@dask.delayed
|
160
|
+
def get_chunk(sql_query, chunk_offset):
|
161
|
+
"""A Dask-delayed function to fetch one chunk of data."""
|
162
|
+
# LIMIT/OFFSET must be applied in the delayed function
|
163
|
+
paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
|
164
|
+
df = pd.read_sql(paginated_query, self.engine)
|
116
165
|
|
117
|
-
|
118
|
-
|
119
|
-
# Create a Dask partition
|
120
|
-
partitions.append(dd.from_pandas(df, npartitions=1))
|
166
|
+
if fillna_value is not None:
|
167
|
+
df = df.fillna(fillna_value)
|
121
168
|
|
122
|
-
|
123
|
-
|
124
|
-
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
125
|
-
else:
|
126
|
-
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
169
|
+
# Ensure column order and types match the meta
|
170
|
+
return df[ordered_columns].astype(meta_dtypes)
|
127
171
|
|
128
|
-
|
172
|
+
offsets = range(0, total_records, self.chunk_size)
|
173
|
+
delayed_chunks = [get_chunk(query, offset) for offset in offsets]
|
129
174
|
|
130
|
-
|
175
|
+
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
176
|
+
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
177
|
+
self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
|
131
178
|
|
132
|
-
|
133
|
-
self.logger.error(f"Error executing query: {str(e)}")
|
134
|
-
self.logger.error(self.query)
|
135
|
-
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
179
|
+
return ddf
|
@@ -3,143 +3,72 @@ import pandas as pd
|
|
3
3
|
|
4
4
|
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
5
5
|
from sibi_dst.utils import Logger
|
6
|
-
from ._io_dask import SQLAlchemyDask
|
7
6
|
from ._db_connection import SqlAlchemyConnectionConfig
|
7
|
+
from ._io_dask import SQLAlchemyDask
|
8
8
|
|
9
9
|
|
10
10
|
class SqlAlchemyLoadFromDb:
|
11
11
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
large datasets efficiently by utilizing the Dask framework for parallel
|
15
|
-
computations.
|
16
|
-
|
17
|
-
This class is initialized with a database connection configuration, query
|
18
|
-
configuration, optional parameters, and a logger. It can execute a query
|
19
|
-
using the specified configurations and read the results into a Dask
|
20
|
-
DataFrame. This is useful for processing and analyzing large-scale data.
|
21
|
-
|
22
|
-
:ivar df: Dask DataFrame to store the loaded data.
|
23
|
-
:type df: dd.DataFrame
|
24
|
-
:ivar db_connection: Database connection configuration object, containing details
|
25
|
-
such as the table, model, and engine to be used for the query.
|
26
|
-
:type db_connection: SqlAlchemyConnectionConfig
|
27
|
-
:ivar table_name: Name of the database table being queried.
|
28
|
-
:type table_name: str
|
29
|
-
:ivar model: SQLAlchemy model associated with the database connection.
|
30
|
-
:type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
|
31
|
-
:ivar engine: SQLAlchemy engine used for executing queries.
|
32
|
-
:type engine: sqlalchemy.engine.base.Engine
|
33
|
-
:ivar logger: Logger instance for logging debug and error information.
|
34
|
-
:type logger: Logger
|
35
|
-
:ivar query_config: Query configuration, including query-related details such
|
36
|
-
as the SQL query or query settings.
|
37
|
-
:type query_config: QueryConfig
|
38
|
-
:ivar params_config: Parameters configuration, including filter parameters for
|
39
|
-
the query.
|
40
|
-
:type params_config: ParamsConfig
|
41
|
-
:ivar debug: Debug flag indicating whether debug mode is enabled.
|
42
|
-
:type debug: bool
|
43
|
-
:ivar chunk_size: Size of data chunks to process at a time.
|
44
|
-
:type chunk_size: int
|
12
|
+
Orchestrates loading data from a database using SQLAlchemy into a Dask
|
13
|
+
DataFrame by configuring and delegating to the SQLAlchemyDask loader.
|
45
14
|
"""
|
46
|
-
df: dd.DataFrame = None
|
47
15
|
|
48
16
|
def __init__(
|
49
17
|
self,
|
50
|
-
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
18
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig,
|
51
19
|
plugin_query: QueryConfig = None,
|
52
20
|
plugin_params: ParamsConfig = None,
|
53
21
|
logger: Logger = None,
|
54
22
|
**kwargs,
|
55
23
|
):
|
56
24
|
"""
|
57
|
-
Initializes
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
associated with the database operations.
|
66
|
-
:param plugin_query:
|
67
|
-
The query configuration object, used to define specific query
|
68
|
-
options or rules. Defaults to None.
|
69
|
-
:param plugin_params:
|
70
|
-
The parameters configuration object, used for any additional
|
71
|
-
parameterized settings or configurations. Defaults to None.
|
72
|
-
:param logger:
|
73
|
-
Optional logger instance for logging purposes. If not provided,
|
74
|
-
a default logger is instantiated using the standard logging system.
|
75
|
-
:param kwargs:
|
76
|
-
Optional additional keyword arguments for customization. Can
|
77
|
-
include optional settings like `debug` mode or `chunk_size`
|
78
|
-
for batch operations.
|
25
|
+
Initializes the loader with all necessary configurations.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
plugin_sqlalchemy: The database connection configuration object.
|
29
|
+
plugin_query: The query configuration object.
|
30
|
+
plugin_params: The parameters and filters configuration object.
|
31
|
+
logger: An optional logger instance.
|
32
|
+
**kwargs: Must contain 'index_column' for Dask partitioning.
|
79
33
|
"""
|
80
34
|
self.db_connection = plugin_sqlalchemy
|
81
|
-
self.table_name = self.db_connection.table
|
82
35
|
self.model = self.db_connection.model
|
83
36
|
self.engine = self.db_connection.engine
|
84
37
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
85
38
|
self.query_config = plugin_query
|
86
39
|
self.params_config = plugin_params
|
87
|
-
self.debug = kwargs.
|
88
|
-
self.chunk_size = kwargs.
|
40
|
+
self.debug = kwargs.get("debug", False)
|
41
|
+
self.chunk_size = kwargs.get("chunk_size", self.params_config.df_params.get("chunk_size", 1000))
|
89
42
|
|
90
43
|
def build_and_load(self) -> dd.DataFrame:
|
91
44
|
"""
|
92
|
-
Builds and
|
93
|
-
build and load function. This method triggers the `_build_and_load`
|
94
|
-
function to process and prepare the data before returning it as
|
95
|
-
a dask dataframe.
|
45
|
+
Builds and loads a Dask DataFrame from a SQLAlchemy source.
|
96
46
|
|
97
|
-
|
47
|
+
This method is stateless and returns the DataFrame directly.
|
98
48
|
|
99
|
-
:
|
100
|
-
|
101
|
-
|
102
|
-
self._build_and_load()
|
103
|
-
return self.df
|
104
|
-
|
105
|
-
def _build_and_load(self) -> dd.DataFrame:
|
106
|
-
"""
|
107
|
-
Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
|
108
|
-
|
109
|
-
This method initializes a SQLAlchemyDask object with the provided model,
|
110
|
-
filters, engine URL, logger, chunk size, and debug configuration.
|
111
|
-
It attempts to load the data using the ``read_frame`` method of
|
112
|
-
SQLAlchemyDask. If the data cannot be loaded or the query returns
|
113
|
-
no rows, it creates and returns an empty Dask DataFrame.
|
114
|
-
|
115
|
-
:raises Exception: On failure to load data or to create a DataFrame.
|
116
|
-
|
117
|
-
:return: A Dask DataFrame object containing the queried data or an
|
118
|
-
empty DataFrame if the query returns no results or fails.
|
119
|
-
:rtype: dask.dataframe.DataFrame
|
49
|
+
Returns:
|
50
|
+
A Dask DataFrame containing the queried data or an empty,
|
51
|
+
correctly structured DataFrame if the query fails or returns no results.
|
120
52
|
"""
|
121
53
|
try:
|
122
|
-
|
54
|
+
# Instantiate and use the low-level Dask loader
|
55
|
+
sqlalchemy_dask_loader=SQLAlchemyDask(
|
123
56
|
model=self.model,
|
124
|
-
filters=self.params_config.filters,
|
125
|
-
|
126
|
-
logger=self.logger,
|
57
|
+
filters=self.params_config.filters if self.params_config else {},
|
58
|
+
engine=self.engine,
|
127
59
|
chunk_size=self.chunk_size,
|
60
|
+
logger=self.logger,
|
128
61
|
debug=self.debug
|
129
|
-
)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
62
|
+
)
|
63
|
+
# Create the lazy DataFrame
|
64
|
+
dask_df = sqlalchemy_dask_loader.read_frame()
|
65
|
+
return dask_df
|
134
66
|
|
135
|
-
return dask_df
|
136
67
|
|
137
|
-
return self.df
|
138
|
-
except RuntimeError as e:
|
139
|
-
self.logger.info(f"Runtime Error {e}:Failed to load data into Dask DataFrame.")
|
140
|
-
dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
141
|
-
return dask_df
|
142
68
|
except Exception as e:
|
143
|
-
self.logger.
|
144
|
-
|
145
|
-
|
69
|
+
self.logger.error(f"Failed to build and load data: {e}", exc_info=True)
|
70
|
+
# Return an empty dataframe with the correct schema on failure
|
71
|
+
columns = [c.name for c in self.model.__table__.columns]
|
72
|
+
return dd.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
|
73
|
+
|
74
|
+
|