sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +186 -591
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/__init__.py +0 -4
- sibi_dst/df_helper/core/_defaults.py +1 -50
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/__init__.py +0 -2
- sibi_dst/utils/data_wrapper.py +9 -12
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/utils/update_planner.py +2 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- sibi_dst-2025.1.1.dist-info/METADATA +55 -0
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
- sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- sibi_dst/utils/airflow_manager.py +0 -212
- sibi_dst-0.3.63.dist-info/METADATA +0 -90
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,227 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
|
3
|
-
import dask.dataframe as dd
|
4
|
-
import pandas as pd
|
5
|
-
from django.db.models import Q
|
6
|
-
|
7
|
-
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
8
|
-
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
9
|
-
from sibi_dst.utils import Logger
|
10
|
-
|
11
|
-
|
12
|
-
class DjangoLoadFromDb:
|
13
|
-
"""
|
14
|
-
Handles loading data from a Django database into a Dask DataFrame, with support for filtering
|
15
|
-
and column type conversion.
|
16
|
-
|
17
|
-
This class is designed to interface with Django ORM models, allowing data querying and mapping
|
18
|
-
Django model fields to Dask DataFrame columns. It accommodates filtering logic provided via
|
19
|
-
parameters and ensures that excessive data is not accidentally loaded when no filters are applied.
|
20
|
-
|
21
|
-
:ivar connection_config: Configuration for the database connection, including the Django model
|
22
|
-
and connection details.
|
23
|
-
:type connection_config: Any
|
24
|
-
:ivar query_config: Configuration for the query, including the number of records to retrieve.
|
25
|
-
:type query_config: Any
|
26
|
-
:ivar params_config: Configuration for query parameters, including filters and DataFrame options.
|
27
|
-
:type params_config: Any
|
28
|
-
:ivar logger: Logger instance used for debugging and reporting runtime information.
|
29
|
-
:type logger: Logger
|
30
|
-
:ivar debug: Indicates whether debug mode is active for verbose logging.
|
31
|
-
:type debug: bool
|
32
|
-
:ivar df: Dask DataFrame to hold the loaded query results.
|
33
|
-
:type df: dd.DataFrame
|
34
|
-
"""
|
35
|
-
df: dd.DataFrame
|
36
|
-
|
37
|
-
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
38
|
-
"""
|
39
|
-
This class initializes and configures a database connection along with the
|
40
|
-
specified query and parameters. It ensures the required model is defined
|
41
|
-
and sets up logging. Additional configurations can be provided via keyword
|
42
|
-
arguments.
|
43
|
-
|
44
|
-
:param db_connection: The configuration object representing the database
|
45
|
-
connection details.
|
46
|
-
:type db_connection: Any
|
47
|
-
:param db_query: The configuration or object for defining the database
|
48
|
-
query.
|
49
|
-
:type db_query: Any
|
50
|
-
:param db_params: The configuration or object for defining parameters
|
51
|
-
to be passed to the query.
|
52
|
-
:type db_params: Any
|
53
|
-
:param logger: An instance of a logging class used to log debug or
|
54
|
-
error messages, defaults to the class's default logger if not
|
55
|
-
specified.
|
56
|
-
:type logger: Any, optional
|
57
|
-
:param kwargs: Additional keyword arguments for custom configurations
|
58
|
-
like `debug`. These can include optional parameters to be parsed by
|
59
|
-
`params_config`.
|
60
|
-
:type kwargs: dict
|
61
|
-
:raises ValueError: If no model is specified in the given database
|
62
|
-
connection configuration.
|
63
|
-
"""
|
64
|
-
self.connection_config = db_connection
|
65
|
-
self.debug = kwargs.pop('debug', False)
|
66
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
67
|
-
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
68
|
-
if self.connection_config.model is None:
|
69
|
-
if self.debug:
|
70
|
-
self.logger.debug('Model must be specified')
|
71
|
-
|
72
|
-
raise ValueError('Model must be specified')
|
73
|
-
|
74
|
-
self.query_config = db_query
|
75
|
-
self.params_config = db_params
|
76
|
-
self.params_config.parse_params(kwargs)
|
77
|
-
|
78
|
-
def build_and_load(self):
|
79
|
-
"""
|
80
|
-
Builds and loads data into a DataFrame by invoking the `_build_and_load` method.
|
81
|
-
This is a utility method designed to perform specific operations for constructing
|
82
|
-
and preparing the data. The loaded data will then be assigned to the instance
|
83
|
-
attribute `df`.
|
84
|
-
|
85
|
-
:param self: Reference to the current instance of the class.
|
86
|
-
:type self: object
|
87
|
-
|
88
|
-
:return: DataFrame containing the built and loaded data.
|
89
|
-
"""
|
90
|
-
self.df = self._build_and_load()
|
91
|
-
# self.df = self._convert_columns(self.df)
|
92
|
-
return self.df
|
93
|
-
|
94
|
-
def _build_and_load(self) -> dd.DataFrame:
|
95
|
-
"""
|
96
|
-
Builds and loads a Dask DataFrame based on the provided query and configuration. This method queries the data
|
97
|
-
model using the specified connection, applies filters if provided, and converts the query result into a
|
98
|
-
Dask DataFrame. If filters are not provided, only the first `n_records` entries are processed to avoid
|
99
|
-
unintentionally loading the entire table.
|
100
|
-
|
101
|
-
:raises Exception: If an error occurs while loading the query, it logs the error and initializes an
|
102
|
-
empty Dask DataFrame.
|
103
|
-
|
104
|
-
:return: A Dask DataFrame containing the queried data. If no filters or valid results are provided,
|
105
|
-
an empty Dask DataFrame is returned.
|
106
|
-
:rtype: dd.DataFrame
|
107
|
-
"""
|
108
|
-
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
109
|
-
if not self.params_config.filters:
|
110
|
-
# IMPORTANT: if no filters are provided show only the first n_records
|
111
|
-
# this is to prevent loading the entire table by mistake
|
112
|
-
n_records = self.query_config.n_records if self.query_config.n_records else 100
|
113
|
-
queryset = query.all()[:n_records]
|
114
|
-
else:
|
115
|
-
q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
|
116
|
-
queryset = query.filter(q_objects)
|
117
|
-
if queryset is not None:
|
118
|
-
try:
|
119
|
-
self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
|
120
|
-
except Exception as e:
|
121
|
-
self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
|
122
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
123
|
-
else:
|
124
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
125
|
-
|
126
|
-
return self.df
|
127
|
-
|
128
|
-
@staticmethod
|
129
|
-
def __build_query_objects(filters: dict, use_exclude: bool):
|
130
|
-
"""
|
131
|
-
Constructs and returns a composite Q object based on the provided `filters` dictionary.
|
132
|
-
The function determines whether to include or exclude the filter conditions in the final
|
133
|
-
query based on the `use_exclude` parameter. If `use_exclude` is False, the filters are
|
134
|
-
directly added to the composite Q object. If `use_exclude` is True, the negation of
|
135
|
-
the filters is added instead.
|
136
|
-
|
137
|
-
:param filters: A dictionary containing filter conditions where keys represent field names
|
138
|
-
and values represent the conditions to be applied.
|
139
|
-
:type filters: dict
|
140
|
-
:param use_exclude: A boolean flag determining whether to exclude (`True`) or include
|
141
|
-
(`False`) the provided filter conditions.
|
142
|
-
:type use_exclude: bool
|
143
|
-
:return: A composite Q object that aggregates the filters based on the given conditions.
|
144
|
-
:rtype: Q
|
145
|
-
"""
|
146
|
-
q_objects = Q()
|
147
|
-
for key, value in filters.items():
|
148
|
-
if not use_exclude:
|
149
|
-
q_objects.add(Q(**{key: value}), Q.AND)
|
150
|
-
else:
|
151
|
-
q_objects.add(~Q(**{key: value}), Q.AND)
|
152
|
-
return q_objects
|
153
|
-
|
154
|
-
def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
|
155
|
-
"""
|
156
|
-
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
157
|
-
|
158
|
-
This function is deprecated and will be removed in a future release. The method converts the data
|
159
|
-
types of columns in a Dask DataFrame to match their corresponding field types defined in a Django model.
|
160
|
-
It emits warnings and logs deprecation notes. The conversions are applied lazily and partition-wise
|
161
|
-
to support distributed computation.
|
162
|
-
|
163
|
-
:param df: Dask DataFrame whose columns' data types are to be converted.
|
164
|
-
:type df: dd.DataFrame
|
165
|
-
:return: Dask DataFrame with converted column data types.
|
166
|
-
:rtype: dd.DataFrame
|
167
|
-
"""
|
168
|
-
"""
|
169
|
-
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
170
|
-
|
171
|
-
:param df: Dask DataFrame whose columns' data types are to be converted.
|
172
|
-
:return: Dask DataFrame with converted column data types.
|
173
|
-
"""
|
174
|
-
# Emit deprecation warning
|
175
|
-
warnings.warn(
|
176
|
-
"_convert_columns is deprecated and will be removed in a future release. "
|
177
|
-
"Consider using <new_method_name> instead.",
|
178
|
-
DeprecationWarning,
|
179
|
-
stacklevel=2,
|
180
|
-
)
|
181
|
-
|
182
|
-
# Log deprecation message if debug mode is enabled
|
183
|
-
if self.debug:
|
184
|
-
self.logger.warning(
|
185
|
-
"[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
|
186
|
-
"Consider using <new_method_name> instead."
|
187
|
-
)
|
188
|
-
|
189
|
-
self.logger.debug(f'Converting columns: {list(df.columns)}')
|
190
|
-
|
191
|
-
# Get field information from the Django model
|
192
|
-
model_fields = self.connection_config.model._meta.get_fields()
|
193
|
-
field_type_map = {field.name: type(field).__name__ for field in model_fields}
|
194
|
-
# Simplified loop to apply conversions partition-wise
|
195
|
-
for field_name, field_type in field_type_map.items():
|
196
|
-
if field_name not in df.columns:
|
197
|
-
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
198
|
-
continue
|
199
|
-
|
200
|
-
conversion_func = django_field_conversion_map_dask.get(field_type)
|
201
|
-
if not conversion_func:
|
202
|
-
message = f"Field type '{field_type}' not found in conversion_map."
|
203
|
-
self.logger.debug(message)
|
204
|
-
continue
|
205
|
-
|
206
|
-
def apply_conversion(partition):
|
207
|
-
"""
|
208
|
-
Apply the conversion function to a single partition for the given column.
|
209
|
-
"""
|
210
|
-
try:
|
211
|
-
if field_name in partition.columns:
|
212
|
-
partition[field_name] = conversion_func(partition[field_name])
|
213
|
-
except Exception as e:
|
214
|
-
self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
|
215
|
-
return partition
|
216
|
-
|
217
|
-
try:
|
218
|
-
# Apply conversion lazily to each partition
|
219
|
-
df = df.map_partitions(
|
220
|
-
apply_conversion,
|
221
|
-
meta=df,
|
222
|
-
)
|
223
|
-
self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
|
224
|
-
except Exception as e:
|
225
|
-
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
226
|
-
|
227
|
-
return df
|