sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sibi_dst/df_helper/_df_helper.py +186 -591
  2. sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
  3. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
  4. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
  5. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
  6. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
  7. sibi_dst/df_helper/core/__init__.py +0 -4
  8. sibi_dst/df_helper/core/_defaults.py +1 -50
  9. sibi_dst/df_helper/core/_query_config.py +2 -2
  10. sibi_dst/utils/__init__.py +0 -2
  11. sibi_dst/utils/data_wrapper.py +9 -12
  12. sibi_dst/utils/log_utils.py +15 -11
  13. sibi_dst/utils/update_planner.py +2 -0
  14. sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
  15. sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
  16. sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
  17. sibi_dst/v3/__init__.py +0 -0
  18. sibi_dst/v3/backends/__init__.py +0 -0
  19. sibi_dst/v3/df_helper/__init__.py +0 -0
  20. sibi_dst/v3/df_helper/_df_helper.py +91 -0
  21. sibi_dst-2025.1.1.dist-info/METADATA +55 -0
  22. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
  23. sibi_dst/df_helper/backends/django/__init__.py +0 -11
  24. sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
  25. sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
  26. sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
  27. sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
  28. sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
  29. sibi_dst/utils/airflow_manager.py +0 -212
  30. sibi_dst-0.3.63.dist-info/METADATA +0 -90
  31. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,227 +0,0 @@
1
- import warnings
2
-
3
- import dask.dataframe as dd
4
- import pandas as pd
5
- from django.db.models import Q
6
-
7
- from sibi_dst.df_helper.backends.django import ReadFrameDask
8
- from sibi_dst.df_helper.core import django_field_conversion_map_dask
9
- from sibi_dst.utils import Logger
10
-
11
-
12
- class DjangoLoadFromDb:
13
- """
14
- Handles loading data from a Django database into a Dask DataFrame, with support for filtering
15
- and column type conversion.
16
-
17
- This class is designed to interface with Django ORM models, allowing data querying and mapping
18
- Django model fields to Dask DataFrame columns. It accommodates filtering logic provided via
19
- parameters and ensures that excessive data is not accidentally loaded when no filters are applied.
20
-
21
- :ivar connection_config: Configuration for the database connection, including the Django model
22
- and connection details.
23
- :type connection_config: Any
24
- :ivar query_config: Configuration for the query, including the number of records to retrieve.
25
- :type query_config: Any
26
- :ivar params_config: Configuration for query parameters, including filters and DataFrame options.
27
- :type params_config: Any
28
- :ivar logger: Logger instance used for debugging and reporting runtime information.
29
- :type logger: Logger
30
- :ivar debug: Indicates whether debug mode is active for verbose logging.
31
- :type debug: bool
32
- :ivar df: Dask DataFrame to hold the loaded query results.
33
- :type df: dd.DataFrame
34
- """
35
- df: dd.DataFrame
36
-
37
- def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
38
- """
39
- This class initializes and configures a database connection along with the
40
- specified query and parameters. It ensures the required model is defined
41
- and sets up logging. Additional configurations can be provided via keyword
42
- arguments.
43
-
44
- :param db_connection: The configuration object representing the database
45
- connection details.
46
- :type db_connection: Any
47
- :param db_query: The configuration or object for defining the database
48
- query.
49
- :type db_query: Any
50
- :param db_params: The configuration or object for defining parameters
51
- to be passed to the query.
52
- :type db_params: Any
53
- :param logger: An instance of a logging class used to log debug or
54
- error messages, defaults to the class's default logger if not
55
- specified.
56
- :type logger: Any, optional
57
- :param kwargs: Additional keyword arguments for custom configurations
58
- like `debug`. These can include optional parameters to be parsed by
59
- `params_config`.
60
- :type kwargs: dict
61
- :raises ValueError: If no model is specified in the given database
62
- connection configuration.
63
- """
64
- self.connection_config = db_connection
65
- self.debug = kwargs.pop('debug', False)
66
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
67
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
68
- if self.connection_config.model is None:
69
- if self.debug:
70
- self.logger.debug('Model must be specified')
71
-
72
- raise ValueError('Model must be specified')
73
-
74
- self.query_config = db_query
75
- self.params_config = db_params
76
- self.params_config.parse_params(kwargs)
77
-
78
- def build_and_load(self):
79
- """
80
- Builds and loads data into a DataFrame by invoking the `_build_and_load` method.
81
- This is a utility method designed to perform specific operations for constructing
82
- and preparing the data. The loaded data will then be assigned to the instance
83
- attribute `df`.
84
-
85
- :param self: Reference to the current instance of the class.
86
- :type self: object
87
-
88
- :return: DataFrame containing the built and loaded data.
89
- """
90
- self.df = self._build_and_load()
91
- # self.df = self._convert_columns(self.df)
92
- return self.df
93
-
94
- def _build_and_load(self) -> dd.DataFrame:
95
- """
96
- Builds and loads a Dask DataFrame based on the provided query and configuration. This method queries the data
97
- model using the specified connection, applies filters if provided, and converts the query result into a
98
- Dask DataFrame. If filters are not provided, only the first `n_records` entries are processed to avoid
99
- unintentionally loading the entire table.
100
-
101
- :raises Exception: If an error occurs while loading the query, it logs the error and initializes an
102
- empty Dask DataFrame.
103
-
104
- :return: A Dask DataFrame containing the queried data. If no filters or valid results are provided,
105
- an empty Dask DataFrame is returned.
106
- :rtype: dd.DataFrame
107
- """
108
- query = self.connection_config.model.objects.using(self.connection_config.connection_name)
109
- if not self.params_config.filters:
110
- # IMPORTANT: if no filters are provided show only the first n_records
111
- # this is to prevent loading the entire table by mistake
112
- n_records = self.query_config.n_records if self.query_config.n_records else 100
113
- queryset = query.all()[:n_records]
114
- else:
115
- q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
116
- queryset = query.filter(q_objects)
117
- if queryset is not None:
118
- try:
119
- self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
120
- except Exception as e:
121
- self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
122
- self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
123
- else:
124
- self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
125
-
126
- return self.df
127
-
128
- @staticmethod
129
- def __build_query_objects(filters: dict, use_exclude: bool):
130
- """
131
- Constructs and returns a composite Q object based on the provided `filters` dictionary.
132
- The function determines whether to include or exclude the filter conditions in the final
133
- query based on the `use_exclude` parameter. If `use_exclude` is False, the filters are
134
- directly added to the composite Q object. If `use_exclude` is True, the negation of
135
- the filters is added instead.
136
-
137
- :param filters: A dictionary containing filter conditions where keys represent field names
138
- and values represent the conditions to be applied.
139
- :type filters: dict
140
- :param use_exclude: A boolean flag determining whether to exclude (`True`) or include
141
- (`False`) the provided filter conditions.
142
- :type use_exclude: bool
143
- :return: A composite Q object that aggregates the filters based on the given conditions.
144
- :rtype: Q
145
- """
146
- q_objects = Q()
147
- for key, value in filters.items():
148
- if not use_exclude:
149
- q_objects.add(Q(**{key: value}), Q.AND)
150
- else:
151
- q_objects.add(~Q(**{key: value}), Q.AND)
152
- return q_objects
153
-
154
- def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
155
- """
156
- [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
157
-
158
- This function is deprecated and will be removed in a future release. The method converts the data
159
- types of columns in a Dask DataFrame to match their corresponding field types defined in a Django model.
160
- It emits warnings and logs deprecation notes. The conversions are applied lazily and partition-wise
161
- to support distributed computation.
162
-
163
- :param df: Dask DataFrame whose columns' data types are to be converted.
164
- :type df: dd.DataFrame
165
- :return: Dask DataFrame with converted column data types.
166
- :rtype: dd.DataFrame
167
- """
168
- """
169
- [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
170
-
171
- :param df: Dask DataFrame whose columns' data types are to be converted.
172
- :return: Dask DataFrame with converted column data types.
173
- """
174
- # Emit deprecation warning
175
- warnings.warn(
176
- "_convert_columns is deprecated and will be removed in a future release. "
177
- "Consider using <new_method_name> instead.",
178
- DeprecationWarning,
179
- stacklevel=2,
180
- )
181
-
182
- # Log deprecation message if debug mode is enabled
183
- if self.debug:
184
- self.logger.warning(
185
- "[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
186
- "Consider using <new_method_name> instead."
187
- )
188
-
189
- self.logger.debug(f'Converting columns: {list(df.columns)}')
190
-
191
- # Get field information from the Django model
192
- model_fields = self.connection_config.model._meta.get_fields()
193
- field_type_map = {field.name: type(field).__name__ for field in model_fields}
194
- # Simplified loop to apply conversions partition-wise
195
- for field_name, field_type in field_type_map.items():
196
- if field_name not in df.columns:
197
- self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
198
- continue
199
-
200
- conversion_func = django_field_conversion_map_dask.get(field_type)
201
- if not conversion_func:
202
- message = f"Field type '{field_type}' not found in conversion_map."
203
- self.logger.debug(message)
204
- continue
205
-
206
- def apply_conversion(partition):
207
- """
208
- Apply the conversion function to a single partition for the given column.
209
- """
210
- try:
211
- if field_name in partition.columns:
212
- partition[field_name] = conversion_func(partition[field_name])
213
- except Exception as e:
214
- self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
215
- return partition
216
-
217
- try:
218
- # Apply conversion lazily to each partition
219
- df = df.map_partitions(
220
- apply_conversion,
221
- meta=df,
222
- )
223
- self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
224
- except Exception as e:
225
- self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
226
-
227
- return df