sibi-dst 0.3.64__py3-none-any.whl → 2025.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +5 -3
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +163 -13
- sibi_dst/df_helper/core/__init__.py +0 -4
- sibi_dst/df_helper/core/_defaults.py +1 -50
- sibi_dst/utils/__init__.py +0 -2
- sibi_dst/utils/data_wrapper.py +9 -12
- sibi_dst/utils/update_planner.py +2 -0
- sibi_dst-2025.1.2.dist-info/METADATA +55 -0
- {sibi_dst-0.3.64.dist-info → sibi_dst-2025.1.2.dist-info}/RECORD +10 -16
- sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst/utils/airflow_manager.py +0 -212
- sibi_dst-0.3.64.dist-info/METADATA +0 -90
- {sibi_dst-0.3.64.dist-info → sibi_dst-2025.1.2.dist-info}/WHEEL +0 -0
@@ -1,450 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
|
3
|
-
import dask.dataframe as dd
|
4
|
-
import django
|
5
|
-
import pandas as pd
|
6
|
-
from django.core.cache import cache
|
7
|
-
from django.core.exceptions import FieldDoesNotExist
|
8
|
-
from django.db import models
|
9
|
-
from django.db.models import Field
|
10
|
-
from django.utils.encoding import force_str as force_text
|
11
|
-
|
12
|
-
|
13
|
-
class ReadFrameDask:
|
14
|
-
"""
|
15
|
-
Handles Django ORM QuerySet to Dask DataFrame conversion with support for field
|
16
|
-
type inference, chunked data retrieval, and verbose updates.
|
17
|
-
|
18
|
-
This class provides methods to efficiently convert a Django QuerySet into a
|
19
|
-
Dask DataFrame while preserving field types and incorporating additional
|
20
|
-
capabilities such as replacing fields with verbose choices or related object
|
21
|
-
information. The class design leverages static and class methods to maintain
|
22
|
-
flexibility and reusability for handling Django model fields and their data
|
23
|
-
types.
|
24
|
-
|
25
|
-
:ivar qs: The Django QuerySet to be converted into a Dask DataFrame.
|
26
|
-
:type qs: django.db.models.query.QuerySet
|
27
|
-
:ivar coerce_float: Whether to attempt to coerce numeric values to floats.
|
28
|
-
:type coerce_float: bool
|
29
|
-
:ivar chunk_size: The number of records to fetch and process per chunk from
|
30
|
-
the QuerySet.
|
31
|
-
:type chunk_size: int
|
32
|
-
:ivar verbose: If True, provides verbose updates during DataFrame creation
|
33
|
-
by replacing fields with readable representations (e.g., verbose names).
|
34
|
-
:type verbose: bool
|
35
|
-
"""
|
36
|
-
FieldDoesNotExist = (
|
37
|
-
django.core.exceptions.FieldDoesNotExist
|
38
|
-
if django.VERSION < (1, 8)
|
39
|
-
else django.core.exceptions.FieldDoesNotExist
|
40
|
-
)
|
41
|
-
|
42
|
-
def __init__(
|
43
|
-
self,
|
44
|
-
qs,
|
45
|
-
**kwargs,
|
46
|
-
):
|
47
|
-
"""
|
48
|
-
An initialization method for a class that sets class attributes based on provided
|
49
|
-
arguments or default values using the keyword arguments. The method allows
|
50
|
-
customization of behaviors like coercing data types, handling chunked operations,
|
51
|
-
and verbosity level during execution.
|
52
|
-
|
53
|
-
:param qs: A data source or query set for processing; its type is dependent
|
54
|
-
on the expected data being handled.
|
55
|
-
:param kwargs: Additional keyword arguments that may include:
|
56
|
-
- coerce_float: A boolean indicating whether floats should be coerced
|
57
|
-
during handling. Default is False.
|
58
|
-
- chunk_size: An integer value representing the size of chunks for
|
59
|
-
data processing. Default is 1000.
|
60
|
-
- verbose: A boolean to specify if verbose logging or output
|
61
|
-
should occur during execution. Default is True.
|
62
|
-
"""
|
63
|
-
self.qs = qs
|
64
|
-
self.coerce_float = kwargs.setdefault("coerce_float", False)
|
65
|
-
self.chunk_size = kwargs.setdefault("chunk_size", 1000)
|
66
|
-
self.verbose = kwargs.setdefault("verbose", True)
|
67
|
-
|
68
|
-
@staticmethod
|
69
|
-
def replace_from_choices(choices):
|
70
|
-
"""
|
71
|
-
Provides a method to replace elements in a list of values based on a mapping of choices.
|
72
|
-
|
73
|
-
This static method generates a closure function that replaces items in a list by
|
74
|
-
looking up their corresponding values in a provided dictionary of choices. If an
|
75
|
-
item cannot be found in the dictionary, it is left unchanged.
|
76
|
-
|
77
|
-
:param choices:
|
78
|
-
Dictionary where keys are original values and values are their replacements.
|
79
|
-
:return:
|
80
|
-
A function that takes a list of values and replaces elements using the
|
81
|
-
provided choices dictionary.
|
82
|
-
"""
|
83
|
-
def inner(values):
|
84
|
-
return [choices.get(v, v) for v in values]
|
85
|
-
|
86
|
-
return inner
|
87
|
-
|
88
|
-
@staticmethod
|
89
|
-
def get_model_name(model):
|
90
|
-
"""
|
91
|
-
Retrieves the model name from a given Django model instance.
|
92
|
-
|
93
|
-
This method accesses the `_meta.model_name` attribute of the provided
|
94
|
-
model object to extract and return the model's name.
|
95
|
-
|
96
|
-
:param model: A Django model instance from which the model name is
|
97
|
-
derived.
|
98
|
-
:type model: object
|
99
|
-
:return: The name of the model as a string.
|
100
|
-
:rtype: str
|
101
|
-
"""
|
102
|
-
return model._meta.model_name
|
103
|
-
|
104
|
-
@staticmethod
|
105
|
-
def get_related_model(field):
|
106
|
-
"""
|
107
|
-
Retrieve the related model from the provided field.
|
108
|
-
|
109
|
-
This function determines the related model associated with the given field.
|
110
|
-
It checks various attributes commonly used to indicate relations in models and
|
111
|
-
retrieves the related model if present.
|
112
|
-
|
113
|
-
:param field: The field from which the related model is to be extracted.
|
114
|
-
It must be an object that potentially contains attributes like
|
115
|
-
`related_model` or `rel`.
|
116
|
-
:return: The related model associated with the provided field, or None if
|
117
|
-
no such model is found.
|
118
|
-
"""
|
119
|
-
model = None
|
120
|
-
if hasattr(field, "related_model") and field.related_model:
|
121
|
-
model = field.related_model
|
122
|
-
elif hasattr(field, "rel") and field.rel:
|
123
|
-
model = field.rel.to
|
124
|
-
return model
|
125
|
-
|
126
|
-
@classmethod
|
127
|
-
def get_base_cache_key(cls, model):
|
128
|
-
"""
|
129
|
-
Generates a base cache key for caching purposes.
|
130
|
-
|
131
|
-
This method constructs a base cache key that can be used in conjunction with
|
132
|
-
Django models to uniquely identify cache entries. The key is formatted to
|
133
|
-
include the app label and model name, ensuring that cache entries are
|
134
|
-
namespaced accordingly.
|
135
|
-
|
136
|
-
:param model: A Django model instance for which the base cache key is generated.
|
137
|
-
:type model: Model
|
138
|
-
:return: The string template for the base cache key, where `%s` can be replaced
|
139
|
-
with specific identifiers to create unique keys.
|
140
|
-
:rtype: str
|
141
|
-
"""
|
142
|
-
return (
|
143
|
-
f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
|
144
|
-
)
|
145
|
-
|
146
|
-
@classmethod
|
147
|
-
def replace_pk(cls, model):
|
148
|
-
"""
|
149
|
-
Generates a function that replaces primary keys in a pandas Series with their
|
150
|
-
corresponding cached values or database-retrieved representations.
|
151
|
-
|
152
|
-
The function uses a cache mechanism to retrieve pre-stored values for primary
|
153
|
-
keys in the series. If some primary keys are not found in the cache, it queries
|
154
|
-
the database for their representations, updates the cache, and replaces the
|
155
|
-
primary keys in the series accordingly.
|
156
|
-
|
157
|
-
:param model: The Django model class associated with the primary keys to be
|
158
|
-
processed.
|
159
|
-
:type model: Type[Model]
|
160
|
-
|
161
|
-
:return: A function that takes a pandas Series of primary keys as input and
|
162
|
-
returns a Series with replaced values based on cache or database retrieval.
|
163
|
-
:rtype: callable
|
164
|
-
"""
|
165
|
-
base_cache_key = cls.get_base_cache_key(model)
|
166
|
-
|
167
|
-
def get_cache_key_from_pk(pk):
|
168
|
-
return None if pk is None else base_cache_key % str(pk)
|
169
|
-
|
170
|
-
def inner(pk_series):
|
171
|
-
pk_series = pk_series.astype(object).where(pk_series.notnull(), None)
|
172
|
-
cache_keys = pk_series.apply(get_cache_key_from_pk, convert_dtype=False)
|
173
|
-
unique_cache_keys = list(filter(None, cache_keys.unique()))
|
174
|
-
if not unique_cache_keys:
|
175
|
-
return pk_series
|
176
|
-
|
177
|
-
out_dict = cache.get_many(unique_cache_keys)
|
178
|
-
if len(out_dict) < len(unique_cache_keys):
|
179
|
-
out_dict = dict(
|
180
|
-
[
|
181
|
-
(base_cache_key % obj.pk, force_text(obj))
|
182
|
-
for obj in model.objects.filter(
|
183
|
-
pk__in=list(filter(None, pk_series.unique()))
|
184
|
-
)
|
185
|
-
]
|
186
|
-
)
|
187
|
-
cache.set_many(out_dict)
|
188
|
-
return list(map(out_dict.get, cache_keys))
|
189
|
-
|
190
|
-
return inner
|
191
|
-
|
192
|
-
@classmethod
|
193
|
-
def build_update_functions(cls, fieldnames, fields):
|
194
|
-
"""
|
195
|
-
This method is responsible for building update functions based on the provided
|
196
|
-
fieldnames and fields. It performs validation for the field type, checks for
|
197
|
-
specific conditions such as `choices` or `ForeignKey` field types, and generates
|
198
|
-
a generator of update functions for the given fieldnames and fields.
|
199
|
-
|
200
|
-
:param fieldnames: A list of field names to be processed.
|
201
|
-
:type fieldnames: list[str]
|
202
|
-
:param fields: A list of field objects corresponding to the fieldnames.
|
203
|
-
:type fields: list[Field]
|
204
|
-
:return: A generator yielding tuples where the first element is a fieldname,
|
205
|
-
and the second element is the corresponding update function or None.
|
206
|
-
:rtype: generator[tuple[str, Callable | None]]
|
207
|
-
"""
|
208
|
-
for fieldname, field in zip(fieldnames, fields):
|
209
|
-
if not isinstance(field, Field):
|
210
|
-
yield fieldname, None
|
211
|
-
else:
|
212
|
-
if field.choices:
|
213
|
-
choices = dict([(k, force_text(v)) for k, v in field.flatchoices])
|
214
|
-
yield fieldname, cls.replace_from_choices(choices)
|
215
|
-
elif field.get_internal_type() == "ForeignKey":
|
216
|
-
yield fieldname, cls.replace_pk(cls.get_related_model(field))
|
217
|
-
|
218
|
-
@classmethod
|
219
|
-
def update_with_verbose(cls, df, fieldnames, fields):
|
220
|
-
"""
|
221
|
-
Updates the provided dataframe by applying transformation functions to specified fields.
|
222
|
-
The method iterates over the provided field names and their corresponding functions, applying
|
223
|
-
each transformation function to its related column in the dataframe.
|
224
|
-
|
225
|
-
:param df: The input dataframe to be updated.
|
226
|
-
:param fieldnames: A list of field names in the dataframe that need to be updated.
|
227
|
-
:param fields: A list of transformation functions or mappings corresponding to the field names.
|
228
|
-
:return: The dataframe with updated fields.
|
229
|
-
"""
|
230
|
-
for fieldname, function in cls.build_update_functions(fieldnames, fields):
|
231
|
-
if function is not None:
|
232
|
-
df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
|
233
|
-
|
234
|
-
@classmethod
|
235
|
-
def to_fields(cls, qs, fieldnames):
|
236
|
-
"""
|
237
|
-
Converts field names from a queryset into corresponding field objects, resolving relationships
|
238
|
-
and related objects if necessary. This method is typically used to yield fully-resolved field
|
239
|
-
objects for further interaction.
|
240
|
-
|
241
|
-
:param qs: A QuerySet object from which the fields are resolved. This object provides access
|
242
|
-
to the model and its metadata from which the fields are retrieved.
|
243
|
-
:type qs: QuerySet
|
244
|
-
|
245
|
-
:param fieldnames: A list of field name strings. These can include nested fields separated by
|
246
|
-
double underscores (__) to denote relationships or subfields.
|
247
|
-
:type fieldnames: List[str]
|
248
|
-
|
249
|
-
:return: A generator that yields resolved field objects corresponding to the provided field names.
|
250
|
-
:rtype: Generator[Field, None, None]
|
251
|
-
"""
|
252
|
-
for fieldname in fieldnames:
|
253
|
-
model = qs.model
|
254
|
-
for fieldname_part in fieldname.split("__"):
|
255
|
-
try:
|
256
|
-
field = model._meta.get_field(fieldname_part)
|
257
|
-
except cls.FieldDoesNotExist:
|
258
|
-
try:
|
259
|
-
rels = model._meta.get_all_related_objects_with_model()
|
260
|
-
except AttributeError:
|
261
|
-
field = fieldname
|
262
|
-
else:
|
263
|
-
for relobj, _ in rels:
|
264
|
-
if relobj.get_accessor_name() == fieldname_part:
|
265
|
-
field = relobj.field
|
266
|
-
model = field.model
|
267
|
-
break
|
268
|
-
else:
|
269
|
-
model = cls.get_related_model(field)
|
270
|
-
yield field
|
271
|
-
|
272
|
-
@staticmethod
|
273
|
-
def is_values_queryset(qs):
|
274
|
-
"""
|
275
|
-
Determines whether the provided queryset is a values queryset.
|
276
|
-
|
277
|
-
This method checks if the `_iterable_class` attribute of the queryset corresponds
|
278
|
-
to `django.db.models.query.ValuesIterable`. If an exception occurs during the check,
|
279
|
-
the method returns `False`.
|
280
|
-
|
281
|
-
:param qs: The queryset to be checked.
|
282
|
-
:type qs: django.db.models.query.QuerySet
|
283
|
-
:return: A boolean indicating whether the queryset is a values queryset.
|
284
|
-
:rtype: bool
|
285
|
-
"""
|
286
|
-
try:
|
287
|
-
return qs._iterable_class == django.db.models.query.ValuesIterable
|
288
|
-
except:
|
289
|
-
return False
|
290
|
-
|
291
|
-
@staticmethod
|
292
|
-
def object_to_dict(obj, fields=None):
|
293
|
-
"""
|
294
|
-
Converts an object to a dictionary representation.
|
295
|
-
|
296
|
-
This static method transforms an object's attributes into a dictionary.
|
297
|
-
If no specific fields are provided, all attribute key-value pairs are
|
298
|
-
included. The "_state" attribute, if present, is safely removed in this
|
299
|
-
case. When specific fields are supplied, only those fields are included
|
300
|
-
in the resulting dictionary.
|
301
|
-
|
302
|
-
:param obj: The object to be serialized into a dictionary. This object
|
303
|
-
must have the `__dict__` attribute available.
|
304
|
-
:param fields: A list of strings representing the attribute names to
|
305
|
-
include in the dictionary. If None or not provided, all attributes
|
306
|
-
are included except for "_state".
|
307
|
-
:return: A dictionary representation of the object's attributes. If the
|
308
|
-
provided object is None, an empty dictionary is returned.
|
309
|
-
:rtype: dict
|
310
|
-
"""
|
311
|
-
if obj is None:
|
312
|
-
return {} # Return an empty dictionary if obj is None
|
313
|
-
if not fields:
|
314
|
-
obj.__dict__.pop("_state", None) # Remove _state safely
|
315
|
-
return obj.__dict__
|
316
|
-
return {field: obj.__dict__.get(field) for field in fields if field is not None}
|
317
|
-
|
318
|
-
@staticmethod
|
319
|
-
def infer_dtypes_from_django(qs):
|
320
|
-
"""
|
321
|
-
Infer dtypes from a Django QuerySet model and annotated fields.
|
322
|
-
|
323
|
-
This method infers the appropriate data types (dtypes) for a given
|
324
|
-
Django QuerySet (`qs`) based on the fields defined in its model and
|
325
|
-
any annotated fields included in the QuerySet. The function maps
|
326
|
-
Django model field types to corresponding dtypes compatible with
|
327
|
-
Dask or Pandas dataframes.
|
328
|
-
|
329
|
-
- Fields in the model are identified through their metadata.
|
330
|
-
- Reverse relationships and non-concrete fields are ignored.
|
331
|
-
- Annotated fields are processed separately and default to object
|
332
|
-
dtype if their type cannot be determined.
|
333
|
-
|
334
|
-
:param qs: Django QuerySet whose model is used to infer dtypes.
|
335
|
-
:type qs: QuerySet
|
336
|
-
:return: A mapping of field names to inferred dtypes.
|
337
|
-
:rtype: dict
|
338
|
-
"""
|
339
|
-
django_to_dask_dtype = {
|
340
|
-
'AutoField': 'Int64', # Use nullable integer
|
341
|
-
'BigAutoField': 'Int64',
|
342
|
-
'BigIntegerField': 'Int64',
|
343
|
-
'BooleanField': 'bool',
|
344
|
-
'CharField': 'object',
|
345
|
-
'DateField': 'datetime64[ns]',
|
346
|
-
'DateTimeField': 'datetime64[ns]',
|
347
|
-
'DecimalField': 'float64',
|
348
|
-
'FloatField': 'float64',
|
349
|
-
'IntegerField': 'Int64', # Use nullable integer
|
350
|
-
'PositiveIntegerField': 'Int64',
|
351
|
-
'SmallIntegerField': 'Int64',
|
352
|
-
'TextField': 'object',
|
353
|
-
'TimeField': 'object',
|
354
|
-
'UUIDField': 'object',
|
355
|
-
'ForeignKey': 'Int64', # Use nullable integer for FK fields
|
356
|
-
}
|
357
|
-
|
358
|
-
dtypes = {}
|
359
|
-
# Handle model fields
|
360
|
-
for field in qs.model._meta.get_fields():
|
361
|
-
# Skip reverse relationships and non-concrete fields
|
362
|
-
if not getattr(field, 'concrete', False):
|
363
|
-
continue
|
364
|
-
|
365
|
-
# Check for AutoField or BigAutoField explicitly
|
366
|
-
if isinstance(field, (models.AutoField, models.BigAutoField)):
|
367
|
-
dtypes[field.name] = 'Int64' # Nullable integer for autoincremented fields
|
368
|
-
else:
|
369
|
-
# Use field type to infer dtype
|
370
|
-
field_type = field.get_internal_type()
|
371
|
-
dtypes[field.name] = django_to_dask_dtype.get(field_type, 'object')
|
372
|
-
|
373
|
-
# Handle annotated fields
|
374
|
-
for annotation_name, annotation in qs.query.annotation_select.items():
|
375
|
-
if hasattr(annotation, 'output_field'):
|
376
|
-
field_type = annotation.output_field.get_internal_type()
|
377
|
-
dtype = django_to_dask_dtype.get(field_type, 'object')
|
378
|
-
else:
|
379
|
-
dtype = 'object' # Default to object for untyped annotations
|
380
|
-
dtypes[annotation_name] = dtype
|
381
|
-
|
382
|
-
return dtypes
|
383
|
-
|
384
|
-
def read_frame(self, fillna_value=None):
|
385
|
-
"""
|
386
|
-
Reads a Django QuerySet and returns a dask DataFrame by iterating over the QuerySet in chunks. It
|
387
|
-
handles data type inference, missing values, timezone awareness, and creates partitions to form a
|
388
|
-
single dask DataFrame efficiently.
|
389
|
-
|
390
|
-
This method includes functionality for managing missing values, inferring data types from Django fields,
|
391
|
-
and handling timezone-aware datetime objects. It processes data in chunks to optimize memory usage and
|
392
|
-
supports converting chunks into pandas DataFrames before combining them into a unified dask DataFrame.
|
393
|
-
|
394
|
-
:param fillna_value: The value to fill NaN values in the DataFrame. If None, NaNs are not filled.
|
395
|
-
:type fillna_value: Any
|
396
|
-
:return: A dask DataFrame constructed from the QuerySet after processing and combining all
|
397
|
-
its partitions.
|
398
|
-
:rtype: dask.dataframe.DataFrame
|
399
|
-
"""
|
400
|
-
qs = self.qs
|
401
|
-
coerce_float = self.coerce_float
|
402
|
-
verbose = self.verbose
|
403
|
-
chunk_size = self.chunk_size
|
404
|
-
|
405
|
-
fields = qs.model._meta.fields
|
406
|
-
fieldnames = [f.name for f in fields]
|
407
|
-
fieldnames += list(qs.query.annotation_select.keys())
|
408
|
-
fieldnames = tuple(fieldnames)
|
409
|
-
# Infer dtypes from Django fields
|
410
|
-
dtypes = self.infer_dtypes_from_django(qs)
|
411
|
-
if fieldnames:
|
412
|
-
dtypes = {field: dtype for field, dtype in dtypes.items() if field in fieldnames}
|
413
|
-
|
414
|
-
# Create partitions for Dask by iterating through chunks
|
415
|
-
partitions = []
|
416
|
-
iterator = iter(qs.iterator(chunk_size=chunk_size))
|
417
|
-
|
418
|
-
while True:
|
419
|
-
chunk = list(itertools.islice(iterator, chunk_size))
|
420
|
-
if not chunk:
|
421
|
-
break
|
422
|
-
|
423
|
-
# Convert chunk to DataFrame with inferred dtypes
|
424
|
-
df = pd.DataFrame.from_records(
|
425
|
-
[self.object_to_dict(obj, fieldnames) for obj in chunk],
|
426
|
-
columns=fieldnames,
|
427
|
-
coerce_float=coerce_float,
|
428
|
-
)
|
429
|
-
# Handle NaN values before casting, if specified
|
430
|
-
if fillna_value is not None:
|
431
|
-
df = df.fillna(fillna_value)
|
432
|
-
|
433
|
-
# Convert timezone-aware columns to timezone-naive if needed
|
434
|
-
for col in df.columns:
|
435
|
-
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
436
|
-
df[col] = df[col].dt.tz_localize(None)
|
437
|
-
|
438
|
-
# Convert to the appropriate data types
|
439
|
-
df = df.astype(dtypes)
|
440
|
-
partitions.append(dd.from_pandas(df, npartitions=1))
|
441
|
-
|
442
|
-
# Concatenate partitions into a single Dask DataFrame
|
443
|
-
# Ensure all partitions have the same columns
|
444
|
-
|
445
|
-
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
446
|
-
|
447
|
-
if verbose:
|
448
|
-
self.update_with_verbose(dask_df, fieldnames, fields)
|
449
|
-
|
450
|
-
return dask_df
|
@@ -1,227 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
|
3
|
-
import dask.dataframe as dd
|
4
|
-
import pandas as pd
|
5
|
-
from django.db.models import Q
|
6
|
-
|
7
|
-
from sibi_dst.df_helper.backends.django import ReadFrameDask
|
8
|
-
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
9
|
-
from sibi_dst.utils import Logger
|
10
|
-
|
11
|
-
|
12
|
-
class DjangoLoadFromDb:
|
13
|
-
"""
|
14
|
-
Handles loading data from a Django database into a Dask DataFrame, with support for filtering
|
15
|
-
and column type conversion.
|
16
|
-
|
17
|
-
This class is designed to interface with Django ORM models, allowing data querying and mapping
|
18
|
-
Django model fields to Dask DataFrame columns. It accommodates filtering logic provided via
|
19
|
-
parameters and ensures that excessive data is not accidentally loaded when no filters are applied.
|
20
|
-
|
21
|
-
:ivar connection_config: Configuration for the database connection, including the Django model
|
22
|
-
and connection details.
|
23
|
-
:type connection_config: Any
|
24
|
-
:ivar query_config: Configuration for the query, including the number of records to retrieve.
|
25
|
-
:type query_config: Any
|
26
|
-
:ivar params_config: Configuration for query parameters, including filters and DataFrame options.
|
27
|
-
:type params_config: Any
|
28
|
-
:ivar logger: Logger instance used for debugging and reporting runtime information.
|
29
|
-
:type logger: Logger
|
30
|
-
:ivar debug: Indicates whether debug mode is active for verbose logging.
|
31
|
-
:type debug: bool
|
32
|
-
:ivar df: Dask DataFrame to hold the loaded query results.
|
33
|
-
:type df: dd.DataFrame
|
34
|
-
"""
|
35
|
-
df: dd.DataFrame
|
36
|
-
|
37
|
-
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
38
|
-
"""
|
39
|
-
This class initializes and configures a database connection along with the
|
40
|
-
specified query and parameters. It ensures the required model is defined
|
41
|
-
and sets up logging. Additional configurations can be provided via keyword
|
42
|
-
arguments.
|
43
|
-
|
44
|
-
:param db_connection: The configuration object representing the database
|
45
|
-
connection details.
|
46
|
-
:type db_connection: Any
|
47
|
-
:param db_query: The configuration or object for defining the database
|
48
|
-
query.
|
49
|
-
:type db_query: Any
|
50
|
-
:param db_params: The configuration or object for defining parameters
|
51
|
-
to be passed to the query.
|
52
|
-
:type db_params: Any
|
53
|
-
:param logger: An instance of a logging class used to log debug or
|
54
|
-
error messages, defaults to the class's default logger if not
|
55
|
-
specified.
|
56
|
-
:type logger: Any, optional
|
57
|
-
:param kwargs: Additional keyword arguments for custom configurations
|
58
|
-
like `debug`. These can include optional parameters to be parsed by
|
59
|
-
`params_config`.
|
60
|
-
:type kwargs: dict
|
61
|
-
:raises ValueError: If no model is specified in the given database
|
62
|
-
connection configuration.
|
63
|
-
"""
|
64
|
-
self.connection_config = db_connection
|
65
|
-
self.debug = kwargs.pop('debug', False)
|
66
|
-
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
67
|
-
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
68
|
-
if self.connection_config.model is None:
|
69
|
-
if self.debug:
|
70
|
-
self.logger.debug('Model must be specified')
|
71
|
-
|
72
|
-
raise ValueError('Model must be specified')
|
73
|
-
|
74
|
-
self.query_config = db_query
|
75
|
-
self.params_config = db_params
|
76
|
-
self.params_config.parse_params(kwargs)
|
77
|
-
|
78
|
-
def build_and_load(self):
|
79
|
-
"""
|
80
|
-
Builds and loads data into a DataFrame by invoking the `_build_and_load` method.
|
81
|
-
This is a utility method designed to perform specific operations for constructing
|
82
|
-
and preparing the data. The loaded data will then be assigned to the instance
|
83
|
-
attribute `df`.
|
84
|
-
|
85
|
-
:param self: Reference to the current instance of the class.
|
86
|
-
:type self: object
|
87
|
-
|
88
|
-
:return: DataFrame containing the built and loaded data.
|
89
|
-
"""
|
90
|
-
self.df = self._build_and_load()
|
91
|
-
# self.df = self._convert_columns(self.df)
|
92
|
-
return self.df
|
93
|
-
|
94
|
-
def _build_and_load(self) -> dd.DataFrame:
|
95
|
-
"""
|
96
|
-
Builds and loads a Dask DataFrame based on the provided query and configuration. This method queries the data
|
97
|
-
model using the specified connection, applies filters if provided, and converts the query result into a
|
98
|
-
Dask DataFrame. If filters are not provided, only the first `n_records` entries are processed to avoid
|
99
|
-
unintentionally loading the entire table.
|
100
|
-
|
101
|
-
:raises Exception: If an error occurs while loading the query, it logs the error and initializes an
|
102
|
-
empty Dask DataFrame.
|
103
|
-
|
104
|
-
:return: A Dask DataFrame containing the queried data. If no filters or valid results are provided,
|
105
|
-
an empty Dask DataFrame is returned.
|
106
|
-
:rtype: dd.DataFrame
|
107
|
-
"""
|
108
|
-
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
109
|
-
if not self.params_config.filters:
|
110
|
-
# IMPORTANT: if no filters are provided show only the first n_records
|
111
|
-
# this is to prevent loading the entire table by mistake
|
112
|
-
n_records = self.query_config.n_records if self.query_config.n_records else 100
|
113
|
-
queryset = query.all()[:n_records]
|
114
|
-
else:
|
115
|
-
q_objects = self.__build_query_objects(self.params_config.filters, self.query_config.use_exclude)
|
116
|
-
queryset = query.filter(q_objects)
|
117
|
-
if queryset is not None:
|
118
|
-
try:
|
119
|
-
self.df = ReadFrameDask(queryset, **self.params_config.df_params).read_frame()
|
120
|
-
except Exception as e:
|
121
|
-
self.logger.debug(f'Error loading query: {str(queryset.query)}, error message: {e}')
|
122
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
123
|
-
else:
|
124
|
-
self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
125
|
-
|
126
|
-
return self.df
|
127
|
-
|
128
|
-
@staticmethod
|
129
|
-
def __build_query_objects(filters: dict, use_exclude: bool):
|
130
|
-
"""
|
131
|
-
Constructs and returns a composite Q object based on the provided `filters` dictionary.
|
132
|
-
The function determines whether to include or exclude the filter conditions in the final
|
133
|
-
query based on the `use_exclude` parameter. If `use_exclude` is False, the filters are
|
134
|
-
directly added to the composite Q object. If `use_exclude` is True, the negation of
|
135
|
-
the filters is added instead.
|
136
|
-
|
137
|
-
:param filters: A dictionary containing filter conditions where keys represent field names
|
138
|
-
and values represent the conditions to be applied.
|
139
|
-
:type filters: dict
|
140
|
-
:param use_exclude: A boolean flag determining whether to exclude (`True`) or include
|
141
|
-
(`False`) the provided filter conditions.
|
142
|
-
:type use_exclude: bool
|
143
|
-
:return: A composite Q object that aggregates the filters based on the given conditions.
|
144
|
-
:rtype: Q
|
145
|
-
"""
|
146
|
-
q_objects = Q()
|
147
|
-
for key, value in filters.items():
|
148
|
-
if not use_exclude:
|
149
|
-
q_objects.add(Q(**{key: value}), Q.AND)
|
150
|
-
else:
|
151
|
-
q_objects.add(~Q(**{key: value}), Q.AND)
|
152
|
-
return q_objects
|
153
|
-
|
154
|
-
def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
|
155
|
-
"""
|
156
|
-
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
157
|
-
|
158
|
-
This function is deprecated and will be removed in a future release. The method converts the data
|
159
|
-
types of columns in a Dask DataFrame to match their corresponding field types defined in a Django model.
|
160
|
-
It emits warnings and logs deprecation notes. The conversions are applied lazily and partition-wise
|
161
|
-
to support distributed computation.
|
162
|
-
|
163
|
-
:param df: Dask DataFrame whose columns' data types are to be converted.
|
164
|
-
:type df: dd.DataFrame
|
165
|
-
:return: Dask DataFrame with converted column data types.
|
166
|
-
:rtype: dd.DataFrame
|
167
|
-
"""
|
168
|
-
"""
|
169
|
-
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
170
|
-
|
171
|
-
:param df: Dask DataFrame whose columns' data types are to be converted.
|
172
|
-
:return: Dask DataFrame with converted column data types.
|
173
|
-
"""
|
174
|
-
# Emit deprecation warning
|
175
|
-
warnings.warn(
|
176
|
-
"_convert_columns is deprecated and will be removed in a future release. "
|
177
|
-
"Consider using <new_method_name> instead.",
|
178
|
-
DeprecationWarning,
|
179
|
-
stacklevel=2,
|
180
|
-
)
|
181
|
-
|
182
|
-
# Log deprecation message if debug mode is enabled
|
183
|
-
if self.debug:
|
184
|
-
self.logger.warning(
|
185
|
-
"[DEPRECATION NOTICE] The `_convert_columns` method is deprecated and will be removed in a future release. "
|
186
|
-
"Consider using <new_method_name> instead."
|
187
|
-
)
|
188
|
-
|
189
|
-
self.logger.debug(f'Converting columns: {list(df.columns)}')
|
190
|
-
|
191
|
-
# Get field information from the Django model
|
192
|
-
model_fields = self.connection_config.model._meta.get_fields()
|
193
|
-
field_type_map = {field.name: type(field).__name__ for field in model_fields}
|
194
|
-
# Simplified loop to apply conversions partition-wise
|
195
|
-
for field_name, field_type in field_type_map.items():
|
196
|
-
if field_name not in df.columns:
|
197
|
-
self.logger.debug(f"Column '{field_name}' not found in DataFrame columns.")
|
198
|
-
continue
|
199
|
-
|
200
|
-
conversion_func = django_field_conversion_map_dask.get(field_type)
|
201
|
-
if not conversion_func:
|
202
|
-
message = f"Field type '{field_type}' not found in conversion_map."
|
203
|
-
self.logger.debug(message)
|
204
|
-
continue
|
205
|
-
|
206
|
-
def apply_conversion(partition):
|
207
|
-
"""
|
208
|
-
Apply the conversion function to a single partition for the given column.
|
209
|
-
"""
|
210
|
-
try:
|
211
|
-
if field_name in partition.columns:
|
212
|
-
partition[field_name] = conversion_func(partition[field_name])
|
213
|
-
except Exception as e:
|
214
|
-
self.logger.debug(f"Error converting column '{field_name}' in partition: {str(e)}")
|
215
|
-
return partition
|
216
|
-
|
217
|
-
try:
|
218
|
-
# Apply conversion lazily to each partition
|
219
|
-
df = df.map_partitions(
|
220
|
-
apply_conversion,
|
221
|
-
meta=df,
|
222
|
-
)
|
223
|
-
self.logger.debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.")
|
224
|
-
except Exception as e:
|
225
|
-
self.logger.debug(f"Failed to queue conversion for column '{field_name}': {str(e)}")
|
226
|
-
|
227
|
-
return df
|