sibi-dst 0.3.32__py3-none-any.whl → 0.3.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +108 -5
- sibi_dst/df_helper/_parquet_artifact.py +63 -0
- sibi_dst/df_helper/_parquet_reader.py +36 -0
- sibi_dst/df_helper/backends/django/_db_connection.py +41 -1
- sibi_dst/df_helper/backends/django/_io_dask.py +211 -3
- sibi_dst/df_helper/backends/django/_load_from_db.py +96 -1
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +132 -6
- sibi_dst/df_helper/backends/http/_http_config.py +52 -1
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +28 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +105 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +17 -0
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +80 -2
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +90 -29
- sibi_dst/df_helper/core/_params_config.py +59 -0
- sibi_dst/geopy_helper/geo_location_service.py +14 -0
- sibi_dst/geopy_helper/utils.py +37 -3
- sibi_dst/osmnx_helper/base_osm_map.py +254 -0
- sibi_dst/osmnx_helper/utils.py +226 -4
- sibi_dst/utils/clickhouse_writer.py +27 -0
- sibi_dst/utils/data_utils.py +32 -1
- sibi_dst/utils/data_wrapper.py +94 -6
- sibi_dst/utils/date_utils.py +35 -0
- sibi_dst/utils/log_utils.py +19 -2
- sibi_dst/utils/parquet_saver.py +0 -106
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/RECORD +27 -27
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -91,7 +91,7 @@ class DfHelper:
|
|
91
91
|
self.filesystem_options = kwargs.pop('filesystem_options', {})
|
92
92
|
kwargs.setdefault("live", True)
|
93
93
|
kwargs.setdefault("logger", self.logger)
|
94
|
-
kwargs.setdefault("fs", fsspec.filesystem('file'))
|
94
|
+
self.fs =kwargs.setdefault("fs", fsspec.filesystem('file'))
|
95
95
|
self.__post_init(**kwargs)
|
96
96
|
|
97
97
|
def __str__(self):
|
@@ -208,6 +208,18 @@ class DfHelper:
|
|
208
208
|
return asyncio.run(self.__load_from_http(**options))
|
209
209
|
|
210
210
|
def __load_from_sqlalchemy(self, **options):
|
211
|
+
"""
|
212
|
+
Loads data from an SQLAlchemy database source into a dataframe. The method processes
|
213
|
+
the loaded data and applies post-processing to transform it into the desired structure.
|
214
|
+
If the operation fails, an empty pandas DataFrame is created as a fallback.
|
215
|
+
|
216
|
+
:param options: Additional keyword arguments to configure the data loading process.
|
217
|
+
These options can include configurations such as 'debug' and other parameters
|
218
|
+
required by the `SqlAlchemyLoadFromDb` class.
|
219
|
+
:type options: dict
|
220
|
+
:return: A dataframe containing the data loaded from the SQLAlchemy database.
|
221
|
+
:rtype: dask.dataframe.DataFrame
|
222
|
+
"""
|
211
223
|
try:
|
212
224
|
options.setdefault("debug", self.debug)
|
213
225
|
db_loader = SqlAlchemyLoadFromDb(
|
@@ -228,6 +240,17 @@ class DfHelper:
|
|
228
240
|
return self.df
|
229
241
|
|
230
242
|
def __load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
243
|
+
"""
|
244
|
+
Loads data from a Django database using a specific backend query mechanism. Processes the loaded data
|
245
|
+
and applies further post-processing before returning the dataframe. If the operation fails, an
|
246
|
+
empty dataframe with a single partition is returned instead.
|
247
|
+
|
248
|
+
:param options: Additional settings for the database loading process, which include optional configurations
|
249
|
+
like debug mode, among others.
|
250
|
+
:type options: dict
|
251
|
+
:return: A dataframe containing the loaded data either as a Pandas or Dask dataframe.
|
252
|
+
:rtype: Union[pd.DataFrame, dd.DataFrame]
|
253
|
+
"""
|
231
254
|
try:
|
232
255
|
options.setdefault("debug", self.debug)
|
233
256
|
db_loader = DjangoLoadFromDb(
|
@@ -248,7 +271,18 @@ class DfHelper:
|
|
248
271
|
return self.df
|
249
272
|
|
250
273
|
async def __load_from_http(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
251
|
-
"""
|
274
|
+
"""
|
275
|
+
Loads data asynchronously from an HTTP source using the configured HTTP plugin.
|
276
|
+
If the HTTP plugin is not properly configured, this method logs a debug message and
|
277
|
+
returns an empty Dask DataFrame. If an exception occurs during data fetching, the error
|
278
|
+
is logged and an empty Dask DataFrame with one partition is returned.
|
279
|
+
|
280
|
+
:param options: Additional keyword arguments that are passed to the HTTP plugin for
|
281
|
+
fetching the data.
|
282
|
+
:returns: A DataFrame object that can either be a pandas or a Dask DataFrame. When the
|
283
|
+
fetching operation fails, it defaults to returning an empty Dask DataFrame
|
284
|
+
with a single partition.
|
285
|
+
"""
|
252
286
|
if not self.backend_http:
|
253
287
|
self.logger.debug("HTTP plugin not configured properly.")
|
254
288
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
@@ -339,12 +373,45 @@ class DfHelper:
|
|
339
373
|
|
340
374
|
self.logger.debug("Processing of loaded data completed.")
|
341
375
|
|
342
|
-
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
343
|
-
|
376
|
+
def save_to_parquet(self, parquet_filename: Optional[str] = None, **kwargs):
|
377
|
+
"""
|
378
|
+
Save the dataframe result to a Parquet file using specified configurations.
|
379
|
+
|
380
|
+
This method leverages the ParquetSaver class to store the dataframe result
|
381
|
+
into a Parquet file. It also provides functionality for overriding the default
|
382
|
+
filesystem (`fs`) and storage path (`parquet_storage_path`). The method logs
|
383
|
+
details about the saving operation for debugging purposes.
|
384
|
+
|
385
|
+
:param parquet_filename: The name of the Parquet file to save the dataframe to.
|
386
|
+
If not provided, a default name will be used.
|
387
|
+
:param kwargs: Additional arguments to customize the saving process. These may
|
388
|
+
include:
|
389
|
+
- `fs`: Filesystem to be used for saving Parquet files. If not
|
390
|
+
provided, defaults to the instance's filesystem attribute.
|
391
|
+
- `parquet_storage_path`: The root path in the filesystem where
|
392
|
+
Parquet files should be saved. If not provided, defaults to
|
393
|
+
the instance's attribute for storage path.
|
394
|
+
:return: None
|
395
|
+
"""
|
396
|
+
fs = kwargs.pop('fs', self.fs)
|
397
|
+
parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
|
398
|
+
ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
|
344
399
|
ps.save_to_parquet(parquet_filename)
|
345
|
-
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {
|
400
|
+
self.logger.debug(f"Parquet saved to {parquet_filename} in parquet storage: {parquet_storage_path}.")
|
346
401
|
|
347
402
|
def save_to_clickhouse(self, **credentials):
|
403
|
+
"""
|
404
|
+
Saves the current DataFrame to ClickHouse using the provided credentials. This
|
405
|
+
method first checks if the DataFrame is empty. If it is empty, the method logs
|
406
|
+
a debug message and does not proceed with saving. Otherwise, it initializes
|
407
|
+
a ClickHouseWriter instance and uses it to save the DataFrame to ClickHouse,
|
408
|
+
logging a debug message upon successful completion.
|
409
|
+
|
410
|
+
:param credentials: Credentials required to connect to ClickHouse as keyword
|
411
|
+
arguments.
|
412
|
+
:type credentials: dict
|
413
|
+
:return: None
|
414
|
+
"""
|
348
415
|
if self.df.map_partitions(len).compute().sum() == 0:
|
349
416
|
self.logger.debug("Cannot write to clickhouse since Dataframe is empty")
|
350
417
|
return
|
@@ -353,6 +420,21 @@ class DfHelper:
|
|
353
420
|
self.logger.debug("Save to ClickHouse completed.")
|
354
421
|
|
355
422
|
def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
423
|
+
"""
|
424
|
+
Loads data from parquet files into a DataFrame, applies provided filters, and handles exceptions.
|
425
|
+
|
426
|
+
This method leverages a backend-specific implementation to load data from parquet files into a
|
427
|
+
DataFrame. If additional options are provided and the data is successfully loaded, filters are
|
428
|
+
applied to the DataFrame using a filter handler. Errors during this process are handled gracefully
|
429
|
+
by logging the issue and returning an empty Dask DataFrame.
|
430
|
+
|
431
|
+
:param options: A dictionary of filter options to be applied to the DataFrame.
|
432
|
+
:type options: dict
|
433
|
+
|
434
|
+
:return: A DataFrame containing the loaded and filtered data. If the operation fails, an empty
|
435
|
+
Dask DataFrame is returned.
|
436
|
+
:rtype: Union[pd.DataFrame, dd.DataFrame]
|
437
|
+
"""
|
356
438
|
try:
|
357
439
|
self.df = self.backend_parquet.load_files()
|
358
440
|
if options and self.df is not None:
|
@@ -368,6 +450,27 @@ class DfHelper:
|
|
368
450
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
369
451
|
|
370
452
|
def load_period(self, **kwargs):
|
453
|
+
"""
|
454
|
+
Loads a period with specified parameters.
|
455
|
+
|
456
|
+
This method acts as a wrapper around the private ``__load_period`` method. It
|
457
|
+
accepts arbitrary keyword arguments that are passed directly to the private
|
458
|
+
method for execution. The purpose of allowing keyword arguments is to permit
|
459
|
+
flexible configuration or parameterization for loading a specific period, based
|
460
|
+
on the internal implementation of the private ``__load_period`` method.
|
461
|
+
|
462
|
+
Note:
|
463
|
+
The arguments and return values are entirely determined by the private
|
464
|
+
method's behavior. This method is intentionally designed to mask details
|
465
|
+
of the internal logic behind the abstraction.
|
466
|
+
|
467
|
+
:param kwargs: Arbitrary keyword arguments to parameterize the internal logic
|
468
|
+
of loading a period. The specific keys and values expected by the
|
469
|
+
``__load_period`` method depend on its own internal implementation.
|
470
|
+
:return: The result of calling the private ``__load_period`` method with the
|
471
|
+
provided keyword arguments. The return type is dependent on the internal
|
472
|
+
implementation of ``__load_period``.
|
473
|
+
"""
|
371
474
|
return self.__load_period(**kwargs)
|
372
475
|
|
373
476
|
def __load_period(self, **kwargs):
|
@@ -9,11 +9,74 @@ from sibi_dst.utils import DateUtils
|
|
9
9
|
|
10
10
|
|
11
11
|
class ParquetArtifact(DfHelper):
|
12
|
+
"""
|
13
|
+
Class designed to manage Parquet data storage and retrieval using a specified
|
14
|
+
DataWrapper class for data processing. It provides functionality for loading,
|
15
|
+
updating, rebuilding, and generating Parquet files within a configurable
|
16
|
+
storage filesystem. The class ensures that all essential configurations and
|
17
|
+
filesystems are properly set up before operations.
|
18
|
+
|
19
|
+
Detailed functionality includes support for dynamically managing and generating
|
20
|
+
Parquet files based on time periods, with customizable options for paths,
|
21
|
+
filenames, date fields, and more. It is an abstraction for efficiently handling
|
22
|
+
storage tasks related to distributed or local file systems.
|
23
|
+
|
24
|
+
:ivar config: Configuration dictionary containing all configurable parameters
|
25
|
+
for managing Parquet data storage, such as paths, filenames,
|
26
|
+
and date ranges.
|
27
|
+
:type config: dict
|
28
|
+
:ivar df: Cached Dask DataFrame used to store and manipulate data loaded
|
29
|
+
from the Parquet file.
|
30
|
+
:type df: Optional[dask.dataframe.DataFrame]
|
31
|
+
:ivar data_wrapper_class: Class responsible for abstracting data processing
|
32
|
+
operations required for Parquet file generation.
|
33
|
+
:type data_wrapper_class: type
|
34
|
+
:ivar date_field: Name of the field used to identify and process data by date.
|
35
|
+
:type date_field: Optional[str]
|
36
|
+
:ivar parquet_storage_path: Filesystem path to store Parquet files.
|
37
|
+
:type parquet_storage_path: Optional[str]
|
38
|
+
:ivar parquet_filename: Name of the Parquet file to be generated and managed.
|
39
|
+
:type parquet_filename: Optional[str]
|
40
|
+
:ivar parquet_start_date: Date string specifying the start date for data range
|
41
|
+
processing.
|
42
|
+
:type parquet_start_date: Optional[str]
|
43
|
+
:ivar parquet_end_date: Date string specifying the end date for data range
|
44
|
+
processing.
|
45
|
+
:type parquet_end_date: Optional[str]
|
46
|
+
:ivar filesystem_type: Type of the filesystem used for managing storage
|
47
|
+
operations (e.g., `file`, `s3`, etc.).
|
48
|
+
:type filesystem_type: str
|
49
|
+
:ivar filesystem_options: Additional options for configuring the filesystem.
|
50
|
+
:type filesystem_options: dict
|
51
|
+
:ivar fs: Filesystem object used for storage operations.
|
52
|
+
:type fs: fsspec.AbstractFileSystem
|
53
|
+
"""
|
12
54
|
DEFAULT_CONFIG = {
|
13
55
|
'backend': 'parquet'
|
14
56
|
}
|
15
57
|
|
16
58
|
def __init__(self, data_wrapper_class, **kwargs):
|
59
|
+
"""
|
60
|
+
Initializes an instance of the class with given configuration and validates
|
61
|
+
required parameters. Sets up the filesystem to handle storage, ensuring
|
62
|
+
necessary directories exist. The configuration supports a variety of options
|
63
|
+
to manage parquet storage requirements, including paths, filenames, and date
|
64
|
+
ranges.
|
65
|
+
|
66
|
+
:param data_wrapper_class: The class responsible for wrapping data to be managed
|
67
|
+
by this instance.
|
68
|
+
:type data_wrapper_class: type
|
69
|
+
:param kwargs: Arbitrary keyword arguments to override default configuration.
|
70
|
+
Includes settings for `date_field`, `parquet_storage_path`,
|
71
|
+
`parquet_filename`, `parquet_start_date`, `parquet_end_date`,
|
72
|
+
`filesystem_type`, `filesystem_options`, and `fs`.
|
73
|
+
:type kwargs: dict
|
74
|
+
|
75
|
+
:raises ValueError: If any of the required configuration options
|
76
|
+
(`date_field`, `parquet_storage_path`,
|
77
|
+
`parquet_filename`, `parquet_start_date`,
|
78
|
+
or `parquet_end_date`) are missing or not set properly.
|
79
|
+
"""
|
17
80
|
self.config = {
|
18
81
|
**self.DEFAULT_CONFIG,
|
19
82
|
**kwargs,
|
@@ -7,6 +7,42 @@ from sibi_dst.df_helper import DfHelper
|
|
7
7
|
|
8
8
|
|
9
9
|
class ParquetReader(DfHelper):
|
10
|
+
"""
|
11
|
+
This class is a specialized helper for reading and managing Parquet files.
|
12
|
+
|
13
|
+
The `ParquetReader` class is designed to facilitate working with Parquet
|
14
|
+
datasets stored across different filesystems. It initializes the required
|
15
|
+
resources, ensures the existence of the specified Parquet directory,
|
16
|
+
and provides an abstraction to load the data into a Dask DataFrame.
|
17
|
+
|
18
|
+
The class requires configuration for the storage path and dates defining
|
19
|
+
a range of interest. It also supports various filesystem types through
|
20
|
+
`fsspec`.
|
21
|
+
|
22
|
+
:ivar config: Holds the final configuration for this instance, combining
|
23
|
+
`DEFAULT_CONFIG` with user-provided configuration.
|
24
|
+
:type config: dict
|
25
|
+
:ivar df: Stores the loaded Dask DataFrame after the `load()` method is
|
26
|
+
invoked. Initially set to None.
|
27
|
+
:type df: Optional[dd.DataFrame]
|
28
|
+
:ivar parquet_storage_path: The path to the Parquet storage directory.
|
29
|
+
:type parquet_storage_path: str
|
30
|
+
:ivar parquet_start_date: Start date for Parquet data selection. Must
|
31
|
+
be set in the configuration.
|
32
|
+
:type parquet_start_date: str
|
33
|
+
:ivar parquet_end_date: End date for Parquet data selection. Must be
|
34
|
+
set in the configuration.
|
35
|
+
:type parquet_end_date: str
|
36
|
+
:ivar filesystem_type: The type of filesystem the Parquet files are
|
37
|
+
stored on (e.g., "file", "s3").
|
38
|
+
:type filesystem_type: str
|
39
|
+
:ivar filesystem_options: Any additional options required for the
|
40
|
+
specified filesystem type.
|
41
|
+
:type filesystem_options: dict
|
42
|
+
:ivar fs: Instance of `fsspec` filesystem used to interact with the
|
43
|
+
Parquet storage.
|
44
|
+
:type fs: fsspec.AbstractFileSystem
|
45
|
+
"""
|
10
46
|
DEFAULT_CONFIG = {
|
11
47
|
'backend': 'parquet'
|
12
48
|
}
|
@@ -6,6 +6,27 @@ from ._sql_model_builder import DjangoSqlModelBuilder
|
|
6
6
|
|
7
7
|
|
8
8
|
class DjangoConnectionConfig(BaseModel):
|
9
|
+
"""
|
10
|
+
Represents a configuration for establishing a Django database connection.
|
11
|
+
|
12
|
+
This class is used for defining the configurations necessary to establish a Django
|
13
|
+
database connection. It supports dynamic model generation if the model is not
|
14
|
+
provided explicitly. It also validates the connection configuration to ensure it
|
15
|
+
is properly set up before being used.
|
16
|
+
|
17
|
+
:ivar live: Indicates whether the connection is live. Automatically set to False if
|
18
|
+
a table is provided without a pre-built model.
|
19
|
+
:type live: bool
|
20
|
+
:ivar connection_name: The name of the database connection to use. This is a mandatory
|
21
|
+
parameter and must be provided.
|
22
|
+
:type connection_name: str
|
23
|
+
:ivar table: The name of the database table to use. Required for dynamic model
|
24
|
+
generation when no model is provided.
|
25
|
+
:type table: str
|
26
|
+
:ivar model: The Django model that represents the database table. If not provided,
|
27
|
+
this can be generated dynamically by using the table name.
|
28
|
+
:type model: Any
|
29
|
+
"""
|
9
30
|
live: bool = False
|
10
31
|
connection_name: str = None
|
11
32
|
table: str = None
|
@@ -13,6 +34,18 @@ class DjangoConnectionConfig(BaseModel):
|
|
13
34
|
|
14
35
|
@model_validator(mode="after")
|
15
36
|
def check_model(self):
|
37
|
+
"""
|
38
|
+
Validates and modifies the instance based on the provided attributes and conditions.
|
39
|
+
This method ensures that all required parameters are populated and consistent, and it
|
40
|
+
dynamically builds a model if necessary. The method also ensures the connection is
|
41
|
+
validated after the model preparation process.
|
42
|
+
|
43
|
+
:raises ValueError: If `connection_name` is not provided.
|
44
|
+
:raises ValueError: If `table` name is not specified when building the model dynamically.
|
45
|
+
:raises ValueError: If there are errors during the dynamic model-building process.
|
46
|
+
:raises ValueError: If `validate_connection` fails due to invalid configuration.
|
47
|
+
:return: The validated and potentially mutated instance.
|
48
|
+
"""
|
16
49
|
# connection_name is mandatory
|
17
50
|
if self.connection_name is None:
|
18
51
|
raise ValueError("Connection name must be specified")
|
@@ -38,7 +71,14 @@ class DjangoConnectionConfig(BaseModel):
|
|
38
71
|
return self
|
39
72
|
|
40
73
|
def validate_connection(self):
|
41
|
-
"""
|
74
|
+
"""
|
75
|
+
Ensures the database connection is valid by performing a simple
|
76
|
+
query. Raises a ValueError if the connection is broken or if any
|
77
|
+
other exception occurs during the query.
|
78
|
+
|
79
|
+
:raises ValueError: If the connection to the database cannot be
|
80
|
+
established or if the query fails.
|
81
|
+
"""
|
42
82
|
try:
|
43
83
|
# Perform a simple query to test the connection
|
44
84
|
self.model.objects.using(self.connection_name).exists()
|
@@ -11,6 +11,28 @@ from django.utils.encoding import force_str as force_text
|
|
11
11
|
|
12
12
|
|
13
13
|
class ReadFrameDask:
|
14
|
+
"""
|
15
|
+
Handles Django ORM QuerySet to Dask DataFrame conversion with support for field
|
16
|
+
type inference, chunked data retrieval, and verbose updates.
|
17
|
+
|
18
|
+
This class provides methods to efficiently convert a Django QuerySet into a
|
19
|
+
Dask DataFrame while preserving field types and incorporating additional
|
20
|
+
capabilities such as replacing fields with verbose choices or related object
|
21
|
+
information. The class design leverages static and class methods to maintain
|
22
|
+
flexibility and reusability for handling Django model fields and their data
|
23
|
+
types.
|
24
|
+
|
25
|
+
:ivar qs: The Django QuerySet to be converted into a Dask DataFrame.
|
26
|
+
:type qs: django.db.models.query.QuerySet
|
27
|
+
:ivar coerce_float: Whether to attempt to coerce numeric values to floats.
|
28
|
+
:type coerce_float: bool
|
29
|
+
:ivar chunk_size: The number of records to fetch and process per chunk from
|
30
|
+
the QuerySet.
|
31
|
+
:type chunk_size: int
|
32
|
+
:ivar verbose: If True, provides verbose updates during DataFrame creation
|
33
|
+
by replacing fields with readable representations (e.g., verbose names).
|
34
|
+
:type verbose: bool
|
35
|
+
"""
|
14
36
|
FieldDoesNotExist = (
|
15
37
|
django.core.exceptions.FieldDoesNotExist
|
16
38
|
if django.VERSION < (1, 8)
|
@@ -22,6 +44,22 @@ class ReadFrameDask:
|
|
22
44
|
qs,
|
23
45
|
**kwargs,
|
24
46
|
):
|
47
|
+
"""
|
48
|
+
An initialization method for a class that sets class attributes based on provided
|
49
|
+
arguments or default values using the keyword arguments. The method allows
|
50
|
+
customization of behaviors like coercing data types, handling chunked operations,
|
51
|
+
and verbosity level during execution.
|
52
|
+
|
53
|
+
:param qs: A data source or query set for processing; its type is dependent
|
54
|
+
on the expected data being handled.
|
55
|
+
:param kwargs: Additional keyword arguments that may include:
|
56
|
+
- coerce_float: A boolean indicating whether floats should be coerced
|
57
|
+
during handling. Default is False.
|
58
|
+
- chunk_size: An integer value representing the size of chunks for
|
59
|
+
data processing. Default is 1000.
|
60
|
+
- verbose: A boolean to specify if verbose logging or output
|
61
|
+
should occur during execution. Default is True.
|
62
|
+
"""
|
25
63
|
self.qs = qs
|
26
64
|
self.coerce_float = kwargs.setdefault("coerce_float", False)
|
27
65
|
self.chunk_size = kwargs.setdefault("chunk_size", 1000)
|
@@ -29,6 +67,19 @@ class ReadFrameDask:
|
|
29
67
|
|
30
68
|
@staticmethod
|
31
69
|
def replace_from_choices(choices):
|
70
|
+
"""
|
71
|
+
Provides a method to replace elements in a list of values based on a mapping of choices.
|
72
|
+
|
73
|
+
This static method generates a closure function that replaces items in a list by
|
74
|
+
looking up their corresponding values in a provided dictionary of choices. If an
|
75
|
+
item cannot be found in the dictionary, it is left unchanged.
|
76
|
+
|
77
|
+
:param choices:
|
78
|
+
Dictionary where keys are original values and values are their replacements.
|
79
|
+
:return:
|
80
|
+
A function that takes a list of values and replaces elements using the
|
81
|
+
provided choices dictionary.
|
82
|
+
"""
|
32
83
|
def inner(values):
|
33
84
|
return [choices.get(v, v) for v in values]
|
34
85
|
|
@@ -36,10 +87,35 @@ class ReadFrameDask:
|
|
36
87
|
|
37
88
|
@staticmethod
|
38
89
|
def get_model_name(model):
|
90
|
+
"""
|
91
|
+
Retrieves the model name from a given Django model instance.
|
92
|
+
|
93
|
+
This method accesses the `_meta.model_name` attribute of the provided
|
94
|
+
model object to extract and return the model's name.
|
95
|
+
|
96
|
+
:param model: A Django model instance from which the model name is
|
97
|
+
derived.
|
98
|
+
:type model: object
|
99
|
+
:return: The name of the model as a string.
|
100
|
+
:rtype: str
|
101
|
+
"""
|
39
102
|
return model._meta.model_name
|
40
103
|
|
41
104
|
@staticmethod
|
42
105
|
def get_related_model(field):
|
106
|
+
"""
|
107
|
+
Retrieve the related model from the provided field.
|
108
|
+
|
109
|
+
This function determines the related model associated with the given field.
|
110
|
+
It checks various attributes commonly used to indicate relations in models and
|
111
|
+
retrieves the related model if present.
|
112
|
+
|
113
|
+
:param field: The field from which the related model is to be extracted.
|
114
|
+
It must be an object that potentially contains attributes like
|
115
|
+
`related_model` or `rel`.
|
116
|
+
:return: The related model associated with the provided field, or None if
|
117
|
+
no such model is found.
|
118
|
+
"""
|
43
119
|
model = None
|
44
120
|
if hasattr(field, "related_model") and field.related_model:
|
45
121
|
model = field.related_model
|
@@ -49,12 +125,43 @@ class ReadFrameDask:
|
|
49
125
|
|
50
126
|
@classmethod
|
51
127
|
def get_base_cache_key(cls, model):
|
128
|
+
"""
|
129
|
+
Generates a base cache key for caching purposes.
|
130
|
+
|
131
|
+
This method constructs a base cache key that can be used in conjunction with
|
132
|
+
Django models to uniquely identify cache entries. The key is formatted to
|
133
|
+
include the app label and model name, ensuring that cache entries are
|
134
|
+
namespaced accordingly.
|
135
|
+
|
136
|
+
:param model: A Django model instance for which the base cache key is generated.
|
137
|
+
:type model: Model
|
138
|
+
:return: The string template for the base cache key, where `%s` can be replaced
|
139
|
+
with specific identifiers to create unique keys.
|
140
|
+
:rtype: str
|
141
|
+
"""
|
52
142
|
return (
|
53
143
|
f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
|
54
144
|
)
|
55
145
|
|
56
146
|
@classmethod
|
57
147
|
def replace_pk(cls, model):
|
148
|
+
"""
|
149
|
+
Generates a function that replaces primary keys in a pandas Series with their
|
150
|
+
corresponding cached values or database-retrieved representations.
|
151
|
+
|
152
|
+
The function uses a cache mechanism to retrieve pre-stored values for primary
|
153
|
+
keys in the series. If some primary keys are not found in the cache, it queries
|
154
|
+
the database for their representations, updates the cache, and replaces the
|
155
|
+
primary keys in the series accordingly.
|
156
|
+
|
157
|
+
:param model: The Django model class associated with the primary keys to be
|
158
|
+
processed.
|
159
|
+
:type model: Type[Model]
|
160
|
+
|
161
|
+
:return: A function that takes a pandas Series of primary keys as input and
|
162
|
+
returns a Series with replaced values based on cache or database retrieval.
|
163
|
+
:rtype: callable
|
164
|
+
"""
|
58
165
|
base_cache_key = cls.get_base_cache_key(model)
|
59
166
|
|
60
167
|
def get_cache_key_from_pk(pk):
|
@@ -84,6 +191,20 @@ class ReadFrameDask:
|
|
84
191
|
|
85
192
|
@classmethod
|
86
193
|
def build_update_functions(cls, fieldnames, fields):
|
194
|
+
"""
|
195
|
+
This method is responsible for building update functions based on the provided
|
196
|
+
fieldnames and fields. It performs validation for the field type, checks for
|
197
|
+
specific conditions such as `choices` or `ForeignKey` field types, and generates
|
198
|
+
a generator of update functions for the given fieldnames and fields.
|
199
|
+
|
200
|
+
:param fieldnames: A list of field names to be processed.
|
201
|
+
:type fieldnames: list[str]
|
202
|
+
:param fields: A list of field objects corresponding to the fieldnames.
|
203
|
+
:type fields: list[Field]
|
204
|
+
:return: A generator yielding tuples where the first element is a fieldname,
|
205
|
+
and the second element is the corresponding update function or None.
|
206
|
+
:rtype: generator[tuple[str, Callable | None]]
|
207
|
+
"""
|
87
208
|
for fieldname, field in zip(fieldnames, fields):
|
88
209
|
if not isinstance(field, Field):
|
89
210
|
yield fieldname, None
|
@@ -96,13 +217,38 @@ class ReadFrameDask:
|
|
96
217
|
|
97
218
|
@classmethod
|
98
219
|
def update_with_verbose(cls, df, fieldnames, fields):
|
220
|
+
"""
|
221
|
+
Updates the provided dataframe by applying transformation functions to specified fields.
|
222
|
+
The method iterates over the provided field names and their corresponding functions, applying
|
223
|
+
each transformation function to its related column in the dataframe.
|
224
|
+
|
225
|
+
:param df: The input dataframe to be updated.
|
226
|
+
:param fieldnames: A list of field names in the dataframe that need to be updated.
|
227
|
+
:param fields: A list of transformation functions or mappings corresponding to the field names.
|
228
|
+
:return: The dataframe with updated fields.
|
229
|
+
"""
|
99
230
|
for fieldname, function in cls.build_update_functions(fieldnames, fields):
|
100
231
|
if function is not None:
|
101
232
|
df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
|
102
233
|
|
103
234
|
@classmethod
|
104
235
|
def to_fields(cls, qs, fieldnames):
|
105
|
-
"""
|
236
|
+
"""
|
237
|
+
Converts field names from a queryset into corresponding field objects, resolving relationships
|
238
|
+
and related objects if necessary. This method is typically used to yield fully-resolved field
|
239
|
+
objects for further interaction.
|
240
|
+
|
241
|
+
:param qs: A QuerySet object from which the fields are resolved. This object provides access
|
242
|
+
to the model and its metadata from which the fields are retrieved.
|
243
|
+
:type qs: QuerySet
|
244
|
+
|
245
|
+
:param fieldnames: A list of field name strings. These can include nested fields separated by
|
246
|
+
double underscores (__) to denote relationships or subfields.
|
247
|
+
:type fieldnames: List[str]
|
248
|
+
|
249
|
+
:return: A generator that yields resolved field objects corresponding to the provided field names.
|
250
|
+
:rtype: Generator[Field, None, None]
|
251
|
+
"""
|
106
252
|
for fieldname in fieldnames:
|
107
253
|
model = qs.model
|
108
254
|
for fieldname_part in fieldname.split("__"):
|
@@ -125,6 +271,18 @@ class ReadFrameDask:
|
|
125
271
|
|
126
272
|
@staticmethod
|
127
273
|
def is_values_queryset(qs):
|
274
|
+
"""
|
275
|
+
Determines whether the provided queryset is a values queryset.
|
276
|
+
|
277
|
+
This method checks if the `_iterable_class` attribute of the queryset corresponds
|
278
|
+
to `django.db.models.query.ValuesIterable`. If an exception occurs during the check,
|
279
|
+
the method returns `False`.
|
280
|
+
|
281
|
+
:param qs: The queryset to be checked.
|
282
|
+
:type qs: django.db.models.query.QuerySet
|
283
|
+
:return: A boolean indicating whether the queryset is a values queryset.
|
284
|
+
:rtype: bool
|
285
|
+
"""
|
128
286
|
try:
|
129
287
|
return qs._iterable_class == django.db.models.query.ValuesIterable
|
130
288
|
except:
|
@@ -132,7 +290,24 @@ class ReadFrameDask:
|
|
132
290
|
|
133
291
|
@staticmethod
|
134
292
|
def object_to_dict(obj, fields=None):
|
135
|
-
"""
|
293
|
+
"""
|
294
|
+
Converts an object to a dictionary representation.
|
295
|
+
|
296
|
+
This static method transforms an object's attributes into a dictionary.
|
297
|
+
If no specific fields are provided, all attribute key-value pairs are
|
298
|
+
included. The "_state" attribute, if present, is safely removed in this
|
299
|
+
case. When specific fields are supplied, only those fields are included
|
300
|
+
in the resulting dictionary.
|
301
|
+
|
302
|
+
:param obj: The object to be serialized into a dictionary. This object
|
303
|
+
must have the `__dict__` attribute available.
|
304
|
+
:param fields: A list of strings representing the attribute names to
|
305
|
+
include in the dictionary. If None or not provided, all attributes
|
306
|
+
are included except for "_state".
|
307
|
+
:return: A dictionary representation of the object's attributes. If the
|
308
|
+
provided object is None, an empty dictionary is returned.
|
309
|
+
:rtype: dict
|
310
|
+
"""
|
136
311
|
if obj is None:
|
137
312
|
return {} # Return an empty dictionary if obj is None
|
138
313
|
if not fields:
|
@@ -142,7 +317,25 @@ class ReadFrameDask:
|
|
142
317
|
|
143
318
|
@staticmethod
|
144
319
|
def infer_dtypes_from_django(qs):
|
145
|
-
"""
|
320
|
+
"""
|
321
|
+
Infer dtypes from a Django QuerySet model and annotated fields.
|
322
|
+
|
323
|
+
This method infers the appropriate data types (dtypes) for a given
|
324
|
+
Django QuerySet (`qs`) based on the fields defined in its model and
|
325
|
+
any annotated fields included in the QuerySet. The function maps
|
326
|
+
Django model field types to corresponding dtypes compatible with
|
327
|
+
Dask or Pandas dataframes.
|
328
|
+
|
329
|
+
- Fields in the model are identified through their metadata.
|
330
|
+
- Reverse relationships and non-concrete fields are ignored.
|
331
|
+
- Annotated fields are processed separately and default to object
|
332
|
+
dtype if their type cannot be determined.
|
333
|
+
|
334
|
+
:param qs: Django QuerySet whose model is used to infer dtypes.
|
335
|
+
:type qs: QuerySet
|
336
|
+
:return: A mapping of field names to inferred dtypes.
|
337
|
+
:rtype: dict
|
338
|
+
"""
|
146
339
|
django_to_dask_dtype = {
|
147
340
|
'AutoField': 'Int64', # Use nullable integer
|
148
341
|
'BigAutoField': 'Int64',
|
@@ -189,6 +382,21 @@ class ReadFrameDask:
|
|
189
382
|
return dtypes
|
190
383
|
|
191
384
|
def read_frame(self, fillna_value=None):
|
385
|
+
"""
|
386
|
+
Reads a Django QuerySet and returns a dask DataFrame by iterating over the QuerySet in chunks. It
|
387
|
+
handles data type inference, missing values, timezone awareness, and creates partitions to form a
|
388
|
+
single dask DataFrame efficiently.
|
389
|
+
|
390
|
+
This method includes functionality for managing missing values, inferring data types from Django fields,
|
391
|
+
and handling timezone-aware datetime objects. It processes data in chunks to optimize memory usage and
|
392
|
+
supports converting chunks into pandas DataFrames before combining them into a unified dask DataFrame.
|
393
|
+
|
394
|
+
:param fillna_value: The value to fill NaN values in the DataFrame. If None, NaNs are not filled.
|
395
|
+
:type fillna_value: Any
|
396
|
+
:return: A dask DataFrame constructed from the QuerySet after processing and combining all
|
397
|
+
its partitions.
|
398
|
+
:rtype: dask.dataframe.DataFrame
|
399
|
+
"""
|
192
400
|
qs = self.qs
|
193
401
|
coerce_float = self.coerce_float
|
194
402
|
verbose = self.verbose
|