sibi-dst 0.3.31__py3-none-any.whl → 0.3.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,11 +9,74 @@ from sibi_dst.utils import DateUtils
9
9
 
10
10
 
11
11
  class ParquetArtifact(DfHelper):
12
+ """
13
+ Class designed to manage Parquet data storage and retrieval using a specified
14
+ DataWrapper class for data processing. It provides functionality for loading,
15
+ updating, rebuilding, and generating Parquet files within a configurable
16
+ storage filesystem. The class ensures that all essential configurations and
17
+ filesystems are properly set up before operations.
18
+
19
+ Detailed functionality includes support for dynamically managing and generating
20
+ Parquet files based on time periods, with customizable options for paths,
21
+ filenames, date fields, and more. It is an abstraction for efficiently handling
22
+ storage tasks related to distributed or local file systems.
23
+
24
+ :ivar config: Configuration dictionary containing all configurable parameters
25
+ for managing Parquet data storage, such as paths, filenames,
26
+ and date ranges.
27
+ :type config: dict
28
+ :ivar df: Cached Dask DataFrame used to store and manipulate data loaded
29
+ from the Parquet file.
30
+ :type df: Optional[dask.dataframe.DataFrame]
31
+ :ivar data_wrapper_class: Class responsible for abstracting data processing
32
+ operations required for Parquet file generation.
33
+ :type data_wrapper_class: type
34
+ :ivar date_field: Name of the field used to identify and process data by date.
35
+ :type date_field: Optional[str]
36
+ :ivar parquet_storage_path: Filesystem path to store Parquet files.
37
+ :type parquet_storage_path: Optional[str]
38
+ :ivar parquet_filename: Name of the Parquet file to be generated and managed.
39
+ :type parquet_filename: Optional[str]
40
+ :ivar parquet_start_date: Date string specifying the start date for data range
41
+ processing.
42
+ :type parquet_start_date: Optional[str]
43
+ :ivar parquet_end_date: Date string specifying the end date for data range
44
+ processing.
45
+ :type parquet_end_date: Optional[str]
46
+ :ivar filesystem_type: Type of the filesystem used for managing storage
47
+ operations (e.g., `file`, `s3`, etc.).
48
+ :type filesystem_type: str
49
+ :ivar filesystem_options: Additional options for configuring the filesystem.
50
+ :type filesystem_options: dict
51
+ :ivar fs: Filesystem object used for storage operations.
52
+ :type fs: fsspec.AbstractFileSystem
53
+ """
12
54
  DEFAULT_CONFIG = {
13
55
  'backend': 'parquet'
14
56
  }
15
57
 
16
58
  def __init__(self, data_wrapper_class, **kwargs):
59
+ """
60
+ Initializes an instance of the class with given configuration and validates
61
+ required parameters. Sets up the filesystem to handle storage, ensuring
62
+ necessary directories exist. The configuration supports a variety of options
63
+ to manage parquet storage requirements, including paths, filenames, and date
64
+ ranges.
65
+
66
+ :param data_wrapper_class: The class responsible for wrapping data to be managed
67
+ by this instance.
68
+ :type data_wrapper_class: type
69
+ :param kwargs: Arbitrary keyword arguments to override default configuration.
70
+ Includes settings for `date_field`, `parquet_storage_path`,
71
+ `parquet_filename`, `parquet_start_date`, `parquet_end_date`,
72
+ `filesystem_type`, `filesystem_options`, and `fs`.
73
+ :type kwargs: dict
74
+
75
+ :raises ValueError: If any of the required configuration options
76
+ (`date_field`, `parquet_storage_path`,
77
+ `parquet_filename`, `parquet_start_date`,
78
+ or `parquet_end_date`) are missing or not set properly.
79
+ """
17
80
  self.config = {
18
81
  **self.DEFAULT_CONFIG,
19
82
  **kwargs,
@@ -61,6 +124,11 @@ class ParquetArtifact(DfHelper):
61
124
  dw = DataWrapper(self.data_wrapper_class, **params)
62
125
  dw.process()
63
126
 
127
+ def __exit__(self, exc_type, exc_value, traceback):
128
+ # Ensure resources are cleaned up
129
+ if self.fs:
130
+ self.fs.close()
131
+
64
132
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
65
133
  """Update the Parquet file with data from a specific period."""
66
134
  kwargs.update(self.parse_parquet_period(period=period))
@@ -7,6 +7,42 @@ from sibi_dst.df_helper import DfHelper
7
7
 
8
8
 
9
9
  class ParquetReader(DfHelper):
10
+ """
11
+ This class is a specialized helper for reading and managing Parquet files.
12
+
13
+ The `ParquetReader` class is designed to facilitate working with Parquet
14
+ datasets stored across different filesystems. It initializes the required
15
+ resources, ensures the existence of the specified Parquet directory,
16
+ and provides an abstraction to load the data into a Dask DataFrame.
17
+
18
+ The class requires configuration for the storage path and dates defining
19
+ a range of interest. It also supports various filesystem types through
20
+ `fsspec`.
21
+
22
+ :ivar config: Holds the final configuration for this instance, combining
23
+ `DEFAULT_CONFIG` with user-provided configuration.
24
+ :type config: dict
25
+ :ivar df: Stores the loaded Dask DataFrame after the `load()` method is
26
+ invoked. Initially set to None.
27
+ :type df: Optional[dd.DataFrame]
28
+ :ivar parquet_storage_path: The path to the Parquet storage directory.
29
+ :type parquet_storage_path: str
30
+ :ivar parquet_start_date: Start date for Parquet data selection. Must
31
+ be set in the configuration.
32
+ :type parquet_start_date: str
33
+ :ivar parquet_end_date: End date for Parquet data selection. Must be
34
+ set in the configuration.
35
+ :type parquet_end_date: str
36
+ :ivar filesystem_type: The type of filesystem the Parquet files are
37
+ stored on (e.g., "file", "s3").
38
+ :type filesystem_type: str
39
+ :ivar filesystem_options: Any additional options required for the
40
+ specified filesystem type.
41
+ :type filesystem_options: dict
42
+ :ivar fs: Instance of `fsspec` filesystem used to interact with the
43
+ Parquet storage.
44
+ :type fs: fsspec.AbstractFileSystem
45
+ """
10
46
  DEFAULT_CONFIG = {
11
47
  'backend': 'parquet'
12
48
  }
@@ -31,7 +67,10 @@ class ParquetReader(DfHelper):
31
67
  # Filesystem setup
32
68
  self.filesystem_type = filesystem_type
33
69
  self.filesystem_options = filesystem_options or {}
34
- self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
70
+ self.fs = self.config.setdefault('fs', None)
71
+ if self.fs is None:
72
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
73
+ self.config.setdefault('fs', self.fs)
35
74
 
36
75
  if not self.directory_exists():
37
76
  raise ValueError(f"{self.parquet_storage_path} does not exist")
@@ -48,3 +87,8 @@ class ParquetReader(DfHelper):
48
87
  return info['type'] == 'directory'
49
88
  except FileNotFoundError:
50
89
  return False
90
+
91
+ def __exit__(self, exc_type, exc_value, traceback):
92
+ # Ensure resources are cleaned up
93
+ if self.fs:
94
+ self.fs.close()
@@ -6,6 +6,27 @@ from ._sql_model_builder import DjangoSqlModelBuilder
6
6
 
7
7
 
8
8
  class DjangoConnectionConfig(BaseModel):
9
+ """
10
+ Represents a configuration for establishing a Django database connection.
11
+
12
+ This class is used for defining the configurations necessary to establish a Django
13
+ database connection. It supports dynamic model generation if the model is not
14
+ provided explicitly. It also validates the connection configuration to ensure it
15
+ is properly set up before being used.
16
+
17
+ :ivar live: Indicates whether the connection is live. Automatically set to False if
18
+ a table is provided without a pre-built model.
19
+ :type live: bool
20
+ :ivar connection_name: The name of the database connection to use. This is a mandatory
21
+ parameter and must be provided.
22
+ :type connection_name: str
23
+ :ivar table: The name of the database table to use. Required for dynamic model
24
+ generation when no model is provided.
25
+ :type table: str
26
+ :ivar model: The Django model that represents the database table. If not provided,
27
+ this can be generated dynamically by using the table name.
28
+ :type model: Any
29
+ """
9
30
  live: bool = False
10
31
  connection_name: str = None
11
32
  table: str = None
@@ -13,6 +34,18 @@ class DjangoConnectionConfig(BaseModel):
13
34
 
14
35
  @model_validator(mode="after")
15
36
  def check_model(self):
37
+ """
38
+ Validates and modifies the instance based on the provided attributes and conditions.
39
+ This method ensures that all required parameters are populated and consistent, and it
40
+ dynamically builds a model if necessary. The method also ensures the connection is
41
+ validated after the model preparation process.
42
+
43
+ :raises ValueError: If `connection_name` is not provided.
44
+ :raises ValueError: If `table` name is not specified when building the model dynamically.
45
+ :raises ValueError: If there are errors during the dynamic model-building process.
46
+ :raises ValueError: If `validate_connection` fails due to invalid configuration.
47
+ :return: The validated and potentially mutated instance.
48
+ """
16
49
  # connection_name is mandatory
17
50
  if self.connection_name is None:
18
51
  raise ValueError("Connection name must be specified")
@@ -38,7 +71,14 @@ class DjangoConnectionConfig(BaseModel):
38
71
  return self
39
72
 
40
73
  def validate_connection(self):
41
- """Test if the database connection is valid by executing a simple query."""
74
+ """
75
+ Ensures the database connection is valid by performing a simple
76
+ query. Raises a ValueError if the connection is broken or if any
77
+ other exception occurs during the query.
78
+
79
+ :raises ValueError: If the connection to the database cannot be
80
+ established or if the query fails.
81
+ """
42
82
  try:
43
83
  # Perform a simple query to test the connection
44
84
  self.model.objects.using(self.connection_name).exists()
@@ -11,6 +11,28 @@ from django.utils.encoding import force_str as force_text
11
11
 
12
12
 
13
13
  class ReadFrameDask:
14
+ """
15
+ Handles Django ORM QuerySet to Dask DataFrame conversion with support for field
16
+ type inference, chunked data retrieval, and verbose updates.
17
+
18
+ This class provides methods to efficiently convert a Django QuerySet into a
19
+ Dask DataFrame while preserving field types and incorporating additional
20
+ capabilities such as replacing fields with verbose choices or related object
21
+ information. The class design leverages static and class methods to maintain
22
+ flexibility and reusability for handling Django model fields and their data
23
+ types.
24
+
25
+ :ivar qs: The Django QuerySet to be converted into a Dask DataFrame.
26
+ :type qs: django.db.models.query.QuerySet
27
+ :ivar coerce_float: Whether to attempt to coerce numeric values to floats.
28
+ :type coerce_float: bool
29
+ :ivar chunk_size: The number of records to fetch and process per chunk from
30
+ the QuerySet.
31
+ :type chunk_size: int
32
+ :ivar verbose: If True, provides verbose updates during DataFrame creation
33
+ by replacing fields with readable representations (e.g., verbose names).
34
+ :type verbose: bool
35
+ """
14
36
  FieldDoesNotExist = (
15
37
  django.core.exceptions.FieldDoesNotExist
16
38
  if django.VERSION < (1, 8)
@@ -22,6 +44,22 @@ class ReadFrameDask:
22
44
  qs,
23
45
  **kwargs,
24
46
  ):
47
+ """
48
+ An initialization method for a class that sets class attributes based on provided
49
+ arguments or default values using the keyword arguments. The method allows
50
+ customization of behaviors like coercing data types, handling chunked operations,
51
+ and verbosity level during execution.
52
+
53
+ :param qs: A data source or query set for processing; its type is dependent
54
+ on the expected data being handled.
55
+ :param kwargs: Additional keyword arguments that may include:
56
+ - coerce_float: A boolean indicating whether floats should be coerced
57
+ during handling. Default is False.
58
+ - chunk_size: An integer value representing the size of chunks for
59
+ data processing. Default is 1000.
60
+ - verbose: A boolean to specify if verbose logging or output
61
+ should occur during execution. Default is True.
62
+ """
25
63
  self.qs = qs
26
64
  self.coerce_float = kwargs.setdefault("coerce_float", False)
27
65
  self.chunk_size = kwargs.setdefault("chunk_size", 1000)
@@ -29,6 +67,19 @@ class ReadFrameDask:
29
67
 
30
68
  @staticmethod
31
69
  def replace_from_choices(choices):
70
+ """
71
+ Provides a method to replace elements in a list of values based on a mapping of choices.
72
+
73
+ This static method generates a closure function that replaces items in a list by
74
+ looking up their corresponding values in a provided dictionary of choices. If an
75
+ item cannot be found in the dictionary, it is left unchanged.
76
+
77
+ :param choices:
78
+ Dictionary where keys are original values and values are their replacements.
79
+ :return:
80
+ A function that takes a list of values and replaces elements using the
81
+ provided choices dictionary.
82
+ """
32
83
  def inner(values):
33
84
  return [choices.get(v, v) for v in values]
34
85
 
@@ -36,10 +87,35 @@ class ReadFrameDask:
36
87
 
37
88
  @staticmethod
38
89
  def get_model_name(model):
90
+ """
91
+ Retrieves the model name from a given Django model instance.
92
+
93
+ This method accesses the `_meta.model_name` attribute of the provided
94
+ model object to extract and return the model's name.
95
+
96
+ :param model: A Django model instance from which the model name is
97
+ derived.
98
+ :type model: object
99
+ :return: The name of the model as a string.
100
+ :rtype: str
101
+ """
39
102
  return model._meta.model_name
40
103
 
41
104
  @staticmethod
42
105
  def get_related_model(field):
106
+ """
107
+ Retrieve the related model from the provided field.
108
+
109
+ This function determines the related model associated with the given field.
110
+ It checks various attributes commonly used to indicate relations in models and
111
+ retrieves the related model if present.
112
+
113
+ :param field: The field from which the related model is to be extracted.
114
+ It must be an object that potentially contains attributes like
115
+ `related_model` or `rel`.
116
+ :return: The related model associated with the provided field, or None if
117
+ no such model is found.
118
+ """
43
119
  model = None
44
120
  if hasattr(field, "related_model") and field.related_model:
45
121
  model = field.related_model
@@ -49,12 +125,43 @@ class ReadFrameDask:
49
125
 
50
126
  @classmethod
51
127
  def get_base_cache_key(cls, model):
128
+ """
129
+ Generates a base cache key for caching purposes.
130
+
131
+ This method constructs a base cache key that can be used in conjunction with
132
+ Django models to uniquely identify cache entries. The key is formatted to
133
+ include the app label and model name, ensuring that cache entries are
134
+ namespaced accordingly.
135
+
136
+ :param model: A Django model instance for which the base cache key is generated.
137
+ :type model: Model
138
+ :return: The string template for the base cache key, where `%s` can be replaced
139
+ with specific identifiers to create unique keys.
140
+ :rtype: str
141
+ """
52
142
  return (
53
143
  f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
54
144
  )
55
145
 
56
146
  @classmethod
57
147
  def replace_pk(cls, model):
148
+ """
149
+ Generates a function that replaces primary keys in a pandas Series with their
150
+ corresponding cached values or database-retrieved representations.
151
+
152
+ The function uses a cache mechanism to retrieve pre-stored values for primary
153
+ keys in the series. If some primary keys are not found in the cache, it queries
154
+ the database for their representations, updates the cache, and replaces the
155
+ primary keys in the series accordingly.
156
+
157
+ :param model: The Django model class associated with the primary keys to be
158
+ processed.
159
+ :type model: Type[Model]
160
+
161
+ :return: A function that takes a pandas Series of primary keys as input and
162
+ returns a Series with replaced values based on cache or database retrieval.
163
+ :rtype: callable
164
+ """
58
165
  base_cache_key = cls.get_base_cache_key(model)
59
166
 
60
167
  def get_cache_key_from_pk(pk):
@@ -84,6 +191,20 @@ class ReadFrameDask:
84
191
 
85
192
  @classmethod
86
193
  def build_update_functions(cls, fieldnames, fields):
194
+ """
195
+ This method is responsible for building update functions based on the provided
196
+ fieldnames and fields. It performs validation for the field type, checks for
197
+ specific conditions such as `choices` or `ForeignKey` field types, and generates
198
+ a generator of update functions for the given fieldnames and fields.
199
+
200
+ :param fieldnames: A list of field names to be processed.
201
+ :type fieldnames: list[str]
202
+ :param fields: A list of field objects corresponding to the fieldnames.
203
+ :type fields: list[Field]
204
+ :return: A generator yielding tuples where the first element is a fieldname,
205
+ and the second element is the corresponding update function or None.
206
+ :rtype: generator[tuple[str, Callable | None]]
207
+ """
87
208
  for fieldname, field in zip(fieldnames, fields):
88
209
  if not isinstance(field, Field):
89
210
  yield fieldname, None
@@ -96,13 +217,38 @@ class ReadFrameDask:
96
217
 
97
218
  @classmethod
98
219
  def update_with_verbose(cls, df, fieldnames, fields):
220
+ """
221
+ Updates the provided dataframe by applying transformation functions to specified fields.
222
+ The method iterates over the provided field names and their corresponding functions, applying
223
+ each transformation function to its related column in the dataframe.
224
+
225
+ :param df: The input dataframe to be updated.
226
+ :param fieldnames: A list of field names in the dataframe that need to be updated.
227
+ :param fields: A list of transformation functions or mappings corresponding to the field names.
228
+ :return: The dataframe with updated fields.
229
+ """
99
230
  for fieldname, function in cls.build_update_functions(fieldnames, fields):
100
231
  if function is not None:
101
232
  df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
102
233
 
103
234
  @classmethod
104
235
  def to_fields(cls, qs, fieldnames):
105
- """Get fields from a queryset based on the given fieldnames."""
236
+ """
237
+ Converts field names from a queryset into corresponding field objects, resolving relationships
238
+ and related objects if necessary. This method is typically used to yield fully-resolved field
239
+ objects for further interaction.
240
+
241
+ :param qs: A QuerySet object from which the fields are resolved. This object provides access
242
+ to the model and its metadata from which the fields are retrieved.
243
+ :type qs: QuerySet
244
+
245
+ :param fieldnames: A list of field name strings. These can include nested fields separated by
246
+ double underscores (__) to denote relationships or subfields.
247
+ :type fieldnames: List[str]
248
+
249
+ :return: A generator that yields resolved field objects corresponding to the provided field names.
250
+ :rtype: Generator[Field, None, None]
251
+ """
106
252
  for fieldname in fieldnames:
107
253
  model = qs.model
108
254
  for fieldname_part in fieldname.split("__"):
@@ -125,6 +271,18 @@ class ReadFrameDask:
125
271
 
126
272
  @staticmethod
127
273
  def is_values_queryset(qs):
274
+ """
275
+ Determines whether the provided queryset is a values queryset.
276
+
277
+ This method checks if the `_iterable_class` attribute of the queryset corresponds
278
+ to `django.db.models.query.ValuesIterable`. If an exception occurs during the check,
279
+ the method returns `False`.
280
+
281
+ :param qs: The queryset to be checked.
282
+ :type qs: django.db.models.query.QuerySet
283
+ :return: A boolean indicating whether the queryset is a values queryset.
284
+ :rtype: bool
285
+ """
128
286
  try:
129
287
  return qs._iterable_class == django.db.models.query.ValuesIterable
130
288
  except:
@@ -132,7 +290,24 @@ class ReadFrameDask:
132
290
 
133
291
  @staticmethod
134
292
  def object_to_dict(obj, fields=None):
135
- """Convert a Django model instance to a dictionary based on specified fields."""
293
+ """
294
+ Converts an object to a dictionary representation.
295
+
296
+ This static method transforms an object's attributes into a dictionary.
297
+ If no specific fields are provided, all attribute key-value pairs are
298
+ included. The "_state" attribute, if present, is safely removed in this
299
+ case. When specific fields are supplied, only those fields are included
300
+ in the resulting dictionary.
301
+
302
+ :param obj: The object to be serialized into a dictionary. This object
303
+ must have the `__dict__` attribute available.
304
+ :param fields: A list of strings representing the attribute names to
305
+ include in the dictionary. If None or not provided, all attributes
306
+ are included except for "_state".
307
+ :return: A dictionary representation of the object's attributes. If the
308
+ provided object is None, an empty dictionary is returned.
309
+ :rtype: dict
310
+ """
136
311
  if obj is None:
137
312
  return {} # Return an empty dictionary if obj is None
138
313
  if not fields:
@@ -142,7 +317,25 @@ class ReadFrameDask:
142
317
 
143
318
  @staticmethod
144
319
  def infer_dtypes_from_django(qs):
145
- """Infers Dask data types based on Django queryset model fields, with support for nullable integers."""
320
+ """
321
+ Infer dtypes from a Django QuerySet model and annotated fields.
322
+
323
+ This method infers the appropriate data types (dtypes) for a given
324
+ Django QuerySet (`qs`) based on the fields defined in its model and
325
+ any annotated fields included in the QuerySet. The function maps
326
+ Django model field types to corresponding dtypes compatible with
327
+ Dask or Pandas dataframes.
328
+
329
+ - Fields in the model are identified through their metadata.
330
+ - Reverse relationships and non-concrete fields are ignored.
331
+ - Annotated fields are processed separately and default to object
332
+ dtype if their type cannot be determined.
333
+
334
+ :param qs: Django QuerySet whose model is used to infer dtypes.
335
+ :type qs: QuerySet
336
+ :return: A mapping of field names to inferred dtypes.
337
+ :rtype: dict
338
+ """
146
339
  django_to_dask_dtype = {
147
340
  'AutoField': 'Int64', # Use nullable integer
148
341
  'BigAutoField': 'Int64',
@@ -189,6 +382,21 @@ class ReadFrameDask:
189
382
  return dtypes
190
383
 
191
384
  def read_frame(self, fillna_value=None):
385
+ """
386
+ Reads a Django QuerySet and returns a dask DataFrame by iterating over the QuerySet in chunks. It
387
+ handles data type inference, missing values, timezone awareness, and creates partitions to form a
388
+ single dask DataFrame efficiently.
389
+
390
+ This method includes functionality for managing missing values, inferring data types from Django fields,
391
+ and handling timezone-aware datetime objects. It processes data in chunks to optimize memory usage and
392
+ supports converting chunks into pandas DataFrames before combining them into a unified dask DataFrame.
393
+
394
+ :param fillna_value: The value to fill NaN values in the DataFrame. If None, NaNs are not filled.
395
+ :type fillna_value: Any
396
+ :return: A dask DataFrame constructed from the QuerySet after processing and combining all
397
+ its partitions.
398
+ :rtype: dask.dataframe.DataFrame
399
+ """
192
400
  qs = self.qs
193
401
  coerce_float = self.coerce_float
194
402
  verbose = self.verbose
@@ -10,9 +10,57 @@ from sibi_dst.utils import Logger
10
10
 
11
11
 
12
12
  class DjangoLoadFromDb:
13
+ """
14
+ Handles loading data from a Django database into a Dask DataFrame, with support for filtering
15
+ and column type conversion.
16
+
17
+ This class is designed to interface with Django ORM models, allowing data querying and mapping
18
+ Django model fields to Dask DataFrame columns. It accommodates filtering logic provided via
19
+ parameters and ensures that excessive data is not accidentally loaded when no filters are applied.
20
+
21
+ :ivar connection_config: Configuration for the database connection, including the Django model
22
+ and connection details.
23
+ :type connection_config: Any
24
+ :ivar query_config: Configuration for the query, including the number of records to retrieve.
25
+ :type query_config: Any
26
+ :ivar params_config: Configuration for query parameters, including filters and DataFrame options.
27
+ :type params_config: Any
28
+ :ivar logger: Logger instance used for debugging and reporting runtime information.
29
+ :type logger: Logger
30
+ :ivar debug: Indicates whether debug mode is active for verbose logging.
31
+ :type debug: bool
32
+ :ivar df: Dask DataFrame to hold the loaded query results.
33
+ :type df: dd.DataFrame
34
+ """
13
35
  df: dd.DataFrame
14
36
 
15
37
  def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
38
+ """
39
+ This class initializes and configures a database connection along with the
40
+ specified query and parameters. It ensures the required model is defined
41
+ and sets up logging. Additional configurations can be provided via keyword
42
+ arguments.
43
+
44
+ :param db_connection: The configuration object representing the database
45
+ connection details.
46
+ :type db_connection: Any
47
+ :param db_query: The configuration or object for defining the database
48
+ query.
49
+ :type db_query: Any
50
+ :param db_params: The configuration or object for defining parameters
51
+ to be passed to the query.
52
+ :type db_params: Any
53
+ :param logger: An instance of a logging class used to log debug or
54
+ error messages, defaults to the class's default logger if not
55
+ specified.
56
+ :type logger: Any, optional
57
+ :param kwargs: Additional keyword arguments for custom configurations
58
+ like `debug`. These can include optional parameters to be parsed by
59
+ `params_config`.
60
+ :type kwargs: dict
61
+ :raises ValueError: If no model is specified in the given database
62
+ connection configuration.
63
+ """
16
64
  self.connection_config = db_connection
17
65
  self.debug = kwargs.pop('debug', False)
18
66
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -27,11 +75,35 @@ class DjangoLoadFromDb:
27
75
  self.params_config.parse_params(kwargs)
28
76
 
29
77
  def build_and_load(self):
78
+ """
79
+ Builds and loads data into a DataFrame by invoking the `_build_and_load` method.
80
+ This is a utility method designed to perform specific operations for constructing
81
+ and preparing the data. The loaded data will then be assigned to the instance
82
+ attribute `df`.
83
+
84
+ :param self: Reference to the current instance of the class.
85
+ :type self: object
86
+
87
+ :return: DataFrame containing the built and loaded data.
88
+ """
30
89
  self.df = self._build_and_load()
31
90
  # self.df = self._convert_columns(self.df)
32
91
  return self.df
33
92
 
34
93
  def _build_and_load(self) -> dd.DataFrame:
94
+ """
95
+ Builds and loads a Dask DataFrame based on the provided query and configuration. This method queries the data
96
+ model using the specified connection, applies filters if provided, and converts the query result into a
97
+ Dask DataFrame. If filters are not provided, only the first `n_records` entries are processed to avoid
98
+ unintentionally loading the entire table.
99
+
100
+ :raises Exception: If an error occurs while loading the query, it logs the error and initializes an
101
+ empty Dask DataFrame.
102
+
103
+ :return: A Dask DataFrame containing the queried data. If no filters or valid results are provided,
104
+ an empty Dask DataFrame is returned.
105
+ :rtype: dd.DataFrame
106
+ """
35
107
  query = self.connection_config.model.objects.using(self.connection_config.connection_name)
36
108
  if not self.params_config.filters:
37
109
  # IMPORTANT: if no filters are provided show only the first n_records
@@ -54,6 +126,22 @@ class DjangoLoadFromDb:
54
126
 
55
127
  @staticmethod
56
128
  def __build_query_objects(filters: dict, use_exclude: bool):
129
+ """
130
+ Constructs and returns a composite Q object based on the provided `filters` dictionary.
131
+ The function determines whether to include or exclude the filter conditions in the final
132
+ query based on the `use_exclude` parameter. If `use_exclude` is False, the filters are
133
+ directly added to the composite Q object. If `use_exclude` is True, the negation of
134
+ the filters is added instead.
135
+
136
+ :param filters: A dictionary containing filter conditions where keys represent field names
137
+ and values represent the conditions to be applied.
138
+ :type filters: dict
139
+ :param use_exclude: A boolean flag determining whether to exclude (`True`) or include
140
+ (`False`) the provided filter conditions.
141
+ :type use_exclude: bool
142
+ :return: A composite Q object that aggregates the filters based on the given conditions.
143
+ :rtype: Q
144
+ """
57
145
  q_objects = Q()
58
146
  for key, value in filters.items():
59
147
  if not use_exclude:
@@ -64,10 +152,17 @@ class DjangoLoadFromDb:
64
152
 
65
153
  def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
66
154
  """
67
- Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
155
+ [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
156
+
157
+ This function is deprecated and will be removed in a future release. The method converts the data
158
+ types of columns in a Dask DataFrame to match their corresponding field types defined in a Django model.
159
+ It emits warnings and logs deprecation notes. The conversions are applied lazily and partition-wise
160
+ to support distributed computation.
68
161
 
69
162
  :param df: Dask DataFrame whose columns' data types are to be converted.
163
+ :type df: dd.DataFrame
70
164
  :return: Dask DataFrame with converted column data types.
165
+ :rtype: dd.DataFrame
71
166
  """
72
167
  """
73
168
  [DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.