sibi-dst 0.3.32__py3-none-any.whl → 0.3.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +108 -5
- sibi_dst/df_helper/_parquet_artifact.py +63 -0
- sibi_dst/df_helper/_parquet_reader.py +36 -0
- sibi_dst/df_helper/backends/django/_db_connection.py +41 -1
- sibi_dst/df_helper/backends/django/_io_dask.py +211 -3
- sibi_dst/df_helper/backends/django/_load_from_db.py +96 -1
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +132 -6
- sibi_dst/df_helper/backends/http/_http_config.py +52 -1
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +28 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +105 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +17 -0
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +80 -2
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +90 -29
- sibi_dst/df_helper/core/_params_config.py +59 -0
- sibi_dst/geopy_helper/geo_location_service.py +14 -0
- sibi_dst/geopy_helper/utils.py +37 -3
- sibi_dst/osmnx_helper/base_osm_map.py +254 -0
- sibi_dst/osmnx_helper/utils.py +226 -4
- sibi_dst/utils/clickhouse_writer.py +27 -0
- sibi_dst/utils/data_utils.py +32 -1
- sibi_dst/utils/data_wrapper.py +94 -6
- sibi_dst/utils/date_utils.py +35 -0
- sibi_dst/utils/log_utils.py +19 -2
- sibi_dst/utils/parquet_saver.py +0 -106
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/RECORD +27 -27
- {sibi_dst-0.3.32.dist-info → sibi_dst-0.3.34.dist-info}/WHEEL +0 -0
@@ -10,9 +10,57 @@ from sibi_dst.utils import Logger
|
|
10
10
|
|
11
11
|
|
12
12
|
class DjangoLoadFromDb:
|
13
|
+
"""
|
14
|
+
Handles loading data from a Django database into a Dask DataFrame, with support for filtering
|
15
|
+
and column type conversion.
|
16
|
+
|
17
|
+
This class is designed to interface with Django ORM models, allowing data querying and mapping
|
18
|
+
Django model fields to Dask DataFrame columns. It accommodates filtering logic provided via
|
19
|
+
parameters and ensures that excessive data is not accidentally loaded when no filters are applied.
|
20
|
+
|
21
|
+
:ivar connection_config: Configuration for the database connection, including the Django model
|
22
|
+
and connection details.
|
23
|
+
:type connection_config: Any
|
24
|
+
:ivar query_config: Configuration for the query, including the number of records to retrieve.
|
25
|
+
:type query_config: Any
|
26
|
+
:ivar params_config: Configuration for query parameters, including filters and DataFrame options.
|
27
|
+
:type params_config: Any
|
28
|
+
:ivar logger: Logger instance used for debugging and reporting runtime information.
|
29
|
+
:type logger: Logger
|
30
|
+
:ivar debug: Indicates whether debug mode is active for verbose logging.
|
31
|
+
:type debug: bool
|
32
|
+
:ivar df: Dask DataFrame to hold the loaded query results.
|
33
|
+
:type df: dd.DataFrame
|
34
|
+
"""
|
13
35
|
df: dd.DataFrame
|
14
36
|
|
15
37
|
def __init__(self, db_connection, db_query, db_params, logger, **kwargs):
|
38
|
+
"""
|
39
|
+
This class initializes and configures a database connection along with the
|
40
|
+
specified query and parameters. It ensures the required model is defined
|
41
|
+
and sets up logging. Additional configurations can be provided via keyword
|
42
|
+
arguments.
|
43
|
+
|
44
|
+
:param db_connection: The configuration object representing the database
|
45
|
+
connection details.
|
46
|
+
:type db_connection: Any
|
47
|
+
:param db_query: The configuration or object for defining the database
|
48
|
+
query.
|
49
|
+
:type db_query: Any
|
50
|
+
:param db_params: The configuration or object for defining parameters
|
51
|
+
to be passed to the query.
|
52
|
+
:type db_params: Any
|
53
|
+
:param logger: An instance of a logging class used to log debug or
|
54
|
+
error messages, defaults to the class's default logger if not
|
55
|
+
specified.
|
56
|
+
:type logger: Any, optional
|
57
|
+
:param kwargs: Additional keyword arguments for custom configurations
|
58
|
+
like `debug`. These can include optional parameters to be parsed by
|
59
|
+
`params_config`.
|
60
|
+
:type kwargs: dict
|
61
|
+
:raises ValueError: If no model is specified in the given database
|
62
|
+
connection configuration.
|
63
|
+
"""
|
16
64
|
self.connection_config = db_connection
|
17
65
|
self.debug = kwargs.pop('debug', False)
|
18
66
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -27,11 +75,35 @@ class DjangoLoadFromDb:
|
|
27
75
|
self.params_config.parse_params(kwargs)
|
28
76
|
|
29
77
|
def build_and_load(self):
|
78
|
+
"""
|
79
|
+
Builds and loads data into a DataFrame by invoking the `_build_and_load` method.
|
80
|
+
This is a utility method designed to perform specific operations for constructing
|
81
|
+
and preparing the data. The loaded data will then be assigned to the instance
|
82
|
+
attribute `df`.
|
83
|
+
|
84
|
+
:param self: Reference to the current instance of the class.
|
85
|
+
:type self: object
|
86
|
+
|
87
|
+
:return: DataFrame containing the built and loaded data.
|
88
|
+
"""
|
30
89
|
self.df = self._build_and_load()
|
31
90
|
# self.df = self._convert_columns(self.df)
|
32
91
|
return self.df
|
33
92
|
|
34
93
|
def _build_and_load(self) -> dd.DataFrame:
|
94
|
+
"""
|
95
|
+
Builds and loads a Dask DataFrame based on the provided query and configuration. This method queries the data
|
96
|
+
model using the specified connection, applies filters if provided, and converts the query result into a
|
97
|
+
Dask DataFrame. If filters are not provided, only the first `n_records` entries are processed to avoid
|
98
|
+
unintentionally loading the entire table.
|
99
|
+
|
100
|
+
:raises Exception: If an error occurs while loading the query, it logs the error and initializes an
|
101
|
+
empty Dask DataFrame.
|
102
|
+
|
103
|
+
:return: A Dask DataFrame containing the queried data. If no filters or valid results are provided,
|
104
|
+
an empty Dask DataFrame is returned.
|
105
|
+
:rtype: dd.DataFrame
|
106
|
+
"""
|
35
107
|
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
36
108
|
if not self.params_config.filters:
|
37
109
|
# IMPORTANT: if no filters are provided show only the first n_records
|
@@ -54,6 +126,22 @@ class DjangoLoadFromDb:
|
|
54
126
|
|
55
127
|
@staticmethod
|
56
128
|
def __build_query_objects(filters: dict, use_exclude: bool):
|
129
|
+
"""
|
130
|
+
Constructs and returns a composite Q object based on the provided `filters` dictionary.
|
131
|
+
The function determines whether to include or exclude the filter conditions in the final
|
132
|
+
query based on the `use_exclude` parameter. If `use_exclude` is False, the filters are
|
133
|
+
directly added to the composite Q object. If `use_exclude` is True, the negation of
|
134
|
+
the filters is added instead.
|
135
|
+
|
136
|
+
:param filters: A dictionary containing filter conditions where keys represent field names
|
137
|
+
and values represent the conditions to be applied.
|
138
|
+
:type filters: dict
|
139
|
+
:param use_exclude: A boolean flag determining whether to exclude (`True`) or include
|
140
|
+
(`False`) the provided filter conditions.
|
141
|
+
:type use_exclude: bool
|
142
|
+
:return: A composite Q object that aggregates the filters based on the given conditions.
|
143
|
+
:rtype: Q
|
144
|
+
"""
|
57
145
|
q_objects = Q()
|
58
146
|
for key, value in filters.items():
|
59
147
|
if not use_exclude:
|
@@ -64,10 +152,17 @@ class DjangoLoadFromDb:
|
|
64
152
|
|
65
153
|
def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
|
66
154
|
"""
|
67
|
-
Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
155
|
+
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
156
|
+
|
157
|
+
This function is deprecated and will be removed in a future release. The method converts the data
|
158
|
+
types of columns in a Dask DataFrame to match their corresponding field types defined in a Django model.
|
159
|
+
It emits warnings and logs deprecation notes. The conversions are applied lazily and partition-wise
|
160
|
+
to support distributed computation.
|
68
161
|
|
69
162
|
:param df: Dask DataFrame whose columns' data types are to be converted.
|
163
|
+
:type df: dd.DataFrame
|
70
164
|
:return: Dask DataFrame with converted column data types.
|
165
|
+
:rtype: dd.DataFrame
|
71
166
|
"""
|
72
167
|
"""
|
73
168
|
[DEPRECATED] Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
#
|
1
|
+
|
3
2
|
import keyword
|
4
3
|
import re
|
5
4
|
from functools import lru_cache
|
@@ -49,13 +48,57 @@ apps_label = "datacubes"
|
|
49
48
|
|
50
49
|
|
51
50
|
class DjangoSqlModelBuilder:
|
51
|
+
"""
|
52
|
+
Handles the dynamic creation of Django ORM models based on database table structures.
|
53
|
+
|
54
|
+
This class takes input parameters such as database connection and table name,
|
55
|
+
and dynamically maps the table's schema to a Django ORM model. The resultant model
|
56
|
+
can be used for various ORM operations like querying, saving, and deleting records.
|
57
|
+
The class utilizes Django's introspection features and allows customization
|
58
|
+
through its fields and methods.
|
59
|
+
|
60
|
+
:ivar connection_name: The name of the database connection being used.
|
61
|
+
:type connection_name: str
|
62
|
+
:ivar table: The name of the database table for which the model is being built.
|
63
|
+
:type table: str
|
64
|
+
:ivar model: The dynamically generated Django model or None if not created yet.
|
65
|
+
:type model: type | None
|
66
|
+
"""
|
52
67
|
def __init__(self, **kwargs):
|
68
|
+
"""
|
69
|
+
Represents an initialization method for a class that handles the
|
70
|
+
assignment of attributes and processes the given keyword arguments
|
71
|
+
through an internal utility function. This method sets up the
|
72
|
+
necessary attributes for later use.
|
73
|
+
|
74
|
+
:param kwargs: A collection of keyword arguments used by the internal
|
75
|
+
parsing method to populate the attributes of the class. Specific
|
76
|
+
expected keys and their usage should be detailed in the internal
|
77
|
+
implementation.
|
78
|
+
"""
|
53
79
|
self.connection_name = None
|
54
80
|
self.table = None
|
55
81
|
self.model = None
|
56
82
|
self.__parse_builder(**kwargs)
|
57
83
|
|
58
84
|
def __parse_builder(self, **kwargs):
|
85
|
+
"""
|
86
|
+
Parses and initializes the builder properties based on provided keyword
|
87
|
+
arguments. Validates that the required 'connection_name' and 'table'
|
88
|
+
values are present and sets the corresponding attributes. If validation
|
89
|
+
fails, raises appropriate errors. Returns the updated builder object
|
90
|
+
after initialization. This method is primarily intended for internal
|
91
|
+
use to configure the builder.
|
92
|
+
|
93
|
+
:param kwargs: Keyword arguments containing configuration values for
|
94
|
+
initializing the builder. Should include 'connection_name'
|
95
|
+
and 'table' keys.
|
96
|
+
:type kwargs: dict
|
97
|
+
:return: Returns the instance of the builder object after initialization.
|
98
|
+
:rtype: self
|
99
|
+
:raises ValueError: If 'connection_name' or 'table' is not provided in
|
100
|
+
the keyword arguments.
|
101
|
+
"""
|
59
102
|
self.connection_name = kwargs.get("connection_name", None)
|
60
103
|
self.table = kwargs.get("table", None)
|
61
104
|
self.model = None
|
@@ -67,6 +110,22 @@ class DjangoSqlModelBuilder:
|
|
67
110
|
|
68
111
|
@lru_cache(maxsize=None)
|
69
112
|
def build_model(self):
|
113
|
+
"""
|
114
|
+
Builds and retrieves a model instance with dynamically defined fields.
|
115
|
+
|
116
|
+
This method attempts to retrieve a model instance by its name and, if it
|
117
|
+
does not exist, creates a new model with the specified table structure.
|
118
|
+
The model is either fetched or constructed using the provided data about
|
119
|
+
its fields. The result is cached for repeated calls to improve performance
|
120
|
+
and avoid redundant computations.
|
121
|
+
|
122
|
+
:raises LookupError: If the model cannot be fetched or created due to an
|
123
|
+
invalid lookup.
|
124
|
+
|
125
|
+
:return: A model instance dynamically constructed or retrieved for the
|
126
|
+
specified table and fields.
|
127
|
+
:rtype: Model
|
128
|
+
"""
|
70
129
|
model = None
|
71
130
|
model_fields = self.get_model_fields()
|
72
131
|
model_name = self.table2model(self.table)
|
@@ -78,6 +137,25 @@ class DjangoSqlModelBuilder:
|
|
78
137
|
return model
|
79
138
|
|
80
139
|
def create_model(self, name, fields) -> type:
|
140
|
+
"""
|
141
|
+
Creates a Django model class dynamically.
|
142
|
+
|
143
|
+
This function takes in a model name and a dictionary of fields, dynamically
|
144
|
+
creates a Meta class where additional metadata for the model (like
|
145
|
+
`db_table`, `managed`, `app_label`) is defined, and then uses Python's
|
146
|
+
standard library `type()` function to generate and return the model class
|
147
|
+
on the fly.
|
148
|
+
|
149
|
+
:param name: The name of the model class to create.
|
150
|
+
:type name: str
|
151
|
+
:param fields: A dictionary mapping field names to their definitions in
|
152
|
+
Django's model field format. Each field definition should include
|
153
|
+
the field type and optional parameters.
|
154
|
+
:type fields: dict
|
155
|
+
:return: The dynamically created Django model class based on the provided
|
156
|
+
name and fields.
|
157
|
+
:rtype: type
|
158
|
+
"""
|
81
159
|
def parse_args(arg_string):
|
82
160
|
arg_dict = {}
|
83
161
|
# Match keyword arguments in the form key=value
|
@@ -118,9 +196,32 @@ class DjangoSqlModelBuilder:
|
|
118
196
|
|
119
197
|
@staticmethod
|
120
198
|
def table2model(table_name):
|
199
|
+
"""
|
200
|
+
Converts a database table name to a corresponding model name by transforming
|
201
|
+
it from snake_case to CamelCase. This method takes a string representing
|
202
|
+
a table name, splits it by underscores, capitalizes the first letter of
|
203
|
+
each part, and then joins them into a single string.
|
204
|
+
|
205
|
+
:param table_name: The name of the database table in snake_case format
|
206
|
+
:type table_name: str
|
207
|
+
:return: A string representing the equivalent model name in CamelCase format
|
208
|
+
:rtype: str
|
209
|
+
"""
|
121
210
|
return "".join([x.title() for x in table_name.split("_")])
|
122
211
|
|
123
212
|
def get_model_fields(self):
|
213
|
+
"""
|
214
|
+
Generates the data structure for model fields from a database table using
|
215
|
+
introspection. The method extracts information about columns, primary keys,
|
216
|
+
unique constraints, and additional metadata to define the fields of the model.
|
217
|
+
|
218
|
+
:raises ValueError: If the specified connection or table is not found.
|
219
|
+
:raises Exception: For any database or introspection-related errors.
|
220
|
+
|
221
|
+
:returns: Dictionary containing the model field definitions based on the
|
222
|
+
table's structure and metadata.
|
223
|
+
:rtype: dict
|
224
|
+
"""
|
124
225
|
connection = connections[self.connection_name]
|
125
226
|
if connection is None:
|
126
227
|
raise ValueError("Connection %s not found" % self.connection_name)
|
@@ -265,7 +366,21 @@ class DjangoSqlModelBuilder:
|
|
265
366
|
@staticmethod
|
266
367
|
def normalize_col_name(col_name, used_column_names, is_relation):
|
267
368
|
"""
|
268
|
-
|
369
|
+
Normalizes a column name to conform to Python's variable naming conventions and addresses potential
|
370
|
+
name conflicts or issues with reserved words. Applies transformations to ensure the column name:
|
371
|
+
- Is lowercase.
|
372
|
+
- Replaces unsuitable characters with underscores.
|
373
|
+
- Avoids conflicts with Python keywords and digits at the start of the name.
|
374
|
+
- Resolves conflicts with previously used column names.
|
375
|
+
|
376
|
+
:param col_name: The original column name provided from the schema.
|
377
|
+
:param used_column_names: A list of previously used column names to avoid naming collisions.
|
378
|
+
:param is_relation: A boolean indicating if the column represents a relation (e.g., foreign key).
|
379
|
+
:return: A tuple containing:
|
380
|
+
- The normalized column name (str).
|
381
|
+
- A dictionary (`field_params`) with any relevant information for database configuration.
|
382
|
+
Includes the original column name if specific transformations were applied.
|
383
|
+
- A list (`field_notes`) containing strings explaining the applied transformations.
|
269
384
|
"""
|
270
385
|
field_params = {}
|
271
386
|
field_notes = []
|
@@ -326,9 +441,20 @@ class DjangoSqlModelBuilder:
|
|
326
441
|
@staticmethod
|
327
442
|
def get_field_type(connection, row):
|
328
443
|
"""
|
329
|
-
|
330
|
-
|
331
|
-
|
444
|
+
Determines the type of a database field based on its description and connection
|
445
|
+
introspection, and includes metadata such as parameters and additional notes.
|
446
|
+
|
447
|
+
This function extracts the field type from the database's introspection
|
448
|
+
interface and adds corresponding parameters (e.g., `max_length`, `decimal_places`)
|
449
|
+
and relevant notes if certain properties are inferred or guessed.
|
450
|
+
|
451
|
+
:param connection: The database connection object used for introspection.
|
452
|
+
:type connection: Any
|
453
|
+
:param row: An object containing field metadata, such as type code,
|
454
|
+
display size, collation, precision, and scale.
|
455
|
+
:type row: Any
|
456
|
+
:return: A tuple containing the field type, its parameters, and any notes.
|
457
|
+
:rtype: tuple[str, dict, list[str]]
|
332
458
|
"""
|
333
459
|
field_params = {}
|
334
460
|
field_notes = []
|
@@ -9,6 +9,26 @@ from sibi_dst.utils import Logger
|
|
9
9
|
|
10
10
|
|
11
11
|
class HttpConfig(BaseModel):
|
12
|
+
"""
|
13
|
+
Configuration for HTTP client operations, designed to manage and fetch data
|
14
|
+
from HTTP endpoints asynchronously. This class serves as a centralized configuration
|
15
|
+
and operation hub encapsulating settings such as base URL, query parameters, API keys,
|
16
|
+
and logger support. It employs `httpx` for HTTP interactions and leverages Dask for the
|
17
|
+
resulting data handling and transformation.
|
18
|
+
|
19
|
+
:ivar base_url: The base URL for HTTP communication.
|
20
|
+
:type base_url: HttpUrl
|
21
|
+
:ivar params: Optional dictionary containing query parameters to be used with GET requests.
|
22
|
+
:type params: Optional[Dict[str, Any]]
|
23
|
+
:ivar logger: The logger instance for logging operations. If not provided, a default logger
|
24
|
+
is initialized using the class name.
|
25
|
+
:type logger: Optional[Logger]
|
26
|
+
:ivar timeout: The timeout value in seconds for HTTP requests. Defaults to 300.
|
27
|
+
:type timeout: Optional[int]
|
28
|
+
:ivar api_key: The optional secret API key for authorization. If present, it will populate
|
29
|
+
the Authorization header in HTTP requests.
|
30
|
+
:type api_key: Optional[SecretStr]
|
31
|
+
"""
|
12
32
|
base_url: HttpUrl
|
13
33
|
params: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
14
34
|
logger: Optional[Logger] = None
|
@@ -17,12 +37,43 @@ class HttpConfig(BaseModel):
|
|
17
37
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
18
38
|
|
19
39
|
def __init__(self, logger=None, **data):
|
40
|
+
"""
|
41
|
+
Initializes the class with a logger and other data parameters.
|
42
|
+
|
43
|
+
This constructor allows the option to provide a custom logger. If no logger
|
44
|
+
is supplied during initialization, a default logger specific to the class
|
45
|
+
is created using the Logger utility. It also initializes the instance
|
46
|
+
with additional data passed as keyword arguments.
|
47
|
+
|
48
|
+
:param logger: Optional logger instance. If not provided, a default
|
49
|
+
logger is created using the class name as the logger name.
|
50
|
+
:type logger: logging.Logger, optional
|
51
|
+
:param data: Arbitrary keyword arguments containing data to initialize
|
52
|
+
the class.
|
53
|
+
:type data: dict
|
54
|
+
"""
|
20
55
|
super().__init__(**data)
|
21
56
|
# Initialize the logger if not provided
|
22
57
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
23
58
|
|
24
59
|
async def fetch_data(self, **options) -> dd.DataFrame:
|
25
|
-
"""
|
60
|
+
"""
|
61
|
+
Fetches data from a specified HTTP JSON source and returns it as a dask DataFrame.
|
62
|
+
|
63
|
+
This asynchronous method constructs a request URL based on the provided options
|
64
|
+
and sends an HTTP GET request. The fetched JSON data is normalized and
|
65
|
+
converted to a dask DataFrame for further use. It handles request errors and
|
66
|
+
JSON parsing errors effectively.
|
67
|
+
|
68
|
+
:param options: Arbitrary keyword arguments representing dynamic path segments
|
69
|
+
to be appended to the base URL.
|
70
|
+
:type options: dict
|
71
|
+
:return: A dask DataFrame containing the structured data retrieved
|
72
|
+
from the HTTP JSON source.
|
73
|
+
:rtype: dd.DataFrame
|
74
|
+
:raises httpx.RequestError: If there is an issue with the HTTP request.
|
75
|
+
:raises ValueError: If there is an error parsing JSON data.
|
76
|
+
"""
|
26
77
|
try:
|
27
78
|
# Build URL with options as path segments
|
28
79
|
|
@@ -5,11 +5,39 @@ from sibi_dst.utils import Logger
|
|
5
5
|
|
6
6
|
|
7
7
|
class ParquetFilterHandler(object):
|
8
|
+
"""
|
9
|
+
Handles parquet filtering operations using dask dataframes.
|
10
|
+
|
11
|
+
This class is designed to apply complex filtering logic on dask dataframes
|
12
|
+
based on specified filter criteria. It includes support for operations such
|
13
|
+
as exact matches, ranges, string pattern matches, and null checks. Additionally,
|
14
|
+
it handles datetime-related field filtering including precise truncations and
|
15
|
+
specific date/time attributes.
|
16
|
+
|
17
|
+
:ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
|
18
|
+
:type logger: Logger
|
19
|
+
"""
|
8
20
|
def __init__(self, logger=None):
|
9
21
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
10
22
|
|
11
23
|
@staticmethod
|
12
24
|
def apply_filters_dask(df, filters):
|
25
|
+
"""
|
26
|
+
Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
|
27
|
+
such as comparisons, ranges, string match operations, and more. Handles special
|
28
|
+
cases for datetime operations, including casting and extracting specific datetime
|
29
|
+
components for filtering.
|
30
|
+
|
31
|
+
:param df: Dask DataFrame to which the filters will be applied.
|
32
|
+
:type df: dask.dataframe.DataFrame
|
33
|
+
:param filters: Dictionary defining the filtering logic, where the keys specify
|
34
|
+
the column name and filter operation, and the values specify the corresponding
|
35
|
+
filter values to apply.
|
36
|
+
:type filters: dict
|
37
|
+
:return: A filtered Dask DataFrame based on the defined logic in the filters.
|
38
|
+
:rtype: dask.dataframe.DataFrame
|
39
|
+
:raises ValueError: If an unsupported operation is encountered in the filters.
|
40
|
+
"""
|
13
41
|
dt_operators = ['date', 'time']
|
14
42
|
date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
|
15
43
|
comparison_operators = [
|
@@ -11,6 +11,45 @@ from sibi_dst.utils import Logger
|
|
11
11
|
|
12
12
|
|
13
13
|
class ParquetConfig(BaseModel):
|
14
|
+
"""
|
15
|
+
Represents configuration for managing and validating parquet file operations.
|
16
|
+
|
17
|
+
The `ParquetConfig` class provides attributes and methods necessary to handle operations
|
18
|
+
on parquet files in a file system. It includes functionalities for ensuring file paths
|
19
|
+
and extensions, validating storage paths and parameters, determining file recency,
|
20
|
+
and calculating the size of parquet files. This class is designed with flexibility to handle
|
21
|
+
different file systems through the integration with `fsspec` and allows storage path validations
|
22
|
+
with optional logging support.
|
23
|
+
|
24
|
+
:ivar load_parquet: Indicates whether parquet data should be loaded based on the
|
25
|
+
current configuration and validation.
|
26
|
+
:type load_parquet: bool
|
27
|
+
:ivar parquet_filename: The name of the parquet file, optional if folders are used.
|
28
|
+
:type parquet_filename: Optional[str]
|
29
|
+
:ivar parquet_storage_path: The base path for storing or retrieving parquet files.
|
30
|
+
:type parquet_storage_path: Optional[str]
|
31
|
+
:ivar parquet_full_path: The full path to a specific parquet file, derived from the
|
32
|
+
storage path and filename when applicable.
|
33
|
+
:type parquet_full_path: Optional[str]
|
34
|
+
:ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
|
35
|
+
and end dates if specified.
|
36
|
+
:type parquet_folder_list: Optional[List[str]]
|
37
|
+
:ivar parquet_size_bytes: The total size of the parquet files, in bytes.
|
38
|
+
:type parquet_size_bytes: int
|
39
|
+
:ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
|
40
|
+
:type parquet_max_age_minutes: int
|
41
|
+
:ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
|
42
|
+
on the `parquet_max_age_minutes` condition.
|
43
|
+
:type parquet_is_recent: bool
|
44
|
+
:ivar parquet_start_date: The start date for parquet file validation or file path generation.
|
45
|
+
:type parquet_start_date: Optional[str]
|
46
|
+
:ivar parquet_end_date: The end date for parquet file validation or file path generation.
|
47
|
+
:type parquet_end_date: Optional[str]
|
48
|
+
:ivar fs: The file system object used for storage operations, compliant with `fsspec`.
|
49
|
+
:type fs: Optional[fsspec.spec.AbstractFileSystem]
|
50
|
+
:ivar logger: A logger for handling logging operations.
|
51
|
+
:type logger: Optional[Logger]
|
52
|
+
"""
|
14
53
|
load_parquet: bool = False
|
15
54
|
parquet_filename: Optional[str] = None
|
16
55
|
parquet_storage_path: Optional[str] = None
|
@@ -27,6 +66,20 @@ class ParquetConfig(BaseModel):
|
|
27
66
|
|
28
67
|
@model_validator(mode='after')
|
29
68
|
def check_parquet_params(self):
|
69
|
+
"""
|
70
|
+
Validates and configures the parameters required for managing parquet files. This includes
|
71
|
+
configuring paths through `fsspec`, identifying file storage paths, checking the validity of
|
72
|
+
dates related to parquet files, ensuring proper parquet file extensions, and determining
|
73
|
+
whether existing parquet files are recent and loadable.
|
74
|
+
|
75
|
+
:return: The current instance with validated and migrated attributes configured for
|
76
|
+
handling parquet files.
|
77
|
+
|
78
|
+
:raises ValueError: If certain conditions are not met, such as missing or invalid
|
79
|
+
`parquet_storage_path`, providing only one of
|
80
|
+
`parquet_start_date` or `parquet_end_date`, or if the
|
81
|
+
`parquet_end_date` is earlier than the `parquet_start_date`.
|
82
|
+
"""
|
30
83
|
# Configure paths based on fsspec
|
31
84
|
if self.logger is None:
|
32
85
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -72,6 +125,23 @@ class ParquetConfig(BaseModel):
|
|
72
125
|
return self
|
73
126
|
|
74
127
|
def is_file_recent(self):
|
128
|
+
"""
|
129
|
+
Determines whether the file at the specified parquet path is considered recent
|
130
|
+
based on its modification time and the maximum age limit defined.
|
131
|
+
|
132
|
+
The function first checks for the existence of the file at the specified
|
133
|
+
`parquet_full_path`. If the file does not exist, the function will return
|
134
|
+
False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
|
135
|
+
limit, and the function will return True. Otherwise, it retrieves the file's
|
136
|
+
last modified time and calculates the age of the file by comparing it with the
|
137
|
+
current time. The function returns True if the file's age does not exceed the
|
138
|
+
maximum age specified by `parquet_max_age_minutes`, otherwise it returns
|
139
|
+
False.
|
140
|
+
|
141
|
+
:return: Whether the file is considered recent based on its existence,
|
142
|
+
modification time, and maximum age limit.
|
143
|
+
:rtype: bool
|
144
|
+
"""
|
75
145
|
if not self.fs.exists(self.parquet_full_path):
|
76
146
|
return False
|
77
147
|
if self.parquet_max_age_minutes == 0:
|
@@ -80,6 +150,24 @@ class ParquetConfig(BaseModel):
|
|
80
150
|
return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
|
81
151
|
|
82
152
|
def get_parquet_size_bytes(self):
|
153
|
+
"""
|
154
|
+
Calculate the total size, in bytes, of all Parquet files within the defined
|
155
|
+
folders specified by `parquet_folder_list`. The function iteratively goes
|
156
|
+
through each folder in the provided list, applying a recursive wildcard
|
157
|
+
search to include all levels of nested directories, and calculates the
|
158
|
+
cumulative size of all found Parquet files using the file system's size
|
159
|
+
retrieval method.
|
160
|
+
|
161
|
+
:raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
|
162
|
+
or improperly configured when the method is called.
|
163
|
+
:raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
|
164
|
+
unimplemented in the provided file system object or it otherwise lacks
|
165
|
+
necessary support for these operations.
|
166
|
+
|
167
|
+
:return: The cumulative size of all Parquet files located in the folders
|
168
|
+
defined by `parquet_folder_list`, measured in bytes.
|
169
|
+
:rtype: int
|
170
|
+
"""
|
83
171
|
total_size = 0
|
84
172
|
for folder in self.parquet_folder_list:
|
85
173
|
# Use a double wildcard ** to match any level of nested directories
|
@@ -88,7 +176,14 @@ class ParquetConfig(BaseModel):
|
|
88
176
|
return total_size
|
89
177
|
|
90
178
|
def load_files(self):
|
91
|
-
|
179
|
+
"""
|
180
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions. This
|
181
|
+
method checks if parquet file loading is enabled and loads either from a list of
|
182
|
+
parquet folder paths or a single specified parquet path.
|
183
|
+
|
184
|
+
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
+
:rtype: dask.dataframe.DataFrame
|
186
|
+
"""
|
92
187
|
if self.load_parquet:
|
93
188
|
if self.parquet_folder_list:
|
94
189
|
return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
|
@@ -97,5 +192,14 @@ class ParquetConfig(BaseModel):
|
|
97
192
|
|
98
193
|
@staticmethod
|
99
194
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
195
|
+
"""
|
196
|
+
Ensures that the specified file has the desired extension. If the file already has the
|
197
|
+
specified extension, it returns the filepath unchanged. Otherwise, it updates the file
|
198
|
+
extension to the given one and returns the modified filepath.
|
199
|
+
|
200
|
+
:param filepath: The path to the file as a string.
|
201
|
+
:param extension: The desired file extension, without the leading dot.
|
202
|
+
:return: The updated file path as a string, ensuring it has the specified extension.
|
203
|
+
"""
|
100
204
|
path = Path(filepath)
|
101
205
|
return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|
@@ -9,6 +9,23 @@ from ._sql_model_builder import SqlAlchemyModelBuilder
|
|
9
9
|
|
10
10
|
|
11
11
|
class SqlAlchemyConnectionConfig(BaseModel):
|
12
|
+
"""
|
13
|
+
Configuration class for managing an SQLAlchemy database connection.
|
14
|
+
|
15
|
+
This class provides configurations to establish a connection to a database,
|
16
|
+
validate the connection, and dynamically build a SQLAlchemy model for a specific
|
17
|
+
table if required. It initializes the database engine using the provided connection URL
|
18
|
+
and ensures that the connection and table information are properly validated.
|
19
|
+
|
20
|
+
:ivar connection_url: The URL used to connect to the database.
|
21
|
+
:type connection_url: str
|
22
|
+
:ivar table: The name of the database table for which a model will be constructed.
|
23
|
+
:type table: Optional[str]
|
24
|
+
:ivar model: The dynamically built SQLAlchemy model for the specified table.
|
25
|
+
:type model: Any
|
26
|
+
:ivar engine: The SQLAlchemy engine instance reused for database connections.
|
27
|
+
:type engine: Optional[Any]
|
28
|
+
"""
|
12
29
|
connection_url: str
|
13
30
|
table: Optional[str] = None
|
14
31
|
model: Any = None
|