sibi-dst 0.3.31__py3-none-any.whl → 0.3.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_parquet_artifact.py +68 -0
- sibi_dst/df_helper/_parquet_reader.py +45 -1
- sibi_dst/df_helper/backends/django/_db_connection.py +41 -1
- sibi_dst/df_helper/backends/django/_io_dask.py +211 -3
- sibi_dst/df_helper/backends/django/_load_from_db.py +96 -1
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +132 -6
- sibi_dst/df_helper/backends/http/_http_config.py +52 -1
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +28 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +105 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +17 -0
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +80 -2
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +90 -29
- sibi_dst/df_helper/core/_params_config.py +59 -0
- sibi_dst/geopy_helper/geo_location_service.py +14 -0
- sibi_dst/geopy_helper/utils.py +37 -3
- sibi_dst/osmnx_helper/base_osm_map.py +254 -0
- sibi_dst/osmnx_helper/utils.py +226 -4
- sibi_dst/utils/clickhouse_writer.py +27 -0
- sibi_dst/utils/data_utils.py +32 -1
- sibi_dst/utils/data_wrapper.py +94 -6
- sibi_dst/utils/date_utils.py +35 -0
- sibi_dst/utils/log_utils.py +19 -2
- sibi_dst/utils/parquet_saver.py +1 -0
- sibi_dst/utils/storage_manager.py +4 -1
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/METADATA +3 -1
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/RECORD +27 -27
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/WHEEL +0 -0
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
#
|
1
|
+
|
3
2
|
import keyword
|
4
3
|
import re
|
5
4
|
from functools import lru_cache
|
@@ -49,13 +48,57 @@ apps_label = "datacubes"
|
|
49
48
|
|
50
49
|
|
51
50
|
class DjangoSqlModelBuilder:
|
51
|
+
"""
|
52
|
+
Handles the dynamic creation of Django ORM models based on database table structures.
|
53
|
+
|
54
|
+
This class takes input parameters such as database connection and table name,
|
55
|
+
and dynamically maps the table's schema to a Django ORM model. The resultant model
|
56
|
+
can be used for various ORM operations like querying, saving, and deleting records.
|
57
|
+
The class utilizes Django's introspection features and allows customization
|
58
|
+
through its fields and methods.
|
59
|
+
|
60
|
+
:ivar connection_name: The name of the database connection being used.
|
61
|
+
:type connection_name: str
|
62
|
+
:ivar table: The name of the database table for which the model is being built.
|
63
|
+
:type table: str
|
64
|
+
:ivar model: The dynamically generated Django model or None if not created yet.
|
65
|
+
:type model: type | None
|
66
|
+
"""
|
52
67
|
def __init__(self, **kwargs):
|
68
|
+
"""
|
69
|
+
Represents an initialization method for a class that handles the
|
70
|
+
assignment of attributes and processes the given keyword arguments
|
71
|
+
through an internal utility function. This method sets up the
|
72
|
+
necessary attributes for later use.
|
73
|
+
|
74
|
+
:param kwargs: A collection of keyword arguments used by the internal
|
75
|
+
parsing method to populate the attributes of the class. Specific
|
76
|
+
expected keys and their usage should be detailed in the internal
|
77
|
+
implementation.
|
78
|
+
"""
|
53
79
|
self.connection_name = None
|
54
80
|
self.table = None
|
55
81
|
self.model = None
|
56
82
|
self.__parse_builder(**kwargs)
|
57
83
|
|
58
84
|
def __parse_builder(self, **kwargs):
|
85
|
+
"""
|
86
|
+
Parses and initializes the builder properties based on provided keyword
|
87
|
+
arguments. Validates that the required 'connection_name' and 'table'
|
88
|
+
values are present and sets the corresponding attributes. If validation
|
89
|
+
fails, raises appropriate errors. Returns the updated builder object
|
90
|
+
after initialization. This method is primarily intended for internal
|
91
|
+
use to configure the builder.
|
92
|
+
|
93
|
+
:param kwargs: Keyword arguments containing configuration values for
|
94
|
+
initializing the builder. Should include 'connection_name'
|
95
|
+
and 'table' keys.
|
96
|
+
:type kwargs: dict
|
97
|
+
:return: Returns the instance of the builder object after initialization.
|
98
|
+
:rtype: self
|
99
|
+
:raises ValueError: If 'connection_name' or 'table' is not provided in
|
100
|
+
the keyword arguments.
|
101
|
+
"""
|
59
102
|
self.connection_name = kwargs.get("connection_name", None)
|
60
103
|
self.table = kwargs.get("table", None)
|
61
104
|
self.model = None
|
@@ -67,6 +110,22 @@ class DjangoSqlModelBuilder:
|
|
67
110
|
|
68
111
|
@lru_cache(maxsize=None)
|
69
112
|
def build_model(self):
|
113
|
+
"""
|
114
|
+
Builds and retrieves a model instance with dynamically defined fields.
|
115
|
+
|
116
|
+
This method attempts to retrieve a model instance by its name and, if it
|
117
|
+
does not exist, creates a new model with the specified table structure.
|
118
|
+
The model is either fetched or constructed using the provided data about
|
119
|
+
its fields. The result is cached for repeated calls to improve performance
|
120
|
+
and avoid redundant computations.
|
121
|
+
|
122
|
+
:raises LookupError: If the model cannot be fetched or created due to an
|
123
|
+
invalid lookup.
|
124
|
+
|
125
|
+
:return: A model instance dynamically constructed or retrieved for the
|
126
|
+
specified table and fields.
|
127
|
+
:rtype: Model
|
128
|
+
"""
|
70
129
|
model = None
|
71
130
|
model_fields = self.get_model_fields()
|
72
131
|
model_name = self.table2model(self.table)
|
@@ -78,6 +137,25 @@ class DjangoSqlModelBuilder:
|
|
78
137
|
return model
|
79
138
|
|
80
139
|
def create_model(self, name, fields) -> type:
|
140
|
+
"""
|
141
|
+
Creates a Django model class dynamically.
|
142
|
+
|
143
|
+
This function takes in a model name and a dictionary of fields, dynamically
|
144
|
+
creates a Meta class where additional metadata for the model (like
|
145
|
+
`db_table`, `managed`, `app_label`) is defined, and then uses Python's
|
146
|
+
standard library `type()` function to generate and return the model class
|
147
|
+
on the fly.
|
148
|
+
|
149
|
+
:param name: The name of the model class to create.
|
150
|
+
:type name: str
|
151
|
+
:param fields: A dictionary mapping field names to their definitions in
|
152
|
+
Django's model field format. Each field definition should include
|
153
|
+
the field type and optional parameters.
|
154
|
+
:type fields: dict
|
155
|
+
:return: The dynamically created Django model class based on the provided
|
156
|
+
name and fields.
|
157
|
+
:rtype: type
|
158
|
+
"""
|
81
159
|
def parse_args(arg_string):
|
82
160
|
arg_dict = {}
|
83
161
|
# Match keyword arguments in the form key=value
|
@@ -118,9 +196,32 @@ class DjangoSqlModelBuilder:
|
|
118
196
|
|
119
197
|
@staticmethod
|
120
198
|
def table2model(table_name):
|
199
|
+
"""
|
200
|
+
Converts a database table name to a corresponding model name by transforming
|
201
|
+
it from snake_case to CamelCase. This method takes a string representing
|
202
|
+
a table name, splits it by underscores, capitalizes the first letter of
|
203
|
+
each part, and then joins them into a single string.
|
204
|
+
|
205
|
+
:param table_name: The name of the database table in snake_case format
|
206
|
+
:type table_name: str
|
207
|
+
:return: A string representing the equivalent model name in CamelCase format
|
208
|
+
:rtype: str
|
209
|
+
"""
|
121
210
|
return "".join([x.title() for x in table_name.split("_")])
|
122
211
|
|
123
212
|
def get_model_fields(self):
|
213
|
+
"""
|
214
|
+
Generates the data structure for model fields from a database table using
|
215
|
+
introspection. The method extracts information about columns, primary keys,
|
216
|
+
unique constraints, and additional metadata to define the fields of the model.
|
217
|
+
|
218
|
+
:raises ValueError: If the specified connection or table is not found.
|
219
|
+
:raises Exception: For any database or introspection-related errors.
|
220
|
+
|
221
|
+
:returns: Dictionary containing the model field definitions based on the
|
222
|
+
table's structure and metadata.
|
223
|
+
:rtype: dict
|
224
|
+
"""
|
124
225
|
connection = connections[self.connection_name]
|
125
226
|
if connection is None:
|
126
227
|
raise ValueError("Connection %s not found" % self.connection_name)
|
@@ -265,7 +366,21 @@ class DjangoSqlModelBuilder:
|
|
265
366
|
@staticmethod
|
266
367
|
def normalize_col_name(col_name, used_column_names, is_relation):
|
267
368
|
"""
|
268
|
-
|
369
|
+
Normalizes a column name to conform to Python's variable naming conventions and addresses potential
|
370
|
+
name conflicts or issues with reserved words. Applies transformations to ensure the column name:
|
371
|
+
- Is lowercase.
|
372
|
+
- Replaces unsuitable characters with underscores.
|
373
|
+
- Avoids conflicts with Python keywords and digits at the start of the name.
|
374
|
+
- Resolves conflicts with previously used column names.
|
375
|
+
|
376
|
+
:param col_name: The original column name provided from the schema.
|
377
|
+
:param used_column_names: A list of previously used column names to avoid naming collisions.
|
378
|
+
:param is_relation: A boolean indicating if the column represents a relation (e.g., foreign key).
|
379
|
+
:return: A tuple containing:
|
380
|
+
- The normalized column name (str).
|
381
|
+
- A dictionary (`field_params`) with any relevant information for database configuration.
|
382
|
+
Includes the original column name if specific transformations were applied.
|
383
|
+
- A list (`field_notes`) containing strings explaining the applied transformations.
|
269
384
|
"""
|
270
385
|
field_params = {}
|
271
386
|
field_notes = []
|
@@ -326,9 +441,20 @@ class DjangoSqlModelBuilder:
|
|
326
441
|
@staticmethod
|
327
442
|
def get_field_type(connection, row):
|
328
443
|
"""
|
329
|
-
|
330
|
-
|
331
|
-
|
444
|
+
Determines the type of a database field based on its description and connection
|
445
|
+
introspection, and includes metadata such as parameters and additional notes.
|
446
|
+
|
447
|
+
This function extracts the field type from the database's introspection
|
448
|
+
interface and adds corresponding parameters (e.g., `max_length`, `decimal_places`)
|
449
|
+
and relevant notes if certain properties are inferred or guessed.
|
450
|
+
|
451
|
+
:param connection: The database connection object used for introspection.
|
452
|
+
:type connection: Any
|
453
|
+
:param row: An object containing field metadata, such as type code,
|
454
|
+
display size, collation, precision, and scale.
|
455
|
+
:type row: Any
|
456
|
+
:return: A tuple containing the field type, its parameters, and any notes.
|
457
|
+
:rtype: tuple[str, dict, list[str]]
|
332
458
|
"""
|
333
459
|
field_params = {}
|
334
460
|
field_notes = []
|
@@ -9,6 +9,26 @@ from sibi_dst.utils import Logger
|
|
9
9
|
|
10
10
|
|
11
11
|
class HttpConfig(BaseModel):
|
12
|
+
"""
|
13
|
+
Configuration for HTTP client operations, designed to manage and fetch data
|
14
|
+
from HTTP endpoints asynchronously. This class serves as a centralized configuration
|
15
|
+
and operation hub encapsulating settings such as base URL, query parameters, API keys,
|
16
|
+
and logger support. It employs `httpx` for HTTP interactions and leverages Dask for the
|
17
|
+
resulting data handling and transformation.
|
18
|
+
|
19
|
+
:ivar base_url: The base URL for HTTP communication.
|
20
|
+
:type base_url: HttpUrl
|
21
|
+
:ivar params: Optional dictionary containing query parameters to be used with GET requests.
|
22
|
+
:type params: Optional[Dict[str, Any]]
|
23
|
+
:ivar logger: The logger instance for logging operations. If not provided, a default logger
|
24
|
+
is initialized using the class name.
|
25
|
+
:type logger: Optional[Logger]
|
26
|
+
:ivar timeout: The timeout value in seconds for HTTP requests. Defaults to 300.
|
27
|
+
:type timeout: Optional[int]
|
28
|
+
:ivar api_key: The optional secret API key for authorization. If present, it will populate
|
29
|
+
the Authorization header in HTTP requests.
|
30
|
+
:type api_key: Optional[SecretStr]
|
31
|
+
"""
|
12
32
|
base_url: HttpUrl
|
13
33
|
params: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
14
34
|
logger: Optional[Logger] = None
|
@@ -17,12 +37,43 @@ class HttpConfig(BaseModel):
|
|
17
37
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
18
38
|
|
19
39
|
def __init__(self, logger=None, **data):
|
40
|
+
"""
|
41
|
+
Initializes the class with a logger and other data parameters.
|
42
|
+
|
43
|
+
This constructor allows the option to provide a custom logger. If no logger
|
44
|
+
is supplied during initialization, a default logger specific to the class
|
45
|
+
is created using the Logger utility. It also initializes the instance
|
46
|
+
with additional data passed as keyword arguments.
|
47
|
+
|
48
|
+
:param logger: Optional logger instance. If not provided, a default
|
49
|
+
logger is created using the class name as the logger name.
|
50
|
+
:type logger: logging.Logger, optional
|
51
|
+
:param data: Arbitrary keyword arguments containing data to initialize
|
52
|
+
the class.
|
53
|
+
:type data: dict
|
54
|
+
"""
|
20
55
|
super().__init__(**data)
|
21
56
|
# Initialize the logger if not provided
|
22
57
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
23
58
|
|
24
59
|
async def fetch_data(self, **options) -> dd.DataFrame:
|
25
|
-
"""
|
60
|
+
"""
|
61
|
+
Fetches data from a specified HTTP JSON source and returns it as a dask DataFrame.
|
62
|
+
|
63
|
+
This asynchronous method constructs a request URL based on the provided options
|
64
|
+
and sends an HTTP GET request. The fetched JSON data is normalized and
|
65
|
+
converted to a dask DataFrame for further use. It handles request errors and
|
66
|
+
JSON parsing errors effectively.
|
67
|
+
|
68
|
+
:param options: Arbitrary keyword arguments representing dynamic path segments
|
69
|
+
to be appended to the base URL.
|
70
|
+
:type options: dict
|
71
|
+
:return: A dask DataFrame containing the structured data retrieved
|
72
|
+
from the HTTP JSON source.
|
73
|
+
:rtype: dd.DataFrame
|
74
|
+
:raises httpx.RequestError: If there is an issue with the HTTP request.
|
75
|
+
:raises ValueError: If there is an error parsing JSON data.
|
76
|
+
"""
|
26
77
|
try:
|
27
78
|
# Build URL with options as path segments
|
28
79
|
|
@@ -5,11 +5,39 @@ from sibi_dst.utils import Logger
|
|
5
5
|
|
6
6
|
|
7
7
|
class ParquetFilterHandler(object):
|
8
|
+
"""
|
9
|
+
Handles parquet filtering operations using dask dataframes.
|
10
|
+
|
11
|
+
This class is designed to apply complex filtering logic on dask dataframes
|
12
|
+
based on specified filter criteria. It includes support for operations such
|
13
|
+
as exact matches, ranges, string pattern matches, and null checks. Additionally,
|
14
|
+
it handles datetime-related field filtering including precise truncations and
|
15
|
+
specific date/time attributes.
|
16
|
+
|
17
|
+
:ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
|
18
|
+
:type logger: Logger
|
19
|
+
"""
|
8
20
|
def __init__(self, logger=None):
|
9
21
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
10
22
|
|
11
23
|
@staticmethod
|
12
24
|
def apply_filters_dask(df, filters):
|
25
|
+
"""
|
26
|
+
Applies a set of filters to a Dask DataFrame, enabling complex filtering operations
|
27
|
+
such as comparisons, ranges, string match operations, and more. Handles special
|
28
|
+
cases for datetime operations, including casting and extracting specific datetime
|
29
|
+
components for filtering.
|
30
|
+
|
31
|
+
:param df: Dask DataFrame to which the filters will be applied.
|
32
|
+
:type df: dask.dataframe.DataFrame
|
33
|
+
:param filters: Dictionary defining the filtering logic, where the keys specify
|
34
|
+
the column name and filter operation, and the values specify the corresponding
|
35
|
+
filter values to apply.
|
36
|
+
:type filters: dict
|
37
|
+
:return: A filtered Dask DataFrame based on the defined logic in the filters.
|
38
|
+
:rtype: dask.dataframe.DataFrame
|
39
|
+
:raises ValueError: If an unsupported operation is encountered in the filters.
|
40
|
+
"""
|
13
41
|
dt_operators = ['date', 'time']
|
14
42
|
date_operators = ['year', 'month', 'day', 'hour', 'minute', 'second', 'week_day']
|
15
43
|
comparison_operators = [
|
@@ -11,6 +11,45 @@ from sibi_dst.utils import Logger
|
|
11
11
|
|
12
12
|
|
13
13
|
class ParquetConfig(BaseModel):
|
14
|
+
"""
|
15
|
+
Represents configuration for managing and validating parquet file operations.
|
16
|
+
|
17
|
+
The `ParquetConfig` class provides attributes and methods necessary to handle operations
|
18
|
+
on parquet files in a file system. It includes functionalities for ensuring file paths
|
19
|
+
and extensions, validating storage paths and parameters, determining file recency,
|
20
|
+
and calculating the size of parquet files. This class is designed with flexibility to handle
|
21
|
+
different file systems through the integration with `fsspec` and allows storage path validations
|
22
|
+
with optional logging support.
|
23
|
+
|
24
|
+
:ivar load_parquet: Indicates whether parquet data should be loaded based on the
|
25
|
+
current configuration and validation.
|
26
|
+
:type load_parquet: bool
|
27
|
+
:ivar parquet_filename: The name of the parquet file, optional if folders are used.
|
28
|
+
:type parquet_filename: Optional[str]
|
29
|
+
:ivar parquet_storage_path: The base path for storing or retrieving parquet files.
|
30
|
+
:type parquet_storage_path: Optional[str]
|
31
|
+
:ivar parquet_full_path: The full path to a specific parquet file, derived from the
|
32
|
+
storage path and filename when applicable.
|
33
|
+
:type parquet_full_path: Optional[str]
|
34
|
+
:ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
|
35
|
+
and end dates if specified.
|
36
|
+
:type parquet_folder_list: Optional[List[str]]
|
37
|
+
:ivar parquet_size_bytes: The total size of the parquet files, in bytes.
|
38
|
+
:type parquet_size_bytes: int
|
39
|
+
:ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
|
40
|
+
:type parquet_max_age_minutes: int
|
41
|
+
:ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
|
42
|
+
on the `parquet_max_age_minutes` condition.
|
43
|
+
:type parquet_is_recent: bool
|
44
|
+
:ivar parquet_start_date: The start date for parquet file validation or file path generation.
|
45
|
+
:type parquet_start_date: Optional[str]
|
46
|
+
:ivar parquet_end_date: The end date for parquet file validation or file path generation.
|
47
|
+
:type parquet_end_date: Optional[str]
|
48
|
+
:ivar fs: The file system object used for storage operations, compliant with `fsspec`.
|
49
|
+
:type fs: Optional[fsspec.spec.AbstractFileSystem]
|
50
|
+
:ivar logger: A logger for handling logging operations.
|
51
|
+
:type logger: Optional[Logger]
|
52
|
+
"""
|
14
53
|
load_parquet: bool = False
|
15
54
|
parquet_filename: Optional[str] = None
|
16
55
|
parquet_storage_path: Optional[str] = None
|
@@ -27,6 +66,20 @@ class ParquetConfig(BaseModel):
|
|
27
66
|
|
28
67
|
@model_validator(mode='after')
|
29
68
|
def check_parquet_params(self):
|
69
|
+
"""
|
70
|
+
Validates and configures the parameters required for managing parquet files. This includes
|
71
|
+
configuring paths through `fsspec`, identifying file storage paths, checking the validity of
|
72
|
+
dates related to parquet files, ensuring proper parquet file extensions, and determining
|
73
|
+
whether existing parquet files are recent and loadable.
|
74
|
+
|
75
|
+
:return: The current instance with validated and migrated attributes configured for
|
76
|
+
handling parquet files.
|
77
|
+
|
78
|
+
:raises ValueError: If certain conditions are not met, such as missing or invalid
|
79
|
+
`parquet_storage_path`, providing only one of
|
80
|
+
`parquet_start_date` or `parquet_end_date`, or if the
|
81
|
+
`parquet_end_date` is earlier than the `parquet_start_date`.
|
82
|
+
"""
|
30
83
|
# Configure paths based on fsspec
|
31
84
|
if self.logger is None:
|
32
85
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -72,6 +125,23 @@ class ParquetConfig(BaseModel):
|
|
72
125
|
return self
|
73
126
|
|
74
127
|
def is_file_recent(self):
|
128
|
+
"""
|
129
|
+
Determines whether the file at the specified parquet path is considered recent
|
130
|
+
based on its modification time and the maximum age limit defined.
|
131
|
+
|
132
|
+
The function first checks for the existence of the file at the specified
|
133
|
+
`parquet_full_path`. If the file does not exist, the function will return
|
134
|
+
False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
|
135
|
+
limit, and the function will return True. Otherwise, it retrieves the file's
|
136
|
+
last modified time and calculates the age of the file by comparing it with the
|
137
|
+
current time. The function returns True if the file's age does not exceed the
|
138
|
+
maximum age specified by `parquet_max_age_minutes`, otherwise it returns
|
139
|
+
False.
|
140
|
+
|
141
|
+
:return: Whether the file is considered recent based on its existence,
|
142
|
+
modification time, and maximum age limit.
|
143
|
+
:rtype: bool
|
144
|
+
"""
|
75
145
|
if not self.fs.exists(self.parquet_full_path):
|
76
146
|
return False
|
77
147
|
if self.parquet_max_age_minutes == 0:
|
@@ -80,6 +150,24 @@ class ParquetConfig(BaseModel):
|
|
80
150
|
return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
|
81
151
|
|
82
152
|
def get_parquet_size_bytes(self):
|
153
|
+
"""
|
154
|
+
Calculate the total size, in bytes, of all Parquet files within the defined
|
155
|
+
folders specified by `parquet_folder_list`. The function iteratively goes
|
156
|
+
through each folder in the provided list, applying a recursive wildcard
|
157
|
+
search to include all levels of nested directories, and calculates the
|
158
|
+
cumulative size of all found Parquet files using the file system's size
|
159
|
+
retrieval method.
|
160
|
+
|
161
|
+
:raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
|
162
|
+
or improperly configured when the method is called.
|
163
|
+
:raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
|
164
|
+
unimplemented in the provided file system object or it otherwise lacks
|
165
|
+
necessary support for these operations.
|
166
|
+
|
167
|
+
:return: The cumulative size of all Parquet files located in the folders
|
168
|
+
defined by `parquet_folder_list`, measured in bytes.
|
169
|
+
:rtype: int
|
170
|
+
"""
|
83
171
|
total_size = 0
|
84
172
|
for folder in self.parquet_folder_list:
|
85
173
|
# Use a double wildcard ** to match any level of nested directories
|
@@ -88,7 +176,14 @@ class ParquetConfig(BaseModel):
|
|
88
176
|
return total_size
|
89
177
|
|
90
178
|
def load_files(self):
|
91
|
-
|
179
|
+
"""
|
180
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions. This
|
181
|
+
method checks if parquet file loading is enabled and loads either from a list of
|
182
|
+
parquet folder paths or a single specified parquet path.
|
183
|
+
|
184
|
+
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
+
:rtype: dask.dataframe.DataFrame
|
186
|
+
"""
|
92
187
|
if self.load_parquet:
|
93
188
|
if self.parquet_folder_list:
|
94
189
|
return dd.read_parquet(self.parquet_folder_list, engine="pyarrow", filesystem=self.fs)
|
@@ -97,5 +192,14 @@ class ParquetConfig(BaseModel):
|
|
97
192
|
|
98
193
|
@staticmethod
|
99
194
|
def ensure_file_extension(filepath: str, extension: str) -> str:
|
195
|
+
"""
|
196
|
+
Ensures that the specified file has the desired extension. If the file already has the
|
197
|
+
specified extension, it returns the filepath unchanged. Otherwise, it updates the file
|
198
|
+
extension to the given one and returns the modified filepath.
|
199
|
+
|
200
|
+
:param filepath: The path to the file as a string.
|
201
|
+
:param extension: The desired file extension, without the leading dot.
|
202
|
+
:return: The updated file path as a string, ensuring it has the specified extension.
|
203
|
+
"""
|
100
204
|
path = Path(filepath)
|
101
205
|
return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
|
@@ -9,6 +9,23 @@ from ._sql_model_builder import SqlAlchemyModelBuilder
|
|
9
9
|
|
10
10
|
|
11
11
|
class SqlAlchemyConnectionConfig(BaseModel):
|
12
|
+
"""
|
13
|
+
Configuration class for managing an SQLAlchemy database connection.
|
14
|
+
|
15
|
+
This class provides configurations to establish a connection to a database,
|
16
|
+
validate the connection, and dynamically build a SQLAlchemy model for a specific
|
17
|
+
table if required. It initializes the database engine using the provided connection URL
|
18
|
+
and ensures that the connection and table information are properly validated.
|
19
|
+
|
20
|
+
:ivar connection_url: The URL used to connect to the database.
|
21
|
+
:type connection_url: str
|
22
|
+
:ivar table: The name of the database table for which a model will be constructed.
|
23
|
+
:type table: Optional[str]
|
24
|
+
:ivar model: The dynamically built SQLAlchemy model for the specified table.
|
25
|
+
:type model: Any
|
26
|
+
:ivar engine: The SQLAlchemy engine instance reused for database connections.
|
27
|
+
:type engine: Optional[Any]
|
28
|
+
"""
|
12
29
|
connection_url: str
|
13
30
|
table: Optional[str] = None
|
14
31
|
model: Any = None
|
@@ -8,6 +8,41 @@ from ._db_connection import SqlAlchemyConnectionConfig
|
|
8
8
|
|
9
9
|
|
10
10
|
class SqlAlchemyLoadFromDb:
|
11
|
+
"""
|
12
|
+
The SqlAlchemyLoadFromDb class provides functionality to load data from a
|
13
|
+
database using SQLAlchemy into a Dask DataFrame. It is capable of handling
|
14
|
+
large datasets efficiently by utilizing the Dask framework for parallel
|
15
|
+
computations.
|
16
|
+
|
17
|
+
This class is initialized with a database connection configuration, query
|
18
|
+
configuration, optional parameters, and a logger. It can execute a query
|
19
|
+
using the specified configurations and read the results into a Dask
|
20
|
+
DataFrame. This is useful for processing and analyzing large-scale data.
|
21
|
+
|
22
|
+
:ivar df: Dask DataFrame to store the loaded data.
|
23
|
+
:type df: dd.DataFrame
|
24
|
+
:ivar db_connection: Database connection configuration object, containing details
|
25
|
+
such as the table, model, and engine to be used for the query.
|
26
|
+
:type db_connection: SqlAlchemyConnectionConfig
|
27
|
+
:ivar table_name: Name of the database table being queried.
|
28
|
+
:type table_name: str
|
29
|
+
:ivar model: SQLAlchemy model associated with the database connection.
|
30
|
+
:type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
|
31
|
+
:ivar engine: SQLAlchemy engine used for executing queries.
|
32
|
+
:type engine: sqlalchemy.engine.base.Engine
|
33
|
+
:ivar logger: Logger instance for logging debug and error information.
|
34
|
+
:type logger: Logger
|
35
|
+
:ivar query_config: Query configuration, including query-related details such
|
36
|
+
as the SQL query or query settings.
|
37
|
+
:type query_config: QueryConfig
|
38
|
+
:ivar params_config: Parameters configuration, including filter parameters for
|
39
|
+
the query.
|
40
|
+
:type params_config: ParamsConfig
|
41
|
+
:ivar debug: Debug flag indicating whether debug mode is enabled.
|
42
|
+
:type debug: bool
|
43
|
+
:ivar chunk_size: Size of data chunks to process at a time.
|
44
|
+
:type chunk_size: int
|
45
|
+
"""
|
11
46
|
df: dd.DataFrame = None
|
12
47
|
|
13
48
|
def __init__(
|
@@ -19,7 +54,28 @@ class SqlAlchemyLoadFromDb:
|
|
19
54
|
**kwargs,
|
20
55
|
):
|
21
56
|
"""
|
22
|
-
|
57
|
+
Initializes an instance of the class, setting up a database connection,
|
58
|
+
query configuration, parameter configuration, and other optional settings
|
59
|
+
like debugging and logging. The class aims to manage the integration and
|
60
|
+
interaction with SQLAlchemy-based database operations.
|
61
|
+
|
62
|
+
:param plugin_sqlalchemy:
|
63
|
+
The SQLAlchemy connection configuration object, which provides
|
64
|
+
the connection details like engine, table name, and model
|
65
|
+
associated with the database operations.
|
66
|
+
:param plugin_query:
|
67
|
+
The query configuration object, used to define specific query
|
68
|
+
options or rules. Defaults to None.
|
69
|
+
:param plugin_params:
|
70
|
+
The parameters configuration object, used for any additional
|
71
|
+
parameterized settings or configurations. Defaults to None.
|
72
|
+
:param logger:
|
73
|
+
Optional logger instance for logging purposes. If not provided,
|
74
|
+
a default logger is instantiated using the standard logging system.
|
75
|
+
:param kwargs:
|
76
|
+
Optional additional keyword arguments for customization. Can
|
77
|
+
include optional settings like `debug` mode or `chunk_size`
|
78
|
+
for batch operations.
|
23
79
|
"""
|
24
80
|
self.db_connection = plugin_sqlalchemy
|
25
81
|
self.table_name = self.db_connection.table
|
@@ -33,13 +89,35 @@ class SqlAlchemyLoadFromDb:
|
|
33
89
|
|
34
90
|
def build_and_load(self) -> dd.DataFrame:
|
35
91
|
"""
|
36
|
-
|
92
|
+
Builds and returns the resulting dataframe after calling the internal
|
93
|
+
build and load function. This method triggers the `_build_and_load`
|
94
|
+
function to process and prepare the data before returning it as
|
95
|
+
a dask dataframe.
|
96
|
+
|
97
|
+
:raises RuntimeError: If any error occurs during the build or load process.
|
98
|
+
|
99
|
+
:return: The processed data in a dask dataframe.
|
100
|
+
:rtype: dd.DataFrame
|
37
101
|
"""
|
38
102
|
self._build_and_load()
|
39
103
|
return self.df
|
40
104
|
|
41
105
|
def _build_and_load(self) -> dd.DataFrame:
|
106
|
+
"""
|
107
|
+
Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
|
108
|
+
|
109
|
+
This method initializes a SQLAlchemyDask object with the provided model,
|
110
|
+
filters, engine URL, logger, chunk size, and debug configuration.
|
111
|
+
It attempts to load the data using the ``read_frame`` method of
|
112
|
+
SQLAlchemyDask. If the data cannot be loaded or the query returns
|
113
|
+
no rows, it creates and returns an empty Dask DataFrame.
|
42
114
|
|
115
|
+
:raises Exception: On failure to load data or to create a DataFrame.
|
116
|
+
|
117
|
+
:return: A Dask DataFrame object containing the queried data or an
|
118
|
+
empty DataFrame if the query returns no results or fails.
|
119
|
+
:rtype: dask.dataframe.DataFrame
|
120
|
+
"""
|
43
121
|
try:
|
44
122
|
self.df = SQLAlchemyDask(
|
45
123
|
model=self.model,
|