sibi-dst 0.3.21__tar.gz → 0.3.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/PKG-INFO +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/pyproject.toml +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_data_wrapper.py +75 -12
- sibi_dst-0.3.21/sibi_dst/df_helper/backends/django/_io_dask_alt.py +0 -189
- sibi_dst-0.3.21/sibi_dst/df_helper/backends/sql_model/__init__.py +0 -9
- sibi_dst-0.3.21/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +0 -134
- sibi_dst-0.3.21/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +0 -101
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/README.md +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/__init__.py +1 -1
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/__init__.py +9 -9
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_clickhouse_writer.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_df_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import datetime
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
3
|
from typing import Type, Any, Dict, Optional
|
3
4
|
|
4
5
|
import fsspec
|
@@ -32,7 +33,8 @@ class DataWrapper:
|
|
32
33
|
logger: Optional[Logger] = None,
|
33
34
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
34
35
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
35
|
-
show_progress: bool = False
|
36
|
+
show_progress: bool = False,
|
37
|
+
timeout: Optional[int] = 300):
|
36
38
|
self.dataclass = dataclass
|
37
39
|
self.date_field = date_field
|
38
40
|
self.data_path = self.ensure_forward_slash(data_path)
|
@@ -50,6 +52,7 @@ class DataWrapper:
|
|
50
52
|
self.max_age_minutes = max_age_minutes
|
51
53
|
self.history_days_threshold = history_days_threshold
|
52
54
|
self.show_progress = show_progress
|
55
|
+
self.timeout = timeout
|
53
56
|
|
54
57
|
self.start_date = self.convert_to_date(start_date)
|
55
58
|
self.end_date = self.convert_to_date(end_date)
|
@@ -76,31 +79,79 @@ class DataWrapper:
|
|
76
79
|
yield date.date()
|
77
80
|
|
78
81
|
def process(self):
|
79
|
-
"""Execute the update plan
|
82
|
+
"""Execute the update plan using 'update_priority' to determine processing order."""
|
80
83
|
update_plan_table = self.generate_update_plan_with_conditions()
|
81
84
|
|
82
|
-
# Display the update plan table to the user if
|
85
|
+
# Display the update plan table to the user if requested
|
83
86
|
if self.show_progress:
|
84
87
|
display(update_plan_table)
|
85
88
|
|
86
|
-
#
|
87
|
-
|
88
|
-
("
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
89
|
+
# Filter out rows that do not require updates (priority 0 means skip)
|
90
|
+
update_plan_table = update_plan_table[
|
91
|
+
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
92
|
+
]
|
93
|
+
|
94
|
+
# Group by priority
|
95
|
+
priorities = sorted(update_plan_table["update_priority"].unique())
|
96
|
+
|
97
|
+
# We will process each priority level in its own thread.
|
98
|
+
# Each thread will handle all dates associated with that priority.
|
99
|
+
def process_priority(priority):
|
100
|
+
# Extract dates for the current priority
|
93
101
|
dates_to_process = update_plan_table[
|
94
|
-
|
102
|
+
update_plan_table["update_priority"] == priority
|
95
103
|
]["date"].tolist()
|
96
104
|
|
105
|
+
# If show_progress is True, wrap in a progress bar
|
97
106
|
date_iterator = dates_to_process
|
98
107
|
if self.show_progress:
|
99
|
-
date_iterator = tqdm(date_iterator, desc=f"{
|
108
|
+
date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
109
|
+
unit="date")
|
100
110
|
|
111
|
+
# Process each date for this priority
|
101
112
|
for current_date in date_iterator:
|
102
113
|
self.process_date(current_date)
|
103
114
|
|
115
|
+
# Launch a separate thread for each priority
|
116
|
+
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
117
|
+
futures = {executor.submit(process_priority, p): p for p in priorities}
|
118
|
+
for future in futures:
|
119
|
+
try:
|
120
|
+
future.result(timeout=self.timeout)
|
121
|
+
except TimeoutError:
|
122
|
+
self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
|
123
|
+
future.cancel()
|
124
|
+
priority = futures[future]
|
125
|
+
new_future = executor.submit(process_priority, priority)
|
126
|
+
futures[new_future] = priority
|
127
|
+
self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
|
128
|
+
|
129
|
+
# def process(self):
|
130
|
+
# """Execute the update plan following the specified hierarchy."""
|
131
|
+
# update_plan_table = self.generate_update_plan_with_conditions()
|
132
|
+
#
|
133
|
+
# # Display the update plan table to the user if show_progress is True
|
134
|
+
# if self.show_progress:
|
135
|
+
# display(update_plan_table)
|
136
|
+
#
|
137
|
+
# # Process files according to the hierarchy, considering only `update_required` dates
|
138
|
+
# for category, description in [
|
139
|
+
# ("overwrite", "Processing files due to overwrite=True"),
|
140
|
+
# ("history_days", "Processing files within history_days_threshold"),
|
141
|
+
# ("missing_files", "Processing missing files")
|
142
|
+
# ]:
|
143
|
+
# # Filter dates in the category where `update_required` is True
|
144
|
+
# dates_to_process = update_plan_table[
|
145
|
+
# (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
146
|
+
# ]["date"].tolist()
|
147
|
+
#
|
148
|
+
# date_iterator = dates_to_process
|
149
|
+
# if self.show_progress:
|
150
|
+
# date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
|
151
|
+
#
|
152
|
+
# for current_date in date_iterator:
|
153
|
+
# self.process_date(current_date)
|
154
|
+
|
104
155
|
def is_file_older_than(self, file_path: str) -> bool:
|
105
156
|
"""
|
106
157
|
Check if a file is older than the specified max_age_minutes.
|
@@ -181,12 +232,14 @@ class DataWrapper:
|
|
181
232
|
category = "history_days"
|
182
233
|
update_required = True
|
183
234
|
else:
|
235
|
+
category = "file age is recent"
|
184
236
|
update_required = False
|
185
237
|
# Hierarchy 3: Missing files
|
186
238
|
elif missing_file and current_date <= today:
|
187
239
|
category = "missing_files"
|
188
240
|
update_required = True
|
189
241
|
else:
|
242
|
+
category = "No Update Required"
|
190
243
|
update_required = False
|
191
244
|
|
192
245
|
# Collect condition descriptions for the update plan table
|
@@ -199,6 +252,16 @@ class DataWrapper:
|
|
199
252
|
"update_category": category,
|
200
253
|
"datawrapper class": self.dataclass.__name__
|
201
254
|
})
|
255
|
+
priority_map = {
|
256
|
+
"overwrite": 1,
|
257
|
+
"history_days": 2,
|
258
|
+
"missing_files": 3
|
259
|
+
}
|
260
|
+
|
261
|
+
for row in rows:
|
262
|
+
category = row.get("update_category")
|
263
|
+
# Default to None if no category assigned (no update required)
|
264
|
+
row["update_priority"] = priority_map.get(category, 0)
|
202
265
|
|
203
266
|
update_plan_table = pd.DataFrame(rows)
|
204
267
|
return update_plan_table
|
@@ -1,189 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
|
3
|
-
import dask.dataframe as dd
|
4
|
-
import django
|
5
|
-
import pandas as pd
|
6
|
-
from django.core.cache import cache
|
7
|
-
from django.core.exceptions import FieldDoesNotExist
|
8
|
-
from django.db import models
|
9
|
-
from django.db.models import Field
|
10
|
-
from django.utils.encoding import force_str as force_text
|
11
|
-
|
12
|
-
|
13
|
-
class ReadFrameDask:
|
14
|
-
FieldDoesNotExist = (
|
15
|
-
django.core.exceptions.FieldDoesNotExist
|
16
|
-
if django.VERSION < (1, 8)
|
17
|
-
else django.core.exceptions.FieldDoesNotExist
|
18
|
-
)
|
19
|
-
|
20
|
-
def __init__(
|
21
|
-
self,
|
22
|
-
qs,
|
23
|
-
**kwargs,
|
24
|
-
):
|
25
|
-
self.qs = qs
|
26
|
-
self.coerce_float = kwargs.setdefault("coerce_float", False)
|
27
|
-
self.chunk_size = kwargs.setdefault("chunk_size", 1000)
|
28
|
-
self.verbose = kwargs.setdefault("verbose", True)
|
29
|
-
|
30
|
-
@staticmethod
|
31
|
-
def get_model_name(model):
|
32
|
-
return model._meta.model_name
|
33
|
-
|
34
|
-
@staticmethod
|
35
|
-
def get_related_model(field):
|
36
|
-
model = None
|
37
|
-
if hasattr(field, "related_model") and field.related_model:
|
38
|
-
model = field.related_model
|
39
|
-
elif hasattr(field, "rel") and field.rel:
|
40
|
-
model = field.rel.to
|
41
|
-
return model
|
42
|
-
|
43
|
-
@classmethod
|
44
|
-
def get_base_cache_key(cls, model):
|
45
|
-
return (
|
46
|
-
f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
|
47
|
-
)
|
48
|
-
|
49
|
-
@classmethod
|
50
|
-
def replace_pk(cls, model):
|
51
|
-
base_cache_key = cls.get_base_cache_key(model)
|
52
|
-
|
53
|
-
def get_cache_key_from_pk(pk):
|
54
|
-
return None if pk is None else base_cache_key % str(pk)
|
55
|
-
|
56
|
-
def inner(pk_series):
|
57
|
-
pk_series = pk_series.astype(object).where(pk_series.notnull(), None)
|
58
|
-
cache_keys = pk_series.apply(get_cache_key_from_pk, convert_dtype=False)
|
59
|
-
unique_cache_keys = list(filter(None, cache_keys.unique()))
|
60
|
-
if not unique_cache_keys:
|
61
|
-
return pk_series
|
62
|
-
|
63
|
-
out_dict = cache.get_many(unique_cache_keys)
|
64
|
-
if len(out_dict) < len(unique_cache_keys):
|
65
|
-
out_dict = dict(
|
66
|
-
[
|
67
|
-
(base_cache_key % obj.pk, force_text(obj))
|
68
|
-
for obj in model.objects.filter(
|
69
|
-
pk__in=list(filter(None, pk_series.unique()))
|
70
|
-
)
|
71
|
-
]
|
72
|
-
)
|
73
|
-
cache.set_many(out_dict)
|
74
|
-
return list(map(out_dict.get, cache_keys))
|
75
|
-
|
76
|
-
return inner
|
77
|
-
|
78
|
-
@staticmethod
|
79
|
-
def replace_from_choices(choices):
|
80
|
-
def inner(values):
|
81
|
-
return [choices.get(v, v) for v in values]
|
82
|
-
|
83
|
-
return inner
|
84
|
-
|
85
|
-
@classmethod
|
86
|
-
def build_update_functions(cls, fieldnames, fields):
|
87
|
-
for fieldname, field in zip(fieldnames, fields):
|
88
|
-
if not isinstance(field, Field):
|
89
|
-
yield fieldname, None
|
90
|
-
else:
|
91
|
-
if field.choices:
|
92
|
-
choices = dict([(k, force_text(v)) for k, v in field.flatchoices])
|
93
|
-
yield fieldname, cls.replace_from_choices(choices)
|
94
|
-
elif field.get_internal_type() == "ForeignKey":
|
95
|
-
yield fieldname, cls.replace_pk(cls.get_related_model(field))
|
96
|
-
|
97
|
-
@classmethod
|
98
|
-
def update_with_verbose(cls, df, fieldnames, fields):
|
99
|
-
for fieldname, function in cls.build_update_functions(fieldnames, fields):
|
100
|
-
if function is not None:
|
101
|
-
df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
|
102
|
-
|
103
|
-
@staticmethod
|
104
|
-
def infer_dtypes_from_django(qs):
|
105
|
-
"""Infers Dask data types based on Django queryset model fields, with support for nullable integers."""
|
106
|
-
django_to_dask_dtype = {
|
107
|
-
'AutoField': 'Int64', # Use nullable integer
|
108
|
-
'BigAutoField': 'Int64',
|
109
|
-
'BigIntegerField': 'Int64',
|
110
|
-
'BooleanField': 'bool',
|
111
|
-
'CharField': 'object',
|
112
|
-
'DateField': 'datetime64[ns]',
|
113
|
-
'DateTimeField': 'datetime64[ns]',
|
114
|
-
'DecimalField': 'float64',
|
115
|
-
'FloatField': 'float64',
|
116
|
-
'IntegerField': 'Int64', # Use nullable integer
|
117
|
-
'PositiveIntegerField': 'Int64',
|
118
|
-
'SmallIntegerField': 'Int64',
|
119
|
-
'TextField': 'object',
|
120
|
-
'TimeField': 'object',
|
121
|
-
'UUIDField': 'object',
|
122
|
-
'ForeignKey': 'Int64', # Use nullable integer for FK fields
|
123
|
-
}
|
124
|
-
|
125
|
-
dtypes = {}
|
126
|
-
# Handle model fields
|
127
|
-
for field in qs.model._meta.get_fields():
|
128
|
-
# Skip reverse relationships and non-concrete fields
|
129
|
-
if not getattr(field, 'concrete', False):
|
130
|
-
continue
|
131
|
-
|
132
|
-
# Check for AutoField or BigAutoField explicitly
|
133
|
-
if isinstance(field, (models.AutoField, models.BigAutoField)):
|
134
|
-
dtypes[field.name] = 'Int64' # Nullable integer for autoincremented fields
|
135
|
-
else:
|
136
|
-
# Use field type to infer dtype
|
137
|
-
field_type = field.get_internal_type()
|
138
|
-
dtypes[field.name] = django_to_dask_dtype.get(field_type, 'object')
|
139
|
-
|
140
|
-
# Handle annotated fields
|
141
|
-
for annotation_name, annotation in qs.query.annotation_select.items():
|
142
|
-
if hasattr(annotation, 'output_field'):
|
143
|
-
field_type = annotation.output_field.get_internal_type()
|
144
|
-
dtype = django_to_dask_dtype.get(field_type, 'object')
|
145
|
-
else:
|
146
|
-
dtype = 'object' # Default to object for untyped annotations
|
147
|
-
dtypes[annotation_name] = dtype
|
148
|
-
|
149
|
-
return dtypes
|
150
|
-
|
151
|
-
def read_frame(self, fillna_value=None):
|
152
|
-
qs = self.qs
|
153
|
-
fieldnames = tuple(qs.model._meta.get_fields())
|
154
|
-
dtypes = self.infer_dtypes_from_django(qs)
|
155
|
-
chunk_size = self.chunk_size
|
156
|
-
verbose = self.verbose
|
157
|
-
|
158
|
-
# Use values to directly fetch required fields
|
159
|
-
qs = qs.values(*fieldnames)
|
160
|
-
|
161
|
-
# Create partitions for Dask
|
162
|
-
partitions = []
|
163
|
-
iterator = qs.iterator(chunk_size=chunk_size)
|
164
|
-
for chunk in itertools.islice(iterator, chunk_size):
|
165
|
-
df = pd.DataFrame.from_records(chunk, columns=fieldnames)
|
166
|
-
|
167
|
-
# Handle NaN values
|
168
|
-
if fillna_value:
|
169
|
-
df = df.fillna(fillna_value)
|
170
|
-
|
171
|
-
# Optimize timezone conversions
|
172
|
-
for col in df.columns:
|
173
|
-
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
174
|
-
df[col] = df[col].dt.tz_localize(None)
|
175
|
-
|
176
|
-
# Optimize dtype conversion
|
177
|
-
df = df.convert_dtypes()
|
178
|
-
|
179
|
-
# Convert to Dask DataFrame
|
180
|
-
partitions.append(dd.from_pandas(df, npartitions=1))
|
181
|
-
|
182
|
-
# Combine all partitions
|
183
|
-
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
184
|
-
|
185
|
-
# Apply verbose updates
|
186
|
-
if verbose:
|
187
|
-
self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
|
188
|
-
|
189
|
-
return dask_df
|
@@ -1,134 +0,0 @@
|
|
1
|
-
import datetime
|
2
|
-
from typing import Any, Optional, Dict, Type
|
3
|
-
|
4
|
-
from pydantic import BaseModel, model_validator
|
5
|
-
from sqlalchemy import inspect
|
6
|
-
from sqlalchemy.exc import OperationalError
|
7
|
-
from sqlalchemy.sql import text
|
8
|
-
from sqlalchemy.sql.sqltypes import (
|
9
|
-
Integer,
|
10
|
-
String,
|
11
|
-
Float,
|
12
|
-
Boolean,
|
13
|
-
DateTime,
|
14
|
-
Date,
|
15
|
-
Time,
|
16
|
-
Numeric,
|
17
|
-
)
|
18
|
-
from sqlmodel import SQLModel, Field, create_engine
|
19
|
-
|
20
|
-
|
21
|
-
class SQLModelConnectionConfig(BaseModel):
|
22
|
-
live: bool = False
|
23
|
-
connection_url: str
|
24
|
-
table: Optional[str] = None
|
25
|
-
model: Optional[Any] = None
|
26
|
-
engine: Optional[Any] = None # Save engine to reuse it
|
27
|
-
|
28
|
-
class Config:
|
29
|
-
arbitrary_types_allowed = True
|
30
|
-
|
31
|
-
@model_validator(mode="after")
|
32
|
-
def validate_and_initialize(self):
|
33
|
-
"""
|
34
|
-
Validate connection parameters, initialize the engine, and build the dynamic model if necessary.
|
35
|
-
"""
|
36
|
-
# Validate `connection_url`
|
37
|
-
if not self.connection_url:
|
38
|
-
raise ValueError("`connection_url` must be provided.")
|
39
|
-
|
40
|
-
# Initialize the engine
|
41
|
-
self.engine = create_engine(self.connection_url)
|
42
|
-
|
43
|
-
# Validate the connection
|
44
|
-
self.validate_connection()
|
45
|
-
|
46
|
-
# If table is provided, set `live=False`
|
47
|
-
if self.table:
|
48
|
-
self.live = False
|
49
|
-
|
50
|
-
# If model is not provided, build dynamically
|
51
|
-
if not self.model:
|
52
|
-
if not self.table:
|
53
|
-
raise ValueError("`table_name` must be provided to build the model.")
|
54
|
-
try:
|
55
|
-
self.model = self.build_model()
|
56
|
-
except Exception as e:
|
57
|
-
raise ValueError(f"Failed to build model for table '{self.table}': {e}")
|
58
|
-
else:
|
59
|
-
self.live = True
|
60
|
-
|
61
|
-
return self
|
62
|
-
|
63
|
-
def validate_connection(self):
|
64
|
-
"""
|
65
|
-
Test the database connection by executing a simple query.
|
66
|
-
"""
|
67
|
-
try:
|
68
|
-
with self.engine.connect() as connection:
|
69
|
-
connection.execute(text("SELECT 1"))
|
70
|
-
except OperationalError as e:
|
71
|
-
raise ValueError(f"Failed to connect to the database: {e}")
|
72
|
-
|
73
|
-
def build_model(self) -> Type[SQLModel]:
|
74
|
-
"""
|
75
|
-
Dynamically build a SQLModel class based on the table schema.
|
76
|
-
"""
|
77
|
-
inspector = inspect(self.engine)
|
78
|
-
|
79
|
-
# Validate table existence
|
80
|
-
if self.table not in inspector.get_table_names():
|
81
|
-
raise ValueError(f"Table '{self.table}' does not exist in the database.")
|
82
|
-
|
83
|
-
columns = inspector.get_columns(self.table)
|
84
|
-
if not columns:
|
85
|
-
raise ValueError(f"No columns found for table '{self.table}'.")
|
86
|
-
|
87
|
-
type_mapping = {
|
88
|
-
Integer: int,
|
89
|
-
String: str,
|
90
|
-
Float: float,
|
91
|
-
Boolean: bool,
|
92
|
-
DateTime: datetime.datetime,
|
93
|
-
Date: datetime.date,
|
94
|
-
Time: datetime.time,
|
95
|
-
Numeric: float,
|
96
|
-
}
|
97
|
-
|
98
|
-
annotations: Dict[str, Type] = {}
|
99
|
-
model_fields = {}
|
100
|
-
|
101
|
-
for column in columns:
|
102
|
-
name = column["name"]
|
103
|
-
sa_type = column["type"]
|
104
|
-
nullable = column["nullable"]
|
105
|
-
default = column.get("default", None)
|
106
|
-
primary_key = column.get("primary_key", False)
|
107
|
-
|
108
|
-
py_type = None
|
109
|
-
for sa_base_type, py_base_type in type_mapping.items():
|
110
|
-
if isinstance(sa_type, sa_base_type):
|
111
|
-
py_type = py_base_type
|
112
|
-
break
|
113
|
-
|
114
|
-
if py_type is None:
|
115
|
-
raise ValueError(f"Unsupported SQLAlchemy type for column '{name}': {sa_type}")
|
116
|
-
|
117
|
-
# Define field type and attributes
|
118
|
-
annotations[name] = py_type
|
119
|
-
model_fields[name] = Field(
|
120
|
-
default=default,
|
121
|
-
nullable=nullable,
|
122
|
-
primary_key=primary_key,
|
123
|
-
sa_column_args={"type_": sa_type},
|
124
|
-
)
|
125
|
-
|
126
|
-
model_fields["__annotations__"] = annotations
|
127
|
-
model_fields["__table__"] = self.table
|
128
|
-
model_name = self._table2model(self.table)
|
129
|
-
return type(model_name, (SQLModel,), model_fields)
|
130
|
-
|
131
|
-
@staticmethod
|
132
|
-
def _table2model(table_name: str) -> str:
|
133
|
-
"""Convert table name to PascalCase model name."""
|
134
|
-
return "".join(word.capitalize() for word in table_name.split("_"))
|
@@ -1,101 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Any, Dict, Optional
|
3
|
-
|
4
|
-
import dask.dataframe as dd
|
5
|
-
import pandas as pd
|
6
|
-
from sqlmodel import Session, select, text
|
7
|
-
|
8
|
-
|
9
|
-
class SQLModelLoadFromDb:
|
10
|
-
df: dd.DataFrame
|
11
|
-
|
12
|
-
def __init__(
|
13
|
-
self,
|
14
|
-
db_connection,
|
15
|
-
db_query: Optional[Dict[str, Any]] = None,
|
16
|
-
db_params: Optional[Dict[str, Any]] = None,
|
17
|
-
logger=None,
|
18
|
-
**kwargs,
|
19
|
-
):
|
20
|
-
"""
|
21
|
-
Initialize the loader with database connection, query, and parameters.
|
22
|
-
"""
|
23
|
-
self.db_connection = db_connection
|
24
|
-
self.table_name = self.db_connection.table
|
25
|
-
self.model = self.db_connection.model
|
26
|
-
self.engine = self.db_connection.engine
|
27
|
-
self.logger = logger or self._default_logger()
|
28
|
-
self.query_config = db_query or {}
|
29
|
-
self.params_config = db_params or {}
|
30
|
-
self.debug = kwargs.pop("debug", False)
|
31
|
-
|
32
|
-
def _default_logger(self):
|
33
|
-
"""Create a default logger."""
|
34
|
-
logging.basicConfig(level=logging.INFO)
|
35
|
-
return logging.getLogger("SQLModelLoadFromDb")
|
36
|
-
|
37
|
-
def build_and_load(self) -> dd.DataFrame:
|
38
|
-
"""
|
39
|
-
Load data into a Dask DataFrame based on the query and parameters.
|
40
|
-
"""
|
41
|
-
self.df = self._build_and_load()
|
42
|
-
if not self.df.empty:
|
43
|
-
self._process_loaded_data()
|
44
|
-
return self.df
|
45
|
-
|
46
|
-
def _build_and_load(self) -> dd.DataFrame:
|
47
|
-
"""
|
48
|
-
Query the database and load results into a Dask DataFrame.
|
49
|
-
"""
|
50
|
-
print(self.model.__name__)
|
51
|
-
with Session(self.engine) as session:
|
52
|
-
try:
|
53
|
-
query = select(text(self.model.__table__))
|
54
|
-
print("query:", query)
|
55
|
-
|
56
|
-
# Apply filters if provided
|
57
|
-
filters = self.params_config.df_params.get("filters")
|
58
|
-
if filters:
|
59
|
-
# Apply ORM filters (simple equality conditions)
|
60
|
-
for column_name, value in filters.items():
|
61
|
-
column = getattr(self.model, column_name, None)
|
62
|
-
if column is not None:
|
63
|
-
query = query.filter(column == value)
|
64
|
-
else:
|
65
|
-
self.logger.warning(f"Filter column '{column_name}' not found in model.")
|
66
|
-
|
67
|
-
# Apply limit if provided in query_config
|
68
|
-
n_records = self.query_config.n_records
|
69
|
-
if n_records:
|
70
|
-
query = query.limit(n_records)
|
71
|
-
|
72
|
-
# Debug: Log the SQL query
|
73
|
-
self.logger.debug(f"Executing query: {str(query)}")
|
74
|
-
|
75
|
-
# Execute the query
|
76
|
-
results = session.exec(query).fetchall()
|
77
|
-
|
78
|
-
# Convert query results to a Dask DataFrame
|
79
|
-
print("results:", results)
|
80
|
-
if results:
|
81
|
-
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
82
|
-
else:
|
83
|
-
self.logger.debug("Query returned no results.")
|
84
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
85
|
-
|
86
|
-
except Exception as e:
|
87
|
-
print(e)
|
88
|
-
self.logger.error(f"Error loading data: {e}")
|
89
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
90
|
-
|
91
|
-
return df
|
92
|
-
|
93
|
-
def _process_loaded_data(self):
|
94
|
-
"""
|
95
|
-
Process and clean the loaded data.
|
96
|
-
"""
|
97
|
-
field_map = self.params_config.get("field_map", {})
|
98
|
-
if field_map:
|
99
|
-
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
100
|
-
if rename_mapping:
|
101
|
-
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from ._io_dask import ReadFrameDask
|
3
4
|
from ._django_db_connection import DjangoConnectionConfig
|
4
5
|
from ._django_load_from_db import DjangoLoadFromDb
|
5
|
-
from ._io_dask import ReadFrameDask
|
6
6
|
|
7
7
|
__all__ = [
|
8
8
|
"DjangoConnectionConfig",
|
{sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.21 → sibi_dst-0.3.23}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,22 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from .
|
4
|
-
from ._clickhouse_writer import ClickHouseWriter
|
5
|
-
from ._credentials import *
|
6
|
-
from ._data_utils import DataUtils
|
7
|
-
from ._data_wrapper import DataWrapper
|
3
|
+
from ._log_utils import Logger
|
8
4
|
from ._date_utils import *
|
9
|
-
from .
|
5
|
+
from ._data_utils import DataUtils
|
10
6
|
from ._file_utils import FileUtils
|
11
7
|
from ._filepath_generator import FilePathGenerator
|
12
|
-
from .
|
13
|
-
from ._parquet_saver import ParquetSaver
|
8
|
+
from ._df_utils import DfUtils
|
14
9
|
from ._storage_manager import StorageManager
|
10
|
+
from ._parquet_saver import ParquetSaver
|
11
|
+
from ._clickhouse_writer import ClickHouseWriter
|
12
|
+
from ._airflow_manager import AirflowDAGManager
|
13
|
+
from ._credentials import *
|
14
|
+
from ._data_wrapper import DataWrapper
|
15
15
|
|
16
16
|
__all__ = [
|
17
|
+
"Logger",
|
17
18
|
"ConfigManager",
|
18
19
|
"ConfigLoader",
|
19
|
-
"Logger",
|
20
20
|
"DateUtils",
|
21
21
|
"BusinessDays",
|
22
22
|
"FileUtils",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|