sibi-dst 0.3.22__tar.gz → 0.3.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/PKG-INFO +3 -2
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/pyproject.toml +3 -2
- sibi_dst-0.3.22/sibi_dst/df_helper/backends/django/_io_dask_alt.py +0 -189
- sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/__init__.py +0 -9
- sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +0 -134
- sibi_dst-0.3.22/sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +0 -101
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/README.md +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_db_connection.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_load_from_db.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_airflow_manager.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_clickhouse_writer.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_credentials.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_data_utils.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_data_wrapper.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_date_utils.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_df_utils.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_file_utils.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_filepath_generator.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_log_utils.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_parquet_saver.py +0 -0
- {sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/utils/_storage_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.24
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -25,6 +25,7 @@ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
|
25
25
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
26
26
|
Requires-Dist: paramiko (>=3.5.0,<4.0.0)
|
27
27
|
Requires-Dist: psutil (>=6.1.0,<7.0.0)
|
28
|
+
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
28
29
|
Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
|
29
30
|
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
30
31
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
@@ -33,7 +34,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
33
34
|
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
34
35
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
35
36
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
36
|
-
Requires-Dist: uvicorn (>=0.
|
37
|
+
Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
|
37
38
|
Description-Content-Type: text/markdown
|
38
39
|
|
39
40
|
# sibi-dst
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.24"
|
4
4
|
description = "Data Science Toolkit"
|
5
5
|
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
6
|
readme = "README.md"
|
@@ -30,10 +30,11 @@ clickhouse-driver = "^0.2.9"
|
|
30
30
|
paramiko = "^3.5.0"
|
31
31
|
chardet = "^5.2.0"
|
32
32
|
charset-normalizer = "^3.4.0"
|
33
|
-
uvicorn = "^0.32.1"
|
34
33
|
sqlalchemy = "^2.0.36"
|
35
34
|
djangorestframework = "^3.15.2"
|
36
35
|
dask-expr = "^1.1.20"
|
36
|
+
psycopg2 = "^2.9.10"
|
37
|
+
uvicorn = "^0.34.0"
|
37
38
|
|
38
39
|
|
39
40
|
[build-system]
|
@@ -1,189 +0,0 @@
|
|
1
|
-
import itertools
|
2
|
-
|
3
|
-
import dask.dataframe as dd
|
4
|
-
import django
|
5
|
-
import pandas as pd
|
6
|
-
from django.core.cache import cache
|
7
|
-
from django.core.exceptions import FieldDoesNotExist
|
8
|
-
from django.db import models
|
9
|
-
from django.db.models import Field
|
10
|
-
from django.utils.encoding import force_str as force_text
|
11
|
-
|
12
|
-
|
13
|
-
class ReadFrameDask:
|
14
|
-
FieldDoesNotExist = (
|
15
|
-
django.core.exceptions.FieldDoesNotExist
|
16
|
-
if django.VERSION < (1, 8)
|
17
|
-
else django.core.exceptions.FieldDoesNotExist
|
18
|
-
)
|
19
|
-
|
20
|
-
def __init__(
|
21
|
-
self,
|
22
|
-
qs,
|
23
|
-
**kwargs,
|
24
|
-
):
|
25
|
-
self.qs = qs
|
26
|
-
self.coerce_float = kwargs.setdefault("coerce_float", False)
|
27
|
-
self.chunk_size = kwargs.setdefault("chunk_size", 1000)
|
28
|
-
self.verbose = kwargs.setdefault("verbose", True)
|
29
|
-
|
30
|
-
@staticmethod
|
31
|
-
def get_model_name(model):
|
32
|
-
return model._meta.model_name
|
33
|
-
|
34
|
-
@staticmethod
|
35
|
-
def get_related_model(field):
|
36
|
-
model = None
|
37
|
-
if hasattr(field, "related_model") and field.related_model:
|
38
|
-
model = field.related_model
|
39
|
-
elif hasattr(field, "rel") and field.rel:
|
40
|
-
model = field.rel.to
|
41
|
-
return model
|
42
|
-
|
43
|
-
@classmethod
|
44
|
-
def get_base_cache_key(cls, model):
|
45
|
-
return (
|
46
|
-
f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
|
47
|
-
)
|
48
|
-
|
49
|
-
@classmethod
|
50
|
-
def replace_pk(cls, model):
|
51
|
-
base_cache_key = cls.get_base_cache_key(model)
|
52
|
-
|
53
|
-
def get_cache_key_from_pk(pk):
|
54
|
-
return None if pk is None else base_cache_key % str(pk)
|
55
|
-
|
56
|
-
def inner(pk_series):
|
57
|
-
pk_series = pk_series.astype(object).where(pk_series.notnull(), None)
|
58
|
-
cache_keys = pk_series.apply(get_cache_key_from_pk, convert_dtype=False)
|
59
|
-
unique_cache_keys = list(filter(None, cache_keys.unique()))
|
60
|
-
if not unique_cache_keys:
|
61
|
-
return pk_series
|
62
|
-
|
63
|
-
out_dict = cache.get_many(unique_cache_keys)
|
64
|
-
if len(out_dict) < len(unique_cache_keys):
|
65
|
-
out_dict = dict(
|
66
|
-
[
|
67
|
-
(base_cache_key % obj.pk, force_text(obj))
|
68
|
-
for obj in model.objects.filter(
|
69
|
-
pk__in=list(filter(None, pk_series.unique()))
|
70
|
-
)
|
71
|
-
]
|
72
|
-
)
|
73
|
-
cache.set_many(out_dict)
|
74
|
-
return list(map(out_dict.get, cache_keys))
|
75
|
-
|
76
|
-
return inner
|
77
|
-
|
78
|
-
@staticmethod
|
79
|
-
def replace_from_choices(choices):
|
80
|
-
def inner(values):
|
81
|
-
return [choices.get(v, v) for v in values]
|
82
|
-
|
83
|
-
return inner
|
84
|
-
|
85
|
-
@classmethod
|
86
|
-
def build_update_functions(cls, fieldnames, fields):
|
87
|
-
for fieldname, field in zip(fieldnames, fields):
|
88
|
-
if not isinstance(field, Field):
|
89
|
-
yield fieldname, None
|
90
|
-
else:
|
91
|
-
if field.choices:
|
92
|
-
choices = dict([(k, force_text(v)) for k, v in field.flatchoices])
|
93
|
-
yield fieldname, cls.replace_from_choices(choices)
|
94
|
-
elif field.get_internal_type() == "ForeignKey":
|
95
|
-
yield fieldname, cls.replace_pk(cls.get_related_model(field))
|
96
|
-
|
97
|
-
@classmethod
|
98
|
-
def update_with_verbose(cls, df, fieldnames, fields):
|
99
|
-
for fieldname, function in cls.build_update_functions(fieldnames, fields):
|
100
|
-
if function is not None:
|
101
|
-
df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
|
102
|
-
|
103
|
-
@staticmethod
|
104
|
-
def infer_dtypes_from_django(qs):
|
105
|
-
"""Infers Dask data types based on Django queryset model fields, with support for nullable integers."""
|
106
|
-
django_to_dask_dtype = {
|
107
|
-
'AutoField': 'Int64', # Use nullable integer
|
108
|
-
'BigAutoField': 'Int64',
|
109
|
-
'BigIntegerField': 'Int64',
|
110
|
-
'BooleanField': 'bool',
|
111
|
-
'CharField': 'object',
|
112
|
-
'DateField': 'datetime64[ns]',
|
113
|
-
'DateTimeField': 'datetime64[ns]',
|
114
|
-
'DecimalField': 'float64',
|
115
|
-
'FloatField': 'float64',
|
116
|
-
'IntegerField': 'Int64', # Use nullable integer
|
117
|
-
'PositiveIntegerField': 'Int64',
|
118
|
-
'SmallIntegerField': 'Int64',
|
119
|
-
'TextField': 'object',
|
120
|
-
'TimeField': 'object',
|
121
|
-
'UUIDField': 'object',
|
122
|
-
'ForeignKey': 'Int64', # Use nullable integer for FK fields
|
123
|
-
}
|
124
|
-
|
125
|
-
dtypes = {}
|
126
|
-
# Handle model fields
|
127
|
-
for field in qs.model._meta.get_fields():
|
128
|
-
# Skip reverse relationships and non-concrete fields
|
129
|
-
if not getattr(field, 'concrete', False):
|
130
|
-
continue
|
131
|
-
|
132
|
-
# Check for AutoField or BigAutoField explicitly
|
133
|
-
if isinstance(field, (models.AutoField, models.BigAutoField)):
|
134
|
-
dtypes[field.name] = 'Int64' # Nullable integer for autoincremented fields
|
135
|
-
else:
|
136
|
-
# Use field type to infer dtype
|
137
|
-
field_type = field.get_internal_type()
|
138
|
-
dtypes[field.name] = django_to_dask_dtype.get(field_type, 'object')
|
139
|
-
|
140
|
-
# Handle annotated fields
|
141
|
-
for annotation_name, annotation in qs.query.annotation_select.items():
|
142
|
-
if hasattr(annotation, 'output_field'):
|
143
|
-
field_type = annotation.output_field.get_internal_type()
|
144
|
-
dtype = django_to_dask_dtype.get(field_type, 'object')
|
145
|
-
else:
|
146
|
-
dtype = 'object' # Default to object for untyped annotations
|
147
|
-
dtypes[annotation_name] = dtype
|
148
|
-
|
149
|
-
return dtypes
|
150
|
-
|
151
|
-
def read_frame(self, fillna_value=None):
|
152
|
-
qs = self.qs
|
153
|
-
fieldnames = tuple(qs.model._meta.get_fields())
|
154
|
-
dtypes = self.infer_dtypes_from_django(qs)
|
155
|
-
chunk_size = self.chunk_size
|
156
|
-
verbose = self.verbose
|
157
|
-
|
158
|
-
# Use values to directly fetch required fields
|
159
|
-
qs = qs.values(*fieldnames)
|
160
|
-
|
161
|
-
# Create partitions for Dask
|
162
|
-
partitions = []
|
163
|
-
iterator = qs.iterator(chunk_size=chunk_size)
|
164
|
-
for chunk in itertools.islice(iterator, chunk_size):
|
165
|
-
df = pd.DataFrame.from_records(chunk, columns=fieldnames)
|
166
|
-
|
167
|
-
# Handle NaN values
|
168
|
-
if fillna_value:
|
169
|
-
df = df.fillna(fillna_value)
|
170
|
-
|
171
|
-
# Optimize timezone conversions
|
172
|
-
for col in df.columns:
|
173
|
-
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
174
|
-
df[col] = df[col].dt.tz_localize(None)
|
175
|
-
|
176
|
-
# Optimize dtype conversion
|
177
|
-
df = df.convert_dtypes()
|
178
|
-
|
179
|
-
# Convert to Dask DataFrame
|
180
|
-
partitions.append(dd.from_pandas(df, npartitions=1))
|
181
|
-
|
182
|
-
# Combine all partitions
|
183
|
-
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
184
|
-
|
185
|
-
# Apply verbose updates
|
186
|
-
if verbose:
|
187
|
-
self.update_with_verbose(dask_df, fieldnames, qs.model._meta.fields)
|
188
|
-
|
189
|
-
return dask_df
|
@@ -1,134 +0,0 @@
|
|
1
|
-
import datetime
|
2
|
-
from typing import Any, Optional, Dict, Type
|
3
|
-
|
4
|
-
from pydantic import BaseModel, model_validator
|
5
|
-
from sqlalchemy import inspect
|
6
|
-
from sqlalchemy.exc import OperationalError
|
7
|
-
from sqlalchemy.sql import text
|
8
|
-
from sqlalchemy.sql.sqltypes import (
|
9
|
-
Integer,
|
10
|
-
String,
|
11
|
-
Float,
|
12
|
-
Boolean,
|
13
|
-
DateTime,
|
14
|
-
Date,
|
15
|
-
Time,
|
16
|
-
Numeric,
|
17
|
-
)
|
18
|
-
from sqlmodel import SQLModel, Field, create_engine
|
19
|
-
|
20
|
-
|
21
|
-
class SQLModelConnectionConfig(BaseModel):
|
22
|
-
live: bool = False
|
23
|
-
connection_url: str
|
24
|
-
table: Optional[str] = None
|
25
|
-
model: Optional[Any] = None
|
26
|
-
engine: Optional[Any] = None # Save engine to reuse it
|
27
|
-
|
28
|
-
class Config:
|
29
|
-
arbitrary_types_allowed = True
|
30
|
-
|
31
|
-
@model_validator(mode="after")
|
32
|
-
def validate_and_initialize(self):
|
33
|
-
"""
|
34
|
-
Validate connection parameters, initialize the engine, and build the dynamic model if necessary.
|
35
|
-
"""
|
36
|
-
# Validate `connection_url`
|
37
|
-
if not self.connection_url:
|
38
|
-
raise ValueError("`connection_url` must be provided.")
|
39
|
-
|
40
|
-
# Initialize the engine
|
41
|
-
self.engine = create_engine(self.connection_url)
|
42
|
-
|
43
|
-
# Validate the connection
|
44
|
-
self.validate_connection()
|
45
|
-
|
46
|
-
# If table is provided, set `live=False`
|
47
|
-
if self.table:
|
48
|
-
self.live = False
|
49
|
-
|
50
|
-
# If model is not provided, build dynamically
|
51
|
-
if not self.model:
|
52
|
-
if not self.table:
|
53
|
-
raise ValueError("`table_name` must be provided to build the model.")
|
54
|
-
try:
|
55
|
-
self.model = self.build_model()
|
56
|
-
except Exception as e:
|
57
|
-
raise ValueError(f"Failed to build model for table '{self.table}': {e}")
|
58
|
-
else:
|
59
|
-
self.live = True
|
60
|
-
|
61
|
-
return self
|
62
|
-
|
63
|
-
def validate_connection(self):
|
64
|
-
"""
|
65
|
-
Test the database connection by executing a simple query.
|
66
|
-
"""
|
67
|
-
try:
|
68
|
-
with self.engine.connect() as connection:
|
69
|
-
connection.execute(text("SELECT 1"))
|
70
|
-
except OperationalError as e:
|
71
|
-
raise ValueError(f"Failed to connect to the database: {e}")
|
72
|
-
|
73
|
-
def build_model(self) -> Type[SQLModel]:
|
74
|
-
"""
|
75
|
-
Dynamically build a SQLModel class based on the table schema.
|
76
|
-
"""
|
77
|
-
inspector = inspect(self.engine)
|
78
|
-
|
79
|
-
# Validate table existence
|
80
|
-
if self.table not in inspector.get_table_names():
|
81
|
-
raise ValueError(f"Table '{self.table}' does not exist in the database.")
|
82
|
-
|
83
|
-
columns = inspector.get_columns(self.table)
|
84
|
-
if not columns:
|
85
|
-
raise ValueError(f"No columns found for table '{self.table}'.")
|
86
|
-
|
87
|
-
type_mapping = {
|
88
|
-
Integer: int,
|
89
|
-
String: str,
|
90
|
-
Float: float,
|
91
|
-
Boolean: bool,
|
92
|
-
DateTime: datetime.datetime,
|
93
|
-
Date: datetime.date,
|
94
|
-
Time: datetime.time,
|
95
|
-
Numeric: float,
|
96
|
-
}
|
97
|
-
|
98
|
-
annotations: Dict[str, Type] = {}
|
99
|
-
model_fields = {}
|
100
|
-
|
101
|
-
for column in columns:
|
102
|
-
name = column["name"]
|
103
|
-
sa_type = column["type"]
|
104
|
-
nullable = column["nullable"]
|
105
|
-
default = column.get("default", None)
|
106
|
-
primary_key = column.get("primary_key", False)
|
107
|
-
|
108
|
-
py_type = None
|
109
|
-
for sa_base_type, py_base_type in type_mapping.items():
|
110
|
-
if isinstance(sa_type, sa_base_type):
|
111
|
-
py_type = py_base_type
|
112
|
-
break
|
113
|
-
|
114
|
-
if py_type is None:
|
115
|
-
raise ValueError(f"Unsupported SQLAlchemy type for column '{name}': {sa_type}")
|
116
|
-
|
117
|
-
# Define field type and attributes
|
118
|
-
annotations[name] = py_type
|
119
|
-
model_fields[name] = Field(
|
120
|
-
default=default,
|
121
|
-
nullable=nullable,
|
122
|
-
primary_key=primary_key,
|
123
|
-
sa_column_args={"type_": sa_type},
|
124
|
-
)
|
125
|
-
|
126
|
-
model_fields["__annotations__"] = annotations
|
127
|
-
model_fields["__table__"] = self.table
|
128
|
-
model_name = self._table2model(self.table)
|
129
|
-
return type(model_name, (SQLModel,), model_fields)
|
130
|
-
|
131
|
-
@staticmethod
|
132
|
-
def _table2model(table_name: str) -> str:
|
133
|
-
"""Convert table name to PascalCase model name."""
|
134
|
-
return "".join(word.capitalize() for word in table_name.split("_"))
|
@@ -1,101 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import Any, Dict, Optional
|
3
|
-
|
4
|
-
import dask.dataframe as dd
|
5
|
-
import pandas as pd
|
6
|
-
from sqlmodel import Session, select, text
|
7
|
-
|
8
|
-
|
9
|
-
class SQLModelLoadFromDb:
|
10
|
-
df: dd.DataFrame
|
11
|
-
|
12
|
-
def __init__(
|
13
|
-
self,
|
14
|
-
db_connection,
|
15
|
-
db_query: Optional[Dict[str, Any]] = None,
|
16
|
-
db_params: Optional[Dict[str, Any]] = None,
|
17
|
-
logger=None,
|
18
|
-
**kwargs,
|
19
|
-
):
|
20
|
-
"""
|
21
|
-
Initialize the loader with database connection, query, and parameters.
|
22
|
-
"""
|
23
|
-
self.db_connection = db_connection
|
24
|
-
self.table_name = self.db_connection.table
|
25
|
-
self.model = self.db_connection.model
|
26
|
-
self.engine = self.db_connection.engine
|
27
|
-
self.logger = logger or self._default_logger()
|
28
|
-
self.query_config = db_query or {}
|
29
|
-
self.params_config = db_params or {}
|
30
|
-
self.debug = kwargs.pop("debug", False)
|
31
|
-
|
32
|
-
def _default_logger(self):
|
33
|
-
"""Create a default logger."""
|
34
|
-
logging.basicConfig(level=logging.INFO)
|
35
|
-
return logging.getLogger("SQLModelLoadFromDb")
|
36
|
-
|
37
|
-
def build_and_load(self) -> dd.DataFrame:
|
38
|
-
"""
|
39
|
-
Load data into a Dask DataFrame based on the query and parameters.
|
40
|
-
"""
|
41
|
-
self.df = self._build_and_load()
|
42
|
-
if not self.df.empty:
|
43
|
-
self._process_loaded_data()
|
44
|
-
return self.df
|
45
|
-
|
46
|
-
def _build_and_load(self) -> dd.DataFrame:
|
47
|
-
"""
|
48
|
-
Query the database and load results into a Dask DataFrame.
|
49
|
-
"""
|
50
|
-
print(self.model.__name__)
|
51
|
-
with Session(self.engine) as session:
|
52
|
-
try:
|
53
|
-
query = select(text(self.model.__table__))
|
54
|
-
print("query:", query)
|
55
|
-
|
56
|
-
# Apply filters if provided
|
57
|
-
filters = self.params_config.df_params.get("filters")
|
58
|
-
if filters:
|
59
|
-
# Apply ORM filters (simple equality conditions)
|
60
|
-
for column_name, value in filters.items():
|
61
|
-
column = getattr(self.model, column_name, None)
|
62
|
-
if column is not None:
|
63
|
-
query = query.filter(column == value)
|
64
|
-
else:
|
65
|
-
self.logger.warning(f"Filter column '{column_name}' not found in model.")
|
66
|
-
|
67
|
-
# Apply limit if provided in query_config
|
68
|
-
n_records = self.query_config.n_records
|
69
|
-
if n_records:
|
70
|
-
query = query.limit(n_records)
|
71
|
-
|
72
|
-
# Debug: Log the SQL query
|
73
|
-
self.logger.debug(f"Executing query: {str(query)}")
|
74
|
-
|
75
|
-
# Execute the query
|
76
|
-
results = session.exec(query).fetchall()
|
77
|
-
|
78
|
-
# Convert query results to a Dask DataFrame
|
79
|
-
print("results:", results)
|
80
|
-
if results:
|
81
|
-
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
82
|
-
else:
|
83
|
-
self.logger.debug("Query returned no results.")
|
84
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
85
|
-
|
86
|
-
except Exception as e:
|
87
|
-
print(e)
|
88
|
-
self.logger.error(f"Error loading data: {e}")
|
89
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
90
|
-
|
91
|
-
return df
|
92
|
-
|
93
|
-
def _process_loaded_data(self):
|
94
|
-
"""
|
95
|
-
Process and clean the loaded data.
|
96
|
-
"""
|
97
|
-
field_map = self.params_config.get("field_map", {})
|
98
|
-
if field_map:
|
99
|
-
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
100
|
-
if rename_mapping:
|
101
|
-
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_db_connection.py
RENAMED
File without changes
|
{sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/django/_django_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-0.3.22 → sibi_dst-0.3.24}/sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|