sibi-dst 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +4 -0
- sibi_dst/df_helper/core/__init__.py +9 -2
- sibi_dst/df_helper/core/_defaults.py +99 -3
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +60 -9
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +118 -128
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +2 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +28 -52
- sibi_dst/utils/_data_utils.py +27 -28
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.11.dist-info}/METADATA +2 -2
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.11.dist-info}/RECORD +11 -11
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.11.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -101,6 +101,8 @@ class DfHelper:
|
|
101
101
|
|
102
102
|
def _load_from_sqlalchemy(self, **options):
|
103
103
|
try:
|
104
|
+
options.setdefault("debug", self.debug)
|
105
|
+
options.setdefault("verbose_debug", self.verbose_debug)
|
104
106
|
db_loader = SqlAlchemyLoadFromDb(
|
105
107
|
self.plugin_sqlalchemy,
|
106
108
|
self.plugin_query,
|
@@ -120,6 +122,8 @@ class DfHelper:
|
|
120
122
|
|
121
123
|
def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
122
124
|
try:
|
125
|
+
options.setdefault("debug", self.debug)
|
126
|
+
options.setdefault("verbose_debug", self.verbose_debug)
|
123
127
|
db_loader = DjangoLoadFromDb(
|
124
128
|
self.plugin_django_connection,
|
125
129
|
self.plugin_query,
|
@@ -2,10 +2,17 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from ._params_config import ParamsConfig
|
4
4
|
from ._query_config import QueryConfig
|
5
|
-
from ._defaults import
|
5
|
+
from ._defaults import (
|
6
|
+
django_field_conversion_map_pandas,
|
7
|
+
django_field_conversion_map_dask,
|
8
|
+
sqlalchemy_field_conversion_map_dask,
|
9
|
+
normalize_sqlalchemy_type)
|
6
10
|
|
7
11
|
__all__ = [
|
8
12
|
"ParamsConfig",
|
9
13
|
"QueryConfig",
|
10
|
-
"
|
14
|
+
"django_field_conversion_map_pandas",
|
15
|
+
"django_field_conversion_map_dask",
|
16
|
+
"sqlalchemy_field_conversion_map_dask",
|
17
|
+
"normalize_sqlalchemy_type"
|
11
18
|
]
|
@@ -1,10 +1,12 @@
|
|
1
1
|
# Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
|
2
2
|
#
|
3
3
|
import json
|
4
|
-
from
|
5
|
-
from typing import Dict, Union, Optional
|
4
|
+
from typing import Dict
|
6
5
|
|
7
6
|
import pandas as pd
|
7
|
+
from sqlalchemy import String, Text, Integer, BigInteger, SmallInteger, Float, Boolean, DateTime, Date, Time, JSON, \
|
8
|
+
Numeric, UUID
|
9
|
+
from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
|
8
10
|
|
9
11
|
# This is the defaults configuration file for the df_helper module.
|
10
12
|
|
@@ -13,11 +15,12 @@ import pandas as pd
|
|
13
15
|
# conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
|
14
16
|
# the Django field type.
|
15
17
|
|
16
|
-
|
18
|
+
django_field_conversion_map_pandas: Dict[str, callable] = {
|
17
19
|
"CharField": lambda x: x.astype(str),
|
18
20
|
"TextField": lambda x: x.astype(str),
|
19
21
|
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
20
22
|
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
|
+
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
21
24
|
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
22
25
|
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
26
|
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
@@ -35,3 +38,96 @@ django_field_conversion_map: Dict[str, callable] = {
|
|
35
38
|
"ArrayField": lambda x: x.apply(eval),
|
36
39
|
"UUIDField": lambda x: x.astype(str),
|
37
40
|
}
|
41
|
+
|
42
|
+
django_field_conversion_map_dask: Dict[str, callable] = {
|
43
|
+
"CharField": lambda x: x.astype(str),
|
44
|
+
"TextField": lambda x: x.astype(str),
|
45
|
+
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
46
|
+
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
47
|
+
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
48
|
+
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
49
|
+
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
50
|
+
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
51
|
+
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
52
|
+
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
53
|
+
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
54
|
+
"BooleanField": lambda x: x.astype(bool),
|
55
|
+
"NullBooleanField": lambda x: x.astype(bool),
|
56
|
+
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
+
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
|
58
|
+
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
|
59
|
+
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
60
|
+
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
61
|
+
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
62
|
+
"UUIDField": lambda x: x.astype(str),
|
63
|
+
}
|
64
|
+
|
65
|
+
sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
66
|
+
String.__name__: lambda x: x.astype(str).fillna(""),
|
67
|
+
Text.__name__: lambda x: x.fillna('').astype(str),
|
68
|
+
Integer.__name__: lambda x: x.fillna(0).astype(int),
|
69
|
+
BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
70
|
+
SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
71
|
+
Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
72
|
+
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
73
|
+
Boolean.__name__: lambda x: x.astype(bool),
|
74
|
+
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
75
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
|
76
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
|
77
|
+
JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
78
|
+
UUID.__name__: lambda x: x.astype(str),
|
79
|
+
}
|
80
|
+
|
81
|
+
# Conversion map with normalized SQLAlchemy field types
|
82
|
+
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
83
|
+
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
84
|
+
# "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
|
85
|
+
# "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
|
86
|
+
# "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
87
|
+
# "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
88
|
+
# "Float": lambda x: pd.to_numeric(x, errors="coerce"),
|
89
|
+
# "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
|
90
|
+
# "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
|
91
|
+
# "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
|
92
|
+
# "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
|
93
|
+
# "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
|
94
|
+
# "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
95
|
+
# }
|
96
|
+
|
97
|
+
|
98
|
+
def normalize_sqlalchemy_type(field_type):
|
99
|
+
"""
|
100
|
+
Normalize SQLAlchemy field types to generic type names.
|
101
|
+
Handles dialect-specific types (e.g., MySQL).
|
102
|
+
"""
|
103
|
+
# Map of generic SQLAlchemy types
|
104
|
+
type_mapping = {
|
105
|
+
String: "String",
|
106
|
+
Text: "Text",
|
107
|
+
Integer: "Integer",
|
108
|
+
SmallInteger: "SmallInteger",
|
109
|
+
BigInteger: "BigInteger",
|
110
|
+
Float: "Float",
|
111
|
+
Numeric: "Numeric",
|
112
|
+
Boolean: "Boolean",
|
113
|
+
DateTime: "DateTime",
|
114
|
+
Date: "Date",
|
115
|
+
Time: "Time",
|
116
|
+
JSON: "JSON",
|
117
|
+
}
|
118
|
+
|
119
|
+
# Dialect-specific types
|
120
|
+
dialect_mapping = {
|
121
|
+
TINYINT: "SmallInteger",
|
122
|
+
MEDIUMTEXT: "Text",
|
123
|
+
}
|
124
|
+
|
125
|
+
# Check if the field matches a generic or dialect-specific type
|
126
|
+
for sql_type, name in {**type_mapping, **dialect_mapping}.items():
|
127
|
+
if isinstance(field_type, sql_type):
|
128
|
+
return name
|
129
|
+
|
130
|
+
# Fallback to raw class name
|
131
|
+
return field_type.__class__.__name__
|
132
|
+
|
133
|
+
|
@@ -4,6 +4,7 @@ from django.db.models import Q
|
|
4
4
|
|
5
5
|
from sibi_dst.df_helper.plugins.django import ReadFrameDask
|
6
6
|
from sibi_dst.utils import Logger
|
7
|
+
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
7
8
|
|
8
9
|
class DjangoLoadFromDb:
|
9
10
|
df: dd.DataFrame
|
@@ -26,10 +27,10 @@ class DjangoLoadFromDb:
|
|
26
27
|
|
27
28
|
def build_and_load(self):
|
28
29
|
self.df = self._build_and_load()
|
29
|
-
|
30
|
-
self._process_loaded_data()
|
30
|
+
#self.df = self._convert_columns(self.df)
|
31
31
|
return self.df
|
32
32
|
|
33
|
+
|
33
34
|
def _build_and_load(self) -> dd.DataFrame:
|
34
35
|
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
35
36
|
if not self.params_config.filters:
|
@@ -61,10 +62,60 @@ class DjangoLoadFromDb:
|
|
61
62
|
q_objects.add(~Q(**{key: value}), Q.AND)
|
62
63
|
return q_objects
|
63
64
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
|
66
|
+
"""
|
67
|
+
Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
68
|
+
|
69
|
+
:param df: Dask DataFrame whose columns' data types are to be converted.
|
70
|
+
:return: Dask DataFrame with converted column data types.
|
71
|
+
"""
|
72
|
+
|
73
|
+
def log_debug(message: str, is_verbose: bool = False):
|
74
|
+
"""Helper to handle debug and verbose debug logging."""
|
75
|
+
if self.debug:
|
76
|
+
self.logger.debug(message)
|
77
|
+
if is_verbose and self.verbose_debug:
|
78
|
+
print(message)
|
79
|
+
|
80
|
+
if self.debug:
|
81
|
+
self.logger.info(f'Converting columns: {list(df.columns)}')
|
82
|
+
|
83
|
+
# Get field information from the Django model
|
84
|
+
model_fields = self.connection_config.model._meta.get_fields()
|
85
|
+
field_type_map = {field.name: type(field).__name__ for field in model_fields}
|
86
|
+
# Simplified loop to apply conversions partition-wise
|
87
|
+
for field_name, field_type in field_type_map.items():
|
88
|
+
if field_name not in df.columns:
|
89
|
+
|
90
|
+
log_debug(f"Column '{field_name}' not found in DataFrame columns.")
|
91
|
+
continue
|
92
|
+
|
93
|
+
conversion_func = django_field_conversion_map_dask.get(field_type)
|
94
|
+
if not conversion_func:
|
95
|
+
message=f"Field type '{field_type}' not found in conversion_map."
|
96
|
+
log_debug(message, is_verbose=True)
|
97
|
+
continue
|
98
|
+
|
99
|
+
def apply_conversion(partition):
|
100
|
+
"""
|
101
|
+
Apply the conversion function to a single partition for the given column.
|
102
|
+
"""
|
103
|
+
try:
|
104
|
+
if field_name in partition.columns:
|
105
|
+
partition[field_name] = conversion_func(partition[field_name])
|
106
|
+
except Exception as e:
|
107
|
+
self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
|
108
|
+
return partition
|
109
|
+
|
110
|
+
try:
|
111
|
+
# Apply conversion lazily to each partition
|
112
|
+
df = df.map_partitions(
|
113
|
+
apply_conversion,
|
114
|
+
meta=df,
|
115
|
+
)
|
116
|
+
log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
|
117
|
+
is_verbose=True)
|
118
|
+
except Exception as e:
|
119
|
+
log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
|
120
|
+
|
121
|
+
return df
|
@@ -1,143 +1,133 @@
|
|
1
1
|
import itertools
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import pandas as pd
|
4
|
-
from sqlalchemy
|
5
|
-
from sqlalchemy.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
fieldnames=None,
|
14
|
-
index_col=None,
|
15
|
-
coerce_float=False,
|
16
|
-
verbose=True,
|
17
|
-
datetime_index=False,
|
18
|
-
column_names=None,
|
19
|
-
chunk_size=1000,
|
20
|
-
):
|
5
|
+
from sqlalchemy import create_engine, inspect, select
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
7
|
+
|
8
|
+
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
|
11
|
+
|
12
|
+
class SQLAlchemyDask:
|
13
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
|
21
14
|
"""
|
22
|
-
Initialize
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
verbose: Whether to include verbose processing like handling choices.
|
31
|
-
datetime_index: Whether to convert the index to a datetime index.
|
32
|
-
column_names: Optional mapping of fieldnames to custom column names.
|
33
|
-
chunk_size: Number of records to fetch in each chunk.
|
15
|
+
Initialize with an SQLAlchemy query and database engine URL.
|
16
|
+
|
17
|
+
:param model: SQLAlchemy ORM model.
|
18
|
+
:param filters: Filters to apply on the query.
|
19
|
+
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
|
+
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
|
+
:param logger: Logger instance for logging.
|
22
|
+
:param verbose: Whether to print detailed logs.
|
34
23
|
"""
|
35
|
-
self.query =
|
36
|
-
self.
|
37
|
-
self.
|
38
|
-
self.index_col = index_col
|
39
|
-
self.coerce_float = coerce_float
|
40
|
-
self.verbose = verbose
|
41
|
-
self.datetime_index = datetime_index
|
42
|
-
self.column_names = column_names
|
24
|
+
self.query = None
|
25
|
+
self.model = model
|
26
|
+
self.filters = filters
|
43
27
|
self.chunk_size = chunk_size
|
28
|
+
self.verbose = verbose
|
29
|
+
self.engine = create_engine(engine_url)
|
30
|
+
self.Session = sessionmaker(bind=self.engine)
|
31
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
44
32
|
|
45
33
|
@staticmethod
|
46
|
-
def
|
34
|
+
def infer_dtypes_from_model(model):
|
47
35
|
"""
|
48
|
-
|
49
|
-
|
50
|
-
Args:
|
51
|
-
row: SQLAlchemy ORM object, Row object, or tuple.
|
52
|
-
fields: List of fields to extract.
|
53
|
-
|
54
|
-
Returns:
|
55
|
-
A dictionary representation of the row.
|
36
|
+
Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
|
56
37
|
"""
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
38
|
+
mapper = inspect(model)
|
39
|
+
sqlalchemy_to_dask_dtype = {
|
40
|
+
#'INTEGER': pd.to_numeric(x, errors="coerce"),
|
41
|
+
'INTEGER': 'Int64',
|
42
|
+
'SMALLINT': 'Int64',
|
43
|
+
'BIGINT': 'Int64',
|
44
|
+
'FLOAT': 'float64',
|
45
|
+
'NUMERIC': 'float64',
|
46
|
+
'BOOLEAN': 'bool',
|
47
|
+
'VARCHAR': 'object',
|
48
|
+
'TEXT': 'object',
|
49
|
+
'DATE': 'datetime64[ns]',
|
50
|
+
'DATETIME': 'datetime64[ns]',
|
51
|
+
'TIME': 'object',
|
52
|
+
'UUID': 'object',
|
53
|
+
}
|
54
|
+
|
55
|
+
dtypes = {}
|
56
|
+
for column in mapper.columns:
|
57
|
+
dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
|
58
|
+
dtypes[column.name] = dtype
|
59
|
+
|
60
|
+
return dtypes
|
78
61
|
|
79
62
|
def read_frame(self, fillna_value=None):
|
80
63
|
"""
|
81
|
-
|
82
|
-
|
83
|
-
Args:
|
84
|
-
fillna_value: Value to use for filling missing values.
|
64
|
+
Load data from an SQLAlchemy query into a Dask DataFrame.
|
85
65
|
|
86
|
-
|
87
|
-
|
66
|
+
:param fillna_value: Value to replace NaN or NULL values with, if any.
|
67
|
+
:return: Dask DataFrame.
|
88
68
|
"""
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
self.
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
69
|
+
with self.Session() as session:
|
70
|
+
try:
|
71
|
+
# Build query
|
72
|
+
self.query = select(self.model)
|
73
|
+
if self.filters:
|
74
|
+
self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
75
|
+
else:
|
76
|
+
n_records = 100
|
77
|
+
self.query = self.query.limit(n_records)
|
78
|
+
|
79
|
+
# Infer dtypes
|
80
|
+
dtypes = self.infer_dtypes_from_model(self.model)
|
81
|
+
# Get the column order from the SQLAlchemy model
|
82
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
83
|
+
|
84
|
+
# Execute query and fetch results in chunks
|
85
|
+
result_proxy = session.execute(self.query)
|
86
|
+
results = result_proxy.scalars().all() # Fetch all rows
|
87
|
+
iterator = iter(results)
|
88
|
+
|
89
|
+
partitions = []
|
90
|
+
|
91
|
+
while True:
|
92
|
+
chunk = list(itertools.islice(iterator, self.chunk_size))
|
93
|
+
if not chunk:
|
94
|
+
break
|
95
|
+
|
96
|
+
# Convert chunk to Pandas DataFrame
|
97
|
+
df = pd.DataFrame.from_records(
|
98
|
+
[row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
|
99
|
+
)
|
100
|
+
# Drop internal SQLAlchemy state if it exists
|
101
|
+
df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
|
102
|
+
|
103
|
+
# Reorder columns to match the model's order
|
104
|
+
df = df[ordered_columns]
|
105
|
+
|
106
|
+
# Fill NaN values
|
107
|
+
if fillna_value is not None:
|
108
|
+
df = df.fillna(fillna_value)
|
109
|
+
|
110
|
+
# Convert timezone-aware columns to naive
|
111
|
+
for col in df.columns:
|
112
|
+
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
113
|
+
df[col] = df[col].dt.tz_localize(None)
|
114
|
+
|
115
|
+
# Apply inferred dtypes
|
116
|
+
df = df.astype(dtypes)
|
117
|
+
# Create a Dask partition
|
118
|
+
partitions.append(dd.from_pandas(df, npartitions=1))
|
119
|
+
|
120
|
+
# Concatenate all partitions
|
121
|
+
# print(partitions)
|
122
|
+
if partitions:
|
123
|
+
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
124
|
+
else:
|
125
|
+
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
|
+
|
127
|
+
if self.verbose:
|
128
|
+
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
return dask_df
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
self.logger.error(f"Error executing query: {str(e)}")
|
133
|
+
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -50,6 +50,7 @@ class SqlAlchemyFilterHandler:
|
|
50
50
|
return [datetime.date.fromisoformat(v) for v in value]
|
51
51
|
return value
|
52
52
|
|
53
|
+
|
53
54
|
def handle_date_operator(column, date_op):
|
54
55
|
"""
|
55
56
|
Handle filtering on specific datetime parts (e.g., year, month).
|
@@ -90,6 +91,7 @@ class SqlAlchemyFilterHandler:
|
|
90
91
|
|
91
92
|
# Get the column from the model
|
92
93
|
column = getattr(model, field_name, None)
|
94
|
+
#column = model.__table__.columns.get(field_name)
|
93
95
|
if not column:
|
94
96
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
95
97
|
|
@@ -1,23 +1,30 @@
|
|
1
|
+
from typing import Dict
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
|
-
from sqlmodel import Session, select
|
3
|
-
from typing import Any, Dict, Optional
|
4
|
-
import logging
|
5
4
|
import pandas as pd
|
5
|
+
from sqlalchemy.inspection import inspect
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
7
|
+
from sqlalchemy import select
|
8
|
+
#from sqlmodel import Session, select
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
|
11
|
+
normalize_sqlalchemy_type
|
6
12
|
from sibi_dst.utils import Logger
|
7
|
-
from .
|
8
|
-
from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
|
13
|
+
from ._io_sqlalchemy_dask import SQLAlchemyDask
|
9
14
|
from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
15
|
+
from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
|
16
|
+
|
10
17
|
|
11
18
|
class SqlAlchemyLoadFromDb:
|
12
19
|
df: dd.DataFrame
|
13
20
|
|
14
21
|
def __init__(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
22
|
+
self,
|
23
|
+
plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
|
24
|
+
plugin_query: QueryConfig = None,
|
25
|
+
plugin_params: ParamsConfig = None,
|
26
|
+
logger: Logger = None,
|
27
|
+
**kwargs,
|
21
28
|
):
|
22
29
|
"""
|
23
30
|
Initialize the loader with database connection, query, and parameters.
|
@@ -27,7 +34,7 @@ class SqlAlchemyLoadFromDb:
|
|
27
34
|
self.model = self.db_connection.model
|
28
35
|
self.engine = self.db_connection.engine
|
29
36
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
|
-
self.query_config =
|
37
|
+
self.query_config = plugin_query
|
31
38
|
self.params_config = plugin_params
|
32
39
|
self.debug = kwargs.pop("debug", False)
|
33
40
|
self.verbose_debug = kwargs.pop("verbose_debug", False)
|
@@ -40,44 +47,13 @@ class SqlAlchemyLoadFromDb:
|
|
40
47
|
return self.df
|
41
48
|
|
42
49
|
def _build_and_load(self) -> dd.DataFrame:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
filters = self.params_config.filters
|
50
|
-
if filters:
|
51
|
-
n_records = 0
|
52
|
-
query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(query, self.model,self.params_config.filters)
|
53
|
-
else:
|
54
|
-
n_records = self.query_config.n_records or 100
|
55
|
-
|
56
|
-
if n_records:
|
57
|
-
query = query.limit(n_records)
|
58
|
-
|
59
|
-
# Debug: Log the SQL query
|
60
|
-
if self.debug:
|
61
|
-
self.logger.info(f"Executing query: {str(query)}")
|
62
|
-
|
63
|
-
# Execute the query
|
64
|
-
try:
|
65
|
-
results = session.exec(query).fetchall()
|
66
|
-
if results:
|
67
|
-
records = [
|
68
|
-
{key: getattr(result, key) for key in result.__table__.columns.keys()}
|
69
|
-
for result in results
|
70
|
-
]
|
71
|
-
df = dd.from_pandas(pd.DataFrame(records), npartitions=1)
|
72
|
-
except Exception as e:
|
73
|
-
self.logger.info(results)
|
74
|
-
self.logger.warning("Query returned no results.")
|
75
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
76
|
-
|
77
|
-
except Exception as e:
|
78
|
-
print(query)
|
79
|
-
self.logger.error(f"Error loading data: {e}")
|
80
|
-
df = dd.from_pandas(pd.DataFrame(), npartitions=1)
|
81
|
-
|
50
|
+
try:
|
51
|
+
reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
|
52
|
+
df = reader.read_frame()
|
53
|
+
if df is None or len(df.index) == 0:
|
54
|
+
self.logger.warning("Query returned no results.")
|
55
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
82
56
|
return df
|
83
|
-
|
57
|
+
except Exception as e:
|
58
|
+
self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
|
59
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
sibi_dst/utils/_data_utils.py
CHANGED
@@ -8,7 +8,7 @@ class DataUtils:
|
|
8
8
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
9
9
|
|
10
10
|
@staticmethod
|
11
|
-
def transform_numeric_columns(df, fill_value=0, transform_func=None):
|
11
|
+
def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
|
12
12
|
"""
|
13
13
|
Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
|
14
14
|
|
@@ -21,10 +21,11 @@ class DataUtils:
|
|
21
21
|
Returns:
|
22
22
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
|
23
23
|
"""
|
24
|
-
|
25
|
-
|
24
|
+
if columns is None:
|
25
|
+
# Detect numeric columns
|
26
|
+
columns = df.select_dtypes(include=['number']).columns.tolist()
|
26
27
|
|
27
|
-
if not
|
28
|
+
if not columns:
|
28
29
|
return df
|
29
30
|
|
30
31
|
# Default transformation function (identity) if none is provided
|
@@ -32,7 +33,7 @@ class DataUtils:
|
|
32
33
|
transform_func = lambda x: x
|
33
34
|
|
34
35
|
# Apply transformations
|
35
|
-
for col in
|
36
|
+
for col in columns:
|
36
37
|
dtype = df[col].dtype
|
37
38
|
if pd.api.types.is_integer_dtype(dtype):
|
38
39
|
meta_type = 'int64'
|
@@ -51,38 +52,36 @@ class DataUtils:
|
|
51
52
|
return df
|
52
53
|
|
53
54
|
@staticmethod
|
54
|
-
def transform_boolean_columns(df,
|
55
|
+
def transform_boolean_columns(df, columns=None, sample_size=100):
|
55
56
|
"""
|
56
|
-
|
57
|
+
Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
|
58
|
+
and convert them to boolean. Detection is performed using a sample.
|
57
59
|
|
58
60
|
Parameters:
|
59
61
|
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
60
|
-
-
|
62
|
+
- columns (list of str): List of columns to check and transform.
|
63
|
+
- sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
|
61
64
|
|
62
65
|
Returns:
|
63
66
|
- pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
|
64
67
|
"""
|
68
|
+
# Apply transformation to each specified column
|
69
|
+
for col in columns:
|
70
|
+
if col in df.columns:
|
71
|
+
if isinstance(df, dd.DataFrame):
|
72
|
+
# Replace NaN with 0, then convert to boolean
|
73
|
+
df[col] = df[col].map_partitions(
|
74
|
+
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
75
|
+
.fillna(0) # Replace NaN with 0
|
76
|
+
.astype(int) # Ensure integer type
|
77
|
+
.astype(bool), # Convert to boolean
|
78
|
+
meta=(col, 'bool')
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
# For Pandas DataFrame, handle mixed types and invalid values
|
82
|
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
|
83
|
+
df[col] = df[col].fillna(0).astype(int).astype(bool)
|
65
84
|
|
66
|
-
def is_boolean_like(col):
|
67
|
-
"""
|
68
|
-
Check if a column is boolean-like (contains only two unique values).
|
69
|
-
"""
|
70
|
-
unique_values = col.dropna().unique()
|
71
|
-
if isinstance(col, dd.Series):
|
72
|
-
unique_values = unique_values.compute()
|
73
|
-
return len(unique_values) <= 2 and set(unique_values).issubset({0, 1, True, False})
|
74
|
-
|
75
|
-
# Detect boolean-like columns
|
76
|
-
boolean_columns = [col for col in df.columns if is_boolean_like(df[col])]
|
77
|
-
|
78
|
-
# Apply transformation to each detected column
|
79
|
-
for col in boolean_columns:
|
80
|
-
if isinstance(df, dd.DataFrame):
|
81
|
-
df[col] = df[col].map_partitions(
|
82
|
-
lambda s: s == threshold, meta=(col, 'bool')
|
83
|
-
)
|
84
|
-
else:
|
85
|
-
df[col] = df[col] == threshold
|
86
85
|
return df
|
87
86
|
|
88
87
|
def merge_lookup_data(self, classname, df, **kwargs):
|
@@ -1,15 +1,15 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=e-ptCEDYt5dx8byNiA0ca8Eejl1DG1V5pioZUzabEnY,12747
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
|
5
|
-
sibi_dst/df_helper/core/__init__.py,sha256=
|
6
|
-
sibi_dst/df_helper/core/_defaults.py,sha256=
|
5
|
+
sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
|
6
|
+
sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
|
7
7
|
sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
|
8
8
|
sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
|
9
9
|
sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
|
11
11
|
sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
|
12
|
-
sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=
|
12
|
+
sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
|
13
13
|
sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
|
14
14
|
sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
|
15
15
|
sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
|
@@ -19,10 +19,10 @@ sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPj
|
|
19
19
|
sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=45mHID1azAg5PmaYWbuRlghoRd3H2aTLj1XcycfLJo0,3497
|
20
20
|
sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=cKyRj0UCby9-iYPPFnlel1H03x8MnAoEv8k1tp7kHXw,4277
|
21
21
|
sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
22
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=
|
23
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=
|
22
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
|
23
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
|
24
24
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
25
|
-
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=
|
25
|
+
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=myrtEzK12DvA73x7QFaqXFb_TxOPMrsVj-mxYHJD2dg,2371
|
26
26
|
sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
|
27
27
|
sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
28
28
|
sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
@@ -31,7 +31,7 @@ sibi_dst/utils/__init__.py,sha256=jiXJSnmsaGZTRhUThtIo6cssWXBWXNij8ffYmv77QK4,79
|
|
31
31
|
sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
|
32
32
|
sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
|
33
33
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
34
|
-
sibi_dst/utils/_data_utils.py,sha256=
|
34
|
+
sibi_dst/utils/_data_utils.py,sha256=3hBMg852ANpS5bOtlU-F4H-Q91WIGga5LrKWWyDvnAA,7354
|
35
35
|
sibi_dst/utils/_data_wrapper.py,sha256=pZnylBFTvsLGfYGv2tTyQHzyb6IbIahfaXR-PxHdivk,24099
|
36
36
|
sibi_dst/utils/_date_utils.py,sha256=6HCrcTiuYLNsbgrNB3eAVAAgXbfx7Ce1qNc3OJla9nM,5621
|
37
37
|
sibi_dst/utils/_df_utils.py,sha256=o2bK5-xMGKqIG4i9xfavYRxIkiHLA0nz5TQTN78998k,7350
|
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
|
|
40
40
|
sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
|
41
41
|
sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
|
42
42
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
43
|
-
sibi_dst-0.3.
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
43
|
+
sibi_dst-0.3.11.dist-info/METADATA,sha256=gwl565etE5wLVGk0rqQ7umOyBRtEXpQ_IdCXyEkv2s8,1897
|
44
|
+
sibi_dst-0.3.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
45
|
+
sibi_dst-0.3.11.dist-info/RECORD,,
|
File without changes
|