sibi-dst 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +9 -2
- sibi_dst/df_helper/core/__init__.py +9 -2
- sibi_dst/df_helper/core/_defaults.py +99 -3
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py +60 -9
- sibi_dst/df_helper/plugins/http/_http_config.py +2 -3
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py +4 -0
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py +2 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py +118 -128
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py +2 -0
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py +28 -52
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/_data_utils.py +116 -88
- sibi_dst/utils/_data_wrapper.py +2 -320
- sibi_dst/utils/_date_utils.py +130 -0
- sibi_dst/utils/_df_utils.py +91 -0
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/METADATA +5 -2
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/RECORD +18 -18
- {sibi_dst-0.3.10.dist-info → sibi_dst-0.3.12.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -42,6 +42,7 @@ class DfHelper:
|
|
42
42
|
self.dt_field=kwargs.setdefault("dt_field", None)
|
43
43
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
44
44
|
kwargs.setdefault("live", True)
|
45
|
+
kwargs.setdefault("logger", self.logger)
|
45
46
|
self.post_init(**kwargs)
|
46
47
|
|
47
48
|
|
@@ -101,6 +102,8 @@ class DfHelper:
|
|
101
102
|
|
102
103
|
def _load_from_sqlalchemy(self, **options):
|
103
104
|
try:
|
105
|
+
options.setdefault("debug", self.debug)
|
106
|
+
options.setdefault("verbose_debug", self.verbose_debug)
|
104
107
|
db_loader = SqlAlchemyLoadFromDb(
|
105
108
|
self.plugin_sqlalchemy,
|
106
109
|
self.plugin_query,
|
@@ -120,6 +123,8 @@ class DfHelper:
|
|
120
123
|
|
121
124
|
def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
122
125
|
try:
|
126
|
+
options.setdefault("debug", self.debug)
|
127
|
+
options.setdefault("verbose_debug", self.verbose_debug)
|
123
128
|
db_loader = DjangoLoadFromDb(
|
124
129
|
self.plugin_django_connection,
|
125
130
|
self.plugin_query,
|
@@ -207,6 +212,7 @@ class DfHelper:
|
|
207
212
|
def save_to_parquet(self, parquet_filename: Optional[str] = None):
|
208
213
|
ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
|
209
214
|
ps.save_to_parquet(parquet_filename)
|
215
|
+
self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
|
210
216
|
|
211
217
|
def save_to_clickhouse(self, database, table, order_by=None, **credentials):
|
212
218
|
click_config ={
|
@@ -215,13 +221,14 @@ class DfHelper:
|
|
215
221
|
'order_by': order_by or 'id',
|
216
222
|
}
|
217
223
|
credentials = {**credentials, **click_config}
|
218
|
-
cs=ClickHouseWriter(**credentials)
|
224
|
+
cs=ClickHouseWriter(logger=self.logger, **credentials)
|
219
225
|
cs.save_to_clickhouse(self.df)
|
226
|
+
self.logger.info("Save to ClickHouse completed.")
|
220
227
|
|
221
228
|
def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
222
229
|
self.df = self.plugin_parquet.load_files()
|
223
230
|
if options:
|
224
|
-
self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
|
231
|
+
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
225
232
|
return self.df
|
226
233
|
|
227
234
|
def load_period(self, **kwargs):
|
@@ -2,10 +2,17 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from ._params_config import ParamsConfig
|
4
4
|
from ._query_config import QueryConfig
|
5
|
-
from ._defaults import
|
5
|
+
from ._defaults import (
|
6
|
+
django_field_conversion_map_pandas,
|
7
|
+
django_field_conversion_map_dask,
|
8
|
+
sqlalchemy_field_conversion_map_dask,
|
9
|
+
normalize_sqlalchemy_type)
|
6
10
|
|
7
11
|
__all__ = [
|
8
12
|
"ParamsConfig",
|
9
13
|
"QueryConfig",
|
10
|
-
"
|
14
|
+
"django_field_conversion_map_pandas",
|
15
|
+
"django_field_conversion_map_dask",
|
16
|
+
"sqlalchemy_field_conversion_map_dask",
|
17
|
+
"normalize_sqlalchemy_type"
|
11
18
|
]
|
@@ -1,10 +1,12 @@
|
|
1
1
|
# Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
|
2
2
|
#
|
3
3
|
import json
|
4
|
-
from
|
5
|
-
from typing import Dict, Union, Optional
|
4
|
+
from typing import Dict
|
6
5
|
|
7
6
|
import pandas as pd
|
7
|
+
from sqlalchemy import String, Text, Integer, BigInteger, SmallInteger, Float, Boolean, DateTime, Date, Time, JSON, \
|
8
|
+
Numeric, UUID
|
9
|
+
from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
|
8
10
|
|
9
11
|
# This is the defaults configuration file for the df_helper module.
|
10
12
|
|
@@ -13,11 +15,12 @@ import pandas as pd
|
|
13
15
|
# conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
|
14
16
|
# the Django field type.
|
15
17
|
|
16
|
-
|
18
|
+
django_field_conversion_map_pandas: Dict[str, callable] = {
|
17
19
|
"CharField": lambda x: x.astype(str),
|
18
20
|
"TextField": lambda x: x.astype(str),
|
19
21
|
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
20
22
|
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
|
+
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
21
24
|
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
22
25
|
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
26
|
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
@@ -35,3 +38,96 @@ django_field_conversion_map: Dict[str, callable] = {
|
|
35
38
|
"ArrayField": lambda x: x.apply(eval),
|
36
39
|
"UUIDField": lambda x: x.astype(str),
|
37
40
|
}
|
41
|
+
|
42
|
+
django_field_conversion_map_dask: Dict[str, callable] = {
|
43
|
+
"CharField": lambda x: x.astype(str),
|
44
|
+
"TextField": lambda x: x.astype(str),
|
45
|
+
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
46
|
+
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
47
|
+
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
48
|
+
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
49
|
+
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
50
|
+
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
51
|
+
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
52
|
+
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
53
|
+
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
54
|
+
"BooleanField": lambda x: x.astype(bool),
|
55
|
+
"NullBooleanField": lambda x: x.astype(bool),
|
56
|
+
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
+
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
|
58
|
+
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
|
59
|
+
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
60
|
+
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
61
|
+
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
62
|
+
"UUIDField": lambda x: x.astype(str),
|
63
|
+
}
|
64
|
+
|
65
|
+
sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
66
|
+
String.__name__: lambda x: x.astype(str).fillna(""),
|
67
|
+
Text.__name__: lambda x: x.fillna('').astype(str),
|
68
|
+
Integer.__name__: lambda x: x.fillna(0).astype(int),
|
69
|
+
BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
70
|
+
SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
71
|
+
Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
72
|
+
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
73
|
+
Boolean.__name__: lambda x: x.astype(bool),
|
74
|
+
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
75
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
|
76
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
|
77
|
+
JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
78
|
+
UUID.__name__: lambda x: x.astype(str),
|
79
|
+
}
|
80
|
+
|
81
|
+
# Conversion map with normalized SQLAlchemy field types
|
82
|
+
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
83
|
+
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
84
|
+
# "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
|
85
|
+
# "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
|
86
|
+
# "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
87
|
+
# "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
|
88
|
+
# "Float": lambda x: pd.to_numeric(x, errors="coerce"),
|
89
|
+
# "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
|
90
|
+
# "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
|
91
|
+
# "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
|
92
|
+
# "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
|
93
|
+
# "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
|
94
|
+
# "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
95
|
+
# }
|
96
|
+
|
97
|
+
|
98
|
+
def normalize_sqlalchemy_type(field_type):
|
99
|
+
"""
|
100
|
+
Normalize SQLAlchemy field types to generic type names.
|
101
|
+
Handles dialect-specific types (e.g., MySQL).
|
102
|
+
"""
|
103
|
+
# Map of generic SQLAlchemy types
|
104
|
+
type_mapping = {
|
105
|
+
String: "String",
|
106
|
+
Text: "Text",
|
107
|
+
Integer: "Integer",
|
108
|
+
SmallInteger: "SmallInteger",
|
109
|
+
BigInteger: "BigInteger",
|
110
|
+
Float: "Float",
|
111
|
+
Numeric: "Numeric",
|
112
|
+
Boolean: "Boolean",
|
113
|
+
DateTime: "DateTime",
|
114
|
+
Date: "Date",
|
115
|
+
Time: "Time",
|
116
|
+
JSON: "JSON",
|
117
|
+
}
|
118
|
+
|
119
|
+
# Dialect-specific types
|
120
|
+
dialect_mapping = {
|
121
|
+
TINYINT: "SmallInteger",
|
122
|
+
MEDIUMTEXT: "Text",
|
123
|
+
}
|
124
|
+
|
125
|
+
# Check if the field matches a generic or dialect-specific type
|
126
|
+
for sql_type, name in {**type_mapping, **dialect_mapping}.items():
|
127
|
+
if isinstance(field_type, sql_type):
|
128
|
+
return name
|
129
|
+
|
130
|
+
# Fallback to raw class name
|
131
|
+
return field_type.__class__.__name__
|
132
|
+
|
133
|
+
|
@@ -4,6 +4,7 @@ from django.db.models import Q
|
|
4
4
|
|
5
5
|
from sibi_dst.df_helper.plugins.django import ReadFrameDask
|
6
6
|
from sibi_dst.utils import Logger
|
7
|
+
from sibi_dst.df_helper.core import django_field_conversion_map_dask
|
7
8
|
|
8
9
|
class DjangoLoadFromDb:
|
9
10
|
df: dd.DataFrame
|
@@ -26,10 +27,10 @@ class DjangoLoadFromDb:
|
|
26
27
|
|
27
28
|
def build_and_load(self):
|
28
29
|
self.df = self._build_and_load()
|
29
|
-
|
30
|
-
self._process_loaded_data()
|
30
|
+
#self.df = self._convert_columns(self.df)
|
31
31
|
return self.df
|
32
32
|
|
33
|
+
|
33
34
|
def _build_and_load(self) -> dd.DataFrame:
|
34
35
|
query = self.connection_config.model.objects.using(self.connection_config.connection_name)
|
35
36
|
if not self.params_config.filters:
|
@@ -61,10 +62,60 @@ class DjangoLoadFromDb:
|
|
61
62
|
q_objects.add(~Q(**{key: value}), Q.AND)
|
62
63
|
return q_objects
|
63
64
|
|
64
|
-
def
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
65
|
+
def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
|
66
|
+
"""
|
67
|
+
Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
|
68
|
+
|
69
|
+
:param df: Dask DataFrame whose columns' data types are to be converted.
|
70
|
+
:return: Dask DataFrame with converted column data types.
|
71
|
+
"""
|
72
|
+
|
73
|
+
def log_debug(message: str, is_verbose: bool = False):
|
74
|
+
"""Helper to handle debug and verbose debug logging."""
|
75
|
+
if self.debug:
|
76
|
+
self.logger.debug(message)
|
77
|
+
if is_verbose and self.verbose_debug:
|
78
|
+
print(message)
|
79
|
+
|
80
|
+
if self.debug:
|
81
|
+
self.logger.info(f'Converting columns: {list(df.columns)}')
|
82
|
+
|
83
|
+
# Get field information from the Django model
|
84
|
+
model_fields = self.connection_config.model._meta.get_fields()
|
85
|
+
field_type_map = {field.name: type(field).__name__ for field in model_fields}
|
86
|
+
# Simplified loop to apply conversions partition-wise
|
87
|
+
for field_name, field_type in field_type_map.items():
|
88
|
+
if field_name not in df.columns:
|
89
|
+
|
90
|
+
log_debug(f"Column '{field_name}' not found in DataFrame columns.")
|
91
|
+
continue
|
92
|
+
|
93
|
+
conversion_func = django_field_conversion_map_dask.get(field_type)
|
94
|
+
if not conversion_func:
|
95
|
+
message=f"Field type '{field_type}' not found in conversion_map."
|
96
|
+
log_debug(message, is_verbose=True)
|
97
|
+
continue
|
98
|
+
|
99
|
+
def apply_conversion(partition):
|
100
|
+
"""
|
101
|
+
Apply the conversion function to a single partition for the given column.
|
102
|
+
"""
|
103
|
+
try:
|
104
|
+
if field_name in partition.columns:
|
105
|
+
partition[field_name] = conversion_func(partition[field_name])
|
106
|
+
except Exception as e:
|
107
|
+
self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
|
108
|
+
return partition
|
109
|
+
|
110
|
+
try:
|
111
|
+
# Apply conversion lazily to each partition
|
112
|
+
df = df.map_partitions(
|
113
|
+
apply_conversion,
|
114
|
+
meta=df,
|
115
|
+
)
|
116
|
+
log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
|
117
|
+
is_verbose=True)
|
118
|
+
except Exception as e:
|
119
|
+
log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
|
120
|
+
|
121
|
+
return df
|
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
|
|
13
13
|
api_key: Optional[SecretStr] = None
|
14
14
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
15
15
|
|
16
|
-
def __init__(self, **data):
|
16
|
+
def __init__(self, logger=None, **data):
|
17
17
|
super().__init__(**data)
|
18
18
|
# Initialize the logger if not provided
|
19
|
-
|
20
|
-
self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
|
19
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
21
20
|
|
22
21
|
async def fetch_data(self, **options) -> dd.DataFrame:
|
23
22
|
"""Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
|
@@ -1,7 +1,11 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
import dask.dataframe as dd
|
3
|
+
from sibi_dst.utils import Logger
|
3
4
|
|
4
5
|
class ParquetFilterHandler(object):
|
6
|
+
def __init__(self, logger=None):
|
7
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
8
|
+
|
5
9
|
@staticmethod
|
6
10
|
def apply_filters_dask(df, filters):
|
7
11
|
dt_operators = ['date', 'time']
|
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
|
|
25
25
|
@model_validator(mode='after')
|
26
26
|
def check_parquet_params(self):
|
27
27
|
# Configure paths based on fsspec
|
28
|
+
if self.logger is None:
|
29
|
+
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
28
30
|
self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
|
29
31
|
|
30
32
|
# Validation for parquet path
|
@@ -1,143 +1,133 @@
|
|
1
1
|
import itertools
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import pandas as pd
|
4
|
-
from sqlalchemy
|
5
|
-
from sqlalchemy.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
fieldnames=None,
|
14
|
-
index_col=None,
|
15
|
-
coerce_float=False,
|
16
|
-
verbose=True,
|
17
|
-
datetime_index=False,
|
18
|
-
column_names=None,
|
19
|
-
chunk_size=1000,
|
20
|
-
):
|
5
|
+
from sqlalchemy import create_engine, inspect, select
|
6
|
+
from sqlalchemy.orm import sessionmaker
|
7
|
+
|
8
|
+
from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
|
9
|
+
from sibi_dst.utils import Logger
|
10
|
+
|
11
|
+
|
12
|
+
class SQLAlchemyDask:
|
13
|
+
def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
|
21
14
|
"""
|
22
|
-
Initialize
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
verbose: Whether to include verbose processing like handling choices.
|
31
|
-
datetime_index: Whether to convert the index to a datetime index.
|
32
|
-
column_names: Optional mapping of fieldnames to custom column names.
|
33
|
-
chunk_size: Number of records to fetch in each chunk.
|
15
|
+
Initialize with an SQLAlchemy query and database engine URL.
|
16
|
+
|
17
|
+
:param model: SQLAlchemy ORM model.
|
18
|
+
:param filters: Filters to apply on the query.
|
19
|
+
:param engine_url: Database connection string for SQLAlchemy engine.
|
20
|
+
:param chunk_size: Number of records per chunk for Dask partitions.
|
21
|
+
:param logger: Logger instance for logging.
|
22
|
+
:param verbose: Whether to print detailed logs.
|
34
23
|
"""
|
35
|
-
self.query =
|
36
|
-
self.
|
37
|
-
self.
|
38
|
-
self.index_col = index_col
|
39
|
-
self.coerce_float = coerce_float
|
40
|
-
self.verbose = verbose
|
41
|
-
self.datetime_index = datetime_index
|
42
|
-
self.column_names = column_names
|
24
|
+
self.query = None
|
25
|
+
self.model = model
|
26
|
+
self.filters = filters
|
43
27
|
self.chunk_size = chunk_size
|
28
|
+
self.verbose = verbose
|
29
|
+
self.engine = create_engine(engine_url)
|
30
|
+
self.Session = sessionmaker(bind=self.engine)
|
31
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
44
32
|
|
45
33
|
@staticmethod
|
46
|
-
def
|
34
|
+
def infer_dtypes_from_model(model):
|
47
35
|
"""
|
48
|
-
|
49
|
-
|
50
|
-
Args:
|
51
|
-
row: SQLAlchemy ORM object, Row object, or tuple.
|
52
|
-
fields: List of fields to extract.
|
53
|
-
|
54
|
-
Returns:
|
55
|
-
A dictionary representation of the row.
|
36
|
+
Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
|
56
37
|
"""
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
38
|
+
mapper = inspect(model)
|
39
|
+
sqlalchemy_to_dask_dtype = {
|
40
|
+
#'INTEGER': pd.to_numeric(x, errors="coerce"),
|
41
|
+
'INTEGER': 'Int64',
|
42
|
+
'SMALLINT': 'Int64',
|
43
|
+
'BIGINT': 'Int64',
|
44
|
+
'FLOAT': 'float64',
|
45
|
+
'NUMERIC': 'float64',
|
46
|
+
'BOOLEAN': 'bool',
|
47
|
+
'VARCHAR': 'object',
|
48
|
+
'TEXT': 'object',
|
49
|
+
'DATE': 'datetime64[ns]',
|
50
|
+
'DATETIME': 'datetime64[ns]',
|
51
|
+
'TIME': 'object',
|
52
|
+
'UUID': 'object',
|
53
|
+
}
|
54
|
+
|
55
|
+
dtypes = {}
|
56
|
+
for column in mapper.columns:
|
57
|
+
dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
|
58
|
+
dtypes[column.name] = dtype
|
59
|
+
|
60
|
+
return dtypes
|
78
61
|
|
79
62
|
def read_frame(self, fillna_value=None):
|
80
63
|
"""
|
81
|
-
|
82
|
-
|
83
|
-
Args:
|
84
|
-
fillna_value: Value to use for filling missing values.
|
64
|
+
Load data from an SQLAlchemy query into a Dask DataFrame.
|
85
65
|
|
86
|
-
|
87
|
-
|
66
|
+
:param fillna_value: Value to replace NaN or NULL values with, if any.
|
67
|
+
:return: Dask DataFrame.
|
88
68
|
"""
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
self.
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
69
|
+
with self.Session() as session:
|
70
|
+
try:
|
71
|
+
# Build query
|
72
|
+
self.query = select(self.model)
|
73
|
+
if self.filters:
|
74
|
+
self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
75
|
+
else:
|
76
|
+
n_records = 100
|
77
|
+
self.query = self.query.limit(n_records)
|
78
|
+
|
79
|
+
# Infer dtypes
|
80
|
+
dtypes = self.infer_dtypes_from_model(self.model)
|
81
|
+
# Get the column order from the SQLAlchemy model
|
82
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
83
|
+
|
84
|
+
# Execute query and fetch results in chunks
|
85
|
+
result_proxy = session.execute(self.query)
|
86
|
+
results = result_proxy.scalars().all() # Fetch all rows
|
87
|
+
iterator = iter(results)
|
88
|
+
|
89
|
+
partitions = []
|
90
|
+
|
91
|
+
while True:
|
92
|
+
chunk = list(itertools.islice(iterator, self.chunk_size))
|
93
|
+
if not chunk:
|
94
|
+
break
|
95
|
+
|
96
|
+
# Convert chunk to Pandas DataFrame
|
97
|
+
df = pd.DataFrame.from_records(
|
98
|
+
[row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
|
99
|
+
)
|
100
|
+
# Drop internal SQLAlchemy state if it exists
|
101
|
+
df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
|
102
|
+
|
103
|
+
# Reorder columns to match the model's order
|
104
|
+
df = df[ordered_columns]
|
105
|
+
|
106
|
+
# Fill NaN values
|
107
|
+
if fillna_value is not None:
|
108
|
+
df = df.fillna(fillna_value)
|
109
|
+
|
110
|
+
# Convert timezone-aware columns to naive
|
111
|
+
for col in df.columns:
|
112
|
+
if isinstance(df[col].dtype, pd.DatetimeTZDtype):
|
113
|
+
df[col] = df[col].dt.tz_localize(None)
|
114
|
+
|
115
|
+
# Apply inferred dtypes
|
116
|
+
df = df.astype(dtypes)
|
117
|
+
# Create a Dask partition
|
118
|
+
partitions.append(dd.from_pandas(df, npartitions=1))
|
119
|
+
|
120
|
+
# Concatenate all partitions
|
121
|
+
# print(partitions)
|
122
|
+
if partitions:
|
123
|
+
dask_df = dd.concat(partitions, axis=0, ignore_index=True)
|
124
|
+
else:
|
125
|
+
dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
126
|
+
|
127
|
+
if self.verbose:
|
128
|
+
self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
|
129
|
+
return dask_df
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
self.logger.error(f"Error executing query: {str(e)}")
|
133
|
+
return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
|
@@ -50,6 +50,7 @@ class SqlAlchemyFilterHandler:
|
|
50
50
|
return [datetime.date.fromisoformat(v) for v in value]
|
51
51
|
return value
|
52
52
|
|
53
|
+
|
53
54
|
def handle_date_operator(column, date_op):
|
54
55
|
"""
|
55
56
|
Handle filtering on specific datetime parts (e.g., year, month).
|
@@ -90,6 +91,7 @@ class SqlAlchemyFilterHandler:
|
|
90
91
|
|
91
92
|
# Get the column from the model
|
92
93
|
column = getattr(model, field_name, None)
|
94
|
+
#column = model.__table__.columns.get(field_name)
|
93
95
|
if not column:
|
94
96
|
raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
|
95
97
|
|