sibi-dst 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,6 +42,7 @@ class DfHelper:
42
42
  self.dt_field=kwargs.setdefault("dt_field", None)
43
43
  self.as_pandas = kwargs.setdefault("as_pandas", False)
44
44
  kwargs.setdefault("live", True)
45
+ kwargs.setdefault("logger", self.logger)
45
46
  self.post_init(**kwargs)
46
47
 
47
48
 
@@ -101,6 +102,8 @@ class DfHelper:
101
102
 
102
103
  def _load_from_sqlalchemy(self, **options):
103
104
  try:
105
+ options.setdefault("debug", self.debug)
106
+ options.setdefault("verbose_debug", self.verbose_debug)
104
107
  db_loader = SqlAlchemyLoadFromDb(
105
108
  self.plugin_sqlalchemy,
106
109
  self.plugin_query,
@@ -120,6 +123,8 @@ class DfHelper:
120
123
 
121
124
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
122
125
  try:
126
+ options.setdefault("debug", self.debug)
127
+ options.setdefault("verbose_debug", self.verbose_debug)
123
128
  db_loader = DjangoLoadFromDb(
124
129
  self.plugin_django_connection,
125
130
  self.plugin_query,
@@ -207,6 +212,7 @@ class DfHelper:
207
212
  def save_to_parquet(self, parquet_filename: Optional[str] = None):
208
213
  ps = ParquetSaver(self.df, self.parquet_storage_path, self.logger)
209
214
  ps.save_to_parquet(parquet_filename)
215
+ self.logger.info(f"Parquet saved to {parquet_filename} in parquet storage: {self.parquet_storage_path}.")
210
216
 
211
217
  def save_to_clickhouse(self, database, table, order_by=None, **credentials):
212
218
  click_config ={
@@ -215,13 +221,14 @@ class DfHelper:
215
221
  'order_by': order_by or 'id',
216
222
  }
217
223
  credentials = {**credentials, **click_config}
218
- cs=ClickHouseWriter(**credentials)
224
+ cs=ClickHouseWriter(logger=self.logger, **credentials)
219
225
  cs.save_to_clickhouse(self.df)
226
+ self.logger.info("Save to ClickHouse completed.")
220
227
 
221
228
  def _load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
222
229
  self.df = self.plugin_parquet.load_files()
223
230
  if options:
224
- self.df = ParquetFilterHandler().apply_filters_dask(self.df, options)
231
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
225
232
  return self.df
226
233
 
227
234
  def load_period(self, **kwargs):
@@ -2,10 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  from ._params_config import ParamsConfig
4
4
  from ._query_config import QueryConfig
5
- from ._defaults import django_field_conversion_map
5
+ from ._defaults import (
6
+ django_field_conversion_map_pandas,
7
+ django_field_conversion_map_dask,
8
+ sqlalchemy_field_conversion_map_dask,
9
+ normalize_sqlalchemy_type)
6
10
 
7
11
  __all__ = [
8
12
  "ParamsConfig",
9
13
  "QueryConfig",
10
- "django_field_conversion_map"
14
+ "django_field_conversion_map_pandas",
15
+ "django_field_conversion_map_dask",
16
+ "sqlalchemy_field_conversion_map_dask",
17
+ "normalize_sqlalchemy_type"
11
18
  ]
@@ -1,10 +1,12 @@
1
1
  # Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
2
2
  #
3
3
  import json
4
- from dataclasses import dataclass
5
- from typing import Dict, Union, Optional
4
+ from typing import Dict
6
5
 
7
6
  import pandas as pd
7
+ from sqlalchemy import String, Text, Integer, BigInteger, SmallInteger, Float, Boolean, DateTime, Date, Time, JSON, \
8
+ Numeric, UUID
9
+ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
8
10
 
9
11
  # This is the defaults configuration file for the df_helper module.
10
12
 
@@ -13,11 +15,12 @@ import pandas as pd
13
15
  # conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
14
16
  # the Django field type.
15
17
 
16
- django_field_conversion_map: Dict[str, callable] = {
18
+ django_field_conversion_map_pandas: Dict[str, callable] = {
17
19
  "CharField": lambda x: x.astype(str),
18
20
  "TextField": lambda x: x.astype(str),
19
21
  "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
20
22
  "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
23
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
21
24
  "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
22
25
  "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
23
26
  "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
@@ -35,3 +38,96 @@ django_field_conversion_map: Dict[str, callable] = {
35
38
  "ArrayField": lambda x: x.apply(eval),
36
39
  "UUIDField": lambda x: x.astype(str),
37
40
  }
41
+
42
+ django_field_conversion_map_dask: Dict[str, callable] = {
43
+ "CharField": lambda x: x.astype(str),
44
+ "TextField": lambda x: x.astype(str),
45
+ "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
46
+ "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
47
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
48
+ "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
49
+ "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
50
+ "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
51
+ "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
52
+ "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
53
+ "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
54
+ "BooleanField": lambda x: x.astype(bool),
55
+ "NullBooleanField": lambda x: x.astype(bool),
56
+ "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
58
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
59
+ "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
60
+ "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
61
+ "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
62
+ "UUIDField": lambda x: x.astype(str),
63
+ }
64
+
65
+ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
66
+ String.__name__: lambda x: x.astype(str).fillna(""),
67
+ Text.__name__: lambda x: x.fillna('').astype(str),
68
+ Integer.__name__: lambda x: x.fillna(0).astype(int),
69
+ BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
70
+ SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
71
+ Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
72
+ Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
73
+ Boolean.__name__: lambda x: x.astype(bool),
74
+ DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
75
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
76
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
77
+ JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
78
+ UUID.__name__: lambda x: x.astype(str),
79
+ }
80
+
81
+ # Conversion map with normalized SQLAlchemy field types
82
+ # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
83
+ # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
84
+ # "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
85
+ # "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
86
+ # "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
87
+ # "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
88
+ # "Float": lambda x: pd.to_numeric(x, errors="coerce"),
89
+ # "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
90
+ # "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
91
+ # "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
92
+ # "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
93
+ # "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
94
+ # "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
95
+ # }
96
+
97
+
98
+ def normalize_sqlalchemy_type(field_type):
99
+ """
100
+ Normalize SQLAlchemy field types to generic type names.
101
+ Handles dialect-specific types (e.g., MySQL).
102
+ """
103
+ # Map of generic SQLAlchemy types
104
+ type_mapping = {
105
+ String: "String",
106
+ Text: "Text",
107
+ Integer: "Integer",
108
+ SmallInteger: "SmallInteger",
109
+ BigInteger: "BigInteger",
110
+ Float: "Float",
111
+ Numeric: "Numeric",
112
+ Boolean: "Boolean",
113
+ DateTime: "DateTime",
114
+ Date: "Date",
115
+ Time: "Time",
116
+ JSON: "JSON",
117
+ }
118
+
119
+ # Dialect-specific types
120
+ dialect_mapping = {
121
+ TINYINT: "SmallInteger",
122
+ MEDIUMTEXT: "Text",
123
+ }
124
+
125
+ # Check if the field matches a generic or dialect-specific type
126
+ for sql_type, name in {**type_mapping, **dialect_mapping}.items():
127
+ if isinstance(field_type, sql_type):
128
+ return name
129
+
130
+ # Fallback to raw class name
131
+ return field_type.__class__.__name__
132
+
133
+
@@ -4,6 +4,7 @@ from django.db.models import Q
4
4
 
5
5
  from sibi_dst.df_helper.plugins.django import ReadFrameDask
6
6
  from sibi_dst.utils import Logger
7
+ from sibi_dst.df_helper.core import django_field_conversion_map_dask
7
8
 
8
9
  class DjangoLoadFromDb:
9
10
  df: dd.DataFrame
@@ -26,10 +27,10 @@ class DjangoLoadFromDb:
26
27
 
27
28
  def build_and_load(self):
28
29
  self.df = self._build_and_load()
29
- if self.df is not None:
30
- self._process_loaded_data()
30
+ #self.df = self._convert_columns(self.df)
31
31
  return self.df
32
32
 
33
+
33
34
  def _build_and_load(self) -> dd.DataFrame:
34
35
  query = self.connection_config.model.objects.using(self.connection_config.connection_name)
35
36
  if not self.params_config.filters:
@@ -61,10 +62,60 @@ class DjangoLoadFromDb:
61
62
  q_objects.add(~Q(**{key: value}), Q.AND)
62
63
  return q_objects
63
64
 
64
- def _process_loaded_data(self):
65
- field_map = self.params_config.field_map
66
- if field_map is not None:
67
- rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
68
- if rename_mapping:
69
- # Apply renaming
70
- self.df = self.df.rename(columns=rename_mapping)
65
+ def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
66
+ """
67
+ Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
68
+
69
+ :param df: Dask DataFrame whose columns' data types are to be converted.
70
+ :return: Dask DataFrame with converted column data types.
71
+ """
72
+
73
+ def log_debug(message: str, is_verbose: bool = False):
74
+ """Helper to handle debug and verbose debug logging."""
75
+ if self.debug:
76
+ self.logger.debug(message)
77
+ if is_verbose and self.verbose_debug:
78
+ print(message)
79
+
80
+ if self.debug:
81
+ self.logger.info(f'Converting columns: {list(df.columns)}')
82
+
83
+ # Get field information from the Django model
84
+ model_fields = self.connection_config.model._meta.get_fields()
85
+ field_type_map = {field.name: type(field).__name__ for field in model_fields}
86
+ # Simplified loop to apply conversions partition-wise
87
+ for field_name, field_type in field_type_map.items():
88
+ if field_name not in df.columns:
89
+
90
+ log_debug(f"Column '{field_name}' not found in DataFrame columns.")
91
+ continue
92
+
93
+ conversion_func = django_field_conversion_map_dask.get(field_type)
94
+ if not conversion_func:
95
+ message=f"Field type '{field_type}' not found in conversion_map."
96
+ log_debug(message, is_verbose=True)
97
+ continue
98
+
99
+ def apply_conversion(partition):
100
+ """
101
+ Apply the conversion function to a single partition for the given column.
102
+ """
103
+ try:
104
+ if field_name in partition.columns:
105
+ partition[field_name] = conversion_func(partition[field_name])
106
+ except Exception as e:
107
+ self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
108
+ return partition
109
+
110
+ try:
111
+ # Apply conversion lazily to each partition
112
+ df = df.map_partitions(
113
+ apply_conversion,
114
+ meta=df,
115
+ )
116
+ log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
+ is_verbose=True)
118
+ except Exception as e:
119
+ log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
120
+
121
+ return df
@@ -13,11 +13,10 @@ class HttpConfig(BaseModel):
13
13
  api_key: Optional[SecretStr] = None
14
14
  model_config = ConfigDict(arbitrary_types_allowed=True)
15
15
 
16
- def __init__(self, **data):
16
+ def __init__(self, logger=None, **data):
17
17
  super().__init__(**data)
18
18
  # Initialize the logger if not provided
19
- if not self.logger:
20
- self.logger = Logger(log_dir='./logs/', logger_name="HttpDataSource", log_file='http_data_source.log')
19
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
21
20
 
22
21
  async def fetch_data(self, **options) -> dd.DataFrame:
23
22
  """Asynchronously fetch JSON data from HTTP endpoint, substituting options into the URL path."""
@@ -1,7 +1,11 @@
1
1
  import pandas as pd
2
2
  import dask.dataframe as dd
3
+ from sibi_dst.utils import Logger
3
4
 
4
5
  class ParquetFilterHandler(object):
6
+ def __init__(self, logger=None):
7
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
8
+
5
9
  @staticmethod
6
10
  def apply_filters_dask(df, filters):
7
11
  dt_operators = ['date', 'time']
@@ -25,6 +25,8 @@ class ParquetConfig(BaseModel):
25
25
  @model_validator(mode='after')
26
26
  def check_parquet_params(self):
27
27
  # Configure paths based on fsspec
28
+ if self.logger is None:
29
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
30
  self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(str(self.parquet_storage_path).split("://")[0])
29
31
 
30
32
  # Validation for parquet path
@@ -1,143 +1,133 @@
1
1
  import itertools
2
+
2
3
  import dask.dataframe as dd
3
4
  import pandas as pd
4
- from sqlalchemy.orm import Query
5
- from sqlalchemy.inspection import inspect
6
-
7
-
8
- class ReadFrameSqlAlchemy:
9
- def __init__(
10
- self,
11
- query,
12
- session,
13
- fieldnames=None,
14
- index_col=None,
15
- coerce_float=False,
16
- verbose=True,
17
- datetime_index=False,
18
- column_names=None,
19
- chunk_size=1000,
20
- ):
5
+ from sqlalchemy import create_engine, inspect, select
6
+ from sqlalchemy.orm import sessionmaker
7
+
8
+ from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
9
+ from sibi_dst.utils import Logger
10
+
11
+
12
+ class SQLAlchemyDask:
13
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
21
14
  """
22
- Initialize the loader for SQLAlchemy queries.
23
-
24
- Args:
25
- query: SQLAlchemy query (ORM or Select).
26
- session: SQLAlchemy session for executing the query.
27
- fieldnames: Optional list of field names to include in the result.
28
- index_col: Column to use as the index of the DataFrame.
29
- coerce_float: Attempt to coerce values to float where applicable.
30
- verbose: Whether to include verbose processing like handling choices.
31
- datetime_index: Whether to convert the index to a datetime index.
32
- column_names: Optional mapping of fieldnames to custom column names.
33
- chunk_size: Number of records to fetch in each chunk.
15
+ Initialize with an SQLAlchemy query and database engine URL.
16
+
17
+ :param model: SQLAlchemy ORM model.
18
+ :param filters: Filters to apply on the query.
19
+ :param engine_url: Database connection string for SQLAlchemy engine.
20
+ :param chunk_size: Number of records per chunk for Dask partitions.
21
+ :param logger: Logger instance for logging.
22
+ :param verbose: Whether to print detailed logs.
34
23
  """
35
- self.query = query
36
- self.session = session
37
- self.fieldnames = fieldnames
38
- self.index_col = index_col
39
- self.coerce_float = coerce_float
40
- self.verbose = verbose
41
- self.datetime_index = datetime_index
42
- self.column_names = column_names
24
+ self.query = None
25
+ self.model = model
26
+ self.filters = filters
43
27
  self.chunk_size = chunk_size
28
+ self.verbose = verbose
29
+ self.engine = create_engine(engine_url)
30
+ self.Session = sessionmaker(bind=self.engine)
31
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
44
32
 
45
33
  @staticmethod
46
- def row_to_dict(row, fields=None):
34
+ def infer_dtypes_from_model(model):
47
35
  """
48
- Convert a SQLAlchemy result row to a dictionary.
49
-
50
- Args:
51
- row: SQLAlchemy ORM object, Row object, or tuple.
52
- fields: List of fields to extract.
53
-
54
- Returns:
55
- A dictionary representation of the row.
36
+ Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
56
37
  """
57
- # Handle ORM instances
58
- if hasattr(row, "__dict__"): # For ORM instances
59
- data = row.__dict__.copy()
60
- data.pop("_sa_instance_state", None) # Remove SQLAlchemy internal state
61
- # Handle SQLAlchemy Row objects
62
- elif hasattr(row, "_mapping"): # For SQLAlchemy result rows
63
- data = dict(row._mapping)
64
- # Handle tuples (e.g., raw query results)
65
- elif isinstance(row, tuple):
66
- if fields:
67
- data = dict(zip(fields, row))
68
- else:
69
- raise ValueError("Cannot map tuple row without field names.")
70
- else:
71
- raise ValueError(f"Unsupported row type: {type(row)}. Expected ORM instance, dict-like object, or tuple.")
72
-
73
- # Filter by specified fields
74
- if fields:
75
- return {field: data.get(field, None) for field in fields}
76
- else:
77
- return data
38
+ mapper = inspect(model)
39
+ sqlalchemy_to_dask_dtype = {
40
+ #'INTEGER': pd.to_numeric(x, errors="coerce"),
41
+ 'INTEGER': 'Int64',
42
+ 'SMALLINT': 'Int64',
43
+ 'BIGINT': 'Int64',
44
+ 'FLOAT': 'float64',
45
+ 'NUMERIC': 'float64',
46
+ 'BOOLEAN': 'bool',
47
+ 'VARCHAR': 'object',
48
+ 'TEXT': 'object',
49
+ 'DATE': 'datetime64[ns]',
50
+ 'DATETIME': 'datetime64[ns]',
51
+ 'TIME': 'object',
52
+ 'UUID': 'object',
53
+ }
54
+
55
+ dtypes = {}
56
+ for column in mapper.columns:
57
+ dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
58
+ dtypes[column.name] = dtype
59
+
60
+ return dtypes
78
61
 
79
62
  def read_frame(self, fillna_value=None):
80
63
  """
81
- Convert the query results to a Dask DataFrame.
82
-
83
- Args:
84
- fillna_value: Value to use for filling missing values.
64
+ Load data from an SQLAlchemy query into a Dask DataFrame.
85
65
 
86
- Returns:
87
- A Dask DataFrame.
66
+ :param fillna_value: Value to replace NaN or NULL values with, if any.
67
+ :return: Dask DataFrame.
88
68
  """
89
- # Infer fieldnames if not provided
90
- if not self.fieldnames:
91
- if hasattr(self.query, "selected_columns"):
92
- self.fieldnames = [col.key for col in self.query.selected_columns]
93
- else:
94
- self.fieldnames = [col.name for col in inspect(self.query._entity_zero().class_).columns]
95
-
96
- partitions = []
97
- results = self.session.execute(self.query) # Execute the query
98
-
99
- # Debugging raw results
100
- print("Results fetched:", results)
101
-
102
- # Chunk processing
103
- iterator = iter(results)
104
- while True:
105
- chunk = list(itertools.islice(iterator, self.chunk_size))
106
- if not chunk:
107
- break
108
-
109
- # Convert chunk to DataFrame
110
- df = pd.DataFrame.from_records(
111
- [self.row_to_dict(row, self.fieldnames) for row in chunk],
112
- columns=self.fieldnames,
113
- coerce_float=self.coerce_float,
114
- )
115
-
116
- # Handle missing values
117
- if fillna_value is not None:
118
- df = df.fillna(fillna_value)
119
-
120
- # Convert datetime columns to timezone-naive
121
- for col in df.columns:
122
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
123
- df[col] = df[col].dt.tz_localize(None)
124
-
125
- partitions.append(dd.from_pandas(df, npartitions=1))
126
-
127
- # Concatenate partitions
128
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
129
-
130
- # Handle index column
131
- if self.index_col and self.index_col in dask_df.columns:
132
- dask_df = dask_df.set_index(self.index_col)
133
-
134
- # Convert index to datetime if required
135
- if self.datetime_index and self.index_col in dask_df.columns:
136
- dask_df = dask_df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index)))
137
-
138
- # Handle column renaming
139
- if self.column_names:
140
- rename_mapping = dict(zip(self.fieldnames, self.column_names))
141
- dask_df = dask_df.rename(columns=rename_mapping)
142
-
143
- return dask_df
69
+ with self.Session() as session:
70
+ try:
71
+ # Build query
72
+ self.query = select(self.model)
73
+ if self.filters:
74
+ self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
75
+ else:
76
+ n_records = 100
77
+ self.query = self.query.limit(n_records)
78
+
79
+ # Infer dtypes
80
+ dtypes = self.infer_dtypes_from_model(self.model)
81
+ # Get the column order from the SQLAlchemy model
82
+ ordered_columns = [column.name for column in self.model.__table__.columns]
83
+
84
+ # Execute query and fetch results in chunks
85
+ result_proxy = session.execute(self.query)
86
+ results = result_proxy.scalars().all() # Fetch all rows
87
+ iterator = iter(results)
88
+
89
+ partitions = []
90
+
91
+ while True:
92
+ chunk = list(itertools.islice(iterator, self.chunk_size))
93
+ if not chunk:
94
+ break
95
+
96
+ # Convert chunk to Pandas DataFrame
97
+ df = pd.DataFrame.from_records(
98
+ [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
99
+ )
100
+ # Drop internal SQLAlchemy state if it exists
101
+ df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
102
+
103
+ # Reorder columns to match the model's order
104
+ df = df[ordered_columns]
105
+
106
+ # Fill NaN values
107
+ if fillna_value is not None:
108
+ df = df.fillna(fillna_value)
109
+
110
+ # Convert timezone-aware columns to naive
111
+ for col in df.columns:
112
+ if isinstance(df[col].dtype, pd.DatetimeTZDtype):
113
+ df[col] = df[col].dt.tz_localize(None)
114
+
115
+ # Apply inferred dtypes
116
+ df = df.astype(dtypes)
117
+ # Create a Dask partition
118
+ partitions.append(dd.from_pandas(df, npartitions=1))
119
+
120
+ # Concatenate all partitions
121
+ # print(partitions)
122
+ if partitions:
123
+ dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
+ else:
125
+ dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
+
127
+ if self.verbose:
128
+ self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+ return dask_df
130
+
131
+ except Exception as e:
132
+ self.logger.error(f"Error executing query: {str(e)}")
133
+ return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -50,6 +50,7 @@ class SqlAlchemyFilterHandler:
50
50
  return [datetime.date.fromisoformat(v) for v in value]
51
51
  return value
52
52
 
53
+
53
54
  def handle_date_operator(column, date_op):
54
55
  """
55
56
  Handle filtering on specific datetime parts (e.g., year, month).
@@ -90,6 +91,7 @@ class SqlAlchemyFilterHandler:
90
91
 
91
92
  # Get the column from the model
92
93
  column = getattr(model, field_name, None)
94
+ #column = model.__table__.columns.get(field_name)
93
95
  if not column:
94
96
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
95
97