sibi-dst 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,6 +101,8 @@ class DfHelper:
101
101
 
102
102
  def _load_from_sqlalchemy(self, **options):
103
103
  try:
104
+ options.setdefault("debug", self.debug)
105
+ options.setdefault("verbose_debug", self.verbose_debug)
104
106
  db_loader = SqlAlchemyLoadFromDb(
105
107
  self.plugin_sqlalchemy,
106
108
  self.plugin_query,
@@ -120,6 +122,8 @@ class DfHelper:
120
122
 
121
123
  def _load_from_db(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
122
124
  try:
125
+ options.setdefault("debug", self.debug)
126
+ options.setdefault("verbose_debug", self.verbose_debug)
123
127
  db_loader = DjangoLoadFromDb(
124
128
  self.plugin_django_connection,
125
129
  self.plugin_query,
@@ -2,10 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  from ._params_config import ParamsConfig
4
4
  from ._query_config import QueryConfig
5
- from ._defaults import django_field_conversion_map
5
+ from ._defaults import (
6
+ django_field_conversion_map_pandas,
7
+ django_field_conversion_map_dask,
8
+ sqlalchemy_field_conversion_map_dask,
9
+ normalize_sqlalchemy_type)
6
10
 
7
11
  __all__ = [
8
12
  "ParamsConfig",
9
13
  "QueryConfig",
10
- "django_field_conversion_map"
14
+ "django_field_conversion_map_pandas",
15
+ "django_field_conversion_map_dask",
16
+ "sqlalchemy_field_conversion_map_dask",
17
+ "normalize_sqlalchemy_type"
11
18
  ]
@@ -1,10 +1,12 @@
1
1
  # Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
2
2
  #
3
3
  import json
4
- from dataclasses import dataclass
5
- from typing import Dict, Union, Optional
4
+ from typing import Dict
6
5
 
7
6
  import pandas as pd
7
+ from sqlalchemy import String, Text, Integer, BigInteger, SmallInteger, Float, Boolean, DateTime, Date, Time, JSON, \
8
+ Numeric, UUID
9
+ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
8
10
 
9
11
  # This is the defaults configuration file for the df_helper module.
10
12
 
@@ -13,11 +15,12 @@ import pandas as pd
13
15
  # conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
14
16
  # the Django field type.
15
17
 
16
- django_field_conversion_map: Dict[str, callable] = {
18
+ django_field_conversion_map_pandas: Dict[str, callable] = {
17
19
  "CharField": lambda x: x.astype(str),
18
20
  "TextField": lambda x: x.astype(str),
19
21
  "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
20
22
  "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
23
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
21
24
  "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
22
25
  "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
23
26
  "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
@@ -35,3 +38,96 @@ django_field_conversion_map: Dict[str, callable] = {
35
38
  "ArrayField": lambda x: x.apply(eval),
36
39
  "UUIDField": lambda x: x.astype(str),
37
40
  }
41
+
42
+ django_field_conversion_map_dask: Dict[str, callable] = {
43
+ "CharField": lambda x: x.astype(str),
44
+ "TextField": lambda x: x.astype(str),
45
+ "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
46
+ "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
47
+ "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
48
+ "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
49
+ "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
50
+ "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
51
+ "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
52
+ "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
53
+ "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
54
+ "BooleanField": lambda x: x.astype(bool),
55
+ "NullBooleanField": lambda x: x.astype(bool),
56
+ "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
58
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
59
+ "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
60
+ "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
61
+ "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
62
+ "UUIDField": lambda x: x.astype(str),
63
+ }
64
+
65
+ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
66
+ String.__name__: lambda x: x.astype(str).fillna(""),
67
+ Text.__name__: lambda x: x.fillna('').astype(str),
68
+ Integer.__name__: lambda x: x.fillna(0).astype(int),
69
+ BigInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
70
+ SmallInteger.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
71
+ Float.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
72
+ Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
73
+ Boolean.__name__: lambda x: x.astype(bool),
74
+ DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
75
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
76
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
77
+ JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
78
+ UUID.__name__: lambda x: x.astype(str),
79
+ }
80
+
81
+ # Conversion map with normalized SQLAlchemy field types
82
+ # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
83
+ # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
84
+ # "Text": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("text", "string")),
85
+ # "Integer": lambda x: pd.to_numeric(x, errors="coerce"),
86
+ # "SmallInteger": lambda x: pd.to_numeric(x, errors="coerce"),
87
+ # "BigInteger": lambda x: pd.to_numeric(x, errors="coerce"),
88
+ # "Float": lambda x: pd.to_numeric(x, errors="coerce"),
89
+ # "Numeric": lambda x: pd.to_numeric(x, errors="coerce"),
90
+ # "Boolean": lambda x: x.map_partitions(lambda s: s.fillna(False).astype(bool), meta=("boolean", "bool")),
91
+ # "DateTime": lambda x: pd.to_datetime(x, errors="coerce"),
92
+ # "Date": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.date, meta=("date", "object")),
93
+ # "Time": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda s: s.dt.time, meta=("time", "object")),
94
+ # "JSON": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
95
+ # }
96
+
97
+
98
+ def normalize_sqlalchemy_type(field_type):
99
+ """
100
+ Normalize SQLAlchemy field types to generic type names.
101
+ Handles dialect-specific types (e.g., MySQL).
102
+ """
103
+ # Map of generic SQLAlchemy types
104
+ type_mapping = {
105
+ String: "String",
106
+ Text: "Text",
107
+ Integer: "Integer",
108
+ SmallInteger: "SmallInteger",
109
+ BigInteger: "BigInteger",
110
+ Float: "Float",
111
+ Numeric: "Numeric",
112
+ Boolean: "Boolean",
113
+ DateTime: "DateTime",
114
+ Date: "Date",
115
+ Time: "Time",
116
+ JSON: "JSON",
117
+ }
118
+
119
+ # Dialect-specific types
120
+ dialect_mapping = {
121
+ TINYINT: "SmallInteger",
122
+ MEDIUMTEXT: "Text",
123
+ }
124
+
125
+ # Check if the field matches a generic or dialect-specific type
126
+ for sql_type, name in {**type_mapping, **dialect_mapping}.items():
127
+ if isinstance(field_type, sql_type):
128
+ return name
129
+
130
+ # Fallback to raw class name
131
+ return field_type.__class__.__name__
132
+
133
+
@@ -4,6 +4,7 @@ from django.db.models import Q
4
4
 
5
5
  from sibi_dst.df_helper.plugins.django import ReadFrameDask
6
6
  from sibi_dst.utils import Logger
7
+ from sibi_dst.df_helper.core import django_field_conversion_map_dask
7
8
 
8
9
  class DjangoLoadFromDb:
9
10
  df: dd.DataFrame
@@ -26,10 +27,10 @@ class DjangoLoadFromDb:
26
27
 
27
28
  def build_and_load(self):
28
29
  self.df = self._build_and_load()
29
- if self.df is not None:
30
- self._process_loaded_data()
30
+ #self.df = self._convert_columns(self.df)
31
31
  return self.df
32
32
 
33
+
33
34
  def _build_and_load(self) -> dd.DataFrame:
34
35
  query = self.connection_config.model.objects.using(self.connection_config.connection_name)
35
36
  if not self.params_config.filters:
@@ -61,10 +62,60 @@ class DjangoLoadFromDb:
61
62
  q_objects.add(~Q(**{key: value}), Q.AND)
62
63
  return q_objects
63
64
 
64
- def _process_loaded_data(self):
65
- field_map = self.params_config.field_map
66
- if field_map is not None:
67
- rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
68
- if rename_mapping:
69
- # Apply renaming
70
- self.df = self.df.rename(columns=rename_mapping)
65
+ def _convert_columns(self, df: dd.DataFrame) -> dd.DataFrame:
66
+ """
67
+ Convert the data types of columns in a Dask DataFrame based on the field type in the Django model.
68
+
69
+ :param df: Dask DataFrame whose columns' data types are to be converted.
70
+ :return: Dask DataFrame with converted column data types.
71
+ """
72
+
73
+ def log_debug(message: str, is_verbose: bool = False):
74
+ """Helper to handle debug and verbose debug logging."""
75
+ if self.debug:
76
+ self.logger.debug(message)
77
+ if is_verbose and self.verbose_debug:
78
+ print(message)
79
+
80
+ if self.debug:
81
+ self.logger.info(f'Converting columns: {list(df.columns)}')
82
+
83
+ # Get field information from the Django model
84
+ model_fields = self.connection_config.model._meta.get_fields()
85
+ field_type_map = {field.name: type(field).__name__ for field in model_fields}
86
+ # Simplified loop to apply conversions partition-wise
87
+ for field_name, field_type in field_type_map.items():
88
+ if field_name not in df.columns:
89
+
90
+ log_debug(f"Column '{field_name}' not found in DataFrame columns.")
91
+ continue
92
+
93
+ conversion_func = django_field_conversion_map_dask.get(field_type)
94
+ if not conversion_func:
95
+ message=f"Field type '{field_type}' not found in conversion_map."
96
+ log_debug(message, is_verbose=True)
97
+ continue
98
+
99
+ def apply_conversion(partition):
100
+ """
101
+ Apply the conversion function to a single partition for the given column.
102
+ """
103
+ try:
104
+ if field_name in partition.columns:
105
+ partition[field_name] = conversion_func(partition[field_name])
106
+ except Exception as e:
107
+ self.logger.error(f"Error converting column '{field_name}' in partition: {str(e)}")
108
+ return partition
109
+
110
+ try:
111
+ # Apply conversion lazily to each partition
112
+ df = df.map_partitions(
113
+ apply_conversion,
114
+ meta=df,
115
+ )
116
+ log_debug(f"Successfully queued conversion for column '{field_name}' to type '{field_type}'.",
117
+ is_verbose=True)
118
+ except Exception as e:
119
+ log_debug(f"Failed to queue conversion for column '{field_name}': {str(e)}", is_verbose=True)
120
+
121
+ return df
@@ -1,143 +1,133 @@
1
1
  import itertools
2
+
2
3
  import dask.dataframe as dd
3
4
  import pandas as pd
4
- from sqlalchemy.orm import Query
5
- from sqlalchemy.inspection import inspect
6
-
7
-
8
- class ReadFrameSqlAlchemy:
9
- def __init__(
10
- self,
11
- query,
12
- session,
13
- fieldnames=None,
14
- index_col=None,
15
- coerce_float=False,
16
- verbose=True,
17
- datetime_index=False,
18
- column_names=None,
19
- chunk_size=1000,
20
- ):
5
+ from sqlalchemy import create_engine, inspect, select
6
+ from sqlalchemy.orm import sessionmaker
7
+
8
+ from sibi_dst.df_helper.plugins.sql_alchemy._sqlachemy_filter_handler import SqlAlchemyFilterHandler
9
+ from sibi_dst.utils import Logger
10
+
11
+
12
+ class SQLAlchemyDask:
13
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, verbose=True):
21
14
  """
22
- Initialize the loader for SQLAlchemy queries.
23
-
24
- Args:
25
- query: SQLAlchemy query (ORM or Select).
26
- session: SQLAlchemy session for executing the query.
27
- fieldnames: Optional list of field names to include in the result.
28
- index_col: Column to use as the index of the DataFrame.
29
- coerce_float: Attempt to coerce values to float where applicable.
30
- verbose: Whether to include verbose processing like handling choices.
31
- datetime_index: Whether to convert the index to a datetime index.
32
- column_names: Optional mapping of fieldnames to custom column names.
33
- chunk_size: Number of records to fetch in each chunk.
15
+ Initialize with an SQLAlchemy query and database engine URL.
16
+
17
+ :param model: SQLAlchemy ORM model.
18
+ :param filters: Filters to apply on the query.
19
+ :param engine_url: Database connection string for SQLAlchemy engine.
20
+ :param chunk_size: Number of records per chunk for Dask partitions.
21
+ :param logger: Logger instance for logging.
22
+ :param verbose: Whether to print detailed logs.
34
23
  """
35
- self.query = query
36
- self.session = session
37
- self.fieldnames = fieldnames
38
- self.index_col = index_col
39
- self.coerce_float = coerce_float
40
- self.verbose = verbose
41
- self.datetime_index = datetime_index
42
- self.column_names = column_names
24
+ self.query = None
25
+ self.model = model
26
+ self.filters = filters
43
27
  self.chunk_size = chunk_size
28
+ self.verbose = verbose
29
+ self.engine = create_engine(engine_url)
30
+ self.Session = sessionmaker(bind=self.engine)
31
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
44
32
 
45
33
  @staticmethod
46
- def row_to_dict(row, fields=None):
34
+ def infer_dtypes_from_model(model):
47
35
  """
48
- Convert a SQLAlchemy result row to a dictionary.
49
-
50
- Args:
51
- row: SQLAlchemy ORM object, Row object, or tuple.
52
- fields: List of fields to extract.
53
-
54
- Returns:
55
- A dictionary representation of the row.
36
+ Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
56
37
  """
57
- # Handle ORM instances
58
- if hasattr(row, "__dict__"): # For ORM instances
59
- data = row.__dict__.copy()
60
- data.pop("_sa_instance_state", None) # Remove SQLAlchemy internal state
61
- # Handle SQLAlchemy Row objects
62
- elif hasattr(row, "_mapping"): # For SQLAlchemy result rows
63
- data = dict(row._mapping)
64
- # Handle tuples (e.g., raw query results)
65
- elif isinstance(row, tuple):
66
- if fields:
67
- data = dict(zip(fields, row))
68
- else:
69
- raise ValueError("Cannot map tuple row without field names.")
70
- else:
71
- raise ValueError(f"Unsupported row type: {type(row)}. Expected ORM instance, dict-like object, or tuple.")
72
-
73
- # Filter by specified fields
74
- if fields:
75
- return {field: data.get(field, None) for field in fields}
76
- else:
77
- return data
38
+ mapper = inspect(model)
39
+ sqlalchemy_to_dask_dtype = {
40
+ #'INTEGER': pd.to_numeric(x, errors="coerce"),
41
+ 'INTEGER': 'Int64',
42
+ 'SMALLINT': 'Int64',
43
+ 'BIGINT': 'Int64',
44
+ 'FLOAT': 'float64',
45
+ 'NUMERIC': 'float64',
46
+ 'BOOLEAN': 'bool',
47
+ 'VARCHAR': 'object',
48
+ 'TEXT': 'object',
49
+ 'DATE': 'datetime64[ns]',
50
+ 'DATETIME': 'datetime64[ns]',
51
+ 'TIME': 'object',
52
+ 'UUID': 'object',
53
+ }
54
+
55
+ dtypes = {}
56
+ for column in mapper.columns:
57
+ dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
58
+ dtypes[column.name] = dtype
59
+
60
+ return dtypes
78
61
 
79
62
  def read_frame(self, fillna_value=None):
80
63
  """
81
- Convert the query results to a Dask DataFrame.
82
-
83
- Args:
84
- fillna_value: Value to use for filling missing values.
64
+ Load data from an SQLAlchemy query into a Dask DataFrame.
85
65
 
86
- Returns:
87
- A Dask DataFrame.
66
+ :param fillna_value: Value to replace NaN or NULL values with, if any.
67
+ :return: Dask DataFrame.
88
68
  """
89
- # Infer fieldnames if not provided
90
- if not self.fieldnames:
91
- if hasattr(self.query, "selected_columns"):
92
- self.fieldnames = [col.key for col in self.query.selected_columns]
93
- else:
94
- self.fieldnames = [col.name for col in inspect(self.query._entity_zero().class_).columns]
95
-
96
- partitions = []
97
- results = self.session.execute(self.query) # Execute the query
98
-
99
- # Debugging raw results
100
- print("Results fetched:", results)
101
-
102
- # Chunk processing
103
- iterator = iter(results)
104
- while True:
105
- chunk = list(itertools.islice(iterator, self.chunk_size))
106
- if not chunk:
107
- break
108
-
109
- # Convert chunk to DataFrame
110
- df = pd.DataFrame.from_records(
111
- [self.row_to_dict(row, self.fieldnames) for row in chunk],
112
- columns=self.fieldnames,
113
- coerce_float=self.coerce_float,
114
- )
115
-
116
- # Handle missing values
117
- if fillna_value is not None:
118
- df = df.fillna(fillna_value)
119
-
120
- # Convert datetime columns to timezone-naive
121
- for col in df.columns:
122
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
123
- df[col] = df[col].dt.tz_localize(None)
124
-
125
- partitions.append(dd.from_pandas(df, npartitions=1))
126
-
127
- # Concatenate partitions
128
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
129
-
130
- # Handle index column
131
- if self.index_col and self.index_col in dask_df.columns:
132
- dask_df = dask_df.set_index(self.index_col)
133
-
134
- # Convert index to datetime if required
135
- if self.datetime_index and self.index_col in dask_df.columns:
136
- dask_df = dask_df.map_partitions(lambda df: df.set_index(pd.to_datetime(df.index)))
137
-
138
- # Handle column renaming
139
- if self.column_names:
140
- rename_mapping = dict(zip(self.fieldnames, self.column_names))
141
- dask_df = dask_df.rename(columns=rename_mapping)
142
-
143
- return dask_df
69
+ with self.Session() as session:
70
+ try:
71
+ # Build query
72
+ self.query = select(self.model)
73
+ if self.filters:
74
+ self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
75
+ else:
76
+ n_records = 100
77
+ self.query = self.query.limit(n_records)
78
+
79
+ # Infer dtypes
80
+ dtypes = self.infer_dtypes_from_model(self.model)
81
+ # Get the column order from the SQLAlchemy model
82
+ ordered_columns = [column.name for column in self.model.__table__.columns]
83
+
84
+ # Execute query and fetch results in chunks
85
+ result_proxy = session.execute(self.query)
86
+ results = result_proxy.scalars().all() # Fetch all rows
87
+ iterator = iter(results)
88
+
89
+ partitions = []
90
+
91
+ while True:
92
+ chunk = list(itertools.islice(iterator, self.chunk_size))
93
+ if not chunk:
94
+ break
95
+
96
+ # Convert chunk to Pandas DataFrame
97
+ df = pd.DataFrame.from_records(
98
+ [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
99
+ )
100
+ # Drop internal SQLAlchemy state if it exists
101
+ df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
102
+
103
+ # Reorder columns to match the model's order
104
+ df = df[ordered_columns]
105
+
106
+ # Fill NaN values
107
+ if fillna_value is not None:
108
+ df = df.fillna(fillna_value)
109
+
110
+ # Convert timezone-aware columns to naive
111
+ for col in df.columns:
112
+ if isinstance(df[col].dtype, pd.DatetimeTZDtype):
113
+ df[col] = df[col].dt.tz_localize(None)
114
+
115
+ # Apply inferred dtypes
116
+ df = df.astype(dtypes)
117
+ # Create a Dask partition
118
+ partitions.append(dd.from_pandas(df, npartitions=1))
119
+
120
+ # Concatenate all partitions
121
+ # print(partitions)
122
+ if partitions:
123
+ dask_df = dd.concat(partitions, axis=0, ignore_index=True)
124
+ else:
125
+ dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
126
+
127
+ if self.verbose:
128
+ self.logger.info(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+ return dask_df
130
+
131
+ except Exception as e:
132
+ self.logger.error(f"Error executing query: {str(e)}")
133
+ return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -50,6 +50,7 @@ class SqlAlchemyFilterHandler:
50
50
  return [datetime.date.fromisoformat(v) for v in value]
51
51
  return value
52
52
 
53
+
53
54
  def handle_date_operator(column, date_op):
54
55
  """
55
56
  Handle filtering on specific datetime parts (e.g., year, month).
@@ -90,6 +91,7 @@ class SqlAlchemyFilterHandler:
90
91
 
91
92
  # Get the column from the model
92
93
  column = getattr(model, field_name, None)
94
+ #column = model.__table__.columns.get(field_name)
93
95
  if not column:
94
96
  raise AttributeError(f"Field '{field_name}' not found in model '{model.__name__}'")
95
97
 
@@ -1,23 +1,30 @@
1
+ from typing import Dict
2
+
1
3
  import dask.dataframe as dd
2
- from sqlmodel import Session, select
3
- from typing import Any, Dict, Optional
4
- import logging
5
4
  import pandas as pd
5
+ from sqlalchemy.inspection import inspect
6
+ from sqlalchemy.orm import sessionmaker
7
+ from sqlalchemy import select
8
+ #from sqlmodel import Session, select
9
+
10
+ from sibi_dst.df_helper.core import ParamsConfig, QueryConfig, sqlalchemy_field_conversion_map_dask, \
11
+ normalize_sqlalchemy_type
6
12
  from sibi_dst.utils import Logger
7
- from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
8
- from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
13
+ from ._io_sqlalchemy_dask import SQLAlchemyDask
9
14
  from ._sqlachemy_filter_handler import SqlAlchemyFilterHandler
15
+ from ._sqlalchemy_db_connection import SqlAlchemyConnectionConfig
16
+
10
17
 
11
18
  class SqlAlchemyLoadFromDb:
12
19
  df: dd.DataFrame
13
20
 
14
21
  def __init__(
15
- self,
16
- plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
17
- plugin_query: QueryConfig = None,
18
- plugin_params: ParamsConfig = None,
19
- logger: Logger = None,
20
- **kwargs,
22
+ self,
23
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
24
+ plugin_query: QueryConfig = None,
25
+ plugin_params: ParamsConfig = None,
26
+ logger: Logger = None,
27
+ **kwargs,
21
28
  ):
22
29
  """
23
30
  Initialize the loader with database connection, query, and parameters.
@@ -27,7 +34,7 @@ class SqlAlchemyLoadFromDb:
27
34
  self.model = self.db_connection.model
28
35
  self.engine = self.db_connection.engine
29
36
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
30
- self.query_config = plugin_query
37
+ self.query_config = plugin_query
31
38
  self.params_config = plugin_params
32
39
  self.debug = kwargs.pop("debug", False)
33
40
  self.verbose_debug = kwargs.pop("verbose_debug", False)
@@ -40,44 +47,13 @@ class SqlAlchemyLoadFromDb:
40
47
  return self.df
41
48
 
42
49
  def _build_and_load(self) -> dd.DataFrame:
43
- """
44
- Query the database and load results into a Dask DataFrame.
45
- """
46
- with Session(self.engine) as session:
47
- try:
48
- query = select(self.model)
49
- filters = self.params_config.filters
50
- if filters:
51
- n_records = 0
52
- query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(query, self.model,self.params_config.filters)
53
- else:
54
- n_records = self.query_config.n_records or 100
55
-
56
- if n_records:
57
- query = query.limit(n_records)
58
-
59
- # Debug: Log the SQL query
60
- if self.debug:
61
- self.logger.info(f"Executing query: {str(query)}")
62
-
63
- # Execute the query
64
- try:
65
- results = session.exec(query).fetchall()
66
- if results:
67
- records = [
68
- {key: getattr(result, key) for key in result.__table__.columns.keys()}
69
- for result in results
70
- ]
71
- df = dd.from_pandas(pd.DataFrame(records), npartitions=1)
72
- except Exception as e:
73
- self.logger.info(results)
74
- self.logger.warning("Query returned no results.")
75
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
76
-
77
- except Exception as e:
78
- print(query)
79
- self.logger.error(f"Error loading data: {e}")
80
- df = dd.from_pandas(pd.DataFrame(), npartitions=1)
81
-
50
+ try:
51
+ reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, verbose=self.debug)
52
+ df = reader.read_frame()
53
+ if df is None or len(df.index) == 0:
54
+ self.logger.warning("Query returned no results.")
55
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
82
56
  return df
83
-
57
+ except Exception as e:
58
+ self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
59
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
@@ -8,7 +8,7 @@ class DataUtils:
8
8
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
9
9
 
10
10
  @staticmethod
11
- def transform_numeric_columns(df, fill_value=0, transform_func=None):
11
+ def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
12
12
  """
13
13
  Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
14
14
 
@@ -21,10 +21,11 @@ class DataUtils:
21
21
  Returns:
22
22
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
23
23
  """
24
- # Detect numeric columns
25
- numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
24
+ if columns is None:
25
+ # Detect numeric columns
26
+ columns = df.select_dtypes(include=['number']).columns.tolist()
26
27
 
27
- if not numeric_columns:
28
+ if not columns:
28
29
  return df
29
30
 
30
31
  # Default transformation function (identity) if none is provided
@@ -32,7 +33,7 @@ class DataUtils:
32
33
  transform_func = lambda x: x
33
34
 
34
35
  # Apply transformations
35
- for col in numeric_columns:
36
+ for col in columns:
36
37
  dtype = df[col].dtype
37
38
  if pd.api.types.is_integer_dtype(dtype):
38
39
  meta_type = 'int64'
@@ -51,38 +52,36 @@ class DataUtils:
51
52
  return df
52
53
 
53
54
  @staticmethod
54
- def transform_boolean_columns(df, threshold=1):
55
+ def transform_boolean_columns(df, columns=None, sample_size=100):
55
56
  """
56
- Transform boolean-like columns in a DataFrame (Pandas or Dask) to actual booleans.
57
+ Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
58
+ and convert them to boolean. Detection is performed using a sample.
57
59
 
58
60
  Parameters:
59
61
  - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
60
- - threshold (int or float): The value to evaluate as `True`.
62
+ - columns (list of str): List of columns to check and transform.
63
+ - sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
61
64
 
62
65
  Returns:
63
66
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
64
67
  """
68
+ # Apply transformation to each specified column
69
+ for col in columns:
70
+ if col in df.columns:
71
+ if isinstance(df, dd.DataFrame):
72
+ # Replace NaN with 0, then convert to boolean
73
+ df[col] = df[col].map_partitions(
74
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
75
+ .fillna(0) # Replace NaN with 0
76
+ .astype(int) # Ensure integer type
77
+ .astype(bool), # Convert to boolean
78
+ meta=(col, 'bool')
79
+ )
80
+ else:
81
+ # For Pandas DataFrame, handle mixed types and invalid values
82
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
83
+ df[col] = df[col].fillna(0).astype(int).astype(bool)
65
84
 
66
- def is_boolean_like(col):
67
- """
68
- Check if a column is boolean-like (contains only two unique values).
69
- """
70
- unique_values = col.dropna().unique()
71
- if isinstance(col, dd.Series):
72
- unique_values = unique_values.compute()
73
- return len(unique_values) <= 2 and set(unique_values).issubset({0, 1, True, False})
74
-
75
- # Detect boolean-like columns
76
- boolean_columns = [col for col in df.columns if is_boolean_like(df[col])]
77
-
78
- # Apply transformation to each detected column
79
- for col in boolean_columns:
80
- if isinstance(df, dd.DataFrame):
81
- df[col] = df[col].map_partitions(
82
- lambda s: s == threshold, meta=(col, 'bool')
83
- )
84
- else:
85
- df[col] = df[col] == threshold
86
85
  return df
87
86
 
88
87
  def merge_lookup_data(self, classname, df, **kwargs):
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.10
4
- Summary:
3
+ Version: 0.3.11
4
+ Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
7
7
  Requires-Python: >=3.11,<4.0
@@ -1,15 +1,15 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=Pvu1kByZhUCAY9LGKFrcyasTq1MeeIBeMoeCgScStPM,12507
3
+ sibi_dst/df_helper/_df_helper.py,sha256=e-ptCEDYt5dx8byNiA0ca8Eejl1DG1V5pioZUzabEnY,12747
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
- sibi_dst/df_helper/core/__init__.py,sha256=UXGUGGSjjrcJRrs25zPV-xgJoyYy1WjVQAExcJDWgV0,254
6
- sibi_dst/df_helper/core/_defaults.py,sha256=AVNT_Vk8K7dLKOnPX_-Cygi-Nuku65CIn0baE0Wn6dI,1877
5
+ sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
6
+ sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
7
7
  sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
8
8
  sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
9
9
  sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
11
11
  sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
12
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=ZWVwJCJY7xmlZwDCZG3vNlEMyTGKJ8CoEtwgKYX0ofQ,2918
12
+ sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
13
13
  sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
14
14
  sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
15
15
  sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
@@ -19,10 +19,10 @@ sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPj
19
19
  sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=45mHID1azAg5PmaYWbuRlghoRd3H2aTLj1XcycfLJo0,3497
20
20
  sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=cKyRj0UCby9-iYPPFnlel1H03x8MnAoEv8k1tp7kHXw,4277
21
21
  sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
22
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=ET9cih0Frc5izMOsdvNlLhjJWtUQbwZhRtsdo5dRckQ,5059
23
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=UXcZ1shS7shcjkSqIIduAnb1Lhzc6pZ6NEcbkcnwgWk,4606
22
+ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
23
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
24
24
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=f1zqBISVn8OjZJs8hu6IvRZSwMX7_DIZMIbhxV6uV80,3179
25
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=myrtEzK12DvA73x7QFaqXFb_TxOPMrsVj-mxYHJD2dg,2371
26
26
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
27
27
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
28
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
@@ -31,7 +31,7 @@ sibi_dst/utils/__init__.py,sha256=jiXJSnmsaGZTRhUThtIo6cssWXBWXNij8ffYmv77QK4,79
31
31
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
32
  sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
33
33
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
- sibi_dst/utils/_data_utils.py,sha256=XB0xjW2LbCmoZjgDbNQQpWaf4upmSoTXeJZ3QMVqbsQ,7056
34
+ sibi_dst/utils/_data_utils.py,sha256=3hBMg852ANpS5bOtlU-F4H-Q91WIGga5LrKWWyDvnAA,7354
35
35
  sibi_dst/utils/_data_wrapper.py,sha256=pZnylBFTvsLGfYGv2tTyQHzyb6IbIahfaXR-PxHdivk,24099
36
36
  sibi_dst/utils/_date_utils.py,sha256=6HCrcTiuYLNsbgrNB3eAVAAgXbfx7Ce1qNc3OJla9nM,5621
37
37
  sibi_dst/utils/_df_utils.py,sha256=o2bK5-xMGKqIG4i9xfavYRxIkiHLA0nz5TQTN78998k,7350
@@ -40,6 +40,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
40
40
  sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
41
  sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
42
42
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.10.dist-info/METADATA,sha256=lrVYU1PPBuHQrEDl_-SURTkE0ip_0xWsJc58AiihHZs,1877
44
- sibi_dst-0.3.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.10.dist-info/RECORD,,
43
+ sibi_dst-0.3.11.dist-info/METADATA,sha256=gwl565etE5wLVGk0rqQ7umOyBRtEXpQ_IdCXyEkv2s8,1897
44
+ sibi_dst-0.3.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ sibi_dst-0.3.11.dist-info/RECORD,,