sibi-dst 0.3.44__py3-none-any.whl → 0.3.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sibi_dst/__init__.py +38 -0
  2. sibi_dst/{df_helper → v1/df_helper}/_artifact_updater_multi_wrapper.py +1 -1
  3. sibi_dst/{df_helper → v1/df_helper}/_df_helper.py +3 -3
  4. sibi_dst/{df_helper → v1/df_helper}/_parquet_artifact.py +3 -3
  5. sibi_dst/{df_helper → v1/df_helper}/_parquet_reader.py +2 -2
  6. sibi_dst/{df_helper → v1/df_helper}/backends/django/_load_from_db.py +3 -3
  7. sibi_dst/{df_helper → v1/df_helper}/backends/http/_http_config.py +1 -1
  8. sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_filter_handler.py +1 -1
  9. sibi_dst/{df_helper → v1/df_helper}/backends/parquet/_parquet_options.py +2 -2
  10. sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_io_dask.py +2 -2
  11. sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_load_from_db.py +2 -2
  12. sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_sql_model_builder.py +2 -1
  13. sibi_dst/{df_helper → v1/df_helper}/core/_filter_handler.py +1 -1
  14. sibi_dst/v1/osmnx_helper/__init__.py +6 -0
  15. sibi_dst/{tests → v1/tests}/test_data_wrapper_class.py +11 -10
  16. sibi_dst/{utils → v1/utils}/__init__.py +2 -0
  17. sibi_dst/{utils → v1/utils}/clickhouse_writer.py +1 -1
  18. sibi_dst/v1/utils/data_from_http_source.py +49 -0
  19. sibi_dst/{utils → v1/utils}/data_utils.py +5 -3
  20. sibi_dst/{utils → v1/utils}/data_wrapper.py +3 -1
  21. sibi_dst/{utils → v1/utils}/date_utils.py +1 -1
  22. sibi_dst/{utils → v1/utils}/file_utils.py +1 -1
  23. sibi_dst/{utils → v1/utils}/filepath_generator.py +1 -1
  24. sibi_dst/{utils → v1/utils}/parquet_saver.py +1 -1
  25. sibi_dst/v1/utils/storage_config.py +28 -0
  26. sibi_dst/v2/df_helper/__init__.py +7 -0
  27. sibi_dst/v2/df_helper/_df_helper.py +214 -0
  28. sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +10 -0
  29. sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +82 -0
  30. sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +135 -0
  31. sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +142 -0
  32. sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +297 -0
  33. sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +9 -0
  34. sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +78 -0
  35. sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +122 -0
  36. sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +142 -0
  37. sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +283 -0
  38. sibi_dst/v2/df_helper/core/__init__.py +9 -0
  39. sibi_dst/v2/df_helper/core/_filter_handler.py +236 -0
  40. sibi_dst/v2/df_helper/core/_params_config.py +139 -0
  41. sibi_dst/v2/df_helper/core/_query_config.py +17 -0
  42. sibi_dst/v2/utils/__init__.py +5 -0
  43. sibi_dst/v2/utils/log_utils.py +120 -0
  44. {sibi_dst-0.3.44.dist-info → sibi_dst-0.3.46.dist-info}/METADATA +3 -2
  45. sibi_dst-0.3.46.dist-info/RECORD +80 -0
  46. sibi_dst/osmnx_helper/__init__.py +0 -9
  47. sibi_dst/osmnx_helper/v2/base_osm_map.py +0 -153
  48. sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
  49. sibi_dst-0.3.44.dist-info/RECORD +0 -62
  50. /sibi_dst/{df_helper/backends → v1}/__init__.py +0 -0
  51. /sibi_dst/{df_helper → v1/df_helper}/__init__.py +0 -0
  52. /sibi_dst/{osmnx_helper/v1 → v1/df_helper/backends}/__init__.py +0 -0
  53. /sibi_dst/{df_helper → v1/df_helper}/backends/django/__init__.py +0 -0
  54. /sibi_dst/{df_helper → v1/df_helper}/backends/django/_db_connection.py +0 -0
  55. /sibi_dst/{df_helper → v1/df_helper}/backends/django/_io_dask.py +0 -0
  56. /sibi_dst/{df_helper → v1/df_helper}/backends/django/_sql_model_builder.py +0 -0
  57. /sibi_dst/{df_helper → v1/df_helper}/backends/http/__init__.py +0 -0
  58. /sibi_dst/{df_helper → v1/df_helper}/backends/parquet/__init__.py +0 -0
  59. /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/__init__.py +0 -0
  60. /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_db_connection.py +0 -0
  61. /sibi_dst/{df_helper → v1/df_helper}/backends/sqlalchemy/_filter_handler.py +0 -0
  62. /sibi_dst/{df_helper → v1/df_helper}/core/__init__.py +0 -0
  63. /sibi_dst/{df_helper → v1/df_helper}/core/_defaults.py +0 -0
  64. /sibi_dst/{df_helper → v1/df_helper}/core/_params_config.py +0 -0
  65. /sibi_dst/{df_helper → v1/df_helper}/core/_query_config.py +0 -0
  66. /sibi_dst/{df_helper → v1/df_helper}/data_cleaner.py +0 -0
  67. /sibi_dst/{geopy_helper → v1/geopy_helper}/__init__.py +0 -0
  68. /sibi_dst/{geopy_helper → v1/geopy_helper}/geo_location_service.py +0 -0
  69. /sibi_dst/{geopy_helper → v1/geopy_helper}/utils.py +0 -0
  70. /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/base_osm_map.py +0 -0
  71. /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/__init__.py +0 -0
  72. /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/calendar_html.py +0 -0
  73. /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/basemaps/router_plotter.py +0 -0
  74. /sibi_dst/{osmnx_helper/v1 → v1/osmnx_helper}/utils.py +0 -0
  75. /sibi_dst/{osmnx_helper/v2 → v1/tests}/__init__.py +0 -0
  76. /sibi_dst/{utils → v1/utils}/airflow_manager.py +0 -0
  77. /sibi_dst/{utils → v1/utils}/credentials.py +0 -0
  78. /sibi_dst/{utils → v1/utils}/df_utils.py +0 -0
  79. /sibi_dst/{utils → v1/utils}/log_utils.py +0 -0
  80. /sibi_dst/{utils → v1/utils}/phone_formatter.py +0 -0
  81. /sibi_dst/{utils → v1/utils}/storage_manager.py +0 -0
  82. /sibi_dst/{osmnx_helper/v2/basemaps → v2}/__init__.py +0 -0
  83. /sibi_dst/{tests → v2/df_helper/backends}/__init__.py +0 -0
  84. {sibi_dst-0.3.44.dist-info → sibi_dst-0.3.46.dist-info}/WHEEL +0 -0
@@ -0,0 +1,214 @@
1
+ import warnings
2
+ from typing import Any, Dict, Type, TypeVar, Union
3
+
4
+ import dask.dataframe as dd
5
+ import fsspec
6
+ import pandas as pd
7
+ from pydantic import BaseModel
8
+
9
+ from sibi_dst.v2.utils import Logger
10
+ from sibi_dst.v2.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
11
+ from sibi_dst.v2.df_helper.backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
12
+ from sibi_dst.v2.df_helper.backends.sqlmodel import SQLModelConnectionConfig, SQLModelLoadFromDb
13
+
14
+ # Define a generic type variable for BaseModel subclasses
15
+ T = TypeVar("T", bound=BaseModel)
16
+
17
+ # Suppress warnings about protected member access
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ message="Access to a protected member _meta",
21
+ category=UserWarning,
22
+ )
23
+
24
+
25
+ class DfHelper:
26
+ df: Union[dd.DataFrame, pd.DataFrame] = None
27
+ default_config = {
28
+ 'parquet_storage_path': None,
29
+ 'dt_field': None,
30
+ 'as_pandas': False,
31
+ 'filesystem': 'file',
32
+ 'filesystem_options': {},
33
+ 'fs': fsspec.filesystem('file')
34
+ }
35
+
36
+ def __init__(self, **kwargs: Any) -> None:
37
+ # Merge default configuration with any provided kwargs
38
+ config = {**self.default_config.copy(), **kwargs}
39
+ self.backend = config.setdefault('backend', 'sqlalchemy')
40
+ self.debug = config.setdefault('debug', False)
41
+ self.as_pandas = config.setdefault('as_pandas', False)
42
+ self.logger = config.setdefault(
43
+ 'logger',
44
+ Logger.default_logger(logger_name=self.__class__.__name__, debug=self.debug)
45
+ )
46
+ self.logger.debug("Logger initialized in DEBUG mode.")
47
+
48
+ # Propagate logger and debug settings to all components
49
+ config.setdefault('logger', self.logger)
50
+ config.setdefault('debug', self.debug)
51
+
52
+ self._initialize_backend_config(**config)
53
+
54
+ def __str__(self) -> str:
55
+ return self.__class__.__name__
56
+
57
+ def _extract_config_vars(self, model: Type[T], kwargs: Dict[str, Any]) -> T:
58
+ """
59
+ Extracts and initializes a Pydantic model using only the keys that the model accepts.
60
+ The recognized keys are removed from kwargs.
61
+ """
62
+ recognized_keys = set(model.__annotations__.keys())
63
+ self.logger.debug(f"Recognized keys for {model.__name__}: {recognized_keys}")
64
+ model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
65
+ self.logger.debug(f"Initializing {model.__name__} with: {model_kwargs}")
66
+ return model(**model_kwargs)
67
+
68
+ def _initialize_backend_config(self, **kwargs: Any) -> None:
69
+ """
70
+ Initializes the backend configurations by extracting the settings required for queries,
71
+ parameters, and SQLAlchemy connections.
72
+ """
73
+ self.logger.debug("Initializing backend configuration.")
74
+ self._backend_query = self._extract_config_vars(QueryConfig, kwargs)
75
+ self._backend_params = self._extract_config_vars(ParamsConfig, kwargs)
76
+ if self.backend == "sqlalchemy":
77
+ self.backend_connection_config = self._extract_config_vars(SqlAlchemyConnectionConfig, kwargs)
78
+ elif self.backend == "sqlmodel":
79
+ self.backend_connection_config = self._extract_config_vars(SQLModelConnectionConfig, kwargs)
80
+ else:
81
+ raise ValueError(f"Unsupported backend: {self.backend}")
82
+
83
+ def load(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
84
+ """
85
+ Loads the data using the underlying SQLAlchemy loader. Returns a pandas DataFrame
86
+ if 'as_pandas' is True; otherwise returns a dask DataFrame.
87
+ """
88
+ df = self._load(**options)
89
+ return df.compute() if self.as_pandas else df
90
+
91
+ def _load(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
92
+ self._backend_params.parse_params(options)
93
+ if self.backend == "sqlalchemy":
94
+ return self._load_from_sqlalchemy(**options)
95
+ elif self.backend == "sqlmodel":
96
+ return self._load_from_sqlmodel(**options)
97
+ else:
98
+ raise ValueError(f"Unsupported backend: {self.backend}")
99
+
100
+ def _load_from_sqlalchemy(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
101
+ """
102
+ Loads data from a SQLAlchemy source. On failure, logs the error and returns an empty
103
+ DataFrame wrapped as a dask DataFrame.
104
+ """
105
+ try:
106
+ db_loader = SqlAlchemyLoadFromDb(
107
+ self.backend_connection_config,
108
+ self._backend_query,
109
+ self._backend_params,
110
+ self.debug,
111
+ self.logger,
112
+ **options
113
+ )
114
+ self.df = db_loader.build_and_load()
115
+ self._process_loaded_data()
116
+ self._post_process_df()
117
+ self.logger.debug("Data successfully loaded from SQLAlchemy database.")
118
+ except Exception as e:
119
+ self.logger.error(f"Failed to load data from SQLAlchemy database: {e}. Options: {options}")
120
+ # Optionally re-raise the exception if in debug mode
121
+ if self.debug:
122
+ raise
123
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
124
+ return self.df
125
+
126
+ def _load_from_sqlmodel(self, **options: Any) -> Union[dd.DataFrame, pd.DataFrame]:
127
+ try:
128
+ db_loader = SQLModelLoadFromDb(
129
+ self.backend_connection_config,
130
+ self._backend_query,
131
+ self._backend_params,
132
+ self.debug,
133
+ self.logger,
134
+ **options
135
+ )
136
+ self.df = db_loader.build_and_load()
137
+ self._process_loaded_data()
138
+ self._post_process_df()
139
+ self.logger.debug("Data successfully loaded from SQLModel database.")
140
+ except Exception as e:
141
+ self.logger.error(f"Failed to load data from SQLModel database: {e}. Options: {options}")
142
+ if self.debug:
143
+ raise
144
+ self.df = dd.from_pandas(pd.DataFrame(), npartitions=1)
145
+ return self.df
146
+
147
+ def _post_process_df(self) -> None:
148
+ """
149
+ Post-processes the DataFrame by filtering columns, renaming them, setting the index,
150
+ and converting the index to datetime if requested.
151
+ """
152
+ df_params = self._backend_params.df_params
153
+ fieldnames = df_params.get("fieldnames")
154
+ index_col = df_params.get("index_col")
155
+ datetime_index = df_params.get("datetime_index", False)
156
+ column_names = df_params.get("column_names")
157
+
158
+ # Filter columns based on fieldnames
159
+ if fieldnames:
160
+ valid_fieldnames = [col for col in fieldnames if col in self.df.columns]
161
+ self.df = self.df[valid_fieldnames]
162
+
163
+ # Rename columns if column_names are provided
164
+ if column_names is not None:
165
+ if not fieldnames or len(fieldnames) != len(column_names):
166
+ raise ValueError(
167
+ f"Length mismatch: fieldnames ({len(fieldnames) if fieldnames else 0}) and "
168
+ f"column_names ({len(column_names)}) must match."
169
+ )
170
+ rename_mapping = dict(zip(fieldnames, column_names))
171
+ self.df = self.df.map_partitions(self._rename_columns, mapping=rename_mapping)
172
+
173
+ # Set the index column if specified
174
+ if index_col is not None:
175
+ if index_col in self.df.columns:
176
+ self.df = self.df.set_index(index_col)
177
+ else:
178
+ raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
179
+
180
+ # Convert the index to datetime if required
181
+ if datetime_index and self.df.index.dtype != 'datetime64[ns]':
182
+ self.df = self.df.map_partitions(self._convert_index_to_datetime)
183
+
184
+ self.logger.debug("Post-processing of DataFrame completed.")
185
+
186
+ def _process_loaded_data(self) -> None:
187
+ """
188
+ Applies renaming logic based on the field map configuration.
189
+ Logs a warning for any missing columns, and only renames existing columns.
190
+ """
191
+ self.logger.debug(f"Processing loaded data; DataFrame type: {type(self.df)}")
192
+ if self.df.map_partitions(len).compute().sum() > 0:
193
+ field_map = self._backend_params.field_map or {}
194
+ if isinstance(field_map, dict):
195
+ rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
196
+ missing_columns = [k for k in field_map if k not in self.df.columns]
197
+ if missing_columns:
198
+ self.logger.warning(
199
+ f"The following columns in field_map are not in the DataFrame: {missing_columns}"
200
+ )
201
+ if rename_mapping:
202
+ self.df = self.df.map_partitions(self._rename_columns, mapping=rename_mapping)
203
+ self.logger.debug("Processing of loaded data completed.")
204
+
205
+ @staticmethod
206
+ def _rename_columns(df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
207
+ """Helper function to rename columns in a DataFrame."""
208
+ return df.rename(columns=mapping)
209
+
210
+ @staticmethod
211
+ def _convert_index_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
212
+ """Helper function to convert the DataFrame index to datetime."""
213
+ df.index = pd.to_datetime(df.index, errors='coerce')
214
+ return df
@@ -0,0 +1,10 @@
1
+ from ._db_connection import SqlAlchemyConnectionConfig
2
+ from ._model_builder import SqlAlchemyModelBuilder
3
+ from ._load_from_db import SqlAlchemyLoadFromDb
4
+
5
+ __all__ = [
6
+ 'SqlAlchemyConnectionConfig',
7
+ 'SqlAlchemyModelBuilder',
8
+ 'SqlAlchemyLoadFromDb',
9
+ ]
10
+
@@ -0,0 +1,82 @@
1
+ from typing import Any, Optional
2
+
3
+ from pydantic import BaseModel, model_validator, ConfigDict
4
+ from sqlalchemy import create_engine, text
5
+ from sqlalchemy.engine import Engine
6
+ from sqlalchemy.exc import OperationalError
7
+
8
+ from sibi_dst.v2.utils import Logger
9
+ from ._model_builder import SqlAlchemyModelBuilder
10
+
11
+
12
+ class SqlAlchemyConnectionConfig(BaseModel):
13
+ """
14
+ Configuration for establishing an SQLAlchemy database connection and dynamically building
15
+ an ORM model for a specific table.
16
+
17
+ Attributes:
18
+ connection_url (str): The URL used to connect to the database.
19
+ table_name (Optional[str]): The name of the table for which the model will be built.
20
+ model (Any): The dynamically built SQLAlchemy model.
21
+ engine (Optional[Engine]): The SQLAlchemy engine instance.
22
+ """
23
+
24
+ model_config = ConfigDict(arbitrary_types_allowed=True)
25
+ connection_url: str
26
+ table: Optional[str] = None
27
+ model: Any = None
28
+ engine: Optional[Engine] = None
29
+ debug: bool = False
30
+ logger: Optional[Logger] = None
31
+ add_relationships: bool = False
32
+ export_models: bool = False
33
+ export_file_name: str = 'models.py'
34
+
35
+ @model_validator(mode="after")
36
+ def validate_and_initialize(self) -> "SqlAlchemyConnectionConfig":
37
+ """
38
+ Validate the configuration, initialize the engine, test the connection, and build the model.
39
+
40
+ Raises:
41
+ ValueError: If `connection_url` or `table_name` is missing, or if the connection or model
42
+ building fails.
43
+ """
44
+ self.logger = self.logger or Logger.default_logger(logger_name="sqlalchemy_connection", debug=self.debug)
45
+ self.logger.debug("Validating and initializing SQLAlchemy connection configuration.")
46
+ if not self.connection_url:
47
+ raise ValueError("`connection_url` must be provided.")
48
+
49
+ # Initialize the engine.
50
+ self.engine = create_engine(self.connection_url)
51
+ self.logger.debug(f"Engine created for URL")
52
+
53
+ # Validate the connection.
54
+ self.validate_connection()
55
+
56
+ if not self.table:
57
+ raise ValueError("`table` must be provided to build the model.")
58
+
59
+ try:
60
+ builder = SqlAlchemyModelBuilder(self.engine, self.table, self.add_relationships, self.debug, self.logger)
61
+ self.model = builder.build_model()
62
+ if self.export_models:
63
+ builder.export_models_to_file(self.export_file_name)
64
+ self.logger.debug(f"Successfully built model for table: {self.table}")
65
+ except Exception as e:
66
+ raise ValueError(f"Failed to build model for table {self.table}: {e}")
67
+
68
+ return self
69
+
70
+ def validate_connection(self) -> None:
71
+ """
72
+ Test the database connection by executing a simple query.
73
+
74
+ Raises:
75
+ ValueError: If the connection cannot be established.
76
+ """
77
+ try:
78
+ with self.engine.connect() as connection:
79
+ connection.execute(text("SELECT 1"))
80
+ self.logger.debug("Database connection validated.")
81
+ except OperationalError as e:
82
+ raise ValueError(f"Failed to connect to the database: {e}")
@@ -0,0 +1,135 @@
1
+ import itertools
2
+
3
+ import dask.dataframe as dd
4
+ import pandas as pd
5
+ from sqlalchemy import create_engine, inspect, select
6
+ from sqlalchemy.orm import sessionmaker
7
+
8
+ from sibi_dst.v2.df_helper.core import FilterHandler
9
+ from sibi_dst.v2.utils import Logger
10
+
11
+
12
+ class SQLAlchemyDask:
13
+ def __init__(self, model, filters, engine_url, chunk_size=1000, logger=None, debug=False):
14
+ """
15
+ Initialize with an SQLAlchemy query and database engine URL.
16
+
17
+ :param model: SQLAlchemy ORM model.
18
+ :param filters: Filters to apply on the query.
19
+ :param engine_url: Database connection string for SQLAlchemy engine.
20
+ :param chunk_size: Number of records per chunk for Dask partitions.
21
+ :param logger: Logger instance for logging.
22
+ :param debug: Whether to print detailed logs.
23
+ """
24
+ self.query = None
25
+ self.model = model
26
+ self.filters = filters
27
+ self.chunk_size = chunk_size
28
+ self.debug = debug
29
+ self.engine = create_engine(engine_url)
30
+ self.Session = sessionmaker(bind=self.engine)
31
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
32
+ self.logger.set_level(logger.DEBUG if debug else logger.INFO)
33
+
34
+ @staticmethod
35
+ def infer_dtypes_from_model(model):
36
+ """
37
+ Infer data types for Dask DataFrame based on SQLAlchemy ORM model columns.
38
+ """
39
+ mapper = inspect(model)
40
+ sqlalchemy_to_dask_dtype = {
41
+ 'INTEGER': 'Int64',
42
+ 'SMALLINT': 'Int64',
43
+ 'BIGINT': 'Int64',
44
+ 'FLOAT': 'float64',
45
+ 'NUMERIC': 'float64',
46
+ 'BOOLEAN': 'bool',
47
+ 'VARCHAR': 'object',
48
+ 'TEXT': 'object',
49
+ 'DATE': 'datetime64[ns]',
50
+ 'DATETIME': 'datetime64[ns]',
51
+ 'TIME': 'object',
52
+ 'UUID': 'object',
53
+ }
54
+
55
+ dtypes = {}
56
+ for column in mapper.columns:
57
+ dtype = sqlalchemy_to_dask_dtype.get(str(column.type).upper(), 'object')
58
+ dtypes[column.name] = dtype
59
+
60
+ return dtypes
61
+
62
+ def read_frame(self, fillna_value=None):
63
+ """
64
+ Load data from an SQLAlchemy query into a Dask DataFrame.
65
+
66
+ :param fillna_value: Value to replace NaN or NULL values with, if any.
67
+ :return: Dask DataFrame.
68
+ """
69
+ with self.Session() as session:
70
+ try:
71
+ # Build query
72
+ self.query = select(self.model)
73
+ if self.filters:
74
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
75
+ model=self.model,
76
+ filters=self.filters)
77
+ else:
78
+ n_records = 100
79
+ self.query = self.query.limit(n_records)
80
+ self.logger.debug(f"query:{self.query}")
81
+ # Infer dtypes
82
+ dtypes = self.infer_dtypes_from_model(self.model)
83
+ # Get the column order from the SQLAlchemy model
84
+ ordered_columns = [column.name for column in self.model.__table__.columns]
85
+
86
+ # Execute query and fetch results in chunks
87
+ result_proxy = session.execute(self.query)
88
+ results = result_proxy.scalars().all() # Fetch all rows
89
+ iterator = iter(results)
90
+
91
+ partitions = []
92
+
93
+ while True:
94
+ chunk = list(itertools.islice(iterator, self.chunk_size))
95
+ if not chunk:
96
+ break
97
+
98
+ # Convert chunk to Pandas DataFrame
99
+ df = pd.DataFrame.from_records(
100
+ [row._asdict() if hasattr(row, '_asdict') else row.__dict__ for row in chunk]
101
+ )
102
+ # Drop internal SQLAlchemy state if it exists
103
+ df = df.loc[:, ~df.columns.str.contains('_sa_instance_state')]
104
+
105
+ # Reorder columns to match the model's order
106
+ df = df[ordered_columns]
107
+
108
+ # Fill NaN values
109
+ if fillna_value is not None:
110
+ df = df.fillna(fillna_value)
111
+
112
+ # Convert timezone-aware columns to naive
113
+ for col in df.columns:
114
+ if isinstance(df[col].dtype, pd.DatetimeTZDtype):
115
+ df[col] = df[col].dt.tz_localize(None)
116
+
117
+ # Apply inferred dtypes
118
+ df = df.astype(dtypes)
119
+ # Create a Dask partition
120
+ partitions.append(dd.from_pandas(df, npartitions=1))
121
+
122
+ # Concatenate all partitions
123
+ if partitions:
124
+ dask_df = dd.concat(partitions, axis=0, ignore_index=True)
125
+ else:
126
+ dask_df = dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
127
+
128
+ self.logger.debug(f"Loaded {len(dask_df)} rows into Dask DataFrame.")
129
+
130
+ return dask_df
131
+
132
+ except Exception as e:
133
+ self.logger.error(f"Error executing query: {str(e)}")
134
+ self.logger.error(self.query)
135
+ return dd.from_pandas(pd.DataFrame(columns=ordered_columns), npartitions=1)
@@ -0,0 +1,142 @@
1
+ import dask.dataframe as dd
2
+ import pandas as pd
3
+
4
+ from sibi_dst.v2.df_helper.core import ParamsConfig, QueryConfig
5
+ from sibi_dst.v2.utils import Logger
6
+ from ._io_dask import SQLAlchemyDask
7
+ from ._db_connection import SqlAlchemyConnectionConfig
8
+
9
+
10
+ class SqlAlchemyLoadFromDb:
11
+ """
12
+ The SqlAlchemyLoadFromDb class provides functionality to load data from a
13
+ database using SQLAlchemy into a Dask DataFrame. It is capable of handling
14
+ large datasets efficiently by utilizing the Dask framework for parallel
15
+ computations.
16
+
17
+ This class is initialized with a database connection configuration, query
18
+ configuration, optional parameters, and a logger. It can execute a query
19
+ using the specified configurations and read the results into a Dask
20
+ DataFrame. This is useful for processing and analyzing large-scale data.
21
+
22
+ :ivar df: Dask DataFrame to store the loaded data.
23
+ :type df: dd.DataFrame
24
+ :ivar db_connection: Database connection configuration object, containing details
25
+ such as the table, model, and engine to be used for the query.
26
+ :type db_connection: SqlAlchemyConnectionConfig
27
+ :ivar table_name: Name of the database table being queried.
28
+ :type table_name: str
29
+ :ivar model: SQLAlchemy model associated with the database connection.
30
+ :type model: sqlalchemy.ext.declarative.api.DeclarativeMeta
31
+ :ivar engine: SQLAlchemy engine used for executing queries.
32
+ :type engine: sqlalchemy.engine.base.Engine
33
+ :ivar logger: Logger instance for logging debug and error information.
34
+ :type logger: Logger
35
+ :ivar query_config: Query configuration, including query-related details such
36
+ as the SQL query or query settings.
37
+ :type query_config: QueryConfig
38
+ :ivar params_config: Parameters configuration, including filter parameters for
39
+ the query.
40
+ :type params_config: ParamsConfig
41
+ :ivar debug: Debug flag indicating whether debug mode is enabled.
42
+ :type debug: bool
43
+ :ivar chunk_size: Size of data chunks to process at a time.
44
+ :type chunk_size: int
45
+ """
46
+ df: dd.DataFrame = None
47
+
48
+ def __init__(
49
+ self,
50
+ plugin_sqlalchemy: SqlAlchemyConnectionConfig, # Expected to be an instance of SqlAlchemyConnection
51
+ plugin_query: QueryConfig = None,
52
+ plugin_params: ParamsConfig = None,
53
+ debug: bool = False,
54
+ logger: Logger = None,
55
+ **kwargs,
56
+ ):
57
+ """
58
+ Initializes an instance of the class, setting up a database connection,
59
+ query configuration, parameter configuration, and other optional settings
60
+ like debugging and logging. The class aims to manage the integration and
61
+ interaction with SQLAlchemy-based database operations.
62
+
63
+ :param plugin_sqlalchemy:
64
+ The SQLAlchemy connection configuration object, which provides
65
+ the connection details like engine, table name, and model
66
+ associated with the database operations.
67
+ :param plugin_query:
68
+ The query configuration object, used to define specific query
69
+ options or rules. Defaults to None.
70
+ :param plugin_params:
71
+ The parameters configuration object, used for any additional
72
+ parameterized settings or configurations. Defaults to None.
73
+ :param logger:
74
+ Optional logger instance for logging purposes. If not provided,
75
+ a default logger is instantiated using the standard logging system.
76
+ :param kwargs:
77
+ Optional additional keyword arguments for customization. Can
78
+ include optional settings like `debug` mode or `chunk_size`
79
+ for batch operations.
80
+ """
81
+ self.db_connection = plugin_sqlalchemy
82
+ self.table_name = self.db_connection.table
83
+ self.model = self.db_connection.model
84
+ self.engine = self.db_connection.engine
85
+ self.debug = debug
86
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__, debug=self.debug)
87
+ self.query_config = plugin_query
88
+ self.params_config = plugin_params
89
+ self.chunk_size = kwargs.pop("chunk_size", 1000)
90
+
91
+ def build_and_load(self) -> dd.DataFrame:
92
+ """
93
+ Builds and returns the resulting dataframe after calling the internal
94
+ build and load function. This method triggers the `_build_and_load`
95
+ function to process and prepare the data before returning it as
96
+ a dask dataframe.
97
+
98
+ :raises RuntimeError: If any error occurs during the build or load process.
99
+
100
+ :return: The processed data in a dask dataframe.
101
+ :rtype: dd.DataFrame
102
+ """
103
+ self._build_and_load()
104
+ return self.df
105
+
106
+ def _build_and_load(self) -> dd.DataFrame:
107
+ """
108
+ Builds and loads a Dask DataFrame from a SQLAlchemy-compatible source.
109
+
110
+ This method initializes a SQLAlchemyDask object with the provided model,
111
+ filters, engine URL, logger, chunk size, and debug configuration.
112
+ It attempts to load the data using the ``read_frame`` method of
113
+ SQLAlchemyDask. If the data cannot be loaded or the query returns
114
+ no rows, it creates and returns an empty Dask DataFrame.
115
+
116
+ :raises Exception: On failure to load data or to create a DataFrame.
117
+
118
+ :return: A Dask DataFrame object containing the queried data or an
119
+ empty DataFrame if the query returns no results or fails.
120
+ :rtype: dask.dataframe.DataFrame
121
+ """
122
+ try:
123
+ self.df = SQLAlchemyDask(
124
+ model=self.model,
125
+ filters=self.params_config.filters,
126
+ engine_url=self.engine.url,
127
+ logger=self.logger,
128
+ chunk_size=self.chunk_size,
129
+ debug=self.debug
130
+ ).read_frame()
131
+
132
+ if self.df is None or len(self.df.head().index) == 0:
133
+ self.logger.debug("Query returned no results.")
134
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
135
+
136
+ return dask_df
137
+ return self.df
138
+ except Exception as e:
139
+ self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
140
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
141
+
142
+ return dask_df