sibi-dst 0.3.63__tar.gz → 0.3.64__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/PKG-INFO +1 -1
  2. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/pyproject.toml +1 -1
  3. sibi_dst-0.3.64/sibi_dst/df_helper/_df_helper.py +230 -0
  4. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
  5. sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +248 -0
  6. sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +179 -0
  7. sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +74 -0
  8. sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +206 -0
  9. {sibi_dst-0.3.63/sibi_dst/v2 → sibi_dst-0.3.64/sibi_dst}/df_helper/core/_query_config.py +2 -2
  10. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/log_utils.py +15 -11
  11. sibi_dst-0.3.64/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +357 -0
  12. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
  13. sibi_dst-0.3.64/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +357 -0
  14. sibi_dst-0.3.64/sibi_dst/v3/__init__.py +0 -0
  15. sibi_dst-0.3.64/sibi_dst/v3/backends/__init__.py +0 -0
  16. sibi_dst-0.3.64/sibi_dst/v3/df_helper/__init__.py +0 -0
  17. sibi_dst-0.3.64/sibi_dst/v3/df_helper/_df_helper.py +91 -0
  18. sibi_dst-0.3.63/sibi_dst/df_helper/_df_helper.py +0 -637
  19. sibi_dst-0.3.63/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -202
  20. sibi_dst-0.3.63/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
  21. sibi_dst-0.3.63/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -135
  22. sibi_dst-0.3.63/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -145
  23. sibi_dst-0.3.63/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -193
  24. sibi_dst-0.3.63/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -82
  25. sibi_dst-0.3.63/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -78
  26. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/README.md +0 -0
  27. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/__init__.py +0 -0
  28. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/__init__.py +0 -0
  29. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  30. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  31. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  32. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/__init__.py +0 -0
  33. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  34. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  35. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  36. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  37. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  38. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  39. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  40. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  41. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  42. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  43. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/core/__init__.py +0 -0
  44. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/core/_defaults.py +0 -0
  45. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  46. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/core/_params_config.py +0 -0
  47. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/df_helper/data_cleaner.py +0 -0
  48. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/geopy_helper/__init__.py +0 -0
  49. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  50. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/geopy_helper/utils.py +0 -0
  51. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/__init__.py +0 -0
  52. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  53. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  54. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  55. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  56. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/osmnx_helper/utils.py +0 -0
  57. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/tests/__init__.py +0 -0
  58. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  59. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/__init__.py +0 -0
  60. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/airflow_manager.py +0 -0
  61. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/clickhouse_writer.py +0 -0
  62. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/credentials.py +0 -0
  63. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/data_from_http_source.py +0 -0
  64. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/data_utils.py +0 -0
  65. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/data_wrapper.py +0 -0
  66. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/date_utils.py +0 -0
  67. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/df_utils.py +0 -0
  68. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/file_utils.py +0 -0
  69. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/filepath_generator.py +0 -0
  70. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/manifest_manager.py +0 -0
  71. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/parquet_saver.py +0 -0
  72. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/phone_formatter.py +0 -0
  73. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/storage_config.py +0 -0
  74. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/storage_manager.py +0 -0
  75. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/update_planner.py +0 -0
  76. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/utils/webdav_client.py +0 -0
  77. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/__init__.py +0 -0
  78. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/__init__.py +0 -0
  79. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  80. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  81. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  82. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  83. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  84. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  85. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  86. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  87. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  88. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  89. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  90. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  91. {sibi_dst-0.3.63/sibi_dst → sibi_dst-0.3.64/sibi_dst/v2}/df_helper/core/_query_config.py +0 -0
  92. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/utils/__init__.py +0 -0
  93. {sibi_dst-0.3.63 → sibi_dst-0.3.64}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.63
3
+ Version: 0.3.64
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.63"
3
+ version = "0.3.64"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,230 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import Any, Dict, Optional, Union, TypeVar
5
+
6
+ import dask.dataframe as dd
7
+ import fsspec
8
+ import pandas as pd
9
+ from pydantic import BaseModel
10
+
11
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
12
+ from sibi_dst.utils import Logger, ParquetSaver, ClickHouseWriter
13
+ from .backends.http import HttpConfig
14
+ from .backends.parquet import ParquetConfig
15
+ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
16
+
17
+ warnings.filterwarnings("ignore")
18
+ T = TypeVar("T", bound=BaseModel)
19
+
20
+
21
+ # --- Backend Strategy Pattern Implementation ---
22
+
23
+ class BaseBackend:
24
+ """Abstract base class defining clear sync and async loading interfaces."""
25
+
26
+ def __init__(self, helper: DfHelper):
27
+ self.helper = helper
28
+ self.logger = helper.logger
29
+
30
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
31
+ """Synchronous data loading method. Must be implemented by sync backends."""
32
+ raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
33
+
34
+ async def aload(self, **options) -> dd.DataFrame | pd.DataFrame:
35
+ """Asynchronous data loading method. By default, it calls the sync version."""
36
+ return self.load(**options)
37
+
38
+
39
+ class SqlAlchemyBackend(BaseBackend):
40
+ def load(self, **options) -> dd.DataFrame:
41
+ try:
42
+ # Process incoming filter options into the ParamsConfig object
43
+ if options and hasattr(self.helper._backend_params, 'parse_params'):
44
+ self.helper._backend_params.parse_params(options)
45
+
46
+ db_loader = SqlAlchemyLoadFromDb(
47
+ plugin_sqlalchemy=self.helper.backend_db_connection,
48
+ plugin_query=self.helper._backend_query,
49
+ plugin_params=self.helper._backend_params,
50
+ logger=self.logger
51
+ )
52
+ return db_loader.build_and_load()
53
+ except Exception as e:
54
+ self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
55
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
56
+
57
+
58
+ class ParquetBackend(BaseBackend):
59
+ """This backend is also purely synchronous."""
60
+
61
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
62
+ try:
63
+ df = self.helper.backend_parquet.load_files()
64
+ if options and df is not None:
65
+ df = FilterHandler('dask', self.logger).apply_filters(df, filters=options)
66
+ return df
67
+ except Exception as e:
68
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
69
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
70
+
71
+
72
+ class HttpBackend(BaseBackend):
73
+ """This backend is purely asynchronous."""
74
+
75
+ def load(self, **options) -> dd.DataFrame | pd.DataFrame:
76
+ # This will correctly fail by raising NotImplementedError from the base class.
77
+ return self.helper.backend_http.fetch_data(**options)
78
+
79
+ async def aload(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
80
+ if not self.helper.backend_http:
81
+ self.logger.warning("HTTP plugin not configured properly.")
82
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
83
+ return await self.helper.backend_http.fetch_data(**options)
84
+
85
+
86
+ # --- Main DfHelper Facade Class ---
87
+
88
+ class DfHelper:
89
+ """
90
+ A reusable utility for loading data. It provides both sync (`load`) and
91
+ async (`aload`) methods to accommodate different backends.
92
+ """
93
+ _BACKEND_STRATEGIES = {
94
+ 'sqlalchemy': SqlAlchemyBackend,
95
+ 'parquet': ParquetBackend,
96
+ 'http': HttpBackend,
97
+ }
98
+
99
+ default_config: Dict = None
100
+
101
+ def __init__(self, backend='sqlalchemy', **kwargs):
102
+ self.default_config = self.default_config or {}
103
+ kwargs = {**self.default_config.copy(), **kwargs}
104
+ self.backend = backend
105
+ self.debug = kwargs.get("debug", False)
106
+ self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
107
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
108
+ self.fs = kwargs.get("fs", fsspec.filesystem('file'))
109
+ kwargs.setdefault("fs", self.fs)
110
+ kwargs.setdefault("logger", self.logger)
111
+ self._backend_query = self._get_config(QueryConfig, kwargs)
112
+ self._backend_params = self._get_config(ParamsConfig, kwargs)
113
+ self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
114
+ self.backend_parquet: Optional[ParquetConfig] = None
115
+ self.backend_http: Optional[HttpConfig] = None
116
+
117
+ if self.backend == 'sqlalchemy':
118
+ self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
119
+ elif self.backend == 'parquet':
120
+ self.backend_parquet = self._get_config(ParquetConfig, kwargs)
121
+ elif self.backend == 'http':
122
+ self.backend_http = self._get_config(HttpConfig, kwargs)
123
+
124
+ strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
125
+ if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
126
+ self.backend_strategy = strategy_class(self)
127
+
128
+ def __enter__(self):
129
+ return self
130
+
131
+ def __exit__(self, exc_type, exc_value, traceback):
132
+ self._cleanup()
133
+
134
+ def _cleanup(self):
135
+ active_config = getattr(self, f"backend_{self.backend}", None)
136
+ if active_config and hasattr(active_config, "close"):
137
+ self.logger.debug(f"Closing resources for '{self.backend}' backend.")
138
+ active_config.close()
139
+
140
+ def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
141
+ recognized_keys = set(model.model_fields.keys())
142
+ model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
143
+ return model(**model_kwargs)
144
+
145
+ def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
146
+ """Loads data synchronously. Fails if backend is async-only."""
147
+ self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
148
+ df = self.backend_strategy.load(**options)
149
+ df = self._process_loaded_data(df)
150
+ df = self._post_process_df(df)
151
+ return df.compute() if as_pandas else df
152
+
153
+ async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
154
+ """Loads data asynchronously from any backend."""
155
+ df = await self.backend_strategy.aload(**options)
156
+ df = self._process_loaded_data(df)
157
+ df = self._post_process_df(df)
158
+ return df.compute() if as_pandas else df
159
+
160
+ def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
161
+ df_params = self._backend_params.df_params
162
+ if not df_params: return df
163
+ fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
164
+ df_params.get("index_col"))
165
+ if not any([fieldnames, column_names, index_col]): return df
166
+ self.logger.debug("Post-processing DataFrame.")
167
+ if fieldnames:
168
+ valid_fieldnames = [f for f in fieldnames if f in df.columns]
169
+ if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
170
+ f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
171
+ df = df[valid_fieldnames]
172
+ if column_names:
173
+ if len(df.columns) != len(column_names): raise ValueError(
174
+ f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
175
+ df = df.rename(columns=dict(zip(df.columns, column_names)))
176
+ if index_col:
177
+ if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
178
+ df = df.set_index(index_col)
179
+ return df
180
+
181
+ def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
182
+ field_map = self._backend_params.field_map or {}
183
+ if not isinstance(field_map, dict) or not field_map: return df
184
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
185
+ self.logger.debug("Processing loaded data...")
186
+ rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
187
+ if rename_mapping: df = df.rename(columns=rename_mapping)
188
+ return df
189
+
190
+ def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
191
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
192
+ self.logger.warning("Cannot save to parquet; DataFrame is empty.")
193
+ return
194
+ fs = kwargs.pop('fs', self.fs)
195
+ path = kwargs.pop('parquet_storage_path', self.backend_parquet.parquet_storage_path)
196
+ ParquetSaver(df, path, self.logger, fs).save_to_parquet(parquet_filename)
197
+ self.logger.debug(f"Parquet saved to {parquet_filename} in path: {path}.")
198
+
199
+ def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
200
+ if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
201
+ self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
202
+ return
203
+ ClickHouseWriter(self.logger, **credentials).save_to_clickhouse(df)
204
+ self.logger.debug("Save to ClickHouse completed.")
205
+
206
+ def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
207
+ """Synchronous convenience method for loading a date range."""
208
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
209
+ return self.load(**final_kwargs)
210
+
211
+ async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
212
+ """Asynchronous convenience method for loading a date range."""
213
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
214
+ return await self.aload(**final_kwargs)
215
+
216
+ def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
217
+ start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
218
+ if start_date > end_date: raise ValueError("'start' date cannot be later than 'end' date.")
219
+ field_map = self._backend_params.field_map or {}
220
+ reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
221
+ if len(reverse_map) != len(field_map): self.logger.warning(
222
+ "field_map values are not unique; reverse mapping may be unreliable.")
223
+ mapped_field = reverse_map.get(dt_field, dt_field)
224
+ if start_date == end_date:
225
+ kwargs[f"{mapped_field}__date"] = start_date
226
+ else:
227
+ kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
228
+ self.logger.debug(f"Period load generated filters: {kwargs}")
229
+ return kwargs
230
+
@@ -1,4 +1,3 @@
1
- from ._filter_handler import SqlAlchemyFilterHandler
2
1
  from ._db_connection import SqlAlchemyConnectionConfig
3
2
  from ._load_from_db import SqlAlchemyLoadFromDb
4
3
  from ._sql_model_builder import SqlAlchemyModelBuilder
@@ -7,5 +6,4 @@ __all__ = [
7
6
  'SqlAlchemyConnectionConfig',
8
7
  'SqlAlchemyModelBuilder',
9
8
  'SqlAlchemyLoadFromDb',
10
- 'SqlAlchemyFilterHandler'
11
9
  ]
@@ -0,0 +1,248 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from contextlib import contextmanager
5
+ from typing import Any, Optional, ClassVar, Generator, Type, Dict
6
+
7
+ from pydantic import (
8
+ BaseModel,
9
+ field_validator,
10
+ model_validator,
11
+ ConfigDict,
12
+ )
13
+ from sqlalchemy import create_engine, event, text
14
+ from sqlalchemy.engine import url as sqlalchemy_url
15
+ from sqlalchemy.engine import Engine
16
+ from sqlalchemy.exc import OperationalError, SQLAlchemyError
17
+ from sqlalchemy.orm import sessionmaker, Session
18
+ from sqlalchemy.pool import QueuePool, NullPool, StaticPool
19
+
20
+ # Assuming these are your project's internal modules
21
+ from sibi_dst.utils import Logger
22
+ from ._sql_model_builder import SqlAlchemyModelBuilder
23
+
24
+
25
+ class SqlAlchemyConnectionConfig(BaseModel):
26
+ """
27
+ A thread-safe, registry-backed SQLAlchemy connection manager.
28
+
29
+ This class encapsulates database connection configuration and provides robust,
30
+ shared resource management. It is designed to be used as a context manager
31
+ to ensure resources are always released correctly.
32
+
33
+ Recommended Usage is via the `with` statement.
34
+ with SqlAlchemyConnectionConfig(...) as config:
35
+ session = config.get_session()
36
+ # ... do work ...
37
+ # config.close() is called automatically upon exiting the block.
38
+
39
+ Key Features:
40
+ - Context Manager Support: Guarantees resource cleanup.
41
+ - Shared Engine & Pool: Reuses a single SQLAlchemy Engine for identical
42
+ database URLs and pool settings, improving application performance.
43
+ - Reference Counting: Safely manages the lifecycle of the shared engine,
44
+ disposing of it only when the last user has closed its connection config.
45
+ """
46
+ # --- Public Configuration ---
47
+ connection_url: str
48
+ table: Optional[str] = None
49
+ debug: bool = False
50
+
51
+ # --- Pool Configuration ---
52
+ pool_size: int = 5
53
+ max_overflow: int = 10
54
+ pool_timeout: int = 30
55
+ pool_recycle: int = 1800
56
+ pool_pre_ping: bool = True
57
+ poolclass: Type[QueuePool] = QueuePool
58
+
59
+ # --- Internal & Runtime State ---
60
+ model: Optional[Type[Any]] = None
61
+ engine: Optional[Engine] = None
62
+ logger: Optional[Logger] = None
63
+ session_factory: Optional[sessionmaker] = None
64
+
65
+ # --- Private State ---
66
+ _engine_key_instance: tuple = ()
67
+ _closed: bool = False # Flag to prevent double-closing.
68
+
69
+ # --- Class-level Shared Resources ---
70
+ _engine_registry: ClassVar[Dict[tuple, Dict[str, Any]]] = {}
71
+ _registry_lock: ClassVar[threading.Lock] = threading.Lock()
72
+
73
+ model_config = ConfigDict(arbitrary_types_allowed=True)
74
+
75
+ # Add __enter__ and __exit__ for context manager protocol
76
+ def __enter__(self) -> SqlAlchemyConnectionConfig:
77
+ """Enter the runtime context, returning self."""
78
+ return self
79
+
80
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
81
+ """Exit the runtime context, ensuring that close() is called."""
82
+ self.close()
83
+
84
+ @field_validator("pool_size", "max_overflow", "pool_timeout", "pool_recycle")
85
+ @classmethod
86
+ def _validate_pool_params(cls, v: int) -> int:
87
+ if v < 0:
88
+ raise ValueError("Pool parameters must be non-negative")
89
+ return v
90
+
91
+ @model_validator(mode="after")
92
+ def _init_all(self) -> SqlAlchemyConnectionConfig:
93
+ """Orchestrates the initialization process after Pydantic validation."""
94
+ self._init_logger()
95
+ self._engine_key_instance = self._get_engine_key()
96
+ self._init_engine()
97
+ self._validate_conn()
98
+ self._build_model()
99
+ if self.engine:
100
+ self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False)
101
+ return self
102
+
103
+ def _init_logger(self) -> None:
104
+ """Initializes the logger for this instance."""
105
+ if self.logger is None:
106
+ self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
107
+ log_level = Logger.DEBUG if self.debug else Logger.INFO
108
+ self.logger.set_level(log_level)
109
+
110
+ def _get_engine_key(self) -> tuple:
111
+ """Generates a unique, normalized key for an engine configuration."""
112
+ parsed = sqlalchemy_url.make_url(self.connection_url)
113
+ query = {k: v for k, v in parsed.query.items() if not k.startswith("pool_")}
114
+ normalized_url = parsed.set(query=query)
115
+ key_parts = [str(normalized_url)]
116
+ if self.poolclass not in (NullPool, StaticPool):
117
+ key_parts += [
118
+ self.pool_size, self.max_overflow, self.pool_timeout,
119
+ self.pool_recycle, self.pool_pre_ping
120
+ ]
121
+ return tuple(key_parts)
122
+
123
+ def _init_engine(self) -> None:
124
+ """Initializes or reuses a shared SQLAlchemy Engine."""
125
+ with self._registry_lock:
126
+ engine_wrapper = self._engine_registry.get(self._engine_key_instance)
127
+ if engine_wrapper:
128
+ self.engine = engine_wrapper['engine']
129
+ engine_wrapper['ref_count'] += 1
130
+ self.logger.debug(f"Reusing engine. Ref count: {engine_wrapper['ref_count']}.")
131
+ else:
132
+ self.logger.debug(f"Creating new engine for key: {self._engine_key_instance}")
133
+ try:
134
+ new_engine = create_engine(
135
+ self.connection_url, pool_size=self.pool_size,
136
+ max_overflow=self.max_overflow, pool_timeout=self.pool_timeout,
137
+ pool_recycle=self.pool_recycle, pool_pre_ping=self.pool_pre_ping,
138
+ poolclass=self.poolclass,
139
+ )
140
+ self.engine = new_engine
141
+ self._attach_events()
142
+ self._engine_registry[self._engine_key_instance] = {
143
+ 'engine': new_engine, 'ref_count': 1, 'active_connections': 0
144
+ }
145
+ except Exception as e:
146
+ self.logger.error(f"Failed to create engine: {e}")
147
+ raise SQLAlchemyError(f"Engine creation failed: {e}") from e
148
+
149
+ def close(self) -> None:
150
+ """
151
+ Decrements the engine's reference count and disposes of the engine
152
+ if the count reaches zero. This is now typically called automatically
153
+ when exiting a `with` block.
154
+ """
155
+ # Prevent the method from running more than once per instance.
156
+ if self._closed:
157
+ self.logger.debug("Attempted to close an already-closed config instance.")
158
+ return
159
+
160
+ with self._registry_lock:
161
+ key = self._engine_key_instance
162
+ engine_wrapper = self._engine_registry.get(key)
163
+
164
+ if not engine_wrapper:
165
+ self.logger.warning("Attempted to close a config whose engine is not in the registry.")
166
+ return
167
+
168
+ engine_wrapper['ref_count'] -= 1
169
+ self.logger.debug(f"Closing config. Ref count is now {engine_wrapper['ref_count']}.")
170
+
171
+ if engine_wrapper['ref_count'] <= 0:
172
+ self.logger.debug(f"Disposing engine as reference count is zero. Key: {key}")
173
+ engine_wrapper['engine'].dispose()
174
+ del self._engine_registry[key]
175
+
176
+ # Mark this instance as closed to prevent subsequent calls.
177
+ self._closed = True
178
+
179
+ # ... (the rest of your methods like _attach_events, _on_checkout, get_session, etc. remain unchanged)
180
+ # They are omitted here for brevity but should be included in your final file.
181
+
182
+ def _attach_events(self) -> None:
183
+ """Attaches checkout/checkin events to the engine for connection tracking."""
184
+ if self.engine:
185
+ event.listen(self.engine, "checkout", self._on_checkout)
186
+ event.listen(self.engine, "checkin", self._on_checkin)
187
+
188
+ def _on_checkout(self, *args) -> None:
189
+ """Event listener for when a connection is checked out from the pool."""
190
+ with self._registry_lock:
191
+ wrapper = self._engine_registry.get(self._engine_key_instance)
192
+ if wrapper:
193
+ wrapper['active_connections'] += 1
194
+ self.logger.debug(f"Connection checked out. Active: {self.active_connections}")
195
+
196
+ def _on_checkin(self, *args) -> None:
197
+ """Event listener for when a connection is returned to the pool."""
198
+ with self._registry_lock:
199
+ wrapper = self._engine_registry.get(self._engine_key_instance)
200
+ if wrapper:
201
+ wrapper['active_connections'] = max(0, wrapper['active_connections'] - 1)
202
+ self.logger.debug(f"Connection checked in. Active: {self.active_connections}")
203
+
204
+ @property
205
+ def active_connections(self) -> int:
206
+ """Returns the number of active connections for this instance's engine."""
207
+ with self._registry_lock:
208
+ wrapper = self._engine_registry.get(self._engine_key_instance)
209
+ return wrapper['active_connections'] if wrapper else 0
210
+
211
+ def _validate_conn(self) -> None:
212
+ """Tests the database connection by executing a simple query."""
213
+ try:
214
+ with self.managed_connection() as conn:
215
+ conn.execute(text("SELECT 1"))
216
+ self.logger.debug("Database connection validated successfully.")
217
+ except OperationalError as e:
218
+ self.logger.error(f"Database connection failed: {e}")
219
+ raise ValueError(f"DB connection failed: {e}") from e
220
+
221
+ @contextmanager
222
+ def managed_connection(self) -> Generator[Any, None, None]:
223
+ """Provides a single database connection from the engine pool."""
224
+ if not self.engine:
225
+ raise RuntimeError("Engine not initialized. Cannot get a connection.")
226
+ conn = self.engine.connect()
227
+ try:
228
+ yield conn
229
+ finally:
230
+ conn.close()
231
+
232
+ def get_session(self) -> Session:
233
+ """Returns a new SQLAlchemy Session from the session factory."""
234
+ if not self.session_factory:
235
+ raise RuntimeError("Session factory not initialized. Cannot get a session.")
236
+ return self.session_factory()
237
+
238
+ def _build_model(self) -> None:
239
+ """Dynamically builds an ORM model if `self.table` is set."""
240
+ if not self.table or not self.engine:
241
+ return
242
+ try:
243
+ builder = SqlAlchemyModelBuilder(self.engine, self.table)
244
+ self.model = builder.build_model()
245
+ self.logger.debug(f"Successfully built ORM model for table: {self.table}")
246
+ except Exception as e:
247
+ self.logger.error(f"Failed to build ORM model for table '{self.table}': {e}")
248
+ raise ValueError(f"Model construction failed for table '{self.table}': {e}") from e
@@ -0,0 +1,179 @@
1
+ from typing import Type
2
+
3
+ import dask
4
+ import dask.dataframe as dd
5
+ import pandas as pd
6
+ from sqlalchemy import (
7
+ inspect,
8
+ select,
9
+ func,
10
+ )
11
+ from sqlalchemy.engine import Engine
12
+ from sqlalchemy.orm import declarative_base
13
+ import time
14
+ from sqlalchemy.exc import TimeoutError
15
+ import sqlalchemy as sa
16
+ from sibi_dst.df_helper.core import FilterHandler
17
+ from sibi_dst.utils import Logger
18
+
19
+
20
+ class SQLAlchemyDask:
21
+ """
22
+ Loads data from a database into a Dask DataFrame using a memory-safe,
23
+ non-parallel, paginated approach.
24
+
25
+ This class avoids using a numeric `index_col for parallel loading.
26
+ """
27
+
28
+ _SQLALCHEMY_TO_DASK_DTYPE = {
29
+ "INTEGER": "Int64",
30
+ "SMALLINT": "Int64",
31
+ "BIGINT": "Int64",
32
+ "FLOAT": "float64",
33
+ "NUMERIC": "float64",
34
+ "BOOLEAN": "bool",
35
+ "VARCHAR": "object",
36
+ "TEXT": "object",
37
+ "DATE": "datetime64[ns]",
38
+ "DATETIME": "datetime64[ns]",
39
+ "TIME": "object",
40
+ "UUID": "object",
41
+ }
42
+
43
+ def __init__(
44
+ self,
45
+ model: Type[declarative_base()],
46
+ filters: dict,
47
+ engine: Engine,
48
+ chunk_size: int = 1000,
49
+ logger=None,
50
+ debug: bool = False,
51
+ ):
52
+ """
53
+ Initializes the data loader.
54
+
55
+ Args:
56
+ model: The SQLAlchemy ORM model for the table.
57
+ filters: A dictionary of filters to apply to the query.
58
+ engine: An SQLAlchemy Engine instance.
59
+ chunk_size: The number of records to fetch in each database query.
60
+ logger: A logger instance.
61
+ debug: Whether to enable detailed logging.
62
+ """
63
+ self.model = model
64
+ self.filters = filters
65
+ self.engine = engine
66
+ self.chunk_size = chunk_size
67
+ self.debug = debug
68
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
69
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
+ self.filter_handler_cls = FilterHandler
71
+
72
+ @classmethod
73
+ def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
74
+ """
75
+ Infers a metadata dictionary for Dask based on the SQLAlchemy model.
76
+ This helps Dask understand the DataFrame structure without reading data.
77
+ """
78
+ mapper = inspect(model)
79
+ dtypes = {}
80
+ for column in mapper.columns:
81
+ dtype_str = str(column.type).upper().split("(")[0]
82
+ dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
83
+ dtypes[column.name] = dtype
84
+ return dtypes
85
+
86
+ def read_frame(self, fillna_value=None) -> dd.DataFrame:
87
+ """
88
+ Builds and executes a query to load data into a Dask DataFrame.
89
+
90
+ This method works by first running a COUNT query to get the total
91
+ size, then creating a series of delayed tasks that each fetch a
92
+ chunk of data using LIMIT/OFFSET.
93
+
94
+ Args:
95
+ fillna_value: Value to replace NaN or NULL values with, if any.
96
+
97
+ Returns:
98
+ A lazy Dask DataFrame.
99
+ """
100
+ # 1. Build the base query and apply filters
101
+ query = select(self.model)
102
+ if self.filters:
103
+ query = self.filter_handler_cls(
104
+ backend="sqlalchemy", logger=self.logger, debug=self.debug
105
+ ).apply_filters(query, model=self.model, filters=self.filters)
106
+
107
+ self.logger.debug(f"Base query for pagination: {query}")
108
+
109
+ # 2. Get metadata for the Dask DataFrame structure
110
+ ordered_columns = [column.name for column in self.model.__table__.columns]
111
+ meta_dtypes = self.infer_meta_from_model(self.model)
112
+ meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
113
+
114
+ # 3. Get the total record count to calculate the number of chunks
115
+ # try:
116
+ # with self.engine.connect() as connection:
117
+ # count_query = select(func.count()).select_from(query.alias())
118
+ # total_records = connection.execute(count_query).scalar_one()
119
+ # except Exception as e:
120
+ # self.logger.error(f"Failed to count records for pagination: {e}", exc_info=True)
121
+ # return dd.from_pandas(meta_df, npartitions=1)
122
+ retry_attempts = 3
123
+ backoff_factor = 0.5 # start with a 0.5-second delay
124
+
125
+ for attempt in range(retry_attempts):
126
+ try:
127
+ with self.engine.connect() as connection:
128
+ count_query = sa.select(sa.func.count()).select_from(query.alias())
129
+ total_records = connection.execute(count_query).scalar_one()
130
+
131
+ # If successful, break the loop
132
+ break
133
+
134
+ except TimeoutError:
135
+ if attempt < retry_attempts - 1:
136
+ self.logger.warning(
137
+ f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
138
+ )
139
+ time.sleep(backoff_factor)
140
+ backoff_factor *= 2 # Double the backoff time for the next attempt
141
+ else:
142
+ self.logger.error(
143
+ "Failed to get a connection from the pool after several retries.",
144
+ exc_info=True
145
+ )
146
+ return dd.from_pandas(meta_df, npartitions=1)
147
+
148
+ except Exception as e:
149
+ self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
150
+ return dd.from_pandas(meta_df, npartitions=1)
151
+
152
+ if total_records == 0:
153
+ self.logger.warning("Query returned 0 records.")
154
+ return dd.from_pandas(meta_df, npartitions=1)
155
+
156
+ self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
157
+
158
+ # 4. Create a list of Dask Delayed objects, one for each chunk
159
+ @dask.delayed
160
+ def get_chunk(sql_query, chunk_offset):
161
+ """A Dask-delayed function to fetch one chunk of data."""
162
+ # LIMIT/OFFSET must be applied in the delayed function
163
+ paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
164
+ df = pd.read_sql(paginated_query, self.engine)
165
+
166
+ if fillna_value is not None:
167
+ df = df.fillna(fillna_value)
168
+
169
+ # Ensure column order and types match the meta
170
+ return df[ordered_columns].astype(meta_dtypes)
171
+
172
+ offsets = range(0, total_records, self.chunk_size)
173
+ delayed_chunks = [get_chunk(query, offset) for offset in offsets]
174
+
175
+ # 5. Construct the final lazy Dask DataFrame from the delayed chunks
176
+ ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
177
+ self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
178
+
179
+ return ddf