sibi-dst 0.3.64__tar.gz → 2025.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sibi_dst-2025.1.2/PKG-INFO +55 -0
  2. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/README.md +3 -9
  3. sibi_dst-2025.1.2/pyproject.toml +49 -0
  4. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_df_helper.py +5 -3
  5. sibi_dst-2025.1.2/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +329 -0
  6. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/__init__.py +0 -4
  7. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_defaults.py +1 -50
  8. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/__init__.py +0 -2
  9. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_wrapper.py +9 -12
  10. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/update_planner.py +2 -0
  11. sibi_dst-0.3.64/PKG-INFO +0 -90
  12. sibi_dst-0.3.64/pyproject.toml +0 -55
  13. sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/__init__.py +0 -11
  14. sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
  15. sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
  16. sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
  17. sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
  18. sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -179
  19. sibi_dst-0.3.64/sibi_dst/utils/airflow_manager.py +0 -212
  20. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/__init__.py +0 -0
  21. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/__init__.py +0 -0
  22. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  23. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  24. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  25. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
  26. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  27. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  28. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  29. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  30. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  31. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  32. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  33. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  34. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  35. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  36. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
  37. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
  38. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
  39. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/__init__.py +0 -0
  40. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  41. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/utils.py +0 -0
  42. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
  43. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  44. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  45. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  46. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  47. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/utils.py +0 -0
  48. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/tests/__init__.py +0 -0
  49. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  50. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/clickhouse_writer.py +0 -0
  51. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/credentials.py +0 -0
  52. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_from_http_source.py +0 -0
  53. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_utils.py +0 -0
  54. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/date_utils.py +0 -0
  55. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/df_utils.py +0 -0
  56. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/file_utils.py +0 -0
  57. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/filepath_generator.py +0 -0
  58. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/log_utils.py +0 -0
  59. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/manifest_manager.py +0 -0
  60. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/parquet_saver.py +0 -0
  61. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/phone_formatter.py +0 -0
  62. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/storage_config.py +0 -0
  63. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/storage_manager.py +0 -0
  64. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/webdav_client.py +0 -0
  65. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/__init__.py +0 -0
  66. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
  67. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  68. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  69. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  70. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  71. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  72. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  73. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  74. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  75. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  76. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  77. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  78. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  79. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  80. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  81. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  82. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  83. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/utils/__init__.py +0 -0
  84. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/utils/log_utils.py +0 -0
  85. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/__init__.py +0 -0
  86. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/backends/__init__.py +0 -0
  87. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/df_helper/__init__.py +0 -0
  88. {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/df_helper/_df_helper.py +0 -0
@@ -0,0 +1,55 @@
1
+ Metadata-Version: 2.1
2
+ Name: sibi-dst
3
+ Version: 2025.1.2
4
+ Summary: Data Science Toolkit
5
+ Author: Luis Valverde
6
+ Author-email: lvalverdeb@gmail.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
12
+ Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
13
+ Requires-Dist: dask[complete] (>=2025.5.1,<2026.0.0)
14
+ Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
15
+ Requires-Dist: pandas (>=2.3.1,<3.0.0)
16
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
17
+ Requires-Dist: pydantic (>=2.11.7,<3.0.0)
18
+ Requires-Dist: pymysql (>=1.1.1,<2.0.0)
19
+ Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
20
+ Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
21
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0)
22
+ Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
23
+ Description-Content-Type: text/markdown
24
+
25
+ ### SIBI-DST
26
+
27
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
28
+
29
+ ## Example Use Cases
30
+
31
+ 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
32
+ 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
33
+ 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
34
+ 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
35
+ 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
36
+
37
+ ## Supported Technologies
38
+
39
+ - **Data Processing**: Pandas, Dask
40
+ - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
41
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
42
+ - **API Development**: Django REST Framework, FastAPI
43
+
44
+ ## Installation
45
+
46
+ ```bash
47
+ # with pip
48
+
49
+ pip install sibi-dst # Install only the main package
50
+ pip install sibi-dst[geospatial] # Install with geospatial dependencies
51
+ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
52
+
53
+
54
+ ```
55
+
@@ -22,15 +22,9 @@ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX,
22
22
  ```bash
23
23
  # with pip
24
24
 
25
- pip install sibi-dst[complete] # Install all dependencies
26
- pip install sibi-dst[df_helper] # Install only df_helper dependencies
27
- pip install sibi-dst[geospatial] # Install only geospatial dependencies
28
-
29
- # with poetry
30
-
31
- poetry add "sibi-dst[complete]" # Install all dependencies
32
- poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
33
- poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
25
+ pip install sibi-dst # Install only the main package
26
+ pip install sibi-dst[geospatial] # Install with geospatial dependencies
27
+ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
34
28
 
35
29
 
36
30
  ```
@@ -0,0 +1,49 @@
1
+ [tool.poetry]
2
+ name = "sibi-dst"
3
+ version = "2025.1.2"
4
+ description = "Data Science Toolkit"
5
+ authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
+ readme = "README.md"
7
+ packages = [{ include = "sibi_dst" }]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = ">=3.12,<4.0"
11
+ pandas = "^2.3.1"
12
+ dask = {extras = ["complete"], version = "^2025.5.1"}
13
+ psycopg2 = "^2.9.10"
14
+ mysqlclient = "^2.2.7"
15
+ webdav4 = "^0.10.0"
16
+ clickhouse-connect = "^0.8.18"
17
+ clickhouse-driver = "^0.2.9"
18
+ tqdm = "^4.67.1"
19
+ s3fs = "^2025.5.1"
20
+ pydantic = "^2.11.7"
21
+ sqlalchemy = "^2.0.41"
22
+ pymysql = "^1.1.1"
23
+
24
+ [tool.poetry.group.dev]
25
+ optional = true
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ jupyter = "^1.1.1"
29
+ python-dotenv = "^1.1.1"
30
+ black = "^25.1.0"
31
+
32
+ [tool.poetry.group.test]
33
+ optional = true
34
+
35
+ [tool.poetry.group.test.dependencies]
36
+ pytest = "^8.4.1"
37
+ pytest-cov = "^6.2.1"
38
+
39
+ [tool.poetry.group.geospatial]
40
+ optional = true
41
+
42
+ [tool.poetry.group.geospatial.dependencies]
43
+ osmnx = "^2.0.5"
44
+ geopy = "^2.4.1"
45
+ folium = "^0.20.0"
46
+
47
+ [build-system]
48
+ requires = ["poetry-core"]
49
+ build-backend = "poetry.core.masonry.api"
@@ -26,6 +26,7 @@ class BaseBackend:
26
26
  def __init__(self, helper: DfHelper):
27
27
  self.helper = helper
28
28
  self.logger = helper.logger
29
+ self.debug = helper.debug
29
30
 
30
31
  def load(self, **options) -> dd.DataFrame | pd.DataFrame:
31
32
  """Synchronous data loading method. Must be implemented by sync backends."""
@@ -47,7 +48,8 @@ class SqlAlchemyBackend(BaseBackend):
47
48
  plugin_sqlalchemy=self.helper.backend_db_connection,
48
49
  plugin_query=self.helper._backend_query,
49
50
  plugin_params=self.helper._backend_params,
50
- logger=self.logger
51
+ logger=self.logger,
52
+ debug= self.debug
51
53
  )
52
54
  return db_loader.build_and_load()
53
55
  except Exception as e:
@@ -62,10 +64,10 @@ class ParquetBackend(BaseBackend):
62
64
  try:
63
65
  df = self.helper.backend_parquet.load_files()
64
66
  if options and df is not None:
65
- df = FilterHandler('dask', self.logger).apply_filters(df, filters=options)
67
+ df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
66
68
  return df
67
69
  except Exception as e:
68
- self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
70
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
69
71
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
70
72
 
71
73
 
@@ -0,0 +1,329 @@
1
+
2
+ from typing import Type
3
+
4
+ import dask
5
+ import dask.dataframe as dd
6
+ import pandas as pd
7
+ from sqlalchemy import (
8
+ inspect,
9
+ select
10
+ )
11
+ from sqlalchemy.engine import Engine
12
+ from sqlalchemy.orm import declarative_base
13
+ import time
14
+ from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
15
+ import sqlalchemy as sa
16
+ from sibi_dst.df_helper.core import FilterHandler
17
+ from sibi_dst.utils import Logger
18
+
19
+
20
+ class SQLAlchemyDask:
21
+ """
22
+ Loads data from a database into a Dask DataFrame using a memory-safe,
23
+ non-parallel, paginated approach.
24
+
25
+ This class avoids using a numeric `index_col for parallel loading.
26
+ """
27
+
28
+ _SQLALCHEMY_TO_DASK_DTYPE = {
29
+ "INTEGER": "Int64",
30
+ "SMALLINT": "Int64",
31
+ "BIGINT": "Int64",
32
+ "FLOAT": "float64",
33
+ "NUMERIC": "float64",
34
+ "BOOLEAN": "bool",
35
+ "VARCHAR": "object",
36
+ "TEXT": "object",
37
+ "DATE": "datetime64[ns]",
38
+ "DATETIME": "datetime64[ns]",
39
+ "TIME": "object",
40
+ "UUID": "object",
41
+ }
42
+
43
+ def __init__(
44
+ self,
45
+ model: Type[declarative_base()],
46
+ filters: dict,
47
+ engine: Engine,
48
+ chunk_size: int = 1000,
49
+ logger=None,
50
+ debug: bool = False,
51
+ ):
52
+ """
53
+ Initializes the data loader.
54
+
55
+ Args:
56
+ model: The SQLAlchemy ORM model for the table.
57
+ filters: A dictionary of filters to apply to the query.
58
+ engine: An SQLAlchemy Engine instance.
59
+ chunk_size: The number of records to fetch in each database query.
60
+ logger: A logger instance.
61
+ debug: Whether to enable detailed logging.
62
+ """
63
+ self.model = model
64
+ self.filters = filters
65
+ self.engine = engine
66
+ self.chunk_size = chunk_size
67
+ self.debug = debug
68
+ self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
69
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
70
+ self.filter_handler_cls = FilterHandler
71
+
72
+ @classmethod
73
+ def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
74
+ """
75
+ Infers a metadata dictionary for Dask based on the SQLAlchemy model.
76
+ This helps Dask understand the DataFrame structure without reading data.
77
+ """
78
+ mapper = inspect(model)
79
+ dtypes = {}
80
+ for column in mapper.columns:
81
+ dtype_str = str(column.type).upper().split("(")[0]
82
+ dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
83
+ dtypes[column.name] = dtype
84
+ return dtypes
85
+
86
+ def read_frame(self, fillna_value=None) -> dd.DataFrame:
87
+ """
88
+ Builds and executes a query to load data into a Dask DataFrame.
89
+
90
+ This method works by first running a COUNT query to get the total
91
+ size, then creating a series of delayed tasks that each fetch a
92
+ chunk of data using LIMIT/OFFSET.
93
+
94
+ Args:
95
+ fillna_value: Value to replace NaN or NULL values with, if any.
96
+
97
+ Returns:
98
+ A lazy Dask DataFrame.
99
+ """
100
+ # 1. Build the base query and apply filters
101
+ query = select(self.model)
102
+ if self.filters:
103
+ query = self.filter_handler_cls(
104
+ backend="sqlalchemy", logger=self.logger, debug=self.debug
105
+ ).apply_filters(query, model=self.model, filters=self.filters)
106
+ else:
107
+ query = query.limit(self.chunk_size)
108
+ self.logger.debug(f"Base query for pagination: {query}")
109
+
110
+ # 2. Get metadata for the Dask DataFrame structure
111
+ ordered_columns = [column.name for column in self.model.__table__.columns]
112
+ meta_dtypes = self.infer_meta_from_model(self.model)
113
+ meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
114
+
115
+ # 3. Get the total record count to calculate the number of chunks
116
+
117
+ retry_attempts = 3
118
+ backoff_factor = 0.5 # start with a 0.5-second delay
119
+
120
+ for attempt in range(retry_attempts):
121
+ try:
122
+ with self.engine.connect() as connection:
123
+ count_query = sa.select(sa.func.count()).select_from(query.alias())
124
+ total_records = connection.execute(count_query).scalar_one()
125
+
126
+ # If successful, break the loop
127
+ break
128
+
129
+ except SASQLTimeoutError:
130
+ if attempt < retry_attempts - 1:
131
+ self.logger.warning(
132
+ f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
133
+ )
134
+ time.sleep(backoff_factor)
135
+ backoff_factor *= 2 # Double the backoff time for the next attempt
136
+ else:
137
+ self.logger.error(
138
+ "Failed to get a connection from the pool after several retries.",
139
+ exc_info=True
140
+ )
141
+ return dd.from_pandas(meta_df, npartitions=1)
142
+ except OperationalError as oe:
143
+ # sometimes the DB driver wraps timeouts in OperationalError
144
+ if "timeout" in str(oe).lower():
145
+ self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
146
+ time.sleep(backoff_factor)
147
+ backoff_factor *= 2
148
+ continue
149
+ else:
150
+ self.logger.error("OperationalError", exc_info=True)
151
+ return dd.from_pandas(meta_df, npartitions=1)
152
+ except Exception as e:
153
+ self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
154
+ return dd.from_pandas(meta_df, npartitions=1)
155
+
156
+ if total_records == 0:
157
+ self.logger.warning("Query returned 0 records.")
158
+ return dd.from_pandas(meta_df, npartitions=1)
159
+
160
+ self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
161
+
162
+ # 4. Create a list of Dask Delayed objects, one for each chunk
163
+ @dask.delayed
164
+ def get_chunk(sql_query, chunk_offset):
165
+ """A Dask-delayed function to fetch one chunk of data."""
166
+ # LIMIT/OFFSET must be applied in the delayed function
167
+ paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
168
+ df = pd.read_sql(paginated_query, self.engine)
169
+
170
+ if fillna_value is not None:
171
+ df = df.fillna(fillna_value)
172
+
173
+ # Ensure column order and types match the meta
174
+ return df[ordered_columns].astype(meta_dtypes)
175
+
176
+ offsets = range(0, total_records, self.chunk_size)
177
+ delayed_chunks = [get_chunk(query, offset) for offset in offsets]
178
+
179
+ # 5. Construct the final lazy Dask DataFrame from the delayed chunks
180
+ ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
181
+ self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
182
+
183
+ return ddf
184
+
185
+ ## Dask-Only Solution to test in better hardware
186
+
187
+ # from typing import Type, Dict, Any
188
+ # import math
189
+ # import time
190
+ # import pandas as pd
191
+ # import dask
192
+ # import dask.dataframe as dd
193
+ #
194
+ # import sqlalchemy as sa
195
+ # from sqlalchemy import select, func
196
+ # from sqlalchemy.engine import Engine
197
+ # from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
198
+ # from sqlalchemy.orm import declarative_base
199
+ #
200
+ # from sibi_dst.df_helper.core import FilterHandler
201
+ # from sibi_dst.utils import Logger
202
+ #
203
+ #
204
+ # class SQLAlchemyDask:
205
+ # """
206
+ # Loads data into a Dask DataFrame. If there’s exactly one integer PK,
207
+ # use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
208
+ # pagination pushed into dask.delayed to keep memory use minimal.
209
+ # """
210
+ #
211
+ # def __init__(
212
+ # self,
213
+ # model: Type[declarative_base()],
214
+ # filters: Dict[str, Any],
215
+ # engine: Engine,
216
+ # chunk_size: int = 1_000,
217
+ # logger=None,
218
+ # debug: bool = False,
219
+ # ):
220
+ # self.model = model
221
+ # self.filters = filters or {}
222
+ # self.engine = engine
223
+ # self.chunk_size = chunk_size
224
+ # self.logger = logger or Logger.default_logger(self.__class__.__name__)
225
+ # self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
226
+ # self.filter_handler_cls = FilterHandler
227
+ # self.debug = debug
228
+ #
229
+ # def read_frame(self, fillna_value=None) -> dd.DataFrame:
230
+ # # 1) Build base query + filters
231
+ # base_q = select(self.model)
232
+ # if self.filters:
233
+ # base_q = self.filter_handler_cls(
234
+ # backend="sqlalchemy",
235
+ # logger=self.logger,
236
+ # debug=self.debug,
237
+ # ).apply_filters(base_q, model=self.model, filters=self.filters)
238
+ #
239
+ # # 2) Zero-row meta for dtype inference
240
+ # meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
241
+ # if meta.shape[1] == 0:
242
+ # self.logger.warning("No columns detected; returning empty DataFrame.")
243
+ # return dd.from_pandas(meta, npartitions=1)
244
+ #
245
+ # # 3) Single‐PK parallel path?
246
+ # pk_cols = list(self.model.__table__.primary_key.columns)
247
+ # if (
248
+ # len(pk_cols) == 1
249
+ # and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
250
+ # ):
251
+ # try:
252
+ # return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
253
+ # except Exception:
254
+ # self.logger.warning(
255
+ # "read_sql_table path failed, falling back to offset pagination",
256
+ # exc_info=True,
257
+ # )
258
+ #
259
+ # # 4) Composite PK or fallback → offset pagination in delayed tasks
260
+ # return self._offset_paginated_ddf(base_q, meta, fillna_value)
261
+ #
262
+ # def _offset_paginated_ddf(self, base_q, meta, fillna):
263
+ # # 1) count total rows
264
+ # try:
265
+ # with self.engine.connect() as conn:
266
+ # total = conn.execute(
267
+ # select(func.count()).select_from(base_q.alias())
268
+ # ).scalar_one()
269
+ # except Exception:
270
+ # self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
271
+ # return dd.from_pandas(meta, npartitions=1)
272
+ #
273
+ # if total == 0:
274
+ # self.logger.warning("Query returned 0 records.")
275
+ # return dd.from_pandas(meta, npartitions=1)
276
+ # self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
277
+ # # 2) create delayed tasks per offset
278
+ # @dask.delayed
279
+ # def _fetch_chunk(offset: int) -> pd.DataFrame:
280
+ # q = base_q.limit(self.chunk_size).offset(offset)
281
+ # df = pd.read_sql_query(q, self.engine)
282
+ # if fillna is not None:
283
+ # df = df.fillna(fillna)
284
+ # return df[meta.columns].astype(meta.dtypes.to_dict())
285
+ #
286
+ # offsets = range(0, total, self.chunk_size)
287
+ # parts = [_fetch_chunk(off) for off in offsets]
288
+ #
289
+ # ddf = dd.from_delayed(parts, meta=meta)
290
+ # self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
291
+ # return ddf
292
+ #
293
+ # def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
294
+ # # same as before: min/max + dd.read_sql_table
295
+ # backoff = 0.5
296
+ # for attempt in range(3):
297
+ # try:
298
+ # with self.engine.connect() as conn:
299
+ # min_id, max_id = conn.execute(
300
+ # select(func.min(pk_col), func.max(pk_col))
301
+ # .select_from(self.model.__table__)
302
+ # ).one()
303
+ # break
304
+ # except (SASQLTimeoutError, OperationalError) as e:
305
+ # if "timeout" in str(e).lower() and attempt < 2:
306
+ # self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
307
+ # time.sleep(backoff)
308
+ # backoff *= 2
309
+ # else:
310
+ # raise
311
+ #
312
+ # if min_id is None or max_id is None:
313
+ # self.logger.warning("Table empty—no PK bounds.")
314
+ # return dd.from_pandas(meta, npartitions=1)
315
+ #
316
+ # total = max_id - min_id + 1
317
+ # nparts = max(1, math.ceil(total / self.chunk_size))
318
+ # ddf = dd.read_sql_table(
319
+ # table=self.model.__table__.name,
320
+ # uri=str(self.engine.url),
321
+ # index_col=pk_col.name,
322
+ # limits=(min_id, max_id),
323
+ # npartitions=nparts,
324
+ # columns=list(meta.columns),
325
+ # )
326
+ # if fillna is not None:
327
+ # ddf = ddf.fillna(fillna)
328
+ # self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
329
+ # return ddf
@@ -1,8 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from ._defaults import (
4
- django_field_conversion_map_pandas,
5
- django_field_conversion_map_dask,
6
4
  sqlalchemy_field_conversion_map_dask,
7
5
  normalize_sqlalchemy_type)
8
6
  from ._filter_handler import FilterHandler
@@ -12,8 +10,6 @@ from ._query_config import QueryConfig
12
10
  __all__ = [
13
11
  "ParamsConfig",
14
12
  "QueryConfig",
15
- "django_field_conversion_map_pandas",
16
- "django_field_conversion_map_dask",
17
13
  "sqlalchemy_field_conversion_map_dask",
18
14
  "normalize_sqlalchemy_type",
19
15
  "FilterHandler",
@@ -13,56 +13,7 @@ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
13
13
  # conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
14
14
  # Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
15
15
  # conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
16
- # the Django field type.
17
-
18
- django_field_conversion_map_pandas: Dict[str, callable] = {
19
- "CharField": lambda x: x.astype(str),
20
- "TextField": lambda x: x.astype(str),
21
- "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
22
- "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
23
- "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
24
- "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
25
- "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
26
- "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
27
- "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
28
- "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
29
- "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
30
- "BooleanField": lambda x: x.astype(bool),
31
- "NullBooleanField": lambda x: x.astype(bool),
32
- "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
33
- "DateField": lambda x: pd.to_datetime(x, errors="coerce").dt.date,
34
- "TimeField": lambda x: pd.to_datetime(x, errors="coerce").dt.time,
35
- "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
36
- # for JSONField, assuming JSON objects are represented as string in df
37
- "JSONField": lambda x: x.apply(json.loads),
38
- "ArrayField": lambda x: x.apply(eval),
39
- "UUIDField": lambda x: x.astype(str),
40
- }
41
-
42
- django_field_conversion_map_dask: Dict[str, callable] = {
43
- "CharField": lambda x: x.astype(str),
44
- "TextField": lambda x: x.astype(str),
45
- "IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
46
- "AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
47
- "BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
48
- "BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
49
- "SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
50
- "PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
51
- "PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
52
- "FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
53
- "DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
54
- "BooleanField": lambda x: x.astype(bool),
55
- "NullBooleanField": lambda x: x.astype(bool),
56
- "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
- "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
58
- meta=("date", "object")),
59
- "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
60
- meta=("time", "object")),
61
- "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
62
- "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
63
- "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
64
- "UUIDField": lambda x: x.astype(str),
65
- }
16
+ # the db field type.
66
17
 
67
18
  sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
68
19
  String.__name__: lambda x: x.astype(str).fillna(""),
@@ -10,7 +10,6 @@ from .df_utils import DfUtils
10
10
  from .storage_manager import StorageManager
11
11
  from .parquet_saver import ParquetSaver
12
12
  from .clickhouse_writer import ClickHouseWriter
13
- from .airflow_manager import AirflowDAGManager
14
13
  from .credentials import *
15
14
  from .update_planner import UpdatePlanner
16
15
  from .data_wrapper import DataWrapper
@@ -35,7 +34,6 @@ __all__ = [
35
34
  "StorageManager",
36
35
  "DfUtils",
37
36
  "ClickHouseWriter",
38
- "AirflowDAGManager",
39
37
  "StorageConfig",
40
38
  "FsRegistry",
41
39
  "DataFromHttpSource",
@@ -38,7 +38,7 @@ class DataWrapper:
38
38
  logger: Logger = None,
39
39
  show_progress: bool = False,
40
40
  timeout: float = 30,
41
- max_threads: int = 1,
41
+ max_threads: int = 3,
42
42
  **kwargs: Any,
43
43
  ):
44
44
  self.dataclass = dataclass
@@ -66,6 +66,7 @@ class DataWrapper:
66
66
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
67
67
  self.mmanifest = kwargs.get("mmanifest", None)
68
68
  self.update_planner=kwargs.get("update_planner", None)
69
+ self.datacls = self.dataclass(**self.class_params)
69
70
 
70
71
  def __enter__(self):
71
72
  """Context manager entry"""
@@ -164,28 +165,24 @@ class DataWrapper:
164
165
  def _process_single_date(self, date: datetime.date):
165
166
  """Core date processing logic with load/save timing and thread reporting"""
166
167
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
167
- self.logger.info(f"Processing date {date.isoformat()} for {path}")
168
- # self.logger.info(f"Path {path} in {self.skipped}: {path in self.skipped}")
168
+ self.logger.debug(f"Processing date {date.isoformat()} for {path}")
169
169
  if path in self.update_planner.skipped and self.update_planner.ignore_missing:
170
170
  self.logger.info(f"Skipping {date} as it exists in the skipped list")
171
171
  return
172
172
  full_path = f"{path}{self.parquet_filename}"
173
173
 
174
174
  thread_name = threading.current_thread().name
175
- self.logger.info(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
175
+ self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
176
176
 
177
177
  overall_start = time.perf_counter()
178
178
  try:
179
179
  load_start = time.perf_counter()
180
- with self.dataclass(**self.class_params) as data:
181
- df = data.load_period(
182
- dt_field=self.date_field,
183
- start=date,
184
- end=date,
185
- **self.load_params
186
- )
180
+ date_filter = {f"{self.date_field}__date": {date.isoformat()}}
181
+ self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
182
+ # Load data using the dataclass with the provided date filter
183
+ self.load_params.update(date_filter)
184
+ df = self.datacls.load(**self.load_params)
187
185
  load_time = time.perf_counter() - load_start
188
-
189
186
  if df.head(1, compute=True).empty:
190
187
  if self.mmanifest:
191
188
  schema = df._meta.dtypes.astype(str).to_dict()
@@ -73,6 +73,8 @@ class UpdatePlanner:
73
73
  self.show_progress = show_progress
74
74
  self.logger = logger or Logger.default_logger(logger_name="update_planner")
75
75
  self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
76
+ self.debug = debug
77
+ self.verbose = verbose
76
78
 
77
79
  # Filesystem and age helper
78
80
  self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))