sibi-dst 0.3.64__tar.gz → 2025.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst-2025.1.2/PKG-INFO +55 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/README.md +3 -9
- sibi_dst-2025.1.2/pyproject.toml +49 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_df_helper.py +5 -3
- sibi_dst-2025.1.2/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +329 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/__init__.py +0 -4
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_defaults.py +1 -50
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/__init__.py +0 -2
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_wrapper.py +9 -12
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/update_planner.py +2 -0
- sibi_dst-0.3.64/PKG-INFO +0 -90
- sibi_dst-0.3.64/pyproject.toml +0 -55
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst-0.3.64/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -179
- sibi_dst-0.3.64/sibi_dst/utils/airflow_manager.py +0 -212
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v2/utils/log_utils.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/backends/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.64 → sibi_dst-2025.1.2}/sibi_dst/v3/df_helper/_df_helper.py +0 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: sibi-dst
|
3
|
+
Version: 2025.1.2
|
4
|
+
Summary: Data Science Toolkit
|
5
|
+
Author: Luis Valverde
|
6
|
+
Author-email: lvalverdeb@gmail.com
|
7
|
+
Requires-Python: >=3.12,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
|
12
|
+
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
13
|
+
Requires-Dist: dask[complete] (>=2025.5.1,<2026.0.0)
|
14
|
+
Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
|
15
|
+
Requires-Dist: pandas (>=2.3.1,<3.0.0)
|
16
|
+
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
17
|
+
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
18
|
+
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
19
|
+
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
20
|
+
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
21
|
+
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
22
|
+
Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
|
23
|
+
Description-Content-Type: text/markdown
|
24
|
+
|
25
|
+
### SIBI-DST
|
26
|
+
|
27
|
+
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
|
28
|
+
|
29
|
+
## Example Use Cases
|
30
|
+
|
31
|
+
1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
|
32
|
+
2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
|
33
|
+
3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
|
34
|
+
4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
|
35
|
+
5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
|
36
|
+
|
37
|
+
## Supported Technologies
|
38
|
+
|
39
|
+
- **Data Processing**: Pandas, Dask
|
40
|
+
- **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
|
41
|
+
- **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
|
42
|
+
- **API Development**: Django REST Framework, FastAPI
|
43
|
+
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```bash
|
47
|
+
# with pip
|
48
|
+
|
49
|
+
pip install sibi-dst # Install only the main package
|
50
|
+
pip install sibi-dst[geospatial] # Install with geospatial dependencies
|
51
|
+
pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
|
52
|
+
|
53
|
+
|
54
|
+
```
|
55
|
+
|
@@ -22,15 +22,9 @@ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX,
|
|
22
22
|
```bash
|
23
23
|
# with pip
|
24
24
|
|
25
|
-
pip install sibi-dst
|
26
|
-
pip install sibi-dst[
|
27
|
-
pip install sibi-dst[geospatial] # Install
|
28
|
-
|
29
|
-
# with poetry
|
30
|
-
|
31
|
-
poetry add "sibi-dst[complete]" # Install all dependencies
|
32
|
-
poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
|
33
|
-
poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
|
25
|
+
pip install sibi-dst # Install only the main package
|
26
|
+
pip install sibi-dst[geospatial] # Install with geospatial dependencies
|
27
|
+
pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
|
34
28
|
|
35
29
|
|
36
30
|
```
|
@@ -0,0 +1,49 @@
|
|
1
|
+
[tool.poetry]
|
2
|
+
name = "sibi-dst"
|
3
|
+
version = "2025.1.2"
|
4
|
+
description = "Data Science Toolkit"
|
5
|
+
authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
|
6
|
+
readme = "README.md"
|
7
|
+
packages = [{ include = "sibi_dst" }]
|
8
|
+
|
9
|
+
[tool.poetry.dependencies]
|
10
|
+
python = ">=3.12,<4.0"
|
11
|
+
pandas = "^2.3.1"
|
12
|
+
dask = {extras = ["complete"], version = "^2025.5.1"}
|
13
|
+
psycopg2 = "^2.9.10"
|
14
|
+
mysqlclient = "^2.2.7"
|
15
|
+
webdav4 = "^0.10.0"
|
16
|
+
clickhouse-connect = "^0.8.18"
|
17
|
+
clickhouse-driver = "^0.2.9"
|
18
|
+
tqdm = "^4.67.1"
|
19
|
+
s3fs = "^2025.5.1"
|
20
|
+
pydantic = "^2.11.7"
|
21
|
+
sqlalchemy = "^2.0.41"
|
22
|
+
pymysql = "^1.1.1"
|
23
|
+
|
24
|
+
[tool.poetry.group.dev]
|
25
|
+
optional = true
|
26
|
+
|
27
|
+
[tool.poetry.group.dev.dependencies]
|
28
|
+
jupyter = "^1.1.1"
|
29
|
+
python-dotenv = "^1.1.1"
|
30
|
+
black = "^25.1.0"
|
31
|
+
|
32
|
+
[tool.poetry.group.test]
|
33
|
+
optional = true
|
34
|
+
|
35
|
+
[tool.poetry.group.test.dependencies]
|
36
|
+
pytest = "^8.4.1"
|
37
|
+
pytest-cov = "^6.2.1"
|
38
|
+
|
39
|
+
[tool.poetry.group.geospatial]
|
40
|
+
optional = true
|
41
|
+
|
42
|
+
[tool.poetry.group.geospatial.dependencies]
|
43
|
+
osmnx = "^2.0.5"
|
44
|
+
geopy = "^2.4.1"
|
45
|
+
folium = "^0.20.0"
|
46
|
+
|
47
|
+
[build-system]
|
48
|
+
requires = ["poetry-core"]
|
49
|
+
build-backend = "poetry.core.masonry.api"
|
@@ -26,6 +26,7 @@ class BaseBackend:
|
|
26
26
|
def __init__(self, helper: DfHelper):
|
27
27
|
self.helper = helper
|
28
28
|
self.logger = helper.logger
|
29
|
+
self.debug = helper.debug
|
29
30
|
|
30
31
|
def load(self, **options) -> dd.DataFrame | pd.DataFrame:
|
31
32
|
"""Synchronous data loading method. Must be implemented by sync backends."""
|
@@ -47,7 +48,8 @@ class SqlAlchemyBackend(BaseBackend):
|
|
47
48
|
plugin_sqlalchemy=self.helper.backend_db_connection,
|
48
49
|
plugin_query=self.helper._backend_query,
|
49
50
|
plugin_params=self.helper._backend_params,
|
50
|
-
logger=self.logger
|
51
|
+
logger=self.logger,
|
52
|
+
debug= self.debug
|
51
53
|
)
|
52
54
|
return db_loader.build_and_load()
|
53
55
|
except Exception as e:
|
@@ -62,10 +64,10 @@ class ParquetBackend(BaseBackend):
|
|
62
64
|
try:
|
63
65
|
df = self.helper.backend_parquet.load_files()
|
64
66
|
if options and df is not None:
|
65
|
-
df = FilterHandler('dask', self.logger).apply_filters(df, filters=options)
|
67
|
+
df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
66
68
|
return df
|
67
69
|
except Exception as e:
|
68
|
-
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=
|
70
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
|
69
71
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
70
72
|
|
71
73
|
|
@@ -0,0 +1,329 @@
|
|
1
|
+
|
2
|
+
from typing import Type
|
3
|
+
|
4
|
+
import dask
|
5
|
+
import dask.dataframe as dd
|
6
|
+
import pandas as pd
|
7
|
+
from sqlalchemy import (
|
8
|
+
inspect,
|
9
|
+
select
|
10
|
+
)
|
11
|
+
from sqlalchemy.engine import Engine
|
12
|
+
from sqlalchemy.orm import declarative_base
|
13
|
+
import time
|
14
|
+
from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
15
|
+
import sqlalchemy as sa
|
16
|
+
from sibi_dst.df_helper.core import FilterHandler
|
17
|
+
from sibi_dst.utils import Logger
|
18
|
+
|
19
|
+
|
20
|
+
class SQLAlchemyDask:
|
21
|
+
"""
|
22
|
+
Loads data from a database into a Dask DataFrame using a memory-safe,
|
23
|
+
non-parallel, paginated approach.
|
24
|
+
|
25
|
+
This class avoids using a numeric `index_col for parallel loading.
|
26
|
+
"""
|
27
|
+
|
28
|
+
_SQLALCHEMY_TO_DASK_DTYPE = {
|
29
|
+
"INTEGER": "Int64",
|
30
|
+
"SMALLINT": "Int64",
|
31
|
+
"BIGINT": "Int64",
|
32
|
+
"FLOAT": "float64",
|
33
|
+
"NUMERIC": "float64",
|
34
|
+
"BOOLEAN": "bool",
|
35
|
+
"VARCHAR": "object",
|
36
|
+
"TEXT": "object",
|
37
|
+
"DATE": "datetime64[ns]",
|
38
|
+
"DATETIME": "datetime64[ns]",
|
39
|
+
"TIME": "object",
|
40
|
+
"UUID": "object",
|
41
|
+
}
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
model: Type[declarative_base()],
|
46
|
+
filters: dict,
|
47
|
+
engine: Engine,
|
48
|
+
chunk_size: int = 1000,
|
49
|
+
logger=None,
|
50
|
+
debug: bool = False,
|
51
|
+
):
|
52
|
+
"""
|
53
|
+
Initializes the data loader.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
model: The SQLAlchemy ORM model for the table.
|
57
|
+
filters: A dictionary of filters to apply to the query.
|
58
|
+
engine: An SQLAlchemy Engine instance.
|
59
|
+
chunk_size: The number of records to fetch in each database query.
|
60
|
+
logger: A logger instance.
|
61
|
+
debug: Whether to enable detailed logging.
|
62
|
+
"""
|
63
|
+
self.model = model
|
64
|
+
self.filters = filters
|
65
|
+
self.engine = engine
|
66
|
+
self.chunk_size = chunk_size
|
67
|
+
self.debug = debug
|
68
|
+
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
69
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
70
|
+
self.filter_handler_cls = FilterHandler
|
71
|
+
|
72
|
+
@classmethod
|
73
|
+
def infer_meta_from_model(cls, model: Type[declarative_base()]) -> dict:
|
74
|
+
"""
|
75
|
+
Infers a metadata dictionary for Dask based on the SQLAlchemy model.
|
76
|
+
This helps Dask understand the DataFrame structure without reading data.
|
77
|
+
"""
|
78
|
+
mapper = inspect(model)
|
79
|
+
dtypes = {}
|
80
|
+
for column in mapper.columns:
|
81
|
+
dtype_str = str(column.type).upper().split("(")[0]
|
82
|
+
dtype = cls._SQLALCHEMY_TO_DASK_DTYPE.get(dtype_str, "object")
|
83
|
+
dtypes[column.name] = dtype
|
84
|
+
return dtypes
|
85
|
+
|
86
|
+
def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
87
|
+
"""
|
88
|
+
Builds and executes a query to load data into a Dask DataFrame.
|
89
|
+
|
90
|
+
This method works by first running a COUNT query to get the total
|
91
|
+
size, then creating a series of delayed tasks that each fetch a
|
92
|
+
chunk of data using LIMIT/OFFSET.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
fillna_value: Value to replace NaN or NULL values with, if any.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
A lazy Dask DataFrame.
|
99
|
+
"""
|
100
|
+
# 1. Build the base query and apply filters
|
101
|
+
query = select(self.model)
|
102
|
+
if self.filters:
|
103
|
+
query = self.filter_handler_cls(
|
104
|
+
backend="sqlalchemy", logger=self.logger, debug=self.debug
|
105
|
+
).apply_filters(query, model=self.model, filters=self.filters)
|
106
|
+
else:
|
107
|
+
query = query.limit(self.chunk_size)
|
108
|
+
self.logger.debug(f"Base query for pagination: {query}")
|
109
|
+
|
110
|
+
# 2. Get metadata for the Dask DataFrame structure
|
111
|
+
ordered_columns = [column.name for column in self.model.__table__.columns]
|
112
|
+
meta_dtypes = self.infer_meta_from_model(self.model)
|
113
|
+
meta_df = pd.DataFrame(columns=ordered_columns).astype(meta_dtypes)
|
114
|
+
|
115
|
+
# 3. Get the total record count to calculate the number of chunks
|
116
|
+
|
117
|
+
retry_attempts = 3
|
118
|
+
backoff_factor = 0.5 # start with a 0.5-second delay
|
119
|
+
|
120
|
+
for attempt in range(retry_attempts):
|
121
|
+
try:
|
122
|
+
with self.engine.connect() as connection:
|
123
|
+
count_query = sa.select(sa.func.count()).select_from(query.alias())
|
124
|
+
total_records = connection.execute(count_query).scalar_one()
|
125
|
+
|
126
|
+
# If successful, break the loop
|
127
|
+
break
|
128
|
+
|
129
|
+
except SASQLTimeoutError:
|
130
|
+
if attempt < retry_attempts - 1:
|
131
|
+
self.logger.warning(
|
132
|
+
f"Connection pool limit reached. Retrying in {backoff_factor} seconds..."
|
133
|
+
)
|
134
|
+
time.sleep(backoff_factor)
|
135
|
+
backoff_factor *= 2 # Double the backoff time for the next attempt
|
136
|
+
else:
|
137
|
+
self.logger.error(
|
138
|
+
"Failed to get a connection from the pool after several retries.",
|
139
|
+
exc_info=True
|
140
|
+
)
|
141
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
142
|
+
except OperationalError as oe:
|
143
|
+
# sometimes the DB driver wraps timeouts in OperationalError
|
144
|
+
if "timeout" in str(oe).lower():
|
145
|
+
self.logger.warning("OperationalTimeout, retrying…", exc_info=True)
|
146
|
+
time.sleep(backoff_factor)
|
147
|
+
backoff_factor *= 2
|
148
|
+
continue
|
149
|
+
else:
|
150
|
+
self.logger.error("OperationalError", exc_info=True)
|
151
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
152
|
+
except Exception as e:
|
153
|
+
self.logger.error(f"An unexpected error occurred: {e}", exc_info=True)
|
154
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
155
|
+
|
156
|
+
if total_records == 0:
|
157
|
+
self.logger.warning("Query returned 0 records.")
|
158
|
+
return dd.from_pandas(meta_df, npartitions=1)
|
159
|
+
|
160
|
+
self.logger.debug(f"Total records to fetch: {total_records}. Chunk size: {self.chunk_size}.")
|
161
|
+
|
162
|
+
# 4. Create a list of Dask Delayed objects, one for each chunk
|
163
|
+
@dask.delayed
|
164
|
+
def get_chunk(sql_query, chunk_offset):
|
165
|
+
"""A Dask-delayed function to fetch one chunk of data."""
|
166
|
+
# LIMIT/OFFSET must be applied in the delayed function
|
167
|
+
paginated_query = sql_query.limit(self.chunk_size).offset(chunk_offset)
|
168
|
+
df = pd.read_sql(paginated_query, self.engine)
|
169
|
+
|
170
|
+
if fillna_value is not None:
|
171
|
+
df = df.fillna(fillna_value)
|
172
|
+
|
173
|
+
# Ensure column order and types match the meta
|
174
|
+
return df[ordered_columns].astype(meta_dtypes)
|
175
|
+
|
176
|
+
offsets = range(0, total_records, self.chunk_size)
|
177
|
+
delayed_chunks = [get_chunk(query, offset) for offset in offsets]
|
178
|
+
|
179
|
+
# 5. Construct the final lazy Dask DataFrame from the delayed chunks
|
180
|
+
ddf = dd.from_delayed(delayed_chunks, meta=meta_df)
|
181
|
+
self.logger.debug(f"Successfully created a lazy Dask DataFrame with {ddf.npartitions} partitions.")
|
182
|
+
|
183
|
+
return ddf
|
184
|
+
|
185
|
+
## Dask-Only Solution to test in better hardware
|
186
|
+
|
187
|
+
# from typing import Type, Dict, Any
|
188
|
+
# import math
|
189
|
+
# import time
|
190
|
+
# import pandas as pd
|
191
|
+
# import dask
|
192
|
+
# import dask.dataframe as dd
|
193
|
+
#
|
194
|
+
# import sqlalchemy as sa
|
195
|
+
# from sqlalchemy import select, func
|
196
|
+
# from sqlalchemy.engine import Engine
|
197
|
+
# from sqlalchemy.exc import TimeoutError as SASQLTimeoutError, OperationalError
|
198
|
+
# from sqlalchemy.orm import declarative_base
|
199
|
+
#
|
200
|
+
# from sibi_dst.df_helper.core import FilterHandler
|
201
|
+
# from sibi_dst.utils import Logger
|
202
|
+
#
|
203
|
+
#
|
204
|
+
# class SQLAlchemyDask:
|
205
|
+
# """
|
206
|
+
# Loads data into a Dask DataFrame. If there’s exactly one integer PK,
|
207
|
+
# use dask.dataframe.read_sql_table; otherwise fall back to offset‐based
|
208
|
+
# pagination pushed into dask.delayed to keep memory use minimal.
|
209
|
+
# """
|
210
|
+
#
|
211
|
+
# def __init__(
|
212
|
+
# self,
|
213
|
+
# model: Type[declarative_base()],
|
214
|
+
# filters: Dict[str, Any],
|
215
|
+
# engine: Engine,
|
216
|
+
# chunk_size: int = 1_000,
|
217
|
+
# logger=None,
|
218
|
+
# debug: bool = False,
|
219
|
+
# ):
|
220
|
+
# self.model = model
|
221
|
+
# self.filters = filters or {}
|
222
|
+
# self.engine = engine
|
223
|
+
# self.chunk_size = chunk_size
|
224
|
+
# self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
225
|
+
# self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
226
|
+
# self.filter_handler_cls = FilterHandler
|
227
|
+
# self.debug = debug
|
228
|
+
#
|
229
|
+
# def read_frame(self, fillna_value=None) -> dd.DataFrame:
|
230
|
+
# # 1) Build base query + filters
|
231
|
+
# base_q = select(self.model)
|
232
|
+
# if self.filters:
|
233
|
+
# base_q = self.filter_handler_cls(
|
234
|
+
# backend="sqlalchemy",
|
235
|
+
# logger=self.logger,
|
236
|
+
# debug=self.debug,
|
237
|
+
# ).apply_filters(base_q, model=self.model, filters=self.filters)
|
238
|
+
#
|
239
|
+
# # 2) Zero-row meta for dtype inference
|
240
|
+
# meta = pd.read_sql_query(base_q.limit(0), self.engine).iloc[:0]
|
241
|
+
# if meta.shape[1] == 0:
|
242
|
+
# self.logger.warning("No columns detected; returning empty DataFrame.")
|
243
|
+
# return dd.from_pandas(meta, npartitions=1)
|
244
|
+
#
|
245
|
+
# # 3) Single‐PK parallel path?
|
246
|
+
# pk_cols = list(self.model.__table__.primary_key.columns)
|
247
|
+
# if (
|
248
|
+
# len(pk_cols) == 1
|
249
|
+
# and pd.api.types.is_integer_dtype(meta[pk_cols[0].name])
|
250
|
+
# ):
|
251
|
+
# try:
|
252
|
+
# return self._ddf_via_read_sql_table(pk_cols[0], meta, fillna_value)
|
253
|
+
# except Exception:
|
254
|
+
# self.logger.warning(
|
255
|
+
# "read_sql_table path failed, falling back to offset pagination",
|
256
|
+
# exc_info=True,
|
257
|
+
# )
|
258
|
+
#
|
259
|
+
# # 4) Composite PK or fallback → offset pagination in delayed tasks
|
260
|
+
# return self._offset_paginated_ddf(base_q, meta, fillna_value)
|
261
|
+
#
|
262
|
+
# def _offset_paginated_ddf(self, base_q, meta, fillna):
|
263
|
+
# # 1) count total rows
|
264
|
+
# try:
|
265
|
+
# with self.engine.connect() as conn:
|
266
|
+
# total = conn.execute(
|
267
|
+
# select(func.count()).select_from(base_q.alias())
|
268
|
+
# ).scalar_one()
|
269
|
+
# except Exception:
|
270
|
+
# self.logger.error("Failed to count records; returning empty DataFrame", exc_info=True)
|
271
|
+
# return dd.from_pandas(meta, npartitions=1)
|
272
|
+
#
|
273
|
+
# if total == 0:
|
274
|
+
# self.logger.warning("Query returned 0 records.")
|
275
|
+
# return dd.from_pandas(meta, npartitions=1)
|
276
|
+
# self.logger.debug(f"Total records to fetch: {total}. Chunk size: {self.chunk_size}.")
|
277
|
+
# # 2) create delayed tasks per offset
|
278
|
+
# @dask.delayed
|
279
|
+
# def _fetch_chunk(offset: int) -> pd.DataFrame:
|
280
|
+
# q = base_q.limit(self.chunk_size).offset(offset)
|
281
|
+
# df = pd.read_sql_query(q, self.engine)
|
282
|
+
# if fillna is not None:
|
283
|
+
# df = df.fillna(fillna)
|
284
|
+
# return df[meta.columns].astype(meta.dtypes.to_dict())
|
285
|
+
#
|
286
|
+
# offsets = range(0, total, self.chunk_size)
|
287
|
+
# parts = [_fetch_chunk(off) for off in offsets]
|
288
|
+
#
|
289
|
+
# ddf = dd.from_delayed(parts, meta=meta)
|
290
|
+
# self.logger.debug(f"Offset‐paginated read → {len(parts)} partitions")
|
291
|
+
# return ddf
|
292
|
+
#
|
293
|
+
# def _ddf_via_read_sql_table(self, pk_col, meta, fillna) -> dd.DataFrame:
|
294
|
+
# # same as before: min/max + dd.read_sql_table
|
295
|
+
# backoff = 0.5
|
296
|
+
# for attempt in range(3):
|
297
|
+
# try:
|
298
|
+
# with self.engine.connect() as conn:
|
299
|
+
# min_id, max_id = conn.execute(
|
300
|
+
# select(func.min(pk_col), func.max(pk_col))
|
301
|
+
# .select_from(self.model.__table__)
|
302
|
+
# ).one()
|
303
|
+
# break
|
304
|
+
# except (SASQLTimeoutError, OperationalError) as e:
|
305
|
+
# if "timeout" in str(e).lower() and attempt < 2:
|
306
|
+
# self.logger.warning(f"Timeout fetching PK bounds; retrying in {backoff}s")
|
307
|
+
# time.sleep(backoff)
|
308
|
+
# backoff *= 2
|
309
|
+
# else:
|
310
|
+
# raise
|
311
|
+
#
|
312
|
+
# if min_id is None or max_id is None:
|
313
|
+
# self.logger.warning("Table empty—no PK bounds.")
|
314
|
+
# return dd.from_pandas(meta, npartitions=1)
|
315
|
+
#
|
316
|
+
# total = max_id - min_id + 1
|
317
|
+
# nparts = max(1, math.ceil(total / self.chunk_size))
|
318
|
+
# ddf = dd.read_sql_table(
|
319
|
+
# table=self.model.__table__.name,
|
320
|
+
# uri=str(self.engine.url),
|
321
|
+
# index_col=pk_col.name,
|
322
|
+
# limits=(min_id, max_id),
|
323
|
+
# npartitions=nparts,
|
324
|
+
# columns=list(meta.columns),
|
325
|
+
# )
|
326
|
+
# if fillna is not None:
|
327
|
+
# ddf = ddf.fillna(fillna)
|
328
|
+
# self.logger.debug(f"Parallel read via dask.read_sql_table → {nparts} partitions")
|
329
|
+
# return ddf
|
@@ -1,8 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from ._defaults import (
|
4
|
-
django_field_conversion_map_pandas,
|
5
|
-
django_field_conversion_map_dask,
|
6
4
|
sqlalchemy_field_conversion_map_dask,
|
7
5
|
normalize_sqlalchemy_type)
|
8
6
|
from ._filter_handler import FilterHandler
|
@@ -12,8 +10,6 @@ from ._query_config import QueryConfig
|
|
12
10
|
__all__ = [
|
13
11
|
"ParamsConfig",
|
14
12
|
"QueryConfig",
|
15
|
-
"django_field_conversion_map_pandas",
|
16
|
-
"django_field_conversion_map_dask",
|
17
13
|
"sqlalchemy_field_conversion_map_dask",
|
18
14
|
"normalize_sqlalchemy_type",
|
19
15
|
"FilterHandler",
|
@@ -13,56 +13,7 @@ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
|
|
13
13
|
# conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
|
14
14
|
# Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
|
15
15
|
# conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
|
16
|
-
# the
|
17
|
-
|
18
|
-
django_field_conversion_map_pandas: Dict[str, callable] = {
|
19
|
-
"CharField": lambda x: x.astype(str),
|
20
|
-
"TextField": lambda x: x.astype(str),
|
21
|
-
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
22
|
-
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
|
-
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
24
|
-
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
25
|
-
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
26
|
-
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
27
|
-
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
28
|
-
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
29
|
-
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
30
|
-
"BooleanField": lambda x: x.astype(bool),
|
31
|
-
"NullBooleanField": lambda x: x.astype(bool),
|
32
|
-
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
33
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").dt.date,
|
34
|
-
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").dt.time,
|
35
|
-
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
36
|
-
# for JSONField, assuming JSON objects are represented as string in df
|
37
|
-
"JSONField": lambda x: x.apply(json.loads),
|
38
|
-
"ArrayField": lambda x: x.apply(eval),
|
39
|
-
"UUIDField": lambda x: x.astype(str),
|
40
|
-
}
|
41
|
-
|
42
|
-
django_field_conversion_map_dask: Dict[str, callable] = {
|
43
|
-
"CharField": lambda x: x.astype(str),
|
44
|
-
"TextField": lambda x: x.astype(str),
|
45
|
-
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
46
|
-
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
47
|
-
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
48
|
-
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
49
|
-
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
50
|
-
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
51
|
-
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
52
|
-
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
53
|
-
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
54
|
-
"BooleanField": lambda x: x.astype(bool),
|
55
|
-
"NullBooleanField": lambda x: x.astype(bool),
|
56
|
-
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
-
meta=("date", "object")),
|
59
|
-
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
60
|
-
meta=("time", "object")),
|
61
|
-
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
62
|
-
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
63
|
-
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
64
|
-
"UUIDField": lambda x: x.astype(str),
|
65
|
-
}
|
16
|
+
# the db field type.
|
66
17
|
|
67
18
|
sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
68
19
|
String.__name__: lambda x: x.astype(str).fillna(""),
|
@@ -10,7 +10,6 @@ from .df_utils import DfUtils
|
|
10
10
|
from .storage_manager import StorageManager
|
11
11
|
from .parquet_saver import ParquetSaver
|
12
12
|
from .clickhouse_writer import ClickHouseWriter
|
13
|
-
from .airflow_manager import AirflowDAGManager
|
14
13
|
from .credentials import *
|
15
14
|
from .update_planner import UpdatePlanner
|
16
15
|
from .data_wrapper import DataWrapper
|
@@ -35,7 +34,6 @@ __all__ = [
|
|
35
34
|
"StorageManager",
|
36
35
|
"DfUtils",
|
37
36
|
"ClickHouseWriter",
|
38
|
-
"AirflowDAGManager",
|
39
37
|
"StorageConfig",
|
40
38
|
"FsRegistry",
|
41
39
|
"DataFromHttpSource",
|
@@ -38,7 +38,7 @@ class DataWrapper:
|
|
38
38
|
logger: Logger = None,
|
39
39
|
show_progress: bool = False,
|
40
40
|
timeout: float = 30,
|
41
|
-
max_threads: int =
|
41
|
+
max_threads: int = 3,
|
42
42
|
**kwargs: Any,
|
43
43
|
):
|
44
44
|
self.dataclass = dataclass
|
@@ -66,6 +66,7 @@ class DataWrapper:
|
|
66
66
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
67
67
|
self.mmanifest = kwargs.get("mmanifest", None)
|
68
68
|
self.update_planner=kwargs.get("update_planner", None)
|
69
|
+
self.datacls = self.dataclass(**self.class_params)
|
69
70
|
|
70
71
|
def __enter__(self):
|
71
72
|
"""Context manager entry"""
|
@@ -164,28 +165,24 @@ class DataWrapper:
|
|
164
165
|
def _process_single_date(self, date: datetime.date):
|
165
166
|
"""Core date processing logic with load/save timing and thread reporting"""
|
166
167
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
167
|
-
self.logger.
|
168
|
-
# self.logger.info(f"Path {path} in {self.skipped}: {path in self.skipped}")
|
168
|
+
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
169
169
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
170
170
|
self.logger.info(f"Skipping {date} as it exists in the skipped list")
|
171
171
|
return
|
172
172
|
full_path = f"{path}{self.parquet_filename}"
|
173
173
|
|
174
174
|
thread_name = threading.current_thread().name
|
175
|
-
self.logger.
|
175
|
+
self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
176
176
|
|
177
177
|
overall_start = time.perf_counter()
|
178
178
|
try:
|
179
179
|
load_start = time.perf_counter()
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
**self.load_params
|
186
|
-
)
|
180
|
+
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
181
|
+
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
182
|
+
# Load data using the dataclass with the provided date filter
|
183
|
+
self.load_params.update(date_filter)
|
184
|
+
df = self.datacls.load(**self.load_params)
|
187
185
|
load_time = time.perf_counter() - load_start
|
188
|
-
|
189
186
|
if df.head(1, compute=True).empty:
|
190
187
|
if self.mmanifest:
|
191
188
|
schema = df._meta.dtypes.astype(str).to_dict()
|
@@ -73,6 +73,8 @@ class UpdatePlanner:
|
|
73
73
|
self.show_progress = show_progress
|
74
74
|
self.logger = logger or Logger.default_logger(logger_name="update_planner")
|
75
75
|
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
76
|
+
self.debug = debug
|
77
|
+
self.verbose = verbose
|
76
78
|
|
77
79
|
# Filesystem and age helper
|
78
80
|
self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
|