sibi-dst 0.3.52__tar.gz → 0.3.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sibi_dst-0.3.54/PKG-INFO +97 -0
  2. sibi_dst-0.3.54/README.md +43 -0
  3. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/pyproject.toml +2 -3
  4. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/_df_helper.py +2 -1
  5. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/_parquet_artifact.py +13 -1
  6. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +8 -2
  7. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/data_wrapper.py +10 -5
  8. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/parquet_saver.py +2 -7
  9. sibi_dst-0.3.52/PKG-INFO +0 -200
  10. sibi_dst-0.3.52/README.md +0 -145
  11. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/__init__.py +0 -0
  12. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/__init__.py +0 -0
  13. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -0
  14. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  15. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/__init__.py +0 -0
  16. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  17. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  18. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  19. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  20. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  21. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  22. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  23. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  24. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  25. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  26. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  27. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  28. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  29. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  30. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  31. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/core/__init__.py +0 -0
  32. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/core/_defaults.py +0 -0
  33. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  34. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/core/_params_config.py +0 -0
  35. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/core/_query_config.py +0 -0
  36. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/df_helper/data_cleaner.py +0 -0
  37. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/geopy_helper/__init__.py +0 -0
  38. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  39. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/geopy_helper/utils.py +0 -0
  40. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/__init__.py +0 -0
  41. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  42. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  43. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  44. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  45. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/osmnx_helper/utils.py +0 -0
  46. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/tests/__init__.py +0 -0
  47. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  48. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/__init__.py +0 -0
  49. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/airflow_manager.py +0 -0
  50. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/clickhouse_writer.py +0 -0
  51. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/credentials.py +0 -0
  52. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/data_from_http_source.py +0 -0
  53. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/data_utils.py +0 -0
  54. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/date_utils.py +0 -0
  55. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/df_utils.py +0 -0
  56. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/file_utils.py +0 -0
  57. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/filepath_generator.py +0 -0
  58. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/log_utils.py +0 -0
  59. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/phone_formatter.py +0 -0
  60. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/storage_config.py +0 -0
  61. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/storage_manager.py +0 -0
  62. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/utils/webdav_client.py +0 -0
  63. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/__init__.py +0 -0
  64. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/__init__.py +0 -0
  65. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  66. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  67. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  68. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  69. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  70. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  71. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  72. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  73. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  74. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  75. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  76. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  77. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  78. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  79. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  80. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  81. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/utils/__init__.py +0 -0
  82. {sibi_dst-0.3.52 → sibi_dst-0.3.54}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.1
2
+ Name: sibi-dst
3
+ Version: 0.3.54
4
+ Summary: Data Science Toolkit
5
+ Author: Luis Valverde
6
+ Author-email: lvalverdeb@gmail.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Provides-Extra: complete
13
+ Provides-Extra: df-helper
14
+ Provides-Extra: geospatial
15
+ Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
16
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
17
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
18
+ Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
19
+ Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
20
+ Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
21
+ Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
22
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
23
+ Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
24
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
25
+ Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
26
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
27
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
28
+ Requires-Dist: ipython (>=8.29.0,<9.0.0)
29
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
+ Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
31
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
32
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
+ Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
34
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
35
+ Requires-Dist: paramiko (>=3.5.0,<4.0.0)
36
+ Requires-Dist: psutil (>=6.1.0,<7.0.0)
37
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
38
+ Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
39
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
40
+ Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
41
+ Requires-Dist: pytest (>=8.3.3,<9.0.0)
42
+ Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
43
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
44
+ Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
45
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
46
+ Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
47
+ Requires-Dist: tornado (>=6.4.1,<7.0.0)
48
+ Requires-Dist: tqdm (>=4.67.0,<5.0.0)
49
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
50
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
51
+ Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
52
+ Description-Content-Type: text/markdown
53
+
54
+ ---
55
+ title: "Sibi Data Science Toolkit"
56
+ description: "Data Science Toolkit built with Python, Pandas, Dask, GeoPandas, OpenStreetMaps, SQLAlchemy"
57
+ author: "Luis Valverde"
58
+ current_mantainer: "Luis Valverde"
59
+ ---
60
+
61
+ ### SIBI-DST
62
+
63
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
64
+
65
+ ## Example Use Cases
66
+
67
+ 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
68
+ 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
69
+ 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
70
+ 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
71
+ 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
72
+
73
+ ## Supported Technologies
74
+
75
+ - **Data Processing**: Pandas, Dask
76
+ - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
77
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
78
+ - **API Development**: Django REST Framework, FastAPI
79
+
80
+ ## Installation
81
+
82
+ ```bash
83
+ # with pip
84
+
85
+ pip install sibi-dst[complete] # Install all dependencies
86
+ pip install sibi-dst[df_helper] # Install only df_helper dependencies
87
+ pip install sibi-dst[geospatial] # Install only geospatial dependencies
88
+
89
+ # with poetry
90
+
91
+ poetry add "sibi-dst[complete]" # Install all dependencies
92
+ poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
93
+ poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
94
+
95
+
96
+ ```
97
+
@@ -0,0 +1,43 @@
1
+ ---
2
+ title: "Sibi Data Science Toolkit"
3
+ description: "Data Science Toolkit built with Python, Pandas, Dask, GeoPandas, OpenStreetMaps, SQLAlchemy"
4
+ author: "Luis Valverde"
5
+ current_mantainer: "Luis Valverde"
6
+ ---
7
+
8
+ ### SIBI-DST
9
+
10
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
11
+
12
+ ## Example Use Cases
13
+
14
+ 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
15
+ 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
16
+ 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
17
+ 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
18
+ 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
19
+
20
+ ## Supported Technologies
21
+
22
+ - **Data Processing**: Pandas, Dask
23
+ - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
24
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
25
+ - **API Development**: Django REST Framework, FastAPI
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ # with pip
31
+
32
+ pip install sibi-dst[complete] # Install all dependencies
33
+ pip install sibi-dst[df_helper] # Install only df_helper dependencies
34
+ pip install sibi-dst[geospatial] # Install only geospatial dependencies
35
+
36
+ # with poetry
37
+
38
+ poetry add "sibi-dst[complete]" # Install all dependencies
39
+ poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
40
+ poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
41
+
42
+
43
+ ```
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.52"
3
+ version = "0.3.54"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -48,8 +48,7 @@ webdav4 = { extras = ["fsspec"], version = "^0.10.0" }
48
48
 
49
49
  [tool.poetry.extras]
50
50
  df_helper = ["django", "sqlalchemy", "sqlmodel", "djangorestframework", "pymysql", "psycopg2", "mysqlclient"]
51
- geopy_helper = ["geopy"]
52
- osmnx_helper = ["folium", "geopandas", "osmnx"]
51
+ geospatial = ["geopy","folium", "geopandas", "osmnx"]
53
52
  complete = ["django", "sqlalchemy", "sqlmodel", "djangorestframework", "pymysql", "psycopg2", "mysqlclient", "geopy", "folium", "geopandas", "osmnx"]
54
53
  [build-system]
55
54
  requires = ["poetry-core"]
@@ -74,7 +74,7 @@ class DfHelper:
74
74
  logger: Logger
75
75
  default_config: Dict = None
76
76
 
77
- def __init__(self, backend='django_db', **kwargs):
77
+ def __init__(self, backend='sqlalchemy', **kwargs):
78
78
  # Ensure default_config is not shared across instances
79
79
  self.default_config = self.default_config or {}
80
80
  kwargs = {**self.default_config.copy(), **kwargs}
@@ -209,6 +209,7 @@ class DfHelper:
209
209
  else:
210
210
  self.logger.debug("Regular asyncio run...")
211
211
  return asyncio.run(self.__load_from_http(**options))
212
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
212
213
 
213
214
  def __load_from_sqlalchemy(self, **options):
214
215
  """
@@ -151,9 +151,21 @@ class ParquetArtifact(DfHelper):
151
151
  'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
152
152
  }
153
153
 
154
+ def custom_config():
155
+ try:
156
+ start_date = kwargs.pop('start_on')
157
+ end_date = kwargs.pop('end_on')
158
+ except KeyError:
159
+ raise ValueError("For period 'custom', you must provide 'start_on' in kwargs.")
160
+ return {
161
+ 'parquet_start_date': start_date,
162
+ 'parquet_end_date': end_date
163
+ }
164
+
154
165
  config_map = {
155
166
  'itd': itd_config,
156
- 'ytd': ytd_config
167
+ 'ytd': ytd_config,
168
+ 'custom': custom_config,
157
169
  }
158
170
 
159
171
  if period in config_map:
@@ -29,7 +29,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
29
29
  connection_url: str
30
30
  table: Optional[str] = None
31
31
  model: Any = None
32
- engine: Optional[Any] = None # Save engine to reuse it
32
+ engine: Optional[Any] = None
33
+ pool_size: int = 10
34
+ max_overflow: int = 5
35
+ pool_timeout: int = 30
33
36
 
34
37
  @model_validator(mode="after")
35
38
  def validate_and_initialize(self):
@@ -41,7 +44,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
41
44
  raise ValueError("`connection_url` must be provided.")
42
45
 
43
46
  # Initialize the engine
44
- self.engine = create_engine(self.connection_url)
47
+ self.engine = create_engine(self.connection_url,
48
+ pool_size=self.pool_size,
49
+ max_overflow=self.max_overflow,
50
+ pool_timeout=self.pool_timeout)
45
51
 
46
52
  # Validate the connection
47
53
  self.validate_connection()
@@ -47,7 +47,8 @@ class DataWrapper:
47
47
  show_progress: bool = False,
48
48
  timeout: float = 60,
49
49
  reference_date: datetime.date = None,
50
- custom_priority_map: Dict[str, int] = None):
50
+ custom_priority_map: Dict[str, int] = None,
51
+ max_threads: int = 3):
51
52
  self.dataclass = dataclass
52
53
  self.date_field = date_field
53
54
  self.data_path = self._ensure_forward_slash(data_path)
@@ -70,12 +71,14 @@ class DataWrapper:
70
71
  self.timeout = timeout
71
72
  self.reference_date = reference_date or datetime.date.today()
72
73
  self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
74
+ self.max_threads = max_threads
73
75
 
74
76
  self.start_date = self._convert_to_date(start_date)
75
77
  self.end_date = self._convert_to_date(end_date)
76
78
  self._lock = Lock()
77
79
  self.processed_dates = []
78
80
  self.age_checker = FileAgeChecker(logger=self.logger)
81
+ self.data_class_instance = self.dataclass(**self.class_params) or None
79
82
 
80
83
  def _init_filesystem(self) -> fsspec.AbstractFileSystem:
81
84
  with self._lock:
@@ -138,8 +141,9 @@ class DataWrapper:
138
141
 
139
142
  desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
140
143
  self.logger.info(f"Starting {desc.lower()}")
141
-
142
- with ThreadPoolExecutor() as executor:
144
+ max_threads = min(len(dates), self.max_threads)
145
+ self.logger.info(f"DataWrapper Max threads set at: {max_threads}")
146
+ with ThreadPoolExecutor(max_workers=max_threads) as executor:
143
147
  futures = {
144
148
  executor.submit(self._process_date_with_retry, date, max_retries): date
145
149
  for date in dates
@@ -262,8 +266,9 @@ class DataWrapper:
262
266
  try:
263
267
  self.logger.debug(f"Class Params: {self.class_params}")
264
268
  self.logger.debug(f"Load Params: {self.load_params}")
265
- data = self.dataclass(**self.class_params)
266
- df = data.load_period(
269
+
270
+ #data = self.dataclass(**self.class_params)
271
+ df = self.data_class_instance.load_period(
267
272
  dt_field=self.date_field,
268
273
  start=date,
269
274
  end=date,
@@ -1,15 +1,10 @@
1
1
  import base64
2
+ import hashlib
2
3
  import logging
3
- from pathlib import Path
4
+ import warnings
4
5
  from typing import Optional
5
6
 
6
7
  import pyarrow as pa
7
- import fsspec
8
- import warnings
9
- import hashlib
10
- from s3fs import S3FileSystem
11
-
12
- from fsspec import filesystem
13
8
 
14
9
  # Suppress the specific UserWarning message
15
10
  warnings.filterwarnings("ignore")
sibi_dst-0.3.52/PKG-INFO DELETED
@@ -1,200 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: sibi-dst
3
- Version: 0.3.52
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Provides-Extra: complete
13
- Provides-Extra: df-helper
14
- Provides-Extra: geopy-helper
15
- Provides-Extra: osmnx-helper
16
- Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
17
- Requires-Dist: chardet (>=5.2.0,<6.0.0)
18
- Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
19
- Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
20
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
21
- Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
22
- Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
23
- Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
24
- Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "osmnx-helper" or extra == "complete"
25
- Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "osmnx-helper" or extra == "complete"
26
- Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geopy-helper" or extra == "complete"
27
- Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
28
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
29
- Requires-Dist: ipython (>=8.29.0,<9.0.0)
30
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
31
- Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
32
- Requires-Dist: nltk (>=3.9.1,<4.0.0)
33
- Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
34
- Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "osmnx-helper" or extra == "complete"
35
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
36
- Requires-Dist: paramiko (>=3.5.0,<4.0.0)
37
- Requires-Dist: psutil (>=6.1.0,<7.0.0)
38
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
39
- Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
40
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
41
- Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
42
- Requires-Dist: pytest (>=8.3.3,<9.0.0)
43
- Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
44
- Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
45
- Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
46
- Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
47
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
48
- Requires-Dist: tornado (>=6.4.1,<7.0.0)
49
- Requires-Dist: tqdm (>=4.67.0,<5.0.0)
50
- Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
51
- Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
52
- Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
53
- Description-Content-Type: text/markdown
54
-
55
- # sibi-dst
56
-
57
- Data Science Toolkit
58
- ---------------------
59
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
60
-
61
- Major Functionality
62
- --------------------
63
- 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
64
- 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
65
- 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
66
- 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
67
-
68
- Supported Technologies
69
- --------------------
70
- - **Data Processing**: Pandas, Dask
71
- - **Machine Learning**: Scikit-Learn, XGBoost
72
- - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
73
- - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
74
- - **API Development**: Django REST Framework, gRPC, FastAPI
75
-
76
- Installation
77
- ---------------------
78
- ```bash
79
- pip install sibi-dst
80
- ```
81
-
82
- Usage
83
- ---------------------
84
- ### Loading Data from SQLAlchemy
85
- ```python
86
- from sibi_dst.df_helper import DfHelper
87
- from conf.transforms.fields.crm import customer_fields
88
- from conf.credentials import replica_db_conf
89
- from conf.storage import get_fs_instance
90
-
91
- config = {
92
- 'backend': 'sqlalchemy',
93
- 'connection_url': replica_db_conf.get('db_url'),
94
- 'table': 'crm_clientes_archivo',
95
- 'field_map': customer_fields,
96
- 'legacy_filters': True,
97
- 'fs': get_fs_instance()
98
- }
99
-
100
- df_helper = DfHelper(**config)
101
- result = df_helper.load(id__gte=1)
102
- ```
103
-
104
- ### Saving Data to ClickHouse
105
- ```python
106
- clk_creds = {
107
- 'host': '192.168.3.171',
108
- 'port': 18123,
109
- 'user': 'username',
110
- 'database': 'xxxxxxx',
111
- 'table': 'customer_file',
112
- 'order_by': 'id'
113
- }
114
-
115
- df_helper.save_to_clickhouse(**clk_creds)
116
- ```
117
-
118
- ### Saving Data to Parquet
119
- ```python
120
- df_helper.save_to_parquet(
121
- parquet_filename='filename.parquet',
122
- parquet_storage_path='/path/to/my/files/'
123
- )
124
- ```
125
-
126
- Backends Supported
127
- ---------------------
128
- | Backend | Description |
129
- |--------------|-------------|
130
- | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
131
- | `django_db` | Load data from Django ORM models. |
132
- | `parquet` | Load and save data from Parquet files. |
133
- | `http` | Fetch data from HTTP endpoints. |
134
- | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
135
- | `geopy` | Geolocation services for address lookup and reverse geocoding. |
136
-
137
- Geospatial Utilities
138
- ---------------------
139
- ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
140
- **
141
- Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
142
-
143
- #### 🔹 Key Features
144
- - **BaseOsmMap**: Manages interactive Folium-based maps.
145
- - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
146
-
147
- #### Example: Generating an OSM Map
148
- ```python
149
- from sibi_dst.osmnx_helper import BaseOsmMap
150
- osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
151
- osm_map.generate_map()
152
- ```
153
-
154
- ### **Geopy Helper (`sibi_dst.geopy_helper`)
155
- **
156
- Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
157
-
158
- #### 🔹 Key Features
159
- - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
160
- - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
161
- - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
162
-
163
- #### Example: Reverse Geocoding
164
- ```python
165
- from sibi_dst.geopy_helper import GeolocationService
166
- gs = GeolocationService()
167
- location = gs.reverse((9.935,-84.091))
168
- print(location)
169
- ```
170
-
171
- Advanced Features
172
- ---------------------
173
- ### Querying with Custom Filters
174
- Filters can be applied dynamically using Django-style syntax:
175
- ```python
176
- result = df_helper.load(date__gte='2023-01-01', status='active')
177
- ```
178
-
179
- ### Parallel Processing
180
- Leverage Dask for parallel execution:
181
- ```python
182
- result = df_helper.load_parallel(status='active')
183
- ```
184
-
185
- Testing
186
- ---------------------
187
- To run unit tests, use:
188
- ```bash
189
- pytest tests/
190
- ```
191
-
192
- Contributing
193
- ---------------------
194
- Contributions are welcome! Please submit pull requests or open issues for discussions.
195
-
196
- License
197
- ---------------------
198
- sibi-dst is licensed under the MIT License.
199
-
200
-
sibi_dst-0.3.52/README.md DELETED
@@ -1,145 +0,0 @@
1
- # sibi-dst
2
-
3
- Data Science Toolkit
4
- ---------------------
5
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
6
-
7
- Major Functionality
8
- --------------------
9
- 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
10
- 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
11
- 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
12
- 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
13
-
14
- Supported Technologies
15
- --------------------
16
- - **Data Processing**: Pandas, Dask
17
- - **Machine Learning**: Scikit-Learn, XGBoost
18
- - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
19
- - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
20
- - **API Development**: Django REST Framework, gRPC, FastAPI
21
-
22
- Installation
23
- ---------------------
24
- ```bash
25
- pip install sibi-dst
26
- ```
27
-
28
- Usage
29
- ---------------------
30
- ### Loading Data from SQLAlchemy
31
- ```python
32
- from sibi_dst.df_helper import DfHelper
33
- from conf.transforms.fields.crm import customer_fields
34
- from conf.credentials import replica_db_conf
35
- from conf.storage import get_fs_instance
36
-
37
- config = {
38
- 'backend': 'sqlalchemy',
39
- 'connection_url': replica_db_conf.get('db_url'),
40
- 'table': 'crm_clientes_archivo',
41
- 'field_map': customer_fields,
42
- 'legacy_filters': True,
43
- 'fs': get_fs_instance()
44
- }
45
-
46
- df_helper = DfHelper(**config)
47
- result = df_helper.load(id__gte=1)
48
- ```
49
-
50
- ### Saving Data to ClickHouse
51
- ```python
52
- clk_creds = {
53
- 'host': '192.168.3.171',
54
- 'port': 18123,
55
- 'user': 'username',
56
- 'database': 'xxxxxxx',
57
- 'table': 'customer_file',
58
- 'order_by': 'id'
59
- }
60
-
61
- df_helper.save_to_clickhouse(**clk_creds)
62
- ```
63
-
64
- ### Saving Data to Parquet
65
- ```python
66
- df_helper.save_to_parquet(
67
- parquet_filename='filename.parquet',
68
- parquet_storage_path='/path/to/my/files/'
69
- )
70
- ```
71
-
72
- Backends Supported
73
- ---------------------
74
- | Backend | Description |
75
- |--------------|-------------|
76
- | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
77
- | `django_db` | Load data from Django ORM models. |
78
- | `parquet` | Load and save data from Parquet files. |
79
- | `http` | Fetch data from HTTP endpoints. |
80
- | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
81
- | `geopy` | Geolocation services for address lookup and reverse geocoding. |
82
-
83
- Geospatial Utilities
84
- ---------------------
85
- ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
86
- **
87
- Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
88
-
89
- #### 🔹 Key Features
90
- - **BaseOsmMap**: Manages interactive Folium-based maps.
91
- - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
92
-
93
- #### Example: Generating an OSM Map
94
- ```python
95
- from sibi_dst.osmnx_helper import BaseOsmMap
96
- osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
97
- osm_map.generate_map()
98
- ```
99
-
100
- ### **Geopy Helper (`sibi_dst.geopy_helper`)
101
- **
102
- Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
103
-
104
- #### 🔹 Key Features
105
- - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
106
- - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
107
- - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
108
-
109
- #### Example: Reverse Geocoding
110
- ```python
111
- from sibi_dst.geopy_helper import GeolocationService
112
- gs = GeolocationService()
113
- location = gs.reverse((9.935,-84.091))
114
- print(location)
115
- ```
116
-
117
- Advanced Features
118
- ---------------------
119
- ### Querying with Custom Filters
120
- Filters can be applied dynamically using Django-style syntax:
121
- ```python
122
- result = df_helper.load(date__gte='2023-01-01', status='active')
123
- ```
124
-
125
- ### Parallel Processing
126
- Leverage Dask for parallel execution:
127
- ```python
128
- result = df_helper.load_parallel(status='active')
129
- ```
130
-
131
- Testing
132
- ---------------------
133
- To run unit tests, use:
134
- ```bash
135
- pytest tests/
136
- ```
137
-
138
- Contributing
139
- ---------------------
140
- Contributions are welcome! Please submit pull requests or open issues for discussions.
141
-
142
- License
143
- ---------------------
144
- sibi-dst is licensed under the MIT License.
145
-