sibi-dst 0.3.53__py3-none-any.whl → 0.3.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,7 +29,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
29
29
  connection_url: str
30
30
  table: Optional[str] = None
31
31
  model: Any = None
32
- engine: Optional[Any] = None # Save engine to reuse it
32
+ engine: Optional[Any] = None
33
+ pool_size: int = 10
34
+ max_overflow: int = 5
35
+ pool_timeout: int = 30
33
36
 
34
37
  @model_validator(mode="after")
35
38
  def validate_and_initialize(self):
@@ -41,7 +44,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
41
44
  raise ValueError("`connection_url` must be provided.")
42
45
 
43
46
  # Initialize the engine
44
- self.engine = create_engine(self.connection_url)
47
+ self.engine = create_engine(self.connection_url,
48
+ pool_size=self.pool_size,
49
+ max_overflow=self.max_overflow,
50
+ pool_timeout=self.pool_timeout)
45
51
 
46
52
  # Validate the connection
47
53
  self.validate_connection()
@@ -78,6 +78,7 @@ class DataWrapper:
78
78
  self._lock = Lock()
79
79
  self.processed_dates = []
80
80
  self.age_checker = FileAgeChecker(logger=self.logger)
81
+ self.data_class_instance = self.dataclass(**self.class_params) or None
81
82
 
82
83
  def _init_filesystem(self) -> fsspec.AbstractFileSystem:
83
84
  with self._lock:
@@ -265,8 +266,9 @@ class DataWrapper:
265
266
  try:
266
267
  self.logger.debug(f"Class Params: {self.class_params}")
267
268
  self.logger.debug(f"Load Params: {self.load_params}")
268
- data = self.dataclass(**self.class_params)
269
- df = data.load_period(
269
+
270
+ #data = self.dataclass(**self.class_params)
271
+ df = self.data_class_instance.load_period(
270
272
  dt_field=self.date_field,
271
273
  start=date,
272
274
  end=date,
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.1
2
+ Name: sibi-dst
3
+ Version: 0.3.54
4
+ Summary: Data Science Toolkit
5
+ Author: Luis Valverde
6
+ Author-email: lvalverdeb@gmail.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Provides-Extra: complete
13
+ Provides-Extra: df-helper
14
+ Provides-Extra: geospatial
15
+ Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
16
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
17
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
18
+ Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
19
+ Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
20
+ Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
21
+ Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
22
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
23
+ Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
24
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
25
+ Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
26
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
27
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
28
+ Requires-Dist: ipython (>=8.29.0,<9.0.0)
29
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
+ Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
31
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
32
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
+ Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
34
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
35
+ Requires-Dist: paramiko (>=3.5.0,<4.0.0)
36
+ Requires-Dist: psutil (>=6.1.0,<7.0.0)
37
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
38
+ Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
39
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
40
+ Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
41
+ Requires-Dist: pytest (>=8.3.3,<9.0.0)
42
+ Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
43
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
44
+ Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
45
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
46
+ Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
47
+ Requires-Dist: tornado (>=6.4.1,<7.0.0)
48
+ Requires-Dist: tqdm (>=4.67.0,<5.0.0)
49
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
50
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
51
+ Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
52
+ Description-Content-Type: text/markdown
53
+
54
+ ---
55
+ title: "Sibi Data Science Toolkit"
56
+ description: "Data Science Toolkit built with Python, Pandas, Dask, GeoPandas, OpenStreetMaps, SQLAlchemy"
57
+ author: "Luis Valverde"
58
+ current_mantainer: "Luis Valverde"
59
+ ---
60
+
61
+ ### SIBI-DST
62
+
63
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
64
+
65
+ ## Example Use Cases
66
+
67
+ 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
68
+ 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
69
+ 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
70
+ 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
71
+ 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
72
+
73
+ ## Supported Technologies
74
+
75
+ - **Data Processing**: Pandas, Dask
76
+ - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
77
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
78
+ - **API Development**: Django REST Framework, FastAPI
79
+
80
+ ## Installation
81
+
82
+ ```bash
83
+ # with pip
84
+
85
+ pip install sibi-dst[complete] # Install all dependencies
86
+ pip install sibi-dst[df_helper] # Install only df_helper dependencies
87
+ pip install sibi-dst[geospatial] # Install only geospatial dependencies
88
+
89
+ # with poetry
90
+
91
+ poetry add "sibi-dst[complete]" # Install all dependencies
92
+ poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
93
+ poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
94
+
95
+
96
+ ```
97
+
@@ -16,7 +16,7 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
16
16
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
17
17
  sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
18
18
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
19
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Kli83IEg5SFVqkhsK4w45cV6PbZnfdGanfsyiW6Xw00,2502
19
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=bQxQAA2GNSmNsRWTZbsLmpvK6pnkbVzECBXQ9H2iqRc,2750
20
20
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
21
21
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
22
22
  sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=I2Us3RrxHci561yyZYBuUCrLVOhB0F3KBnae78m_ARw,6259
@@ -44,7 +44,7 @@ sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcW
44
44
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
45
45
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
46
46
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
47
- sibi_dst/utils/data_wrapper.py,sha256=OMmdjnUCjxDsC1T0K1JbzxKXLTiDOtxPp1ujV66bMAY,12345
47
+ sibi_dst/utils/data_wrapper.py,sha256=-RqK_sU3uuc6U9dFfnICFZkaIetk3JOvJain6_lTWlo,12446
48
48
  sibi_dst/utils/date_utils.py,sha256=OCJqkWl5e8fE7z11Ufz4206DUeuLMd_Gf_JGZu914Pg,18539
49
49
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
50
50
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
@@ -75,6 +75,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
75
75
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
76
76
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
77
77
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
78
- sibi_dst-0.3.53.dist-info/METADATA,sha256=rNy87i3acBUu27BdqZ-J73e9oy6pd8YEKCX-HrE48ig,7182
79
- sibi_dst-0.3.53.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
80
- sibi_dst-0.3.53.dist-info/RECORD,,
78
+ sibi_dst-0.3.54.dist-info/METADATA,sha256=i2guOpX-oxuWb7-Jk2GNLQrSY83VjaAulNA_9AyuVzk,4502
79
+ sibi_dst-0.3.54.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
80
+ sibi_dst-0.3.54.dist-info/RECORD,,
@@ -1,199 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: sibi-dst
3
- Version: 0.3.53
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Provides-Extra: complete
13
- Provides-Extra: df-helper
14
- Provides-Extra: geospatial
15
- Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
16
- Requires-Dist: chardet (>=5.2.0,<6.0.0)
17
- Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
18
- Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
19
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
20
- Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
21
- Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
22
- Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
23
- Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
24
- Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
25
- Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
26
- Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
27
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
28
- Requires-Dist: ipython (>=8.29.0,<9.0.0)
29
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
- Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
31
- Requires-Dist: nltk (>=3.9.1,<4.0.0)
32
- Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
- Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
34
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
35
- Requires-Dist: paramiko (>=3.5.0,<4.0.0)
36
- Requires-Dist: psutil (>=6.1.0,<7.0.0)
37
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
38
- Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
39
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
40
- Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
41
- Requires-Dist: pytest (>=8.3.3,<9.0.0)
42
- Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
43
- Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
44
- Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
45
- Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
46
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
47
- Requires-Dist: tornado (>=6.4.1,<7.0.0)
48
- Requires-Dist: tqdm (>=4.67.0,<5.0.0)
49
- Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
50
- Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
51
- Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
52
- Description-Content-Type: text/markdown
53
-
54
- # sibi-dst
55
-
56
- Data Science Toolkit
57
- ---------------------
58
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
59
-
60
- Major Functionality
61
- --------------------
62
- 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
63
- 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
64
- 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
65
- 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
66
-
67
- Supported Technologies
68
- --------------------
69
- - **Data Processing**: Pandas, Dask
70
- - **Machine Learning**: Scikit-Learn, XGBoost
71
- - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
72
- - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
73
- - **API Development**: Django REST Framework, gRPC, FastAPI
74
-
75
- Installation
76
- ---------------------
77
- ```bash
78
- pip install sibi-dst
79
- ```
80
-
81
- Usage
82
- ---------------------
83
- ### Loading Data from SQLAlchemy
84
- ```python
85
- from sibi_dst.df_helper import DfHelper
86
- from conf.transforms.fields.crm import customer_fields
87
- from conf.credentials import replica_db_conf
88
- from conf.storage import get_fs_instance
89
-
90
- config = {
91
- 'backend': 'sqlalchemy',
92
- 'connection_url': replica_db_conf.get('db_url'),
93
- 'table': 'crm_clientes_archivo',
94
- 'field_map': customer_fields,
95
- 'legacy_filters': True,
96
- 'fs': get_fs_instance()
97
- }
98
-
99
- df_helper = DfHelper(**config)
100
- result = df_helper.load(id__gte=1)
101
- ```
102
-
103
- ### Saving Data to ClickHouse
104
- ```python
105
- clk_creds = {
106
- 'host': '192.168.3.171',
107
- 'port': 18123,
108
- 'user': 'username',
109
- 'database': 'xxxxxxx',
110
- 'table': 'customer_file',
111
- 'order_by': 'id'
112
- }
113
-
114
- df_helper.save_to_clickhouse(**clk_creds)
115
- ```
116
-
117
- ### Saving Data to Parquet
118
- ```python
119
- df_helper.save_to_parquet(
120
- parquet_filename='filename.parquet',
121
- parquet_storage_path='/path/to/my/files/'
122
- )
123
- ```
124
-
125
- Backends Supported
126
- ---------------------
127
- | Backend | Description |
128
- |--------------|-------------|
129
- | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
130
- | `django_db` | Load data from Django ORM models. |
131
- | `parquet` | Load and save data from Parquet files. |
132
- | `http` | Fetch data from HTTP endpoints. |
133
- | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
134
- | `geopy` | Geolocation services for address lookup and reverse geocoding. |
135
-
136
- Geospatial Utilities
137
- ---------------------
138
- ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
139
- **
140
- Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
141
-
142
- #### 🔹 Key Features
143
- - **BaseOsmMap**: Manages interactive Folium-based maps.
144
- - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
145
-
146
- #### Example: Generating an OSM Map
147
- ```python
148
- from sibi_dst.osmnx_helper import BaseOsmMap
149
- osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
150
- osm_map.generate_map()
151
- ```
152
-
153
- ### **Geopy Helper (`sibi_dst.geopy_helper`)
154
- **
155
- Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
156
-
157
- #### 🔹 Key Features
158
- - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
159
- - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
160
- - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
161
-
162
- #### Example: Reverse Geocoding
163
- ```python
164
- from sibi_dst.geopy_helper import GeolocationService
165
- gs = GeolocationService()
166
- location = gs.reverse((9.935,-84.091))
167
- print(location)
168
- ```
169
-
170
- Advanced Features
171
- ---------------------
172
- ### Querying with Custom Filters
173
- Filters can be applied dynamically using Django-style syntax:
174
- ```python
175
- result = df_helper.load(date__gte='2023-01-01', status='active')
176
- ```
177
-
178
- ### Parallel Processing
179
- Leverage Dask for parallel execution:
180
- ```python
181
- result = df_helper.load_parallel(status='active')
182
- ```
183
-
184
- Testing
185
- ---------------------
186
- To run unit tests, use:
187
- ```bash
188
- pytest tests/
189
- ```
190
-
191
- Contributing
192
- ---------------------
193
- Contributions are welcome! Please submit pull requests or open issues for discussions.
194
-
195
- License
196
- ---------------------
197
- sibi-dst is licensed under the MIT License.
198
-
199
-