sibi-dst 0.3.42__tar.gz → 0.3.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst-0.3.44/PKG-INFO +194 -0
- sibi_dst-0.3.44/README.md +145 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/pyproject.toml +1 -1
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +7 -2
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_df_helper.py +5 -2
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_parquet_artifact.py +33 -3
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_parquet_reader.py +5 -1
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_load_from_db.py +1 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +2 -1
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -3
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +1 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +2 -5
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_filter_handler.py +2 -1
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/osmnx_helper/__init__.py +2 -2
- {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/basemaps/router_plotter.py +85 -30
- sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/__init__.py +0 -0
- sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/base_osm_map.py +153 -0
- sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/basemaps/__init__.py +0 -0
- sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst-0.3.44/sibi_dst/tests/__init__.py +0 -0
- sibi_dst-0.3.44/sibi_dst/utils/data_wrapper.py +301 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/df_utils.py +7 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/log_utils.py +6 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/parquet_saver.py +4 -2
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/storage_manager.py +14 -7
- sibi_dst-0.3.42/PKG-INFO +0 -62
- sibi_dst-0.3.42/README.md +0 -13
- sibi_dst-0.3.42/sibi_dst/utils/data_wrapper.py +0 -665
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.42/sibi_dst/osmnx_helper/basemaps → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/__init__.py +0 -0
- {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/base_osm_map.py +0 -0
- {sibi_dst-0.3.42/sibi_dst/tests → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1/basemaps}/__init__.py +0 -0
- {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/utils.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/phone_formatter.py +0 -0
sibi_dst-0.3.44/PKG-INFO
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: sibi-dst
|
3
|
+
Version: 0.3.44
|
4
|
+
Summary: Data Science Toolkit
|
5
|
+
Author: Luis Valverde
|
6
|
+
Author-email: lvalverdeb@gmail.com
|
7
|
+
Requires-Python: >=3.11,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
12
|
+
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
13
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
14
|
+
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
15
|
+
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
16
|
+
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
17
|
+
Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
|
18
|
+
Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
|
19
|
+
Requires-Dist: django (>=5.1.4,<6.0.0)
|
20
|
+
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
21
|
+
Requires-Dist: folium (>=0.19.4,<0.20.0)
|
22
|
+
Requires-Dist: geopandas (>=1.0.1,<2.0.0)
|
23
|
+
Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
|
24
|
+
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
25
|
+
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
26
|
+
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
27
|
+
Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
|
28
|
+
Requires-Dist: nltk (>=3.9.1,<4.0.0)
|
29
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
30
|
+
Requires-Dist: osmnx (>=2.0.1,<3.0.0)
|
31
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
32
|
+
Requires-Dist: paramiko (>=3.5.0,<4.0.0)
|
33
|
+
Requires-Dist: psutil (>=6.1.0,<7.0.0)
|
34
|
+
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
35
|
+
Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
|
36
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
37
|
+
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
38
|
+
Requires-Dist: pytest (>=8.3.3,<9.0.0)
|
39
|
+
Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
|
40
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
41
|
+
Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
|
42
|
+
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
43
|
+
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
44
|
+
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
45
|
+
Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
|
46
|
+
Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
|
47
|
+
Description-Content-Type: text/markdown
|
48
|
+
|
49
|
+
# sibi-dst
|
50
|
+
|
51
|
+
Data Science Toolkit
|
52
|
+
---------------------
|
53
|
+
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
|
54
|
+
|
55
|
+
Major Functionality
|
56
|
+
--------------------
|
57
|
+
1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
|
58
|
+
2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
|
59
|
+
3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
|
60
|
+
4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
|
61
|
+
|
62
|
+
Supported Technologies
|
63
|
+
--------------------
|
64
|
+
- **Data Processing**: Pandas, Dask
|
65
|
+
- **Machine Learning**: Scikit-Learn, XGBoost
|
66
|
+
- **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
|
67
|
+
- **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
|
68
|
+
- **API Development**: Django REST Framework, gRPC
|
69
|
+
|
70
|
+
Installation
|
71
|
+
---------------------
|
72
|
+
```bash
|
73
|
+
pip install sibi-dst
|
74
|
+
```
|
75
|
+
|
76
|
+
Usage
|
77
|
+
---------------------
|
78
|
+
### Loading Data from SQLAlchemy
|
79
|
+
```python
|
80
|
+
from sibi_dst.df_helper import DfHelper
|
81
|
+
from conf.transforms.fields.crm import customer_fields
|
82
|
+
from conf.credentials import replica_db_conf
|
83
|
+
from conf.storage import get_fs_instance
|
84
|
+
|
85
|
+
config = {
|
86
|
+
'backend': 'sqlalchemy',
|
87
|
+
'connection_url': replica_db_conf.get('db_url'),
|
88
|
+
'table': 'crm_clientes_archivo',
|
89
|
+
'field_map': customer_fields,
|
90
|
+
'legacy_filters': True,
|
91
|
+
'fs': get_fs_instance()
|
92
|
+
}
|
93
|
+
|
94
|
+
df_helper = DfHelper(**config)
|
95
|
+
result = df_helper.load(id__gte=1)
|
96
|
+
```
|
97
|
+
|
98
|
+
### Saving Data to ClickHouse
|
99
|
+
```python
|
100
|
+
clk_creds = {
|
101
|
+
'host': '192.168.3.171',
|
102
|
+
'port': 18123,
|
103
|
+
'user': 'username',
|
104
|
+
'database': 'xxxxxxx',
|
105
|
+
'table': 'customer_file',
|
106
|
+
'order_by': 'id'
|
107
|
+
}
|
108
|
+
|
109
|
+
df_helper.save_to_clickhouse(**clk_creds)
|
110
|
+
```
|
111
|
+
|
112
|
+
### Saving Data to Parquet
|
113
|
+
```python
|
114
|
+
df_helper.save_to_parquet(
|
115
|
+
parquet_filename='filename.parquet',
|
116
|
+
parquet_storage_path='/path/to/my/files/'
|
117
|
+
)
|
118
|
+
```
|
119
|
+
|
120
|
+
Backends Supported
|
121
|
+
---------------------
|
122
|
+
| Backend | Description |
|
123
|
+
|--------------|-------------|
|
124
|
+
| `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
|
125
|
+
| `django_db` | Load data from Django ORM models. |
|
126
|
+
| `parquet` | Load and save data from Parquet files. |
|
127
|
+
| `http` | Fetch data from HTTP endpoints. |
|
128
|
+
| `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
|
129
|
+
| `geopy` | Geolocation services for address lookup and reverse geocoding. |
|
130
|
+
|
131
|
+
Geospatial Utilities
|
132
|
+
---------------------
|
133
|
+
### **OSMnx Helper (`sibi_dst.osmnx_helper`)
|
134
|
+
**
|
135
|
+
Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
|
136
|
+
|
137
|
+
#### 🔹 Key Features
|
138
|
+
- **BaseOsmMap**: Manages interactive Folium-based maps.
|
139
|
+
- **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
|
140
|
+
|
141
|
+
#### Example: Generating an OSM Map
|
142
|
+
```python
|
143
|
+
from sibi_dst.osmnx_helper import BaseOsmMap
|
144
|
+
osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
|
145
|
+
osm_map.generate_map()
|
146
|
+
```
|
147
|
+
|
148
|
+
### **Geopy Helper (`sibi_dst.geopy_helper`)
|
149
|
+
**
|
150
|
+
Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
|
151
|
+
|
152
|
+
#### 🔹 Key Features
|
153
|
+
- **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
|
154
|
+
- **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
|
155
|
+
- **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
|
156
|
+
|
157
|
+
#### Example: Reverse Geocoding
|
158
|
+
```python
|
159
|
+
from sibi_dst.geopy_helper import GeolocationService
|
160
|
+
gs = GeolocationService()
|
161
|
+
location = gs.reverse_geocode(lat=9.935, lon=-84.091)
|
162
|
+
print(location)
|
163
|
+
```
|
164
|
+
|
165
|
+
Advanced Features
|
166
|
+
---------------------
|
167
|
+
### Querying with Custom Filters
|
168
|
+
Filters can be applied dynamically using Django-style syntax:
|
169
|
+
```python
|
170
|
+
result = df_helper.load(date__gte='2023-01-01', status='active')
|
171
|
+
```
|
172
|
+
|
173
|
+
### Parallel Processing
|
174
|
+
Leverage Dask for parallel execution:
|
175
|
+
```python
|
176
|
+
results = df_helper.load_parallel(status='active')
|
177
|
+
```
|
178
|
+
|
179
|
+
Testing
|
180
|
+
---------------------
|
181
|
+
To run unit tests, use:
|
182
|
+
```bash
|
183
|
+
pytest tests/
|
184
|
+
```
|
185
|
+
|
186
|
+
Contributing
|
187
|
+
---------------------
|
188
|
+
Contributions are welcome! Please submit pull requests or open issues for discussions.
|
189
|
+
|
190
|
+
License
|
191
|
+
---------------------
|
192
|
+
sibi-dst is licensed under the MIT License.
|
193
|
+
|
194
|
+
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# sibi-dst
|
2
|
+
|
3
|
+
Data Science Toolkit
|
4
|
+
---------------------
|
5
|
+
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
|
6
|
+
|
7
|
+
Major Functionality
|
8
|
+
--------------------
|
9
|
+
1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
|
10
|
+
2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
|
11
|
+
3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
|
12
|
+
4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
|
13
|
+
|
14
|
+
Supported Technologies
|
15
|
+
--------------------
|
16
|
+
- **Data Processing**: Pandas, Dask
|
17
|
+
- **Machine Learning**: Scikit-Learn, XGBoost
|
18
|
+
- **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
|
19
|
+
- **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
|
20
|
+
- **API Development**: Django REST Framework, gRPC
|
21
|
+
|
22
|
+
Installation
|
23
|
+
---------------------
|
24
|
+
```bash
|
25
|
+
pip install sibi-dst
|
26
|
+
```
|
27
|
+
|
28
|
+
Usage
|
29
|
+
---------------------
|
30
|
+
### Loading Data from SQLAlchemy
|
31
|
+
```python
|
32
|
+
from sibi_dst.df_helper import DfHelper
|
33
|
+
from conf.transforms.fields.crm import customer_fields
|
34
|
+
from conf.credentials import replica_db_conf
|
35
|
+
from conf.storage import get_fs_instance
|
36
|
+
|
37
|
+
config = {
|
38
|
+
'backend': 'sqlalchemy',
|
39
|
+
'connection_url': replica_db_conf.get('db_url'),
|
40
|
+
'table': 'crm_clientes_archivo',
|
41
|
+
'field_map': customer_fields,
|
42
|
+
'legacy_filters': True,
|
43
|
+
'fs': get_fs_instance()
|
44
|
+
}
|
45
|
+
|
46
|
+
df_helper = DfHelper(**config)
|
47
|
+
result = df_helper.load(id__gte=1)
|
48
|
+
```
|
49
|
+
|
50
|
+
### Saving Data to ClickHouse
|
51
|
+
```python
|
52
|
+
clk_creds = {
|
53
|
+
'host': '192.168.3.171',
|
54
|
+
'port': 18123,
|
55
|
+
'user': 'username',
|
56
|
+
'database': 'xxxxxxx',
|
57
|
+
'table': 'customer_file',
|
58
|
+
'order_by': 'id'
|
59
|
+
}
|
60
|
+
|
61
|
+
df_helper.save_to_clickhouse(**clk_creds)
|
62
|
+
```
|
63
|
+
|
64
|
+
### Saving Data to Parquet
|
65
|
+
```python
|
66
|
+
df_helper.save_to_parquet(
|
67
|
+
parquet_filename='filename.parquet',
|
68
|
+
parquet_storage_path='/path/to/my/files/'
|
69
|
+
)
|
70
|
+
```
|
71
|
+
|
72
|
+
Backends Supported
|
73
|
+
---------------------
|
74
|
+
| Backend | Description |
|
75
|
+
|--------------|-------------|
|
76
|
+
| `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
|
77
|
+
| `django_db` | Load data from Django ORM models. |
|
78
|
+
| `parquet` | Load and save data from Parquet files. |
|
79
|
+
| `http` | Fetch data from HTTP endpoints. |
|
80
|
+
| `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
|
81
|
+
| `geopy` | Geolocation services for address lookup and reverse geocoding. |
|
82
|
+
|
83
|
+
Geospatial Utilities
|
84
|
+
---------------------
|
85
|
+
### **OSMnx Helper (`sibi_dst.osmnx_helper`)
|
86
|
+
**
|
87
|
+
Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
|
88
|
+
|
89
|
+
#### 🔹 Key Features
|
90
|
+
- **BaseOsmMap**: Manages interactive Folium-based maps.
|
91
|
+
- **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
|
92
|
+
|
93
|
+
#### Example: Generating an OSM Map
|
94
|
+
```python
|
95
|
+
from sibi_dst.osmnx_helper import BaseOsmMap
|
96
|
+
osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
|
97
|
+
osm_map.generate_map()
|
98
|
+
```
|
99
|
+
|
100
|
+
### **Geopy Helper (`sibi_dst.geopy_helper`)
|
101
|
+
**
|
102
|
+
Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
|
103
|
+
|
104
|
+
#### 🔹 Key Features
|
105
|
+
- **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
|
106
|
+
- **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
|
107
|
+
- **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
|
108
|
+
|
109
|
+
#### Example: Reverse Geocoding
|
110
|
+
```python
|
111
|
+
from sibi_dst.geopy_helper import GeolocationService
|
112
|
+
gs = GeolocationService()
|
113
|
+
location = gs.reverse_geocode(lat=9.935, lon=-84.091)
|
114
|
+
print(location)
|
115
|
+
```
|
116
|
+
|
117
|
+
Advanced Features
|
118
|
+
---------------------
|
119
|
+
### Querying with Custom Filters
|
120
|
+
Filters can be applied dynamically using Django-style syntax:
|
121
|
+
```python
|
122
|
+
result = df_helper.load(date__gte='2023-01-01', status='active')
|
123
|
+
```
|
124
|
+
|
125
|
+
### Parallel Processing
|
126
|
+
Leverage Dask for parallel execution:
|
127
|
+
```python
|
128
|
+
results = df_helper.load_parallel(status='active')
|
129
|
+
```
|
130
|
+
|
131
|
+
Testing
|
132
|
+
---------------------
|
133
|
+
To run unit tests, use:
|
134
|
+
```bash
|
135
|
+
pytest tests/
|
136
|
+
```
|
137
|
+
|
138
|
+
Contributing
|
139
|
+
---------------------
|
140
|
+
Contributions are welcome! Please submit pull requests or open issues for discussions.
|
141
|
+
|
142
|
+
License
|
143
|
+
---------------------
|
144
|
+
sibi-dst is licensed under the MIT License.
|
145
|
+
|
@@ -25,7 +25,7 @@ class ArtifactUpdaterMultiWrapper:
|
|
25
25
|
def __init__(self, wrapped_classes=None, debug=False, **kwargs):
|
26
26
|
self.wrapped_classes = wrapped_classes or {}
|
27
27
|
self.debug = debug
|
28
|
-
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
28
|
+
self.logger = kwargs.setdefault('logger',Logger.default_logger(logger_name=self.__class__.__name__))
|
29
29
|
self.logger.set_level(logging.DEBUG if debug else logging.INFO)
|
30
30
|
|
31
31
|
today = datetime.datetime.today()
|
@@ -73,7 +73,12 @@ class ArtifactUpdaterMultiWrapper:
|
|
73
73
|
raise ValueError(f"Unsupported data type: {data_type}")
|
74
74
|
|
75
75
|
return [
|
76
|
-
artifact_class(
|
76
|
+
artifact_class(
|
77
|
+
parquet_start_date=self.parquet_start_date,
|
78
|
+
parquet_end_date=self.parquet_end_date,
|
79
|
+
logger=self.logger,
|
80
|
+
debug=self.debug
|
81
|
+
)
|
77
82
|
for artifact_class in self.wrapped_classes[data_type]
|
78
83
|
]
|
79
84
|
|
@@ -112,6 +112,7 @@ class DfHelper:
|
|
112
112
|
:return: None
|
113
113
|
"""
|
114
114
|
self.logger.debug(f"backend used: {self.backend}")
|
115
|
+
self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
|
115
116
|
self._backend_query = self.__get_config(QueryConfig, kwargs)
|
116
117
|
self._backend_params = self.__get_config(ParamsConfig, kwargs)
|
117
118
|
if self.backend == 'django_db':
|
@@ -124,8 +125,8 @@ class DfHelper:
|
|
124
125
|
elif self.backend == 'sqlalchemy':
|
125
126
|
self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
|
126
127
|
|
127
|
-
|
128
|
-
def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
128
|
+
|
129
|
+
def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
|
129
130
|
"""
|
130
131
|
Initializes a Pydantic model with the keys it recognizes from the kwargs,
|
131
132
|
and removes those keys from the kwargs dictionary.
|
@@ -135,7 +136,9 @@ class DfHelper:
|
|
135
136
|
"""
|
136
137
|
# Extract keys that the model can accept
|
137
138
|
recognized_keys = set(model.__annotations__.keys())
|
139
|
+
self.logger.debug(f"recognized keys: {recognized_keys}")
|
138
140
|
model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
|
141
|
+
self.logger.debug(f"model_kwargs: {model_kwargs}")
|
139
142
|
return model(**model_kwargs)
|
140
143
|
|
141
144
|
def load_parallel(self, **options):
|
@@ -1,11 +1,12 @@
|
|
1
|
+
import datetime
|
2
|
+
import logging
|
1
3
|
from typing import Optional, Any, Dict
|
2
4
|
|
3
5
|
import dask.dataframe as dd
|
4
6
|
import fsspec
|
5
7
|
|
6
8
|
from sibi_dst.df_helper import DfHelper
|
7
|
-
from sibi_dst.utils import DataWrapper
|
8
|
-
from sibi_dst.utils import DateUtils
|
9
|
+
from sibi_dst.utils import DataWrapper, DateUtils, Logger
|
9
10
|
|
10
11
|
|
11
12
|
class ParquetArtifact(DfHelper):
|
@@ -82,7 +83,12 @@ class ParquetArtifact(DfHelper):
|
|
82
83
|
**kwargs,
|
83
84
|
}
|
84
85
|
self.df: Optional[dd.DataFrame] = None
|
86
|
+
self.debug = self.config.setdefault('debug', False)
|
87
|
+
self.logger = self.config.setdefault('logger',Logger.default_logger(logger_name=f'parquet_artifact_{__class__.__name__}'))
|
88
|
+
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
85
89
|
self.data_wrapper_class = data_wrapper_class
|
90
|
+
self.class_params = self.config.setdefault('class_params', None)
|
91
|
+
self.load_params = self.config.setdefault('load_params', None)
|
86
92
|
self.date_field = self.config.setdefault('date_field', None)
|
87
93
|
if self.date_field is None:
|
88
94
|
raise ValueError('date_field must be set')
|
@@ -131,7 +137,30 @@ class ParquetArtifact(DfHelper):
|
|
131
137
|
|
132
138
|
def update_parquet(self, period: str = 'today', **kwargs) -> None:
|
133
139
|
"""Update the Parquet file with data from a specific period."""
|
134
|
-
|
140
|
+
|
141
|
+
def itd_config():
|
142
|
+
try:
|
143
|
+
start_date = kwargs.pop('history_begins_on')
|
144
|
+
except KeyError:
|
145
|
+
raise ValueError("For period 'itd', you must provide 'history_begins_on' in kwargs.")
|
146
|
+
return {'parquet_start_date': start_date, 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')}
|
147
|
+
|
148
|
+
def ytd_config():
|
149
|
+
return {
|
150
|
+
'parquet_start_date': datetime.date(datetime.date.today().year, 1, 1).strftime('%Y-%m-%d'),
|
151
|
+
'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
|
152
|
+
}
|
153
|
+
|
154
|
+
config_map = {
|
155
|
+
'itd': itd_config,
|
156
|
+
'ytd': ytd_config
|
157
|
+
}
|
158
|
+
|
159
|
+
if period in config_map:
|
160
|
+
kwargs.update(config_map[period]())
|
161
|
+
else:
|
162
|
+
kwargs.update(self.parse_parquet_period(period=period))
|
163
|
+
print(kwargs)
|
135
164
|
self.generate_parquet(**kwargs)
|
136
165
|
|
137
166
|
def rebuild_parquet(self, **kwargs) -> None:
|
@@ -150,6 +179,7 @@ class ParquetArtifact(DfHelper):
|
|
150
179
|
|
151
180
|
def _prepare_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
152
181
|
"""Prepare the parameters for generating the Parquet file."""
|
182
|
+
kwargs = {**self.config, **kwargs}
|
153
183
|
return {
|
154
184
|
'class_params': kwargs.pop('class_params', None),
|
155
185
|
'date_field': kwargs.pop('date_field', self.date_field),
|
@@ -1,10 +1,11 @@
|
|
1
|
+
import logging
|
1
2
|
from typing import Optional
|
2
3
|
|
3
4
|
import dask.dataframe as dd
|
4
5
|
import fsspec
|
5
6
|
|
6
7
|
from sibi_dst.df_helper import DfHelper
|
7
|
-
|
8
|
+
from sibi_dst.utils import Logger
|
8
9
|
|
9
10
|
class ParquetReader(DfHelper):
|
10
11
|
"""
|
@@ -53,6 +54,9 @@ class ParquetReader(DfHelper):
|
|
53
54
|
**kwargs,
|
54
55
|
}
|
55
56
|
self.df: Optional[dd.DataFrame] = None
|
57
|
+
self.debug = self.config.setdefault('debug', False)
|
58
|
+
self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
59
|
+
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
56
60
|
self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
|
57
61
|
if self.parquet_storage_path is None:
|
58
62
|
raise ValueError('parquet_storage_path must be set')
|
@@ -64,6 +64,7 @@ class DjangoLoadFromDb:
|
|
64
64
|
self.connection_config = db_connection
|
65
65
|
self.debug = kwargs.pop('debug', False)
|
66
66
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
67
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
67
68
|
if self.connection_config.model is None:
|
68
69
|
if self.debug:
|
69
70
|
self.logger.debug('Model must be specified')
|
@@ -17,8 +17,9 @@ class ParquetFilterHandler(object):
|
|
17
17
|
:ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
|
18
18
|
:type logger: Logger
|
19
19
|
"""
|
20
|
-
def __init__(self, logger=None):
|
20
|
+
def __init__(self, logger=None, debug=False):
|
21
21
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
22
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
22
23
|
|
23
24
|
@staticmethod
|
24
25
|
def apply_filters_dask(df, filters):
|
@@ -62,6 +62,7 @@ class ParquetConfig(BaseModel):
|
|
62
62
|
parquet_end_date: Optional[str] = None
|
63
63
|
fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
|
64
64
|
logger: Optional[Logger] = None
|
65
|
+
debug: bool = False
|
65
66
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
66
67
|
|
67
68
|
@model_validator(mode='after')
|
@@ -83,9 +84,7 @@ class ParquetConfig(BaseModel):
|
|
83
84
|
# Configure paths based on fsspec
|
84
85
|
if self.logger is None:
|
85
86
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
86
|
-
|
87
|
-
# str(self.parquet_storage_path).split("://")[0])
|
88
|
-
# Validation for parquet path
|
87
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
89
88
|
|
90
89
|
|
91
90
|
if self.parquet_storage_path is None:
|
@@ -29,6 +29,7 @@ class SQLAlchemyDask:
|
|
29
29
|
self.engine = create_engine(engine_url)
|
30
30
|
self.Session = sessionmaker(bind=self.engine)
|
31
31
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
32
|
+
self.logger.set_level(logger.DEBUG if debug else logger.INFO)
|
32
33
|
|
33
34
|
@staticmethod
|
34
35
|
def infer_dtypes_from_model(model):
|
@@ -70,11 +71,7 @@ class SQLAlchemyDask:
|
|
70
71
|
# Build query
|
71
72
|
self.query = select(self.model)
|
72
73
|
if self.filters:
|
73
|
-
""
|
74
|
-
deprecated specific filter handling to a generic one
|
75
|
-
#self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
|
76
|
-
"""
|
77
|
-
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
|
74
|
+
self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
|
78
75
|
model=self.model,
|
79
76
|
filters=self.filters)
|
80
77
|
else:
|
@@ -25,7 +25,7 @@ class FilterHandler:
|
|
25
25
|
:ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
|
26
26
|
:type backend_methods: dict
|
27
27
|
"""
|
28
|
-
def __init__(self, backend, logger=None):
|
28
|
+
def __init__(self, backend, logger=None, debug=False):
|
29
29
|
"""
|
30
30
|
Initialize the FilterHandler.
|
31
31
|
|
@@ -36,6 +36,7 @@ class FilterHandler:
|
|
36
36
|
self.backend = backend
|
37
37
|
self.logger = logger or Logger.default_logger(
|
38
38
|
logger_name=self.__class__.__name__) # No-op logger if none provided
|
39
|
+
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
39
40
|
self.backend_methods = self._get_backend_methods(backend)
|
40
41
|
|
41
42
|
def apply_filters(self, query_or_df, model=None, filters=None):
|