sibi-dst 0.3.42__tar.gz → 0.3.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. sibi_dst-0.3.44/PKG-INFO +194 -0
  2. sibi_dst-0.3.44/README.md +145 -0
  3. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/pyproject.toml +1 -1
  4. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +7 -2
  5. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_df_helper.py +5 -2
  6. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_parquet_artifact.py +33 -3
  7. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/_parquet_reader.py +5 -1
  8. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_load_from_db.py +1 -0
  9. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +2 -1
  10. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +2 -3
  11. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +1 -0
  12. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +2 -5
  13. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_filter_handler.py +2 -1
  14. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/osmnx_helper/__init__.py +2 -2
  15. {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/basemaps/router_plotter.py +85 -30
  16. sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/__init__.py +0 -0
  17. sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/base_osm_map.py +153 -0
  18. sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/basemaps/__init__.py +0 -0
  19. sibi_dst-0.3.44/sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
  20. sibi_dst-0.3.44/sibi_dst/tests/__init__.py +0 -0
  21. sibi_dst-0.3.44/sibi_dst/utils/data_wrapper.py +301 -0
  22. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/df_utils.py +7 -0
  23. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/log_utils.py +6 -0
  24. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/parquet_saver.py +4 -2
  25. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/storage_manager.py +14 -7
  26. sibi_dst-0.3.42/PKG-INFO +0 -62
  27. sibi_dst-0.3.42/README.md +0 -13
  28. sibi_dst-0.3.42/sibi_dst/utils/data_wrapper.py +0 -665
  29. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/__init__.py +0 -0
  30. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/__init__.py +0 -0
  31. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/__init__.py +0 -0
  32. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  33. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  34. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  35. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  36. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  37. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  38. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  39. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  40. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  41. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  42. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  43. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/__init__.py +0 -0
  44. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_defaults.py +0 -0
  45. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_params_config.py +0 -0
  46. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/core/_query_config.py +0 -0
  47. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/df_helper/data_cleaner.py +0 -0
  48. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/__init__.py +0 -0
  49. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  50. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/geopy_helper/utils.py +0 -0
  51. {sibi_dst-0.3.42/sibi_dst/osmnx_helper/basemaps → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/__init__.py +0 -0
  52. {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/base_osm_map.py +0 -0
  53. {sibi_dst-0.3.42/sibi_dst/tests → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1/basemaps}/__init__.py +0 -0
  54. {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/basemaps/calendar_html.py +0 -0
  55. {sibi_dst-0.3.42/sibi_dst/osmnx_helper → sibi_dst-0.3.44/sibi_dst/osmnx_helper/v1}/utils.py +0 -0
  56. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  57. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/__init__.py +0 -0
  58. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/airflow_manager.py +0 -0
  59. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/clickhouse_writer.py +0 -0
  60. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/credentials.py +0 -0
  61. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/data_utils.py +0 -0
  62. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/date_utils.py +0 -0
  63. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/file_utils.py +0 -0
  64. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/filepath_generator.py +0 -0
  65. {sibi_dst-0.3.42 → sibi_dst-0.3.44}/sibi_dst/utils/phone_formatter.py +0 -0
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.1
2
+ Name: sibi-dst
3
+ Version: 0.3.44
4
+ Summary: Data Science Toolkit
5
+ Author: Luis Valverde
6
+ Author-email: lvalverdeb@gmail.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
13
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
14
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
15
+ Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
16
+ Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
17
+ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
18
+ Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
19
+ Requires-Dist: django (>=5.1.4,<6.0.0)
20
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
21
+ Requires-Dist: folium (>=0.19.4,<0.20.0)
22
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0)
23
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
24
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
25
+ Requires-Dist: ipython (>=8.29.0,<9.0.0)
26
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
27
+ Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
28
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
29
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
30
+ Requires-Dist: osmnx (>=2.0.1,<3.0.0)
31
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
32
+ Requires-Dist: paramiko (>=3.5.0,<4.0.0)
33
+ Requires-Dist: psutil (>=6.1.0,<7.0.0)
34
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
35
+ Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
36
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
37
+ Requires-Dist: pymysql (>=1.1.1,<2.0.0)
38
+ Requires-Dist: pytest (>=8.3.3,<9.0.0)
39
+ Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
40
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
41
+ Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
42
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
43
+ Requires-Dist: tornado (>=6.4.1,<7.0.0)
44
+ Requires-Dist: tqdm (>=4.67.0,<5.0.0)
45
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
46
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
47
+ Description-Content-Type: text/markdown
48
+
49
+ # sibi-dst
50
+
51
+ Data Science Toolkit
52
+ ---------------------
53
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
54
+
55
+ Major Functionality
56
+ --------------------
57
+ 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
58
+ 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
59
+ 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
60
+ 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
61
+
62
+ Supported Technologies
63
+ --------------------
64
+ - **Data Processing**: Pandas, Dask
65
+ - **Machine Learning**: Scikit-Learn, XGBoost
66
+ - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
67
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
68
+ - **API Development**: Django REST Framework, gRPC
69
+
70
+ Installation
71
+ ---------------------
72
+ ```bash
73
+ pip install sibi-dst
74
+ ```
75
+
76
+ Usage
77
+ ---------------------
78
+ ### Loading Data from SQLAlchemy
79
+ ```python
80
+ from sibi_dst.df_helper import DfHelper
81
+ from conf.transforms.fields.crm import customer_fields
82
+ from conf.credentials import replica_db_conf
83
+ from conf.storage import get_fs_instance
84
+
85
+ config = {
86
+ 'backend': 'sqlalchemy',
87
+ 'connection_url': replica_db_conf.get('db_url'),
88
+ 'table': 'crm_clientes_archivo',
89
+ 'field_map': customer_fields,
90
+ 'legacy_filters': True,
91
+ 'fs': get_fs_instance()
92
+ }
93
+
94
+ df_helper = DfHelper(**config)
95
+ result = df_helper.load(id__gte=1)
96
+ ```
97
+
98
+ ### Saving Data to ClickHouse
99
+ ```python
100
+ clk_creds = {
101
+ 'host': '192.168.3.171',
102
+ 'port': 18123,
103
+ 'user': 'username',
104
+ 'database': 'xxxxxxx',
105
+ 'table': 'customer_file',
106
+ 'order_by': 'id'
107
+ }
108
+
109
+ df_helper.save_to_clickhouse(**clk_creds)
110
+ ```
111
+
112
+ ### Saving Data to Parquet
113
+ ```python
114
+ df_helper.save_to_parquet(
115
+ parquet_filename='filename.parquet',
116
+ parquet_storage_path='/path/to/my/files/'
117
+ )
118
+ ```
119
+
120
+ Backends Supported
121
+ ---------------------
122
+ | Backend | Description |
123
+ |--------------|-------------|
124
+ | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
125
+ | `django_db` | Load data from Django ORM models. |
126
+ | `parquet` | Load and save data from Parquet files. |
127
+ | `http` | Fetch data from HTTP endpoints. |
128
+ | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
129
+ | `geopy` | Geolocation services for address lookup and reverse geocoding. |
130
+
131
+ Geospatial Utilities
132
+ ---------------------
133
+ ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
134
+ **
135
+ Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
136
+
137
+ #### 🔹 Key Features
138
+ - **BaseOsmMap**: Manages interactive Folium-based maps.
139
+ - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
140
+
141
+ #### Example: Generating an OSM Map
142
+ ```python
143
+ from sibi_dst.osmnx_helper import BaseOsmMap
144
+ osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
145
+ osm_map.generate_map()
146
+ ```
147
+
148
+ ### **Geopy Helper (`sibi_dst.geopy_helper`)
149
+ **
150
+ Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
151
+
152
+ #### 🔹 Key Features
153
+ - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
154
+ - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
155
+ - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
156
+
157
+ #### Example: Reverse Geocoding
158
+ ```python
159
+ from sibi_dst.geopy_helper import GeolocationService
160
+ gs = GeolocationService()
161
+ location = gs.reverse_geocode(lat=9.935, lon=-84.091)
162
+ print(location)
163
+ ```
164
+
165
+ Advanced Features
166
+ ---------------------
167
+ ### Querying with Custom Filters
168
+ Filters can be applied dynamically using Django-style syntax:
169
+ ```python
170
+ result = df_helper.load(date__gte='2023-01-01', status='active')
171
+ ```
172
+
173
+ ### Parallel Processing
174
+ Leverage Dask for parallel execution:
175
+ ```python
176
+ results = df_helper.load_parallel(status='active')
177
+ ```
178
+
179
+ Testing
180
+ ---------------------
181
+ To run unit tests, use:
182
+ ```bash
183
+ pytest tests/
184
+ ```
185
+
186
+ Contributing
187
+ ---------------------
188
+ Contributions are welcome! Please submit pull requests or open issues for discussions.
189
+
190
+ License
191
+ ---------------------
192
+ sibi-dst is licensed under the MIT License.
193
+
194
+
@@ -0,0 +1,145 @@
1
+ # sibi-dst
2
+
3
+ Data Science Toolkit
4
+ ---------------------
5
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
6
+
7
+ Major Functionality
8
+ --------------------
9
+ 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
10
+ 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
11
+ 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
12
+ 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
13
+
14
+ Supported Technologies
15
+ --------------------
16
+ - **Data Processing**: Pandas, Dask
17
+ - **Machine Learning**: Scikit-Learn, XGBoost
18
+ - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
19
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
20
+ - **API Development**: Django REST Framework, gRPC
21
+
22
+ Installation
23
+ ---------------------
24
+ ```bash
25
+ pip install sibi-dst
26
+ ```
27
+
28
+ Usage
29
+ ---------------------
30
+ ### Loading Data from SQLAlchemy
31
+ ```python
32
+ from sibi_dst.df_helper import DfHelper
33
+ from conf.transforms.fields.crm import customer_fields
34
+ from conf.credentials import replica_db_conf
35
+ from conf.storage import get_fs_instance
36
+
37
+ config = {
38
+ 'backend': 'sqlalchemy',
39
+ 'connection_url': replica_db_conf.get('db_url'),
40
+ 'table': 'crm_clientes_archivo',
41
+ 'field_map': customer_fields,
42
+ 'legacy_filters': True,
43
+ 'fs': get_fs_instance()
44
+ }
45
+
46
+ df_helper = DfHelper(**config)
47
+ result = df_helper.load(id__gte=1)
48
+ ```
49
+
50
+ ### Saving Data to ClickHouse
51
+ ```python
52
+ clk_creds = {
53
+ 'host': '192.168.3.171',
54
+ 'port': 18123,
55
+ 'user': 'username',
56
+ 'database': 'xxxxxxx',
57
+ 'table': 'customer_file',
58
+ 'order_by': 'id'
59
+ }
60
+
61
+ df_helper.save_to_clickhouse(**clk_creds)
62
+ ```
63
+
64
+ ### Saving Data to Parquet
65
+ ```python
66
+ df_helper.save_to_parquet(
67
+ parquet_filename='filename.parquet',
68
+ parquet_storage_path='/path/to/my/files/'
69
+ )
70
+ ```
71
+
72
+ Backends Supported
73
+ ---------------------
74
+ | Backend | Description |
75
+ |--------------|-------------|
76
+ | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
77
+ | `django_db` | Load data from Django ORM models. |
78
+ | `parquet` | Load and save data from Parquet files. |
79
+ | `http` | Fetch data from HTTP endpoints. |
80
+ | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
81
+ | `geopy` | Geolocation services for address lookup and reverse geocoding. |
82
+
83
+ Geospatial Utilities
84
+ ---------------------
85
+ ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
86
+ **
87
+ Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
88
+
89
+ #### 🔹 Key Features
90
+ - **BaseOsmMap**: Manages interactive Folium-based maps.
91
+ - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
92
+
93
+ #### Example: Generating an OSM Map
94
+ ```python
95
+ from sibi_dst.osmnx_helper import BaseOsmMap
96
+ osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
97
+ osm_map.generate_map()
98
+ ```
99
+
100
+ ### **Geopy Helper (`sibi_dst.geopy_helper`)
101
+ **
102
+ Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
103
+
104
+ #### 🔹 Key Features
105
+ - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
106
+ - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
107
+ - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
108
+
109
+ #### Example: Reverse Geocoding
110
+ ```python
111
+ from sibi_dst.geopy_helper import GeolocationService
112
+ gs = GeolocationService()
113
+ location = gs.reverse_geocode(lat=9.935, lon=-84.091)
114
+ print(location)
115
+ ```
116
+
117
+ Advanced Features
118
+ ---------------------
119
+ ### Querying with Custom Filters
120
+ Filters can be applied dynamically using Django-style syntax:
121
+ ```python
122
+ result = df_helper.load(date__gte='2023-01-01', status='active')
123
+ ```
124
+
125
+ ### Parallel Processing
126
+ Leverage Dask for parallel execution:
127
+ ```python
128
+ results = df_helper.load_parallel(status='active')
129
+ ```
130
+
131
+ Testing
132
+ ---------------------
133
+ To run unit tests, use:
134
+ ```bash
135
+ pytest tests/
136
+ ```
137
+
138
+ Contributing
139
+ ---------------------
140
+ Contributions are welcome! Please submit pull requests or open issues for discussions.
141
+
142
+ License
143
+ ---------------------
144
+ sibi-dst is licensed under the MIT License.
145
+
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.42"
3
+ version = "0.3.44"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -25,7 +25,7 @@ class ArtifactUpdaterMultiWrapper:
25
25
  def __init__(self, wrapped_classes=None, debug=False, **kwargs):
26
26
  self.wrapped_classes = wrapped_classes or {}
27
27
  self.debug = debug
28
- self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
28
+ self.logger = kwargs.setdefault('logger',Logger.default_logger(logger_name=self.__class__.__name__))
29
29
  self.logger.set_level(logging.DEBUG if debug else logging.INFO)
30
30
 
31
31
  today = datetime.datetime.today()
@@ -73,7 +73,12 @@ class ArtifactUpdaterMultiWrapper:
73
73
  raise ValueError(f"Unsupported data type: {data_type}")
74
74
 
75
75
  return [
76
- artifact_class(parquet_start_date=self.parquet_start_date, parquet_end_date=self.parquet_end_date)
76
+ artifact_class(
77
+ parquet_start_date=self.parquet_start_date,
78
+ parquet_end_date=self.parquet_end_date,
79
+ logger=self.logger,
80
+ debug=self.debug
81
+ )
77
82
  for artifact_class in self.wrapped_classes[data_type]
78
83
  ]
79
84
 
@@ -112,6 +112,7 @@ class DfHelper:
112
112
  :return: None
113
113
  """
114
114
  self.logger.debug(f"backend used: {self.backend}")
115
+ self.logger.debug(f"kwargs passed to backend plugins: {kwargs}")
115
116
  self._backend_query = self.__get_config(QueryConfig, kwargs)
116
117
  self._backend_params = self.__get_config(ParamsConfig, kwargs)
117
118
  if self.backend == 'django_db':
@@ -124,8 +125,8 @@ class DfHelper:
124
125
  elif self.backend == 'sqlalchemy':
125
126
  self.backend_sqlalchemy = self.__get_config(SqlAlchemyConnectionConfig, kwargs)
126
127
 
127
- @staticmethod
128
- def __get_config(model: [T], kwargs: Dict[str, Any]) -> Union[T]:
128
+
129
+ def __get_config(self, model: [T], kwargs: Dict[str, Any]) -> Union[T]:
129
130
  """
130
131
  Initializes a Pydantic model with the keys it recognizes from the kwargs,
131
132
  and removes those keys from the kwargs dictionary.
@@ -135,7 +136,9 @@ class DfHelper:
135
136
  """
136
137
  # Extract keys that the model can accept
137
138
  recognized_keys = set(model.__annotations__.keys())
139
+ self.logger.debug(f"recognized keys: {recognized_keys}")
138
140
  model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
141
+ self.logger.debug(f"model_kwargs: {model_kwargs}")
139
142
  return model(**model_kwargs)
140
143
 
141
144
  def load_parallel(self, **options):
@@ -1,11 +1,12 @@
1
+ import datetime
2
+ import logging
1
3
  from typing import Optional, Any, Dict
2
4
 
3
5
  import dask.dataframe as dd
4
6
  import fsspec
5
7
 
6
8
  from sibi_dst.df_helper import DfHelper
7
- from sibi_dst.utils import DataWrapper
8
- from sibi_dst.utils import DateUtils
9
+ from sibi_dst.utils import DataWrapper, DateUtils, Logger
9
10
 
10
11
 
11
12
  class ParquetArtifact(DfHelper):
@@ -82,7 +83,12 @@ class ParquetArtifact(DfHelper):
82
83
  **kwargs,
83
84
  }
84
85
  self.df: Optional[dd.DataFrame] = None
86
+ self.debug = self.config.setdefault('debug', False)
87
+ self.logger = self.config.setdefault('logger',Logger.default_logger(logger_name=f'parquet_artifact_{__class__.__name__}'))
88
+ self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
85
89
  self.data_wrapper_class = data_wrapper_class
90
+ self.class_params = self.config.setdefault('class_params', None)
91
+ self.load_params = self.config.setdefault('load_params', None)
86
92
  self.date_field = self.config.setdefault('date_field', None)
87
93
  if self.date_field is None:
88
94
  raise ValueError('date_field must be set')
@@ -131,7 +137,30 @@ class ParquetArtifact(DfHelper):
131
137
 
132
138
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
133
139
  """Update the Parquet file with data from a specific period."""
134
- kwargs.update(self.parse_parquet_period(period=period))
140
+
141
+ def itd_config():
142
+ try:
143
+ start_date = kwargs.pop('history_begins_on')
144
+ except KeyError:
145
+ raise ValueError("For period 'itd', you must provide 'history_begins_on' in kwargs.")
146
+ return {'parquet_start_date': start_date, 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')}
147
+
148
+ def ytd_config():
149
+ return {
150
+ 'parquet_start_date': datetime.date(datetime.date.today().year, 1, 1).strftime('%Y-%m-%d'),
151
+ 'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
152
+ }
153
+
154
+ config_map = {
155
+ 'itd': itd_config,
156
+ 'ytd': ytd_config
157
+ }
158
+
159
+ if period in config_map:
160
+ kwargs.update(config_map[period]())
161
+ else:
162
+ kwargs.update(self.parse_parquet_period(period=period))
163
+ print(kwargs)
135
164
  self.generate_parquet(**kwargs)
136
165
 
137
166
  def rebuild_parquet(self, **kwargs) -> None:
@@ -150,6 +179,7 @@ class ParquetArtifact(DfHelper):
150
179
 
151
180
  def _prepare_params(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
152
181
  """Prepare the parameters for generating the Parquet file."""
182
+ kwargs = {**self.config, **kwargs}
153
183
  return {
154
184
  'class_params': kwargs.pop('class_params', None),
155
185
  'date_field': kwargs.pop('date_field', self.date_field),
@@ -1,10 +1,11 @@
1
+ import logging
1
2
  from typing import Optional
2
3
 
3
4
  import dask.dataframe as dd
4
5
  import fsspec
5
6
 
6
7
  from sibi_dst.df_helper import DfHelper
7
-
8
+ from sibi_dst.utils import Logger
8
9
 
9
10
  class ParquetReader(DfHelper):
10
11
  """
@@ -53,6 +54,9 @@ class ParquetReader(DfHelper):
53
54
  **kwargs,
54
55
  }
55
56
  self.df: Optional[dd.DataFrame] = None
57
+ self.debug = self.config.setdefault('debug', False)
58
+ self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
59
+ self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
56
60
  self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
57
61
  if self.parquet_storage_path is None:
58
62
  raise ValueError('parquet_storage_path must be set')
@@ -64,6 +64,7 @@ class DjangoLoadFromDb:
64
64
  self.connection_config = db_connection
65
65
  self.debug = kwargs.pop('debug', False)
66
66
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
67
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
67
68
  if self.connection_config.model is None:
68
69
  if self.debug:
69
70
  self.logger.debug('Model must be specified')
@@ -17,8 +17,9 @@ class ParquetFilterHandler(object):
17
17
  :ivar logger: Logger object to handle logging within the class. Defaults to the class-level logger.
18
18
  :type logger: Logger
19
19
  """
20
- def __init__(self, logger=None):
20
+ def __init__(self, logger=None, debug=False):
21
21
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
22
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
22
23
 
23
24
  @staticmethod
24
25
  def apply_filters_dask(df, filters):
@@ -62,6 +62,7 @@ class ParquetConfig(BaseModel):
62
62
  parquet_end_date: Optional[str] = None
63
63
  fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
64
64
  logger: Optional[Logger] = None
65
+ debug: bool = False
65
66
  model_config = ConfigDict(arbitrary_types_allowed=True)
66
67
 
67
68
  @model_validator(mode='after')
@@ -83,9 +84,7 @@ class ParquetConfig(BaseModel):
83
84
  # Configure paths based on fsspec
84
85
  if self.logger is None:
85
86
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
86
- #self.fs = fsspec.filesystem("file") if "://" not in str(self.parquet_storage_path) else fsspec.filesystem(
87
- # str(self.parquet_storage_path).split("://")[0])
88
- # Validation for parquet path
87
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
89
88
 
90
89
 
91
90
  if self.parquet_storage_path is None:
@@ -63,3 +63,4 @@ class SqlAlchemyConnectionConfig(BaseModel):
63
63
  connection.execute(text("SELECT 1"))
64
64
  except OperationalError as e:
65
65
  raise ValueError(f"Failed to connect to the database: {e}")
66
+
@@ -29,6 +29,7 @@ class SQLAlchemyDask:
29
29
  self.engine = create_engine(engine_url)
30
30
  self.Session = sessionmaker(bind=self.engine)
31
31
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
32
+ self.logger.set_level(logger.DEBUG if debug else logger.INFO)
32
33
 
33
34
  @staticmethod
34
35
  def infer_dtypes_from_model(model):
@@ -70,11 +71,7 @@ class SQLAlchemyDask:
70
71
  # Build query
71
72
  self.query = select(self.model)
72
73
  if self.filters:
73
- """
74
- deprecated specific filter handling to a generic one
75
- #self.query = SqlAlchemyFilterHandler.apply_filters_sqlalchemy(self.query, self.model, self.filters)
76
- """
77
- self.query = FilterHandler(backend="sqlalchemy", logger=self.logger).apply_filters(self.query,
74
+ self.query = FilterHandler(backend="sqlalchemy", logger=self.logger, debug=self.debug).apply_filters(self.query,
78
75
  model=self.model,
79
76
  filters=self.filters)
80
77
  else:
@@ -25,7 +25,7 @@ class FilterHandler:
25
25
  :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
26
26
  :type backend_methods: dict
27
27
  """
28
- def __init__(self, backend, logger=None):
28
+ def __init__(self, backend, logger=None, debug=False):
29
29
  """
30
30
  Initialize the FilterHandler.
31
31
 
@@ -36,6 +36,7 @@ class FilterHandler:
36
36
  self.backend = backend
37
37
  self.logger = logger or Logger.default_logger(
38
38
  logger_name=self.__class__.__name__) # No-op logger if none provided
39
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
39
40
  self.backend_methods = self._get_backend_methods(backend)
40
41
 
41
42
  def apply_filters(self, query_or_df, model=None, filters=None):
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from .base_osm_map import BaseOsmMap
4
- from .utils import PBFHandler
3
+ from .v1.base_osm_map import BaseOsmMap
4
+ from .v1.utils import PBFHandler
5
5
 
6
6
  __all__ = [
7
7
  "BaseOsmMap",