sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +186 -591
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/__init__.py +0 -4
- sibi_dst/df_helper/core/_defaults.py +1 -50
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/__init__.py +0 -2
- sibi_dst/utils/data_wrapper.py +9 -12
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/utils/update_planner.py +2 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- sibi_dst-2025.1.1.dist-info/METADATA +55 -0
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
- sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- sibi_dst/utils/airflow_manager.py +0 -212
- sibi_dst-0.3.63.dist-info/METADATA +0 -90
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,212 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from datetime import datetime
|
3
|
-
|
4
|
-
import fsspec
|
5
|
-
import httpx
|
6
|
-
from jinja2 import Template
|
7
|
-
|
8
|
-
"""
|
9
|
-
A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
|
10
|
-
"""
|
11
|
-
DAG_TEMPLATE = """
|
12
|
-
from airflow import DAG
|
13
|
-
from airflow.operators.python_operator import PythonOperator
|
14
|
-
from datetime import datetime, timedelta
|
15
|
-
from {{ wrapper_module_path }} import DataUpdateWrapper
|
16
|
-
{% for module_path, classes in wrapped_classes.items() %}
|
17
|
-
{% for class_name in classes %}
|
18
|
-
from {{ module_path }} import {{ class_name }}
|
19
|
-
{% endfor %}
|
20
|
-
{% endfor %}
|
21
|
-
|
22
|
-
wrapped_classes = {
|
23
|
-
{% for group, items in wrapped_classes.items() %}
|
24
|
-
'{{ group }}': [{% for class_name in items %}{{ class_name }}, {% endfor %}],
|
25
|
-
{% endfor %}
|
26
|
-
}
|
27
|
-
|
28
|
-
def update_data(group_name, params):
|
29
|
-
wrapper = DataUpdateWrapper(wrapped_classes)
|
30
|
-
wrapper.update_data(group_name, **params)
|
31
|
-
|
32
|
-
default_args = {
|
33
|
-
'owner': 'airflow',
|
34
|
-
'depends_on_past': False,
|
35
|
-
'email_on_failure': False,
|
36
|
-
'email_on_retry': False,
|
37
|
-
'retries': 1,
|
38
|
-
'retry_delay': timedelta(minutes=5),
|
39
|
-
}
|
40
|
-
|
41
|
-
with DAG(
|
42
|
-
'{{ dag_id }}',
|
43
|
-
default_args=default_args,
|
44
|
-
description='{{ description }}',
|
45
|
-
schedule_interval='{{ schedule_interval }}',
|
46
|
-
start_date=datetime({{ start_year }}, {{ start_month }}, {{ start_day }}),
|
47
|
-
catchup=False,
|
48
|
-
) as dag:
|
49
|
-
{% for group in groups %}
|
50
|
-
PythonOperator(
|
51
|
-
task_id='{{ group }}_update',
|
52
|
-
python_callable=update_data,
|
53
|
-
op_kwargs={'group_name': '{{ group }}', 'params': {{ params }}},
|
54
|
-
)
|
55
|
-
{% endfor %}
|
56
|
-
"""
|
57
|
-
|
58
|
-
|
59
|
-
class AirflowDAGManager:
|
60
|
-
|
61
|
-
def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
|
62
|
-
"""
|
63
|
-
Initialize the Airflow DAG Manager.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
output_dir (str): Local directory to save generated DAGs.
|
67
|
-
remote_dags_path (str): Path to the Airflow `dags` folder on the remote server.
|
68
|
-
ssh_host (str): Hostname or IP of the remote server.
|
69
|
-
ssh_user (str): SSH username for the remote server.
|
70
|
-
ssh_password (str): SSH password for the remote server.
|
71
|
-
wrapper_module_path (str): Path to the `DataUpdateWrapper` module.
|
72
|
-
"""
|
73
|
-
self.output_dir = output_dir
|
74
|
-
self.remote_dags_path = remote_dags_path
|
75
|
-
self.ssh_host = ssh_host
|
76
|
-
self.ssh_user = ssh_user
|
77
|
-
self.ssh_password = ssh_password
|
78
|
-
self.url = url
|
79
|
-
self.wrapper_module_path = wrapper_module_path
|
80
|
-
self.auth = auth
|
81
|
-
|
82
|
-
os.makedirs(self.output_dir, exist_ok=True)
|
83
|
-
|
84
|
-
def generate_dag(self, dag_id, description, schedule_interval, wrapped_classes, groups, params):
|
85
|
-
"""
|
86
|
-
Generate an Airflow DAG script from the provided template.
|
87
|
-
|
88
|
-
Args:
|
89
|
-
dag_id (str): Unique DAG ID.
|
90
|
-
description (str): Description of the DAG.
|
91
|
-
schedule_interval (str): Cron schedule for the DAG.
|
92
|
-
wrapped_classes (dict): Classes grouped by functionality.
|
93
|
-
groups (list): List of groups to include as tasks.
|
94
|
-
params (dict): Parameters for the `update_data` function.
|
95
|
-
|
96
|
-
Returns:
|
97
|
-
str: Path to the generated DAG file.
|
98
|
-
"""
|
99
|
-
module_classes = {}
|
100
|
-
for group, classes in wrapped_classes.items():
|
101
|
-
for cls in classes:
|
102
|
-
module_path, class_name = cls.__module__, cls.__name__
|
103
|
-
if module_path not in module_classes:
|
104
|
-
module_classes[module_path] = []
|
105
|
-
module_classes[module_path].append(class_name)
|
106
|
-
|
107
|
-
template = Template(DAG_TEMPLATE)
|
108
|
-
dag_script = template.render(
|
109
|
-
dag_id=dag_id,
|
110
|
-
description=description,
|
111
|
-
schedule_interval=schedule_interval,
|
112
|
-
start_year=datetime.now().year,
|
113
|
-
start_month=datetime.now().month,
|
114
|
-
start_day=datetime.now().day,
|
115
|
-
wrapper_module_path=self.wrapper_module_path,
|
116
|
-
wrapped_classes=module_classes,
|
117
|
-
groups=groups,
|
118
|
-
params=params,
|
119
|
-
)
|
120
|
-
|
121
|
-
file_path = os.path.join(self.output_dir, f"{dag_id}.py")
|
122
|
-
with open(file_path, "w") as f:
|
123
|
-
f.write(dag_script)
|
124
|
-
|
125
|
-
print(f"DAG for {dag_id} created at: {file_path}")
|
126
|
-
return file_path
|
127
|
-
|
128
|
-
def upload_dag(self, local_file, subdirectory=None):
|
129
|
-
"""
|
130
|
-
Upload a DAG file to the Airflow server using SSH.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
local_file (str): Path to the local DAG file.
|
134
|
-
subdirectory (str, optional): Subdirectory within the Airflow `dags` folder.
|
135
|
-
"""
|
136
|
-
try:
|
137
|
-
# Destination path on the remote server
|
138
|
-
remote_path = os.path.join(self.remote_dags_path, subdirectory) if subdirectory else self.remote_dags_path
|
139
|
-
|
140
|
-
# Ensure subdirectory exists
|
141
|
-
fs = fsspec.filesystem(
|
142
|
-
"ssh",
|
143
|
-
host=self.ssh_host,
|
144
|
-
username=self.ssh_user,
|
145
|
-
password=self.ssh_password,
|
146
|
-
)
|
147
|
-
fs.makedirs(remote_path, exist_ok=True)
|
148
|
-
|
149
|
-
# Upload the DAG file
|
150
|
-
remote_file_path = os.path.join(remote_path, os.path.basename(local_file))
|
151
|
-
with open(local_file, "rb") as f, fs.open(remote_file_path, "wb") as remote_f:
|
152
|
-
remote_f.write(f.read())
|
153
|
-
|
154
|
-
print(f"Uploaded {local_file} to {remote_file_path}")
|
155
|
-
except Exception as e:
|
156
|
-
print(f"Failed to upload DAG: {e}")
|
157
|
-
raise
|
158
|
-
|
159
|
-
def manage_dags(self, wrapped_classes, schedule_interval, description, params, subdirectory=None):
|
160
|
-
"""
|
161
|
-
Generate, upload, and manage Airflow DAGs for all groups in wrapped_classes.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
wrapped_classes (dict): Dictionary of groups and their corresponding classes.
|
165
|
-
schedule_interval (str): Cron schedule for the DAGs.
|
166
|
-
description (str): Description for the DAGs.
|
167
|
-
params (dict): Parameters for the `update_data` function.
|
168
|
-
subdirectory (str, optional): Subdirectory within the Airflow `dags` folder.
|
169
|
-
"""
|
170
|
-
groups = list(wrapped_classes.keys())
|
171
|
-
dag_id = "daily_data_update"
|
172
|
-
|
173
|
-
print("Generating DAG...")
|
174
|
-
dag_file = self.generate_dag(
|
175
|
-
dag_id=dag_id,
|
176
|
-
description=description,
|
177
|
-
schedule_interval=schedule_interval,
|
178
|
-
wrapped_classes=wrapped_classes,
|
179
|
-
groups=groups,
|
180
|
-
params=params,
|
181
|
-
)
|
182
|
-
|
183
|
-
print("Uploading DAG to Airflow server...")
|
184
|
-
self.upload_dag(dag_file, subdirectory)
|
185
|
-
|
186
|
-
print("DAG management completed successfully.")
|
187
|
-
|
188
|
-
def trigger_dag(self, dag_id, run_id=None, conf=None):
|
189
|
-
"""
|
190
|
-
Trigger a DAG via Airflow's REST API.
|
191
|
-
|
192
|
-
Args:
|
193
|
-
dag_id (str): ID of the DAG to trigger.
|
194
|
-
run_id (str, optional): Custom run ID for the DAG run.
|
195
|
-
conf (dict, optional): Additional parameters for the DAG run.
|
196
|
-
|
197
|
-
Returns:
|
198
|
-
dict: Response from Airflow.
|
199
|
-
"""
|
200
|
-
url = f"{self.url}/api/v1/dags/{dag_id}/dagRuns"
|
201
|
-
payload = {
|
202
|
-
"dag_run_id": run_id or f"manual_{datetime.now().isoformat()}",
|
203
|
-
"conf": conf or {}
|
204
|
-
}
|
205
|
-
try:
|
206
|
-
response = httpx.post(url, json=payload, auth=self.auth)
|
207
|
-
response.raise_for_status()
|
208
|
-
print(f"DAG {dag_id} triggered successfully.")
|
209
|
-
return response.json()
|
210
|
-
except httpx.RequestError as e:
|
211
|
-
print(f"Failed to trigger DAG {dag_id}: {e}")
|
212
|
-
raise
|
@@ -1,90 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: sibi-dst
|
3
|
-
Version: 0.3.63
|
4
|
-
Summary: Data Science Toolkit
|
5
|
-
Author: Luis Valverde
|
6
|
-
Author-email: lvalverdeb@gmail.com
|
7
|
-
Requires-Python: >=3.11,<4.0
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: Programming Language :: Python :: 3.11
|
10
|
-
Classifier: Programming Language :: Python :: 3.12
|
11
|
-
Classifier: Programming Language :: Python :: 3.13
|
12
|
-
Provides-Extra: complete
|
13
|
-
Provides-Extra: df-helper
|
14
|
-
Provides-Extra: geospatial
|
15
|
-
Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
|
16
|
-
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
17
|
-
Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
|
18
|
-
Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
|
19
|
-
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
20
|
-
Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
|
21
|
-
Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
|
22
|
-
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
|
23
|
-
Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
|
24
|
-
Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
|
25
|
-
Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
|
26
|
-
Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
|
27
|
-
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
28
|
-
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
29
|
-
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
30
|
-
Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
|
31
|
-
Requires-Dist: nltk (>=3.9.1,<4.0.0)
|
32
|
-
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
33
|
-
Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
|
34
|
-
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
35
|
-
Requires-Dist: paramiko (>=3.5.0,<4.0.0)
|
36
|
-
Requires-Dist: psutil (>=6.1.0,<7.0.0)
|
37
|
-
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
|
38
|
-
Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
|
39
|
-
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
40
|
-
Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
|
41
|
-
Requires-Dist: pytest (>=8.3.3,<9.0.0)
|
42
|
-
Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
|
43
|
-
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
44
|
-
Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
|
45
|
-
Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
|
46
|
-
Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
|
47
|
-
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
48
|
-
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
49
|
-
Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
|
50
|
-
Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
|
51
|
-
Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
|
52
|
-
Description-Content-Type: text/markdown
|
53
|
-
|
54
|
-
### SIBI-DST
|
55
|
-
|
56
|
-
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
|
57
|
-
|
58
|
-
## Example Use Cases
|
59
|
-
|
60
|
-
1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
|
61
|
-
2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
|
62
|
-
3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
|
63
|
-
4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
|
64
|
-
5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
|
65
|
-
|
66
|
-
## Supported Technologies
|
67
|
-
|
68
|
-
- **Data Processing**: Pandas, Dask
|
69
|
-
- **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
|
70
|
-
- **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
|
71
|
-
- **API Development**: Django REST Framework, FastAPI
|
72
|
-
|
73
|
-
## Installation
|
74
|
-
|
75
|
-
```bash
|
76
|
-
# with pip
|
77
|
-
|
78
|
-
pip install sibi-dst[complete] # Install all dependencies
|
79
|
-
pip install sibi-dst[df_helper] # Install only df_helper dependencies
|
80
|
-
pip install sibi-dst[geospatial] # Install only geospatial dependencies
|
81
|
-
|
82
|
-
# with poetry
|
83
|
-
|
84
|
-
poetry add "sibi-dst[complete]" # Install all dependencies
|
85
|
-
poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
|
86
|
-
poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
|
87
|
-
|
88
|
-
|
89
|
-
```
|
90
|
-
|
File without changes
|