sibi-dst 0.3.64__py3-none-any.whl → 2025.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,212 +0,0 @@
1
- import os
2
- from datetime import datetime
3
-
4
- import fsspec
5
- import httpx
6
- from jinja2 import Template
7
-
8
- """
9
- A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
10
- """
11
- DAG_TEMPLATE = """
12
- from airflow import DAG
13
- from airflow.operators.python_operator import PythonOperator
14
- from datetime import datetime, timedelta
15
- from {{ wrapper_module_path }} import DataUpdateWrapper
16
- {% for module_path, classes in wrapped_classes.items() %}
17
- {% for class_name in classes %}
18
- from {{ module_path }} import {{ class_name }}
19
- {% endfor %}
20
- {% endfor %}
21
-
22
- wrapped_classes = {
23
- {% for group, items in wrapped_classes.items() %}
24
- '{{ group }}': [{% for class_name in items %}{{ class_name }}, {% endfor %}],
25
- {% endfor %}
26
- }
27
-
28
- def update_data(group_name, params):
29
- wrapper = DataUpdateWrapper(wrapped_classes)
30
- wrapper.update_data(group_name, **params)
31
-
32
- default_args = {
33
- 'owner': 'airflow',
34
- 'depends_on_past': False,
35
- 'email_on_failure': False,
36
- 'email_on_retry': False,
37
- 'retries': 1,
38
- 'retry_delay': timedelta(minutes=5),
39
- }
40
-
41
- with DAG(
42
- '{{ dag_id }}',
43
- default_args=default_args,
44
- description='{{ description }}',
45
- schedule_interval='{{ schedule_interval }}',
46
- start_date=datetime({{ start_year }}, {{ start_month }}, {{ start_day }}),
47
- catchup=False,
48
- ) as dag:
49
- {% for group in groups %}
50
- PythonOperator(
51
- task_id='{{ group }}_update',
52
- python_callable=update_data,
53
- op_kwargs={'group_name': '{{ group }}', 'params': {{ params }}},
54
- )
55
- {% endfor %}
56
- """
57
-
58
-
59
- class AirflowDAGManager:
60
-
61
- def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
62
- """
63
- Initialize the Airflow DAG Manager.
64
-
65
- Args:
66
- output_dir (str): Local directory to save generated DAGs.
67
- remote_dags_path (str): Path to the Airflow `dags` folder on the remote server.
68
- ssh_host (str): Hostname or IP of the remote server.
69
- ssh_user (str): SSH username for the remote server.
70
- ssh_password (str): SSH password for the remote server.
71
- wrapper_module_path (str): Path to the `DataUpdateWrapper` module.
72
- """
73
- self.output_dir = output_dir
74
- self.remote_dags_path = remote_dags_path
75
- self.ssh_host = ssh_host
76
- self.ssh_user = ssh_user
77
- self.ssh_password = ssh_password
78
- self.url = url
79
- self.wrapper_module_path = wrapper_module_path
80
- self.auth = auth
81
-
82
- os.makedirs(self.output_dir, exist_ok=True)
83
-
84
- def generate_dag(self, dag_id, description, schedule_interval, wrapped_classes, groups, params):
85
- """
86
- Generate an Airflow DAG script from the provided template.
87
-
88
- Args:
89
- dag_id (str): Unique DAG ID.
90
- description (str): Description of the DAG.
91
- schedule_interval (str): Cron schedule for the DAG.
92
- wrapped_classes (dict): Classes grouped by functionality.
93
- groups (list): List of groups to include as tasks.
94
- params (dict): Parameters for the `update_data` function.
95
-
96
- Returns:
97
- str: Path to the generated DAG file.
98
- """
99
- module_classes = {}
100
- for group, classes in wrapped_classes.items():
101
- for cls in classes:
102
- module_path, class_name = cls.__module__, cls.__name__
103
- if module_path not in module_classes:
104
- module_classes[module_path] = []
105
- module_classes[module_path].append(class_name)
106
-
107
- template = Template(DAG_TEMPLATE)
108
- dag_script = template.render(
109
- dag_id=dag_id,
110
- description=description,
111
- schedule_interval=schedule_interval,
112
- start_year=datetime.now().year,
113
- start_month=datetime.now().month,
114
- start_day=datetime.now().day,
115
- wrapper_module_path=self.wrapper_module_path,
116
- wrapped_classes=module_classes,
117
- groups=groups,
118
- params=params,
119
- )
120
-
121
- file_path = os.path.join(self.output_dir, f"{dag_id}.py")
122
- with open(file_path, "w") as f:
123
- f.write(dag_script)
124
-
125
- print(f"DAG for {dag_id} created at: {file_path}")
126
- return file_path
127
-
128
- def upload_dag(self, local_file, subdirectory=None):
129
- """
130
- Upload a DAG file to the Airflow server using SSH.
131
-
132
- Args:
133
- local_file (str): Path to the local DAG file.
134
- subdirectory (str, optional): Subdirectory within the Airflow `dags` folder.
135
- """
136
- try:
137
- # Destination path on the remote server
138
- remote_path = os.path.join(self.remote_dags_path, subdirectory) if subdirectory else self.remote_dags_path
139
-
140
- # Ensure subdirectory exists
141
- fs = fsspec.filesystem(
142
- "ssh",
143
- host=self.ssh_host,
144
- username=self.ssh_user,
145
- password=self.ssh_password,
146
- )
147
- fs.makedirs(remote_path, exist_ok=True)
148
-
149
- # Upload the DAG file
150
- remote_file_path = os.path.join(remote_path, os.path.basename(local_file))
151
- with open(local_file, "rb") as f, fs.open(remote_file_path, "wb") as remote_f:
152
- remote_f.write(f.read())
153
-
154
- print(f"Uploaded {local_file} to {remote_file_path}")
155
- except Exception as e:
156
- print(f"Failed to upload DAG: {e}")
157
- raise
158
-
159
- def manage_dags(self, wrapped_classes, schedule_interval, description, params, subdirectory=None):
160
- """
161
- Generate, upload, and manage Airflow DAGs for all groups in wrapped_classes.
162
-
163
- Args:
164
- wrapped_classes (dict): Dictionary of groups and their corresponding classes.
165
- schedule_interval (str): Cron schedule for the DAGs.
166
- description (str): Description for the DAGs.
167
- params (dict): Parameters for the `update_data` function.
168
- subdirectory (str, optional): Subdirectory within the Airflow `dags` folder.
169
- """
170
- groups = list(wrapped_classes.keys())
171
- dag_id = "daily_data_update"
172
-
173
- print("Generating DAG...")
174
- dag_file = self.generate_dag(
175
- dag_id=dag_id,
176
- description=description,
177
- schedule_interval=schedule_interval,
178
- wrapped_classes=wrapped_classes,
179
- groups=groups,
180
- params=params,
181
- )
182
-
183
- print("Uploading DAG to Airflow server...")
184
- self.upload_dag(dag_file, subdirectory)
185
-
186
- print("DAG management completed successfully.")
187
-
188
- def trigger_dag(self, dag_id, run_id=None, conf=None):
189
- """
190
- Trigger a DAG via Airflow's REST API.
191
-
192
- Args:
193
- dag_id (str): ID of the DAG to trigger.
194
- run_id (str, optional): Custom run ID for the DAG run.
195
- conf (dict, optional): Additional parameters for the DAG run.
196
-
197
- Returns:
198
- dict: Response from Airflow.
199
- """
200
- url = f"{self.url}/api/v1/dags/{dag_id}/dagRuns"
201
- payload = {
202
- "dag_run_id": run_id or f"manual_{datetime.now().isoformat()}",
203
- "conf": conf or {}
204
- }
205
- try:
206
- response = httpx.post(url, json=payload, auth=self.auth)
207
- response.raise_for_status()
208
- print(f"DAG {dag_id} triggered successfully.")
209
- return response.json()
210
- except httpx.RequestError as e:
211
- print(f"Failed to trigger DAG {dag_id}: {e}")
212
- raise
@@ -1,90 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: sibi-dst
3
- Version: 0.3.64
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Provides-Extra: complete
13
- Provides-Extra: df-helper
14
- Provides-Extra: geospatial
15
- Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
16
- Requires-Dist: chardet (>=5.2.0,<6.0.0)
17
- Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
18
- Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
19
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
20
- Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
21
- Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
22
- Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
23
- Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
24
- Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
25
- Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
26
- Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
27
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
28
- Requires-Dist: ipython (>=8.29.0,<9.0.0)
29
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
- Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
31
- Requires-Dist: nltk (>=3.9.1,<4.0.0)
32
- Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
- Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
34
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
35
- Requires-Dist: paramiko (>=3.5.0,<4.0.0)
36
- Requires-Dist: psutil (>=6.1.0,<7.0.0)
37
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
38
- Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
39
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
40
- Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
41
- Requires-Dist: pytest (>=8.3.3,<9.0.0)
42
- Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
43
- Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
44
- Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
45
- Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
46
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
47
- Requires-Dist: tornado (>=6.4.1,<7.0.0)
48
- Requires-Dist: tqdm (>=4.67.0,<5.0.0)
49
- Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
50
- Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
51
- Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
52
- Description-Content-Type: text/markdown
53
-
54
- ### SIBI-DST
55
-
56
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
57
-
58
- ## Example Use Cases
59
-
60
- 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
61
- 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
62
- 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
63
- 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
64
- 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
65
-
66
- ## Supported Technologies
67
-
68
- - **Data Processing**: Pandas, Dask
69
- - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
70
- - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
71
- - **API Development**: Django REST Framework, FastAPI
72
-
73
- ## Installation
74
-
75
- ```bash
76
- # with pip
77
-
78
- pip install sibi-dst[complete] # Install all dependencies
79
- pip install sibi-dst[df_helper] # Install only df_helper dependencies
80
- pip install sibi-dst[geospatial] # Install only geospatial dependencies
81
-
82
- # with poetry
83
-
84
- poetry add "sibi-dst[complete]" # Install all dependencies
85
- poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
86
- poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
87
-
88
-
89
- ```
90
-