sibi-flux 2025.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_flux-2025.12.0/PKG-INFO +283 -0
- sibi_flux-2025.12.0/README.md +233 -0
- sibi_flux-2025.12.0/pyproject.toml +163 -0
- sibi_flux-2025.12.0/src/sibi_dst/__init__.py +44 -0
- sibi_flux-2025.12.0/src/sibi_flux/__init__.py +49 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/base.py +166 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux-2025.12.0/src/sibi_flux/conf/settings.py +131 -0
- sibi_flux-2025.12.0/src/sibi_flux/core/__init__.py +5 -0
- sibi_flux-2025.12.0/src/sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux-2025.12.0/src/sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/generator.py +677 -0
- sibi_flux-2025.12.0/src/sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux-2025.12.0/src/sibi_flux/dataset/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_enricher/types.py +12 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux-2025.12.0/src/sibi_flux/logger/__init__.py +1 -0
- sibi_flux-2025.12.0/src/sibi_flux/logger/_logger.py +480 -0
- sibi_flux-2025.12.0/src/sibi_flux/mcp/__init__.py +26 -0
- sibi_flux-2025.12.0/src/sibi_flux/mcp/client.py +150 -0
- sibi_flux-2025.12.0/src/sibi_flux/mcp/router.py +126 -0
- sibi_flux-2025.12.0/src/sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux-2025.12.0/src/sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux-2025.12.0/src/sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux-2025.12.0/src/sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/pipelines/base.py +218 -0
- sibi_flux-2025.12.0/src/sibi_flux/py.typed +0 -0
- sibi_flux-2025.12.0/src/sibi_flux/readers/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/readers/base.py +82 -0
- sibi_flux-2025.12.0/src/sibi_flux/readers/parquet.py +106 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/__init__.py +53 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/common.py +7 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/file_utils.py +48 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/retry.py +46 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux-2025.12.0/src/sibi_flux/utils/storage/factory.py +33 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: sibi-flux
|
|
3
|
+
Version: 2025.12.0
|
|
4
|
+
Summary: Sibi Toolkit: A collection of tools for Data Analysis/Engineering.
|
|
5
|
+
Author: Luis Valverde
|
|
6
|
+
Author-email: Luis Valverde <lvalverdeb@gmail.com>
|
|
7
|
+
Requires-Dist: pandas>=2.3.3
|
|
8
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
9
|
+
Requires-Dist: pydantic>=2.12.5
|
|
10
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
11
|
+
Requires-Dist: dask>=2025.11.0
|
|
12
|
+
Requires-Dist: fsspec>=2025.10.0
|
|
13
|
+
Requires-Dist: s3fs>=2025.10.0
|
|
14
|
+
Requires-Dist: sqlalchemy>=2.0.44
|
|
15
|
+
Requires-Dist: psycopg2>=2.9.11
|
|
16
|
+
Requires-Dist: pymysql>=1.1.2
|
|
17
|
+
Requires-Dist: clickhouse-connect>=0.10.0
|
|
18
|
+
Requires-Dist: concurrent-log-handler>=0.9.28
|
|
19
|
+
Requires-Dist: rich>=14.2.0
|
|
20
|
+
Requires-Dist: filelock>=3.20.1
|
|
21
|
+
Requires-Dist: tqdm>=4.67.1
|
|
22
|
+
Requires-Dist: watchdog>=6.0.0
|
|
23
|
+
Requires-Dist: tornado==6.5.4
|
|
24
|
+
Requires-Dist: typer>=0.21.0
|
|
25
|
+
Requires-Dist: psutil>=6.1.1
|
|
26
|
+
Requires-Dist: httpx>=0.28.1
|
|
27
|
+
Requires-Dist: opentelemetry-api>=1.38.0
|
|
28
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.38.0
|
|
29
|
+
Requires-Dist: opentelemetry-sdk>=1.38.0
|
|
30
|
+
Requires-Dist: sibi-flux[distributed,geospatial,mcp] ; extra == 'complete'
|
|
31
|
+
Requires-Dist: distributed>=2025.11.0 ; extra == 'distributed'
|
|
32
|
+
Requires-Dist: osmnx>=2.0.7 ; extra == 'geospatial'
|
|
33
|
+
Requires-Dist: geopandas>=1.1.2 ; extra == 'geospatial'
|
|
34
|
+
Requires-Dist: geopy>=2.4.1 ; extra == 'geospatial'
|
|
35
|
+
Requires-Dist: folium>=0.20.0 ; extra == 'geospatial'
|
|
36
|
+
Requires-Dist: osmium>=4.2.0 ; extra == 'geospatial'
|
|
37
|
+
Requires-Dist: shapely>=2.0.0 ; extra == 'geospatial'
|
|
38
|
+
Requires-Dist: networkx>=3.6.1 ; extra == 'geospatial'
|
|
39
|
+
Requires-Dist: sibi-flux[distributed] ; extra == 'mcp'
|
|
40
|
+
Requires-Dist: mcp>=1.1.2 ; extra == 'mcp'
|
|
41
|
+
Requires-Dist: fastapi>=0.127.0 ; extra == 'mcp'
|
|
42
|
+
Requires-Dist: uvicorn>=0.40.0 ; extra == 'mcp'
|
|
43
|
+
Requires-Dist: httpx>=0.28.1 ; extra == 'mcp'
|
|
44
|
+
Requires-Python: >=3.11
|
|
45
|
+
Provides-Extra: complete
|
|
46
|
+
Provides-Extra: distributed
|
|
47
|
+
Provides-Extra: geospatial
|
|
48
|
+
Provides-Extra: mcp
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# SibiFlux
|
|
52
|
+
|
|
53
|
+
**SibiFlux** is a production-grade resilient data engineering ecosystem designed to bridge the gap between local development, distributed computing, and agentic AI workflows. It provides a unified engine for hybrid data loading (batch + streaming), self-healing distributed operations, and native interfaces for AI agents via the Model Context Protocol (MCP).
|
|
54
|
+
|
|
55
|
+
```mermaid
|
|
56
|
+
graph TD
|
|
57
|
+
subgraph "Agentic Interface (MCP)"
|
|
58
|
+
Agent["AI Agent / Claude"] <--> Router["MCP Router"]
|
|
59
|
+
Router <--> Resources["SibiFlux Resources"]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
subgraph "Solutions Layer (Business Logic)"
|
|
63
|
+
Logistics["Logistics Solutions"]
|
|
64
|
+
Enrichment["Enrichment Pipelines"]
|
|
65
|
+
Cubes["DataCubes"]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
subgraph "SibiFlux Core Engine"
|
|
69
|
+
DfHelper["DfHelper (Unified Loader)"]
|
|
70
|
+
Cluster["Resilient Dask Cluster"]
|
|
71
|
+
Managed["ManagedResource Lifecycle"]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
Resources --> Cubes
|
|
75
|
+
Logistics --> DfHelper
|
|
76
|
+
Cubes --> DfHelper
|
|
77
|
+
DfHelper --> Cluster
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Core Architecture
|
|
81
|
+
|
|
82
|
+
### 1. The Flux Engine (`sibi_flux`)
|
|
83
|
+
The foundational library providing resilient distributed primitives.
|
|
84
|
+
- **`DfHelper`**: A unified API for loading data from SQLAlchemy, Parquet, or HTTP sources into Pandas or Dask DataFrames.
|
|
85
|
+
- **`Dataset`**: A high-level abstraction for hybrid data loading, seamlessly merging historical (Parquet) and live (SQL) data sources.
|
|
86
|
+
- **`DfValidator`**: A robust schema enforcement tool that validates DataFrames against strict type maps and generates ClickHouse DDL.
|
|
87
|
+
- **`ArtifactOrchestrator`**: An async engine for managing concurrent artifact updates with retries, backoff, and worker isolation.
|
|
88
|
+
- **`ManagedResource`**: A rigorous lifecycle management system for async resources, ensuring clean shutdown, signal handling, and observability.
|
|
89
|
+
- **`Dask Cluster`**: A self-healing distributed runtime that detects worker failures, manages re-connection, and enforces "The Nuclear Option" for test isolation.
|
|
90
|
+
|
|
91
|
+
### 2. Agentic Interface (`mcp`)
|
|
92
|
+
Native support for the **Model Context Protocol (MCP)**, allowing AI agents to directly interact with SibiFlux data structures.
|
|
93
|
+
- **Expose DataCubes**: Automatically turn any `DataCube` into a queryable MCP Resource.
|
|
94
|
+
- **Tooling**: Register Python functions as tools callable by agents.
|
|
95
|
+
|
|
96
|
+
## Key Capabilities
|
|
97
|
+
|
|
98
|
+
### Hybrid Data Loading
|
|
99
|
+
SibiFlux implements a "Hot/Cold" architecture for seamless data access:
|
|
100
|
+
- **Historical Data**: Read efficiently from partitioned Parquet archives (S3/Local).
|
|
101
|
+
- **Live Data**: Query operational SQL databases for real-time changes.
|
|
102
|
+
- **Automatic Merge**: `DfHelper` and `Dataset` automatically stitch these sources together, handling schema evolution and deduplication.
|
|
103
|
+
|
|
104
|
+
### Resilient Distributed Compute
|
|
105
|
+
The SibiFlux Dask wrapper provides:
|
|
106
|
+
- **Auto-Healing**: Clients that automatically reconnect if the scheduler dies.
|
|
107
|
+
- **Safe Persistence**: Wrappers like `safe_persist` that retry operations on network jitter.
|
|
108
|
+
- **Smart Partitioning**: Automated repartitioning to prevent "small file problem" in Parquet outputs.
|
|
109
|
+
|
|
110
|
+
### Observability
|
|
111
|
+
Built-in integration with OpenTelemetry (OTel):
|
|
112
|
+
- structured logging with correlation IDs.
|
|
113
|
+
- distributed tracing across async boundaries.
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
### Installation
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Base installation (Core Engine only)
|
|
121
|
+
pip install sibi-flux
|
|
122
|
+
|
|
123
|
+
# For Distributed Computing (Dask Cluster support)
|
|
124
|
+
pip install "sibi-flux[distributed]"
|
|
125
|
+
|
|
126
|
+
# For Geospatial capabilities (OSMnx, GeoPandas, etc.)
|
|
127
|
+
pip install "sibi-flux[geospatial]"
|
|
128
|
+
|
|
129
|
+
# For MCP Agentic Interface support
|
|
130
|
+
pip install "sibi-flux[mcp]"
|
|
131
|
+
|
|
132
|
+
# Complete installation (Simulates the "All-in-One" environment)
|
|
133
|
+
pip install "sibi-flux[complete]"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## API Examples
|
|
137
|
+
|
|
138
|
+
### 1. Data Loading (`Dataset`)
|
|
139
|
+
|
|
140
|
+
The `Dataset` class provides a high-level abstraction for hybrid loading.
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from sibi_flux import Dataset
|
|
144
|
+
from solutions.logistics.readers.products import ProductsParquetReader, ProductsSqlReader
|
|
145
|
+
|
|
146
|
+
class ProductsDataset(Dataset):
|
|
147
|
+
historical_reader = ProductsParquetReader
|
|
148
|
+
live_reader = ProductsSqlReader
|
|
149
|
+
date_field = "created_at"
|
|
150
|
+
|
|
151
|
+
# Load hybrid data (merges Parquet + SQL)
|
|
152
|
+
ds = ProductsDataset(start_date="2023-01-01", end_date="2023-01-31")
|
|
153
|
+
df = await ds.aload() # Returns a Dask DataFrame
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### 2. Schema Validation (`DfValidator`)
|
|
157
|
+
|
|
158
|
+
Ensure your data meets strict type requirements and generate DDL for ClickHouse.
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
from sibi_flux import DfValidator
|
|
162
|
+
|
|
163
|
+
# Define expected schema
|
|
164
|
+
SCHEMA = {
|
|
165
|
+
"product_id": "Int64[pyarrow]",
|
|
166
|
+
"price": "Float64[pyarrow]",
|
|
167
|
+
"name": "string[pyarrow]"
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
validator = DfValidator(df)
|
|
171
|
+
validator.validate_schema(SCHEMA)
|
|
172
|
+
validator.standardize_data_quality()
|
|
173
|
+
|
|
174
|
+
# Generate ClickHouse DDL
|
|
175
|
+
ddl = validator.generate_clickhouse_ddl("products_table")
|
|
176
|
+
print(ddl)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 3. Data Enrichment (`AsyncDfEnricher`)
|
|
180
|
+
|
|
181
|
+
Enrich a base DataFrame with data from other sources using `AttachmentSpec`.
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from sibi_flux.df_enricher import AsyncDfEnricher, AttachmentSpec
|
|
185
|
+
|
|
186
|
+
specs = [
|
|
187
|
+
AttachmentSpec(
|
|
188
|
+
key="customer_info",
|
|
189
|
+
required_cols={"customer_id"},
|
|
190
|
+
attachment_fn=fetch_customer_data, # Async function returning DF
|
|
191
|
+
left_on=["customer_id"],
|
|
192
|
+
right_on=["id"],
|
|
193
|
+
drop_cols=["id"]
|
|
194
|
+
)
|
|
195
|
+
]
|
|
196
|
+
|
|
197
|
+
enricher = AsyncDfEnricher(base_df=orders_df, specs=specs)
|
|
198
|
+
enriched_df = await enricher.enrich()
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### 4. Orchestration (`ArtifactOrchestrator`)
|
|
202
|
+
|
|
203
|
+
Manage concurrent updates of multiple artifacts with retries and worker isolation.
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from sibi_flux.orchestration import ArtifactOrchestrator
|
|
207
|
+
|
|
208
|
+
orchestrator = ArtifactOrchestrator(
|
|
209
|
+
wrapped_classes={"daily": [ProductsDataset, OrdersDataset]},
|
|
210
|
+
max_workers=3,
|
|
211
|
+
retry_attempts=3
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Updates all 'daily' artifacts concurrently
|
|
215
|
+
results = await orchestrator.update_data("daily")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### 5. Distributed Compute (`Dask Cluster`)
|
|
219
|
+
|
|
220
|
+
Execute resilient operations that survive scheduler restarts.
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from sibi_flux.dask_cluster import safe_compute, get_persistent_client
|
|
224
|
+
|
|
225
|
+
client = get_persistent_client()
|
|
226
|
+
|
|
227
|
+
# Safe compute with auto-retry logic
|
|
228
|
+
result = safe_compute(df.groupby("category").sum())
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### 6. Resource Management (`ManagedResource`)
|
|
232
|
+
|
|
233
|
+
Create lifecycle-safe components.
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from sibi_flux.core import ManagedResource
|
|
237
|
+
|
|
238
|
+
class MyResource(ManagedResource):
|
|
239
|
+
async def _acleanup(self):
|
|
240
|
+
await self.db_connection.close()
|
|
241
|
+
self.logger.info("Cleaned up!")
|
|
242
|
+
|
|
243
|
+
async with MyResource() as res:
|
|
244
|
+
await res.do_work()
|
|
245
|
+
# Automatically cleaned up here
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### 7. Agentic Interface (`sibi_flux.mcp`)
|
|
249
|
+
|
|
250
|
+
Seamlessly bridge your data with AI Agents.
|
|
251
|
+
|
|
252
|
+
**Server Side (Expose Resources)**
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from sibi_flux.mcp import BaseMCPRouter
|
|
256
|
+
from products import ProductsDataset
|
|
257
|
+
|
|
258
|
+
# Create an MCP Router compatible with FastAPI
|
|
259
|
+
router = BaseMCPRouter(name="data-server")
|
|
260
|
+
|
|
261
|
+
# Automatically register a Dataset as an MCP Resource
|
|
262
|
+
# Agent can now read `sibi://ProductsDataset`
|
|
263
|
+
router.register_cube_resource(ProductsDataset)
|
|
264
|
+
|
|
265
|
+
# Register a custom tool
|
|
266
|
+
@router.tool()
|
|
267
|
+
def calculate_vat(amount: float) -> float:
|
|
268
|
+
return amount * 0.2
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
**Client Side (Consume Resources)**
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from sibi_flux.mcp import GenericMcpClient
|
|
275
|
+
|
|
276
|
+
# Connect to the MCP Server
|
|
277
|
+
async with GenericMcpClient(url="http://localhost:8000/sse") as client:
|
|
278
|
+
# Read the resource (returns JSON data from the Dataset)
|
|
279
|
+
data = await client.read_resource("sibi://ProductsDataset")
|
|
280
|
+
|
|
281
|
+
# Call a tool
|
|
282
|
+
vat = await client.call_tool("calculate_vat", arguments={"amount": 100.0})
|
|
283
|
+
```
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# SibiFlux
|
|
2
|
+
|
|
3
|
+
**SibiFlux** is a production-grade resilient data engineering ecosystem designed to bridge the gap between local development, distributed computing, and agentic AI workflows. It provides a unified engine for hybrid data loading (batch + streaming), self-healing distributed operations, and native interfaces for AI agents via the Model Context Protocol (MCP).
|
|
4
|
+
|
|
5
|
+
```mermaid
|
|
6
|
+
graph TD
|
|
7
|
+
subgraph "Agentic Interface (MCP)"
|
|
8
|
+
Agent["AI Agent / Claude"] <--> Router["MCP Router"]
|
|
9
|
+
Router <--> Resources["SibiFlux Resources"]
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
subgraph "Solutions Layer (Business Logic)"
|
|
13
|
+
Logistics["Logistics Solutions"]
|
|
14
|
+
Enrichment["Enrichment Pipelines"]
|
|
15
|
+
Cubes["DataCubes"]
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
subgraph "SibiFlux Core Engine"
|
|
19
|
+
DfHelper["DfHelper (Unified Loader)"]
|
|
20
|
+
Cluster["Resilient Dask Cluster"]
|
|
21
|
+
Managed["ManagedResource Lifecycle"]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
Resources --> Cubes
|
|
25
|
+
Logistics --> DfHelper
|
|
26
|
+
Cubes --> DfHelper
|
|
27
|
+
DfHelper --> Cluster
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Core Architecture
|
|
31
|
+
|
|
32
|
+
### 1. The Flux Engine (`sibi_flux`)
|
|
33
|
+
The foundational library providing resilient distributed primitives.
|
|
34
|
+
- **`DfHelper`**: A unified API for loading data from SQLAlchemy, Parquet, or HTTP sources into Pandas or Dask DataFrames.
|
|
35
|
+
- **`Dataset`**: A high-level abstraction for hybrid data loading, seamlessly merging historical (Parquet) and live (SQL) data sources.
|
|
36
|
+
- **`DfValidator`**: A robust schema enforcement tool that validates DataFrames against strict type maps and generates ClickHouse DDL.
|
|
37
|
+
- **`ArtifactOrchestrator`**: An async engine for managing concurrent artifact updates with retries, backoff, and worker isolation.
|
|
38
|
+
- **`ManagedResource`**: A rigorous lifecycle management system for async resources, ensuring clean shutdown, signal handling, and observability.
|
|
39
|
+
- **`Dask Cluster`**: A self-healing distributed runtime that detects worker failures, manages re-connection, and enforces "The Nuclear Option" for test isolation.
|
|
40
|
+
|
|
41
|
+
### 2. Agentic Interface (`mcp`)
|
|
42
|
+
Native support for the **Model Context Protocol (MCP)**, allowing AI agents to directly interact with SibiFlux data structures.
|
|
43
|
+
- **Expose DataCubes**: Automatically turn any `DataCube` into a queryable MCP Resource.
|
|
44
|
+
- **Tooling**: Register Python functions as tools callable by agents.
|
|
45
|
+
|
|
46
|
+
## Key Capabilities
|
|
47
|
+
|
|
48
|
+
### Hybrid Data Loading
|
|
49
|
+
SibiFlux implements a "Hot/Cold" architecture for seamless data access:
|
|
50
|
+
- **Historical Data**: Read efficiently from partitioned Parquet archives (S3/Local).
|
|
51
|
+
- **Live Data**: Query operational SQL databases for real-time changes.
|
|
52
|
+
- **Automatic Merge**: `DfHelper` and `Dataset` automatically stitch these sources together, handling schema evolution and deduplication.
|
|
53
|
+
|
|
54
|
+
### Resilient Distributed Compute
|
|
55
|
+
The SibiFlux Dask wrapper provides:
|
|
56
|
+
- **Auto-Healing**: Clients that automatically reconnect if the scheduler dies.
|
|
57
|
+
- **Safe Persistence**: Wrappers like `safe_persist` that retry operations on network jitter.
|
|
58
|
+
- **Smart Partitioning**: Automated repartitioning to prevent "small file problem" in Parquet outputs.
|
|
59
|
+
|
|
60
|
+
### Observability
|
|
61
|
+
Built-in integration with OpenTelemetry (OTel):
|
|
62
|
+
- structured logging with correlation IDs.
|
|
63
|
+
- distributed tracing across async boundaries.
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
### Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Base installation (Core Engine only)
|
|
71
|
+
pip install sibi-flux
|
|
72
|
+
|
|
73
|
+
# For Distributed Computing (Dask Cluster support)
|
|
74
|
+
pip install "sibi-flux[distributed]"
|
|
75
|
+
|
|
76
|
+
# For Geospatial capabilities (OSMnx, GeoPandas, etc.)
|
|
77
|
+
pip install "sibi-flux[geospatial]"
|
|
78
|
+
|
|
79
|
+
# For MCP Agentic Interface support
|
|
80
|
+
pip install "sibi-flux[mcp]"
|
|
81
|
+
|
|
82
|
+
# Complete installation (Simulates the "All-in-One" environment)
|
|
83
|
+
pip install "sibi-flux[complete]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## API Examples
|
|
87
|
+
|
|
88
|
+
### 1. Data Loading (`Dataset`)
|
|
89
|
+
|
|
90
|
+
The `Dataset` class provides a high-level abstraction for hybrid loading.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from sibi_flux import Dataset
|
|
94
|
+
from solutions.logistics.readers.products import ProductsParquetReader, ProductsSqlReader
|
|
95
|
+
|
|
96
|
+
class ProductsDataset(Dataset):
|
|
97
|
+
historical_reader = ProductsParquetReader
|
|
98
|
+
live_reader = ProductsSqlReader
|
|
99
|
+
date_field = "created_at"
|
|
100
|
+
|
|
101
|
+
# Load hybrid data (merges Parquet + SQL)
|
|
102
|
+
ds = ProductsDataset(start_date="2023-01-01", end_date="2023-01-31")
|
|
103
|
+
df = await ds.aload() # Returns a Dask DataFrame
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 2. Schema Validation (`DfValidator`)
|
|
107
|
+
|
|
108
|
+
Ensure your data meets strict type requirements and generate DDL for ClickHouse.
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from sibi_flux import DfValidator
|
|
112
|
+
|
|
113
|
+
# Define expected schema
|
|
114
|
+
SCHEMA = {
|
|
115
|
+
"product_id": "Int64[pyarrow]",
|
|
116
|
+
"price": "Float64[pyarrow]",
|
|
117
|
+
"name": "string[pyarrow]"
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
validator = DfValidator(df)
|
|
121
|
+
validator.validate_schema(SCHEMA)
|
|
122
|
+
validator.standardize_data_quality()
|
|
123
|
+
|
|
124
|
+
# Generate ClickHouse DDL
|
|
125
|
+
ddl = validator.generate_clickhouse_ddl("products_table")
|
|
126
|
+
print(ddl)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 3. Data Enrichment (`AsyncDfEnricher`)
|
|
130
|
+
|
|
131
|
+
Enrich a base DataFrame with data from other sources using `AttachmentSpec`.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from sibi_flux.df_enricher import AsyncDfEnricher, AttachmentSpec
|
|
135
|
+
|
|
136
|
+
specs = [
|
|
137
|
+
AttachmentSpec(
|
|
138
|
+
key="customer_info",
|
|
139
|
+
required_cols={"customer_id"},
|
|
140
|
+
attachment_fn=fetch_customer_data, # Async function returning DF
|
|
141
|
+
left_on=["customer_id"],
|
|
142
|
+
right_on=["id"],
|
|
143
|
+
drop_cols=["id"]
|
|
144
|
+
)
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
enricher = AsyncDfEnricher(base_df=orders_df, specs=specs)
|
|
148
|
+
enriched_df = await enricher.enrich()
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### 4. Orchestration (`ArtifactOrchestrator`)
|
|
152
|
+
|
|
153
|
+
Manage concurrent updates of multiple artifacts with retries and worker isolation.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from sibi_flux.orchestration import ArtifactOrchestrator
|
|
157
|
+
|
|
158
|
+
orchestrator = ArtifactOrchestrator(
|
|
159
|
+
wrapped_classes={"daily": [ProductsDataset, OrdersDataset]},
|
|
160
|
+
max_workers=3,
|
|
161
|
+
retry_attempts=3
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Updates all 'daily' artifacts concurrently
|
|
165
|
+
results = await orchestrator.update_data("daily")
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### 5. Distributed Compute (`Dask Cluster`)
|
|
169
|
+
|
|
170
|
+
Execute resilient operations that survive scheduler restarts.
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from sibi_flux.dask_cluster import safe_compute, get_persistent_client
|
|
174
|
+
|
|
175
|
+
client = get_persistent_client()
|
|
176
|
+
|
|
177
|
+
# Safe compute with auto-retry logic
|
|
178
|
+
result = safe_compute(df.groupby("category").sum())
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 6. Resource Management (`ManagedResource`)
|
|
182
|
+
|
|
183
|
+
Create lifecycle-safe components.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from sibi_flux.core import ManagedResource
|
|
187
|
+
|
|
188
|
+
class MyResource(ManagedResource):
|
|
189
|
+
async def _acleanup(self):
|
|
190
|
+
await self.db_connection.close()
|
|
191
|
+
self.logger.info("Cleaned up!")
|
|
192
|
+
|
|
193
|
+
async with MyResource() as res:
|
|
194
|
+
await res.do_work()
|
|
195
|
+
# Automatically cleaned up here
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### 7. Agentic Interface (`sibi_flux.mcp`)
|
|
199
|
+
|
|
200
|
+
Seamlessly bridge your data with AI Agents.
|
|
201
|
+
|
|
202
|
+
**Server Side (Expose Resources)**
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
from sibi_flux.mcp import BaseMCPRouter
|
|
206
|
+
from products import ProductsDataset
|
|
207
|
+
|
|
208
|
+
# Create an MCP Router compatible with FastAPI
|
|
209
|
+
router = BaseMCPRouter(name="data-server")
|
|
210
|
+
|
|
211
|
+
# Automatically register a Dataset as an MCP Resource
|
|
212
|
+
# Agent can now read `sibi://ProductsDataset`
|
|
213
|
+
router.register_cube_resource(ProductsDataset)
|
|
214
|
+
|
|
215
|
+
# Register a custom tool
|
|
216
|
+
@router.tool()
|
|
217
|
+
def calculate_vat(amount: float) -> float:
|
|
218
|
+
return amount * 0.2
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**Client Side (Consume Resources)**
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from sibi_flux.mcp import GenericMcpClient
|
|
225
|
+
|
|
226
|
+
# Connect to the MCP Server
|
|
227
|
+
async with GenericMcpClient(url="http://localhost:8000/sse") as client:
|
|
228
|
+
# Read the resource (returns JSON data from the Dataset)
|
|
229
|
+
data = await client.read_resource("sibi://ProductsDataset")
|
|
230
|
+
|
|
231
|
+
# Call a tool
|
|
232
|
+
vat = await client.call_tool("calculate_vat", arguments={"amount": 100.0})
|
|
233
|
+
```
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sibi-flux"
|
|
3
|
+
version = "2025.12.0"
|
|
4
|
+
description = "Sibi Toolkit: A collection of tools for Data Analysis/Engineering."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Luis Valverde", email = "lvalverdeb@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
# Core Dependencies
|
|
12
|
+
"pandas>=2.3.3",
|
|
13
|
+
"pyarrow>=22.0.0",
|
|
14
|
+
"pydantic>=2.12.5",
|
|
15
|
+
"pydantic-settings>=2.12.0",
|
|
16
|
+
"dask>=2025.11.0",
|
|
17
|
+
"fsspec>=2025.10.0",
|
|
18
|
+
"s3fs>=2025.10.0",
|
|
19
|
+
"sqlalchemy>=2.0.44",
|
|
20
|
+
"psycopg2>=2.9.11",
|
|
21
|
+
"pymysql>=1.1.2",
|
|
22
|
+
"clickhouse-connect>=0.10.0",
|
|
23
|
+
"concurrent-log-handler>=0.9.28",
|
|
24
|
+
"rich>=14.2.0",
|
|
25
|
+
"filelock>=3.20.1",
|
|
26
|
+
"tqdm>=4.67.1",
|
|
27
|
+
"watchdog>=6.0.0", # Core utils usage
|
|
28
|
+
"tornado==6.5.4",
|
|
29
|
+
"typer>=0.21.0",
|
|
30
|
+
"psutil>=6.1.1",
|
|
31
|
+
"httpx>=0.28.1",
|
|
32
|
+
# Logger Core Dependencies
|
|
33
|
+
"opentelemetry-api>=1.38.0",
|
|
34
|
+
"opentelemetry-exporter-otlp>=1.38.0",
|
|
35
|
+
"opentelemetry-sdk>=1.38.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
distributed = [
|
|
40
|
+
"distributed>=2025.11.0",
|
|
41
|
+
]
|
|
42
|
+
mcp = [
|
|
43
|
+
"sibi-flux[distributed]",
|
|
44
|
+
"mcp>=1.1.2",
|
|
45
|
+
"fastapi>=0.127.0",
|
|
46
|
+
"uvicorn>=0.40.0",
|
|
47
|
+
"httpx>=0.28.1",
|
|
48
|
+
]
|
|
49
|
+
geospatial = [
|
|
50
|
+
"osmnx>=2.0.7",
|
|
51
|
+
"geopandas>=1.1.2",
|
|
52
|
+
"geopy>=2.4.1",
|
|
53
|
+
"folium>=0.20.0",
|
|
54
|
+
"osmium>=4.2.0",
|
|
55
|
+
"shapely>=2.0.0",
|
|
56
|
+
"networkx>=3.6.1",
|
|
57
|
+
]
|
|
58
|
+
complete = [
|
|
59
|
+
"sibi-flux[distributed,geospatial,mcp]"
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
[dependency-groups]
|
|
64
|
+
dev = [
|
|
65
|
+
"black>=25.11.0",
|
|
66
|
+
"bokeh>=3.8.0",
|
|
67
|
+
"graphviz>=0.21",
|
|
68
|
+
"jupyter>=1.1.1",
|
|
69
|
+
"poethepoet>=0.38.0",
|
|
70
|
+
"notebook>=7.5.0",
|
|
71
|
+
"pytest>=9.0.1",
|
|
72
|
+
"pytest-asyncio>=1.3.0",
|
|
73
|
+
"ruff>=0.14.9",
|
|
74
|
+
"httpx>=0.28.1",
|
|
75
|
+
]
|
|
76
|
+
geospatial = [
|
|
77
|
+
"folium>=0.20.0",
|
|
78
|
+
"geopandas>=1.1.2",
|
|
79
|
+
"geopy>=2.4.1",
|
|
80
|
+
"networkx>=3.6.1",
|
|
81
|
+
"osmnx>=2.0.7",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
[build-system]
|
|
85
|
+
requires = ["uv_build>=0.9.5,<0.10.0"]
|
|
86
|
+
build-backend = "uv_build"
|
|
87
|
+
|
|
88
|
+
[tool.uv.build-backend]
|
|
89
|
+
module-root = "src"
|
|
90
|
+
module-name = ["sibi_flux", "sibi_dst"]
|
|
91
|
+
|
|
92
|
+
[tool.pytest.ini_options]
|
|
93
|
+
pythonpath = ["src", "."]
|
|
94
|
+
testpaths = ["tests"]
|
|
95
|
+
addopts = "-v"
|
|
96
|
+
filterwarnings = ["ignore::DeprecationWarning"]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
[tool.poe.tasks]
|
|
100
|
+
dev = """
|
|
101
|
+
uvicorn solutions.main:app
|
|
102
|
+
--reload
|
|
103
|
+
--reload-dir solutions
|
|
104
|
+
--host 0.0.0.0
|
|
105
|
+
--port 6500
|
|
106
|
+
--env-file .env.linux
|
|
107
|
+
--workers 1
|
|
108
|
+
"""
|
|
109
|
+
test = { cmd = "pytest tests/"}
|
|
110
|
+
lint = "black src/"
|
|
111
|
+
|
|
112
|
+
[tool.commitizen]
|
|
113
|
+
name = "cz_conventional_commits"
|
|
114
|
+
version = "1.0.0"
|
|
115
|
+
tag_format = "v$version"
|
|
116
|
+
|
|
117
|
+
[tool.mypy]
|
|
118
|
+
python_version = "3.11"
|
|
119
|
+
mypy_path = "src"
|
|
120
|
+
explicit_package_bases = true
|
|
121
|
+
namespace_packages = true
|
|
122
|
+
warn_return_any = false
|
|
123
|
+
warn_unused_configs = true
|
|
124
|
+
check_untyped_defs = true
|
|
125
|
+
disallow_untyped_defs = false
|
|
126
|
+
ignore_missing_imports = false
|
|
127
|
+
|
|
128
|
+
# Ignore missing imports for specific 3rd party libraries without strict typing
|
|
129
|
+
[[tool.mypy.overrides]]
|
|
130
|
+
module = [
|
|
131
|
+
"pandas.*",
|
|
132
|
+
"dask.*",
|
|
133
|
+
"distributed.*",
|
|
134
|
+
"fsspec.*",
|
|
135
|
+
"s3fs.*",
|
|
136
|
+
"sqlalchemy.*",
|
|
137
|
+
"pyarrow.*",
|
|
138
|
+
"clickhouse_connect.*",
|
|
139
|
+
"rich.*",
|
|
140
|
+
"filelock.*",
|
|
141
|
+
"tqdm.*",
|
|
142
|
+
"watchdog.*",
|
|
143
|
+
"tornado.*",
|
|
144
|
+
"typer.*",
|
|
145
|
+
"fastapi.*",
|
|
146
|
+
"uvicorn.*",
|
|
147
|
+
"httpx.*",
|
|
148
|
+
"osmnx.*",
|
|
149
|
+
"geopandas.*",
|
|
150
|
+
"geopy.*",
|
|
151
|
+
"folium.*",
|
|
152
|
+
"osmium.*",
|
|
153
|
+
"shapely.*",
|
|
154
|
+
"networkx.*",
|
|
155
|
+
"opentelemetry.*",
|
|
156
|
+
"psutil.*",
|
|
157
|
+
"pytest.*",
|
|
158
|
+
"mcp.*",
|
|
159
|
+
"yaml.*",
|
|
160
|
+
"pydantic.*",
|
|
161
|
+
"pydantic_settings.*",
|
|
162
|
+
]
|
|
163
|
+
ignore_missing_imports = true
|