sibi-flux 2025.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_flux-2025.12.0/PKG-INFO +283 -0
  2. sibi_flux-2025.12.0/README.md +233 -0
  3. sibi_flux-2025.12.0/pyproject.toml +163 -0
  4. sibi_flux-2025.12.0/src/sibi_dst/__init__.py +44 -0
  5. sibi_flux-2025.12.0/src/sibi_flux/__init__.py +49 -0
  6. sibi_flux-2025.12.0/src/sibi_flux/artifacts/__init__.py +7 -0
  7. sibi_flux-2025.12.0/src/sibi_flux/artifacts/base.py +166 -0
  8. sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet.py +360 -0
  9. sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  10. sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  11. sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  12. sibi_flux-2025.12.0/src/sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  13. sibi_flux-2025.12.0/src/sibi_flux/conf/settings.py +131 -0
  14. sibi_flux-2025.12.0/src/sibi_flux/core/__init__.py +5 -0
  15. sibi_flux-2025.12.0/src/sibi_flux/core/managed_resource/__init__.py +3 -0
  16. sibi_flux-2025.12.0/src/sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  17. sibi_flux-2025.12.0/src/sibi_flux/core/type_maps/__init__.py +100 -0
  18. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/__init__.py +47 -0
  19. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/async_core.py +27 -0
  20. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/client_manager.py +549 -0
  21. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/core.py +322 -0
  22. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/exceptions.py +34 -0
  23. sibi_flux-2025.12.0/src/sibi_flux/dask_cluster/utils.py +49 -0
  24. sibi_flux-2025.12.0/src/sibi_flux/datacube/__init__.py +3 -0
  25. sibi_flux-2025.12.0/src/sibi_flux/datacube/_data_cube.py +332 -0
  26. sibi_flux-2025.12.0/src/sibi_flux/datacube/config_engine.py +152 -0
  27. sibi_flux-2025.12.0/src/sibi_flux/datacube/field_factory.py +48 -0
  28. sibi_flux-2025.12.0/src/sibi_flux/datacube/field_registry.py +122 -0
  29. sibi_flux-2025.12.0/src/sibi_flux/datacube/generator.py +677 -0
  30. sibi_flux-2025.12.0/src/sibi_flux/datacube/orchestrator.py +171 -0
  31. sibi_flux-2025.12.0/src/sibi_flux/dataset/__init__.py +3 -0
  32. sibi_flux-2025.12.0/src/sibi_flux/dataset/_dataset.py +162 -0
  33. sibi_flux-2025.12.0/src/sibi_flux/df_enricher/__init__.py +56 -0
  34. sibi_flux-2025.12.0/src/sibi_flux/df_enricher/async_enricher.py +201 -0
  35. sibi_flux-2025.12.0/src/sibi_flux/df_enricher/merger.py +253 -0
  36. sibi_flux-2025.12.0/src/sibi_flux/df_enricher/specs.py +45 -0
  37. sibi_flux-2025.12.0/src/sibi_flux/df_enricher/types.py +12 -0
  38. sibi_flux-2025.12.0/src/sibi_flux/df_helper/__init__.py +5 -0
  39. sibi_flux-2025.12.0/src/sibi_flux/df_helper/_df_helper.py +450 -0
  40. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/__init__.py +34 -0
  41. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/_params.py +173 -0
  42. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/_strategies.py +295 -0
  43. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/http/__init__.py +5 -0
  44. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  45. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  46. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  47. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  48. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  49. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  50. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  51. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  52. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  53. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  54. sibi_flux-2025.12.0/src/sibi_flux/df_helper/backends/utils.py +32 -0
  55. sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/__init__.py +15 -0
  56. sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_defaults.py +104 -0
  57. sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_filter_handler.py +617 -0
  58. sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_params_config.py +185 -0
  59. sibi_flux-2025.12.0/src/sibi_flux/df_helper/core/_query_config.py +17 -0
  60. sibi_flux-2025.12.0/src/sibi_flux/df_validator/__init__.py +3 -0
  61. sibi_flux-2025.12.0/src/sibi_flux/df_validator/_df_validator.py +222 -0
  62. sibi_flux-2025.12.0/src/sibi_flux/logger/__init__.py +1 -0
  63. sibi_flux-2025.12.0/src/sibi_flux/logger/_logger.py +480 -0
  64. sibi_flux-2025.12.0/src/sibi_flux/mcp/__init__.py +26 -0
  65. sibi_flux-2025.12.0/src/sibi_flux/mcp/client.py +150 -0
  66. sibi_flux-2025.12.0/src/sibi_flux/mcp/router.py +126 -0
  67. sibi_flux-2025.12.0/src/sibi_flux/orchestration/__init__.py +9 -0
  68. sibi_flux-2025.12.0/src/sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  69. sibi_flux-2025.12.0/src/sibi_flux/orchestration/_pipeline_executor.py +212 -0
  70. sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/__init__.py +22 -0
  71. sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  72. sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/graph_loader.py +225 -0
  73. sibi_flux-2025.12.0/src/sibi_flux/osmnx_helper/utils.py +100 -0
  74. sibi_flux-2025.12.0/src/sibi_flux/pipelines/__init__.py +3 -0
  75. sibi_flux-2025.12.0/src/sibi_flux/pipelines/base.py +218 -0
  76. sibi_flux-2025.12.0/src/sibi_flux/py.typed +0 -0
  77. sibi_flux-2025.12.0/src/sibi_flux/readers/__init__.py +3 -0
  78. sibi_flux-2025.12.0/src/sibi_flux/readers/base.py +82 -0
  79. sibi_flux-2025.12.0/src/sibi_flux/readers/parquet.py +106 -0
  80. sibi_flux-2025.12.0/src/sibi_flux/utils/__init__.py +53 -0
  81. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/__init__.py +19 -0
  82. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  83. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  84. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  85. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  86. sibi_flux-2025.12.0/src/sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  87. sibi_flux-2025.12.0/src/sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  88. sibi_flux-2025.12.0/src/sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  89. sibi_flux-2025.12.0/src/sibi_flux/utils/common.py +7 -0
  90. sibi_flux-2025.12.0/src/sibi_flux/utils/credentials/__init__.py +3 -0
  91. sibi_flux-2025.12.0/src/sibi_flux/utils/credentials/_config_manager.py +155 -0
  92. sibi_flux-2025.12.0/src/sibi_flux/utils/dask_utils.py +14 -0
  93. sibi_flux-2025.12.0/src/sibi_flux/utils/data_utils/__init__.py +3 -0
  94. sibi_flux-2025.12.0/src/sibi_flux/utils/data_utils/_data_utils.py +389 -0
  95. sibi_flux-2025.12.0/src/sibi_flux/utils/dataframe_utils.py +52 -0
  96. sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/__init__.py +10 -0
  97. sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_business_days.py +220 -0
  98. sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_date_utils.py +311 -0
  99. sibi_flux-2025.12.0/src/sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  100. sibi_flux-2025.12.0/src/sibi_flux/utils/file_utils.py +48 -0
  101. sibi_flux-2025.12.0/src/sibi_flux/utils/filepath_generator/__init__.py +5 -0
  102. sibi_flux-2025.12.0/src/sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  103. sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/__init__.py +6 -0
  104. sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  105. sibi_flux-2025.12.0/src/sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  106. sibi_flux-2025.12.0/src/sibi_flux/utils/retry.py +46 -0
  107. sibi_flux-2025.12.0/src/sibi_flux/utils/storage/__init__.py +7 -0
  108. sibi_flux-2025.12.0/src/sibi_flux/utils/storage/_fs_registry.py +112 -0
  109. sibi_flux-2025.12.0/src/sibi_flux/utils/storage/_storage_manager.py +257 -0
  110. sibi_flux-2025.12.0/src/sibi_flux/utils/storage/factory.py +33 -0
@@ -0,0 +1,283 @@
1
+ Metadata-Version: 2.3
2
+ Name: sibi-flux
3
+ Version: 2025.12.0
4
+ Summary: Sibi Toolkit: A collection of tools for Data Analysis/Engineering.
5
+ Author: Luis Valverde
6
+ Author-email: Luis Valverde <lvalverdeb@gmail.com>
7
+ Requires-Dist: pandas>=2.3.3
8
+ Requires-Dist: pyarrow>=22.0.0
9
+ Requires-Dist: pydantic>=2.12.5
10
+ Requires-Dist: pydantic-settings>=2.12.0
11
+ Requires-Dist: dask>=2025.11.0
12
+ Requires-Dist: fsspec>=2025.10.0
13
+ Requires-Dist: s3fs>=2025.10.0
14
+ Requires-Dist: sqlalchemy>=2.0.44
15
+ Requires-Dist: psycopg2>=2.9.11
16
+ Requires-Dist: pymysql>=1.1.2
17
+ Requires-Dist: clickhouse-connect>=0.10.0
18
+ Requires-Dist: concurrent-log-handler>=0.9.28
19
+ Requires-Dist: rich>=14.2.0
20
+ Requires-Dist: filelock>=3.20.1
21
+ Requires-Dist: tqdm>=4.67.1
22
+ Requires-Dist: watchdog>=6.0.0
23
+ Requires-Dist: tornado==6.5.4
24
+ Requires-Dist: typer>=0.21.0
25
+ Requires-Dist: psutil>=6.1.1
26
+ Requires-Dist: httpx>=0.28.1
27
+ Requires-Dist: opentelemetry-api>=1.38.0
28
+ Requires-Dist: opentelemetry-exporter-otlp>=1.38.0
29
+ Requires-Dist: opentelemetry-sdk>=1.38.0
30
+ Requires-Dist: sibi-flux[distributed,geospatial,mcp] ; extra == 'complete'
31
+ Requires-Dist: distributed>=2025.11.0 ; extra == 'distributed'
32
+ Requires-Dist: osmnx>=2.0.7 ; extra == 'geospatial'
33
+ Requires-Dist: geopandas>=1.1.2 ; extra == 'geospatial'
34
+ Requires-Dist: geopy>=2.4.1 ; extra == 'geospatial'
35
+ Requires-Dist: folium>=0.20.0 ; extra == 'geospatial'
36
+ Requires-Dist: osmium>=4.2.0 ; extra == 'geospatial'
37
+ Requires-Dist: shapely>=2.0.0 ; extra == 'geospatial'
38
+ Requires-Dist: networkx>=3.6.1 ; extra == 'geospatial'
39
+ Requires-Dist: sibi-flux[distributed] ; extra == 'mcp'
40
+ Requires-Dist: mcp>=1.1.2 ; extra == 'mcp'
41
+ Requires-Dist: fastapi>=0.127.0 ; extra == 'mcp'
42
+ Requires-Dist: uvicorn>=0.40.0 ; extra == 'mcp'
43
+ Requires-Dist: httpx>=0.28.1 ; extra == 'mcp'
44
+ Requires-Python: >=3.11
45
+ Provides-Extra: complete
46
+ Provides-Extra: distributed
47
+ Provides-Extra: geospatial
48
+ Provides-Extra: mcp
49
+ Description-Content-Type: text/markdown
50
+
51
+ # SibiFlux
52
+
53
+ **SibiFlux** is a production-grade resilient data engineering ecosystem designed to bridge the gap between local development, distributed computing, and agentic AI workflows. It provides a unified engine for hybrid data loading (batch + streaming), self-healing distributed operations, and native interfaces for AI agents via the Model Context Protocol (MCP).
54
+
55
+ ```mermaid
56
+ graph TD
57
+ subgraph "Agentic Interface (MCP)"
58
+ Agent["AI Agent / Claude"] <--> Router["MCP Router"]
59
+ Router <--> Resources["SibiFlux Resources"]
60
+ end
61
+
62
+ subgraph "Solutions Layer (Business Logic)"
63
+ Logistics["Logistics Solutions"]
64
+ Enrichment["Enrichment Pipelines"]
65
+ Cubes["DataCubes"]
66
+ end
67
+
68
+ subgraph "SibiFlux Core Engine"
69
+ DfHelper["DfHelper (Unified Loader)"]
70
+ Cluster["Resilient Dask Cluster"]
71
+ Managed["ManagedResource Lifecycle"]
72
+ end
73
+
74
+ Resources --> Cubes
75
+ Logistics --> DfHelper
76
+ Cubes --> DfHelper
77
+ DfHelper --> Cluster
78
+ ```
79
+
80
+ ## Core Architecture
81
+
82
+ ### 1. The Flux Engine (`sibi_flux`)
83
+ The foundational library providing resilient distributed primitives.
84
+ - **`DfHelper`**: A unified API for loading data from SQLAlchemy, Parquet, or HTTP sources into Pandas or Dask DataFrames.
85
+ - **`Dataset`**: A high-level abstraction for hybrid data loading, seamlessly merging historical (Parquet) and live (SQL) data sources.
86
+ - **`DfValidator`**: A robust schema enforcement tool that validates DataFrames against strict type maps and generates ClickHouse DDL.
87
+ - **`ArtifactOrchestrator`**: An async engine for managing concurrent artifact updates with retries, backoff, and worker isolation.
88
+ - **`ManagedResource`**: A rigorous lifecycle management system for async resources, ensuring clean shutdown, signal handling, and observability.
89
+ - **`Dask Cluster`**: A self-healing distributed runtime that detects worker failures, manages re-connection, and enforces "The Nuclear Option" for test isolation.
90
+
91
+ ### 2. Agentic Interface (`mcp`)
92
+ Native support for the **Model Context Protocol (MCP)**, allowing AI agents to directly interact with SibiFlux data structures.
93
+ - **Expose DataCubes**: Automatically turn any `DataCube` into a queryable MCP Resource.
94
+ - **Tooling**: Register Python functions as tools callable by agents.
95
+
96
+ ## Key Capabilities
97
+
98
+ ### Hybrid Data Loading
99
+ SibiFlux implements a "Hot/Cold" architecture for seamless data access:
100
+ - **Historical Data**: Read efficiently from partitioned Parquet archives (S3/Local).
101
+ - **Live Data**: Query operational SQL databases for real-time changes.
102
+ - **Automatic Merge**: `DfHelper` and `Dataset` automatically stitch these sources together, handling schema evolution and deduplication.
103
+
104
+ ### Resilient Distributed Compute
105
+ The SibiFlux Dask wrapper provides:
106
+ - **Auto-Healing**: Clients that automatically reconnect if the scheduler dies.
107
+ - **Safe Persistence**: Wrappers like `safe_persist` that retry operations on network jitter.
108
+ - **Smart Partitioning**: Automated repartitioning to prevent "small file problem" in Parquet outputs.
109
+
110
+ ### Observability
111
+ Built-in integration with OpenTelemetry (OTel):
112
+ - structured logging with correlation IDs.
113
+ - distributed tracing across async boundaries.
114
+
115
+ ## Quick Start
116
+
117
+ ### Installation
118
+
119
+ ```bash
120
+ # Base installation (Core Engine only)
121
+ pip install sibi-flux
122
+
123
+ # For Distributed Computing (Dask Cluster support)
124
+ pip install "sibi-flux[distributed]"
125
+
126
+ # For Geospatial capabilities (OSMnx, GeoPandas, etc.)
127
+ pip install "sibi-flux[geospatial]"
128
+
129
+ # For MCP Agentic Interface support
130
+ pip install "sibi-flux[mcp]"
131
+
132
+ # Complete installation (Simulates the "All-in-One" environment)
133
+ pip install "sibi-flux[complete]"
134
+ ```
135
+
136
+ ## API Examples
137
+
138
+ ### 1. Data Loading (`Dataset`)
139
+
140
+ The `Dataset` class provides a high-level abstraction for hybrid loading.
141
+
142
+ ```python
143
+ from sibi_flux import Dataset
144
+ from solutions.logistics.readers.products import ProductsParquetReader, ProductsSqlReader
145
+
146
+ class ProductsDataset(Dataset):
147
+ historical_reader = ProductsParquetReader
148
+ live_reader = ProductsSqlReader
149
+ date_field = "created_at"
150
+
151
+ # Load hybrid data (merges Parquet + SQL)
152
+ ds = ProductsDataset(start_date="2023-01-01", end_date="2023-01-31")
153
+ df = await ds.aload() # Returns a Dask DataFrame
154
+ ```
155
+
156
+ ### 2. Schema Validation (`DfValidator`)
157
+
158
+ Ensure your data meets strict type requirements and generate DDL for ClickHouse.
159
+
160
+ ```python
161
+ from sibi_flux import DfValidator
162
+
163
+ # Define expected schema
164
+ SCHEMA = {
165
+ "product_id": "Int64[pyarrow]",
166
+ "price": "Float64[pyarrow]",
167
+ "name": "string[pyarrow]"
168
+ }
169
+
170
+ validator = DfValidator(df)
171
+ validator.validate_schema(SCHEMA)
172
+ validator.standardize_data_quality()
173
+
174
+ # Generate ClickHouse DDL
175
+ ddl = validator.generate_clickhouse_ddl("products_table")
176
+ print(ddl)
177
+ ```
178
+
179
+ ### 3. Data Enrichment (`AsyncDfEnricher`)
180
+
181
+ Enrich a base DataFrame with data from other sources using `AttachmentSpec`.
182
+
183
+ ```python
184
+ from sibi_flux.df_enricher import AsyncDfEnricher, AttachmentSpec
185
+
186
+ specs = [
187
+ AttachmentSpec(
188
+ key="customer_info",
189
+ required_cols={"customer_id"},
190
+ attachment_fn=fetch_customer_data, # Async function returning DF
191
+ left_on=["customer_id"],
192
+ right_on=["id"],
193
+ drop_cols=["id"]
194
+ )
195
+ ]
196
+
197
+ enricher = AsyncDfEnricher(base_df=orders_df, specs=specs)
198
+ enriched_df = await enricher.enrich()
199
+ ```
200
+
201
+ ### 4. Orchestration (`ArtifactOrchestrator`)
202
+
203
+ Manage concurrent updates of multiple artifacts with retries and worker isolation.
204
+
205
+ ```python
206
+ from sibi_flux.orchestration import ArtifactOrchestrator
207
+
208
+ orchestrator = ArtifactOrchestrator(
209
+ wrapped_classes={"daily": [ProductsDataset, OrdersDataset]},
210
+ max_workers=3,
211
+ retry_attempts=3
212
+ )
213
+
214
+ # Updates all 'daily' artifacts concurrently
215
+ results = await orchestrator.update_data("daily")
216
+ ```
217
+
218
+ ### 5. Distributed Compute (`Dask Cluster`)
219
+
220
+ Execute resilient operations that survive scheduler restarts.
221
+
222
+ ```python
223
+ from sibi_flux.dask_cluster import safe_compute, get_persistent_client
224
+
225
+ client = get_persistent_client()
226
+
227
+ # Safe compute with auto-retry logic
228
+ result = safe_compute(df.groupby("category").sum())
229
+ ```
230
+
231
+ ### 6. Resource Management (`ManagedResource`)
232
+
233
+ Create lifecycle-safe components.
234
+
235
+ ```python
236
+ from sibi_flux.core import ManagedResource
237
+
238
+ class MyResource(ManagedResource):
239
+ async def _acleanup(self):
240
+ await self.db_connection.close()
241
+ self.logger.info("Cleaned up!")
242
+
243
+ async with MyResource() as res:
244
+ await res.do_work()
245
+ # Automatically cleaned up here
246
+ ```
247
+
248
+ ### 7. Agentic Interface (`sibi_flux.mcp`)
249
+
250
+ Seamlessly bridge your data with AI Agents.
251
+
252
+ **Server Side (Expose Resources)**
253
+
254
+ ```python
255
+ from sibi_flux.mcp import BaseMCPRouter
256
+ from products import ProductsDataset
257
+
258
+ # Create an MCP Router compatible with FastAPI
259
+ router = BaseMCPRouter(name="data-server")
260
+
261
+ # Automatically register a Dataset as an MCP Resource
262
+ # Agent can now read `sibi://ProductsDataset`
263
+ router.register_cube_resource(ProductsDataset)
264
+
265
+ # Register a custom tool
266
+ @router.tool()
267
+ def calculate_vat(amount: float) -> float:
268
+ return amount * 0.2
269
+ ```
270
+
271
+ **Client Side (Consume Resources)**
272
+
273
+ ```python
274
+ from sibi_flux.mcp import GenericMcpClient
275
+
276
+ # Connect to the MCP Server
277
+ async with GenericMcpClient(url="http://localhost:8000/sse") as client:
278
+ # Read the resource (returns JSON data from the Dataset)
279
+ data = await client.read_resource("sibi://ProductsDataset")
280
+
281
+ # Call a tool
282
+ vat = await client.call_tool("calculate_vat", arguments={"amount": 100.0})
283
+ ```
@@ -0,0 +1,233 @@
1
+ # SibiFlux
2
+
3
+ **SibiFlux** is a production-grade resilient data engineering ecosystem designed to bridge the gap between local development, distributed computing, and agentic AI workflows. It provides a unified engine for hybrid data loading (batch + streaming), self-healing distributed operations, and native interfaces for AI agents via the Model Context Protocol (MCP).
4
+
5
+ ```mermaid
6
+ graph TD
7
+ subgraph "Agentic Interface (MCP)"
8
+ Agent["AI Agent / Claude"] <--> Router["MCP Router"]
9
+ Router <--> Resources["SibiFlux Resources"]
10
+ end
11
+
12
+ subgraph "Solutions Layer (Business Logic)"
13
+ Logistics["Logistics Solutions"]
14
+ Enrichment["Enrichment Pipelines"]
15
+ Cubes["DataCubes"]
16
+ end
17
+
18
+ subgraph "SibiFlux Core Engine"
19
+ DfHelper["DfHelper (Unified Loader)"]
20
+ Cluster["Resilient Dask Cluster"]
21
+ Managed["ManagedResource Lifecycle"]
22
+ end
23
+
24
+ Resources --> Cubes
25
+ Logistics --> DfHelper
26
+ Cubes --> DfHelper
27
+ DfHelper --> Cluster
28
+ ```
29
+
30
+ ## Core Architecture
31
+
32
+ ### 1. The Flux Engine (`sibi_flux`)
33
+ The foundational library providing resilient distributed primitives.
34
+ - **`DfHelper`**: A unified API for loading data from SQLAlchemy, Parquet, or HTTP sources into Pandas or Dask DataFrames.
35
+ - **`Dataset`**: A high-level abstraction for hybrid data loading, seamlessly merging historical (Parquet) and live (SQL) data sources.
36
+ - **`DfValidator`**: A robust schema enforcement tool that validates DataFrames against strict type maps and generates ClickHouse DDL.
37
+ - **`ArtifactOrchestrator`**: An async engine for managing concurrent artifact updates with retries, backoff, and worker isolation.
38
+ - **`ManagedResource`**: A rigorous lifecycle management system for async resources, ensuring clean shutdown, signal handling, and observability.
39
+ - **`Dask Cluster`**: A self-healing distributed runtime that detects worker failures, manages re-connection, and enforces "The Nuclear Option" for test isolation.
40
+
41
+ ### 2. Agentic Interface (`mcp`)
42
+ Native support for the **Model Context Protocol (MCP)**, allowing AI agents to directly interact with SibiFlux data structures.
43
+ - **Expose DataCubes**: Automatically turn any `DataCube` into a queryable MCP Resource.
44
+ - **Tooling**: Register Python functions as tools callable by agents.
45
+
46
+ ## Key Capabilities
47
+
48
+ ### Hybrid Data Loading
49
+ SibiFlux implements a "Hot/Cold" architecture for seamless data access:
50
+ - **Historical Data**: Read efficiently from partitioned Parquet archives (S3/Local).
51
+ - **Live Data**: Query operational SQL databases for real-time changes.
52
+ - **Automatic Merge**: `DfHelper` and `Dataset` automatically stitch these sources together, handling schema evolution and deduplication.
53
+
54
+ ### Resilient Distributed Compute
55
+ The SibiFlux Dask wrapper provides:
56
+ - **Auto-Healing**: Clients that automatically reconnect if the scheduler dies.
57
+ - **Safe Persistence**: Wrappers like `safe_persist` that retry operations on network jitter.
58
+ - **Smart Partitioning**: Automated repartitioning to prevent "small file problem" in Parquet outputs.
59
+
60
+ ### Observability
61
+ Built-in integration with OpenTelemetry (OTel):
62
+ - structured logging with correlation IDs.
63
+ - distributed tracing across async boundaries.
64
+
65
+ ## Quick Start
66
+
67
+ ### Installation
68
+
69
+ ```bash
70
+ # Base installation (Core Engine only)
71
+ pip install sibi-flux
72
+
73
+ # For Distributed Computing (Dask Cluster support)
74
+ pip install "sibi-flux[distributed]"
75
+
76
+ # For Geospatial capabilities (OSMnx, GeoPandas, etc.)
77
+ pip install "sibi-flux[geospatial]"
78
+
79
+ # For MCP Agentic Interface support
80
+ pip install "sibi-flux[mcp]"
81
+
82
+ # Complete installation (Simulates the "All-in-One" environment)
83
+ pip install "sibi-flux[complete]"
84
+ ```
85
+
86
+ ## API Examples
87
+
88
+ ### 1. Data Loading (`Dataset`)
89
+
90
+ The `Dataset` class provides a high-level abstraction for hybrid loading.
91
+
92
+ ```python
93
+ from sibi_flux import Dataset
94
+ from solutions.logistics.readers.products import ProductsParquetReader, ProductsSqlReader
95
+
96
+ class ProductsDataset(Dataset):
97
+ historical_reader = ProductsParquetReader
98
+ live_reader = ProductsSqlReader
99
+ date_field = "created_at"
100
+
101
+ # Load hybrid data (merges Parquet + SQL)
102
+ ds = ProductsDataset(start_date="2023-01-01", end_date="2023-01-31")
103
+ df = await ds.aload() # Returns a Dask DataFrame
104
+ ```
105
+
106
+ ### 2. Schema Validation (`DfValidator`)
107
+
108
+ Ensure your data meets strict type requirements and generate DDL for ClickHouse.
109
+
110
+ ```python
111
+ from sibi_flux import DfValidator
112
+
113
+ # Define expected schema
114
+ SCHEMA = {
115
+ "product_id": "Int64[pyarrow]",
116
+ "price": "Float64[pyarrow]",
117
+ "name": "string[pyarrow]"
118
+ }
119
+
120
+ validator = DfValidator(df)
121
+ validator.validate_schema(SCHEMA)
122
+ validator.standardize_data_quality()
123
+
124
+ # Generate ClickHouse DDL
125
+ ddl = validator.generate_clickhouse_ddl("products_table")
126
+ print(ddl)
127
+ ```
128
+
129
+ ### 3. Data Enrichment (`AsyncDfEnricher`)
130
+
131
+ Enrich a base DataFrame with data from other sources using `AttachmentSpec`.
132
+
133
+ ```python
134
+ from sibi_flux.df_enricher import AsyncDfEnricher, AttachmentSpec
135
+
136
+ specs = [
137
+ AttachmentSpec(
138
+ key="customer_info",
139
+ required_cols={"customer_id"},
140
+ attachment_fn=fetch_customer_data, # Async function returning DF
141
+ left_on=["customer_id"],
142
+ right_on=["id"],
143
+ drop_cols=["id"]
144
+ )
145
+ ]
146
+
147
+ enricher = AsyncDfEnricher(base_df=orders_df, specs=specs)
148
+ enriched_df = await enricher.enrich()
149
+ ```
150
+
151
+ ### 4. Orchestration (`ArtifactOrchestrator`)
152
+
153
+ Manage concurrent updates of multiple artifacts with retries and worker isolation.
154
+
155
+ ```python
156
+ from sibi_flux.orchestration import ArtifactOrchestrator
157
+
158
+ orchestrator = ArtifactOrchestrator(
159
+ wrapped_classes={"daily": [ProductsDataset, OrdersDataset]},
160
+ max_workers=3,
161
+ retry_attempts=3
162
+ )
163
+
164
+ # Updates all 'daily' artifacts concurrently
165
+ results = await orchestrator.update_data("daily")
166
+ ```
167
+
168
+ ### 5. Distributed Compute (`Dask Cluster`)
169
+
170
+ Execute resilient operations that survive scheduler restarts.
171
+
172
+ ```python
173
+ from sibi_flux.dask_cluster import safe_compute, get_persistent_client
174
+
175
+ client = get_persistent_client()
176
+
177
+ # Safe compute with auto-retry logic
178
+ result = safe_compute(df.groupby("category").sum())
179
+ ```
180
+
181
+ ### 6. Resource Management (`ManagedResource`)
182
+
183
+ Create lifecycle-safe components.
184
+
185
+ ```python
186
+ from sibi_flux.core import ManagedResource
187
+
188
+ class MyResource(ManagedResource):
189
+ async def _acleanup(self):
190
+ await self.db_connection.close()
191
+ self.logger.info("Cleaned up!")
192
+
193
+ async with MyResource() as res:
194
+ await res.do_work()
195
+ # Automatically cleaned up here
196
+ ```
197
+
198
+ ### 7. Agentic Interface (`sibi_flux.mcp`)
199
+
200
+ Seamlessly bridge your data with AI Agents.
201
+
202
+ **Server Side (Expose Resources)**
203
+
204
+ ```python
205
+ from sibi_flux.mcp import BaseMCPRouter
206
+ from products import ProductsDataset
207
+
208
+ # Create an MCP Router compatible with FastAPI
209
+ router = BaseMCPRouter(name="data-server")
210
+
211
+ # Automatically register a Dataset as an MCP Resource
212
+ # Agent can now read `sibi://ProductsDataset`
213
+ router.register_cube_resource(ProductsDataset)
214
+
215
+ # Register a custom tool
216
+ @router.tool()
217
+ def calculate_vat(amount: float) -> float:
218
+ return amount * 0.2
219
+ ```
220
+
221
+ **Client Side (Consume Resources)**
222
+
223
+ ```python
224
+ from sibi_flux.mcp import GenericMcpClient
225
+
226
+ # Connect to the MCP Server
227
+ async with GenericMcpClient(url="http://localhost:8000/sse") as client:
228
+ # Read the resource (returns JSON data from the Dataset)
229
+ data = await client.read_resource("sibi://ProductsDataset")
230
+
231
+ # Call a tool
232
+ vat = await client.call_tool("calculate_vat", arguments={"amount": 100.0})
233
+ ```
@@ -0,0 +1,163 @@
1
+ [project]
2
+ name = "sibi-flux"
3
+ version = "2025.12.0"
4
+ description = "Sibi Toolkit: A collection of tools for Data Analysis/Engineering."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Luis Valverde", email = "lvalverdeb@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ # Core Dependencies
12
+ "pandas>=2.3.3",
13
+ "pyarrow>=22.0.0",
14
+ "pydantic>=2.12.5",
15
+ "pydantic-settings>=2.12.0",
16
+ "dask>=2025.11.0",
17
+ "fsspec>=2025.10.0",
18
+ "s3fs>=2025.10.0",
19
+ "sqlalchemy>=2.0.44",
20
+ "psycopg2>=2.9.11",
21
+ "pymysql>=1.1.2",
22
+ "clickhouse-connect>=0.10.0",
23
+ "concurrent-log-handler>=0.9.28",
24
+ "rich>=14.2.0",
25
+ "filelock>=3.20.1",
26
+ "tqdm>=4.67.1",
27
+ "watchdog>=6.0.0", # Core utils usage
28
+ "tornado==6.5.4",
29
+ "typer>=0.21.0",
30
+ "psutil>=6.1.1",
31
+ "httpx>=0.28.1",
32
+ # Logger Core Dependencies
33
+ "opentelemetry-api>=1.38.0",
34
+ "opentelemetry-exporter-otlp>=1.38.0",
35
+ "opentelemetry-sdk>=1.38.0",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ distributed = [
40
+ "distributed>=2025.11.0",
41
+ ]
42
+ mcp = [
43
+ "sibi-flux[distributed]",
44
+ "mcp>=1.1.2",
45
+ "fastapi>=0.127.0",
46
+ "uvicorn>=0.40.0",
47
+ "httpx>=0.28.1",
48
+ ]
49
+ geospatial = [
50
+ "osmnx>=2.0.7",
51
+ "geopandas>=1.1.2",
52
+ "geopy>=2.4.1",
53
+ "folium>=0.20.0",
54
+ "osmium>=4.2.0",
55
+ "shapely>=2.0.0",
56
+ "networkx>=3.6.1",
57
+ ]
58
+ complete = [
59
+ "sibi-flux[distributed,geospatial,mcp]"
60
+ ]
61
+
62
+
63
+ [dependency-groups]
64
+ dev = [
65
+ "black>=25.11.0",
66
+ "bokeh>=3.8.0",
67
+ "graphviz>=0.21",
68
+ "jupyter>=1.1.1",
69
+ "poethepoet>=0.38.0",
70
+ "notebook>=7.5.0",
71
+ "pytest>=9.0.1",
72
+ "pytest-asyncio>=1.3.0",
73
+ "ruff>=0.14.9",
74
+ "httpx>=0.28.1",
75
+ ]
76
+ geospatial = [
77
+ "folium>=0.20.0",
78
+ "geopandas>=1.1.2",
79
+ "geopy>=2.4.1",
80
+ "networkx>=3.6.1",
81
+ "osmnx>=2.0.7",
82
+ ]
83
+
84
+ [build-system]
85
+ requires = ["uv_build>=0.9.5,<0.10.0"]
86
+ build-backend = "uv_build"
87
+
88
+ [tool.uv.build-backend]
89
+ module-root = "src"
90
+ module-name = ["sibi_flux", "sibi_dst"]
91
+
92
+ [tool.pytest.ini_options]
93
+ pythonpath = ["src", "."]
94
+ testpaths = ["tests"]
95
+ addopts = "-v"
96
+ filterwarnings = ["ignore::DeprecationWarning"]
97
+
98
+
99
+ [tool.poe.tasks]
100
+ dev = """
101
+ uvicorn solutions.main:app
102
+ --reload
103
+ --reload-dir solutions
104
+ --host 0.0.0.0
105
+ --port 6500
106
+ --env-file .env.linux
107
+ --workers 1
108
+ """
109
+ test = { cmd = "pytest tests/"}
110
+ lint = "black src/"
111
+
112
+ [tool.commitizen]
113
+ name = "cz_conventional_commits"
114
+ version = "1.0.0"
115
+ tag_format = "v$version"
116
+
117
+ [tool.mypy]
118
+ python_version = "3.11"
119
+ mypy_path = "src"
120
+ explicit_package_bases = true
121
+ namespace_packages = true
122
+ warn_return_any = false
123
+ warn_unused_configs = true
124
+ check_untyped_defs = true
125
+ disallow_untyped_defs = false
126
+ ignore_missing_imports = false
127
+
128
+ # Ignore missing imports for specific 3rd party libraries without strict typing
129
+ [[tool.mypy.overrides]]
130
+ module = [
131
+ "pandas.*",
132
+ "dask.*",
133
+ "distributed.*",
134
+ "fsspec.*",
135
+ "s3fs.*",
136
+ "sqlalchemy.*",
137
+ "pyarrow.*",
138
+ "clickhouse_connect.*",
139
+ "rich.*",
140
+ "filelock.*",
141
+ "tqdm.*",
142
+ "watchdog.*",
143
+ "tornado.*",
144
+ "typer.*",
145
+ "fastapi.*",
146
+ "uvicorn.*",
147
+ "httpx.*",
148
+ "osmnx.*",
149
+ "geopandas.*",
150
+ "geopy.*",
151
+ "folium.*",
152
+ "osmium.*",
153
+ "shapely.*",
154
+ "networkx.*",
155
+ "opentelemetry.*",
156
+ "psutil.*",
157
+ "pytest.*",
158
+ "mcp.*",
159
+ "yaml.*",
160
+ "pydantic.*",
161
+ "pydantic_settings.*",
162
+ ]
163
+ ignore_missing_imports = true