sibi-flux 2025.12.0__py3-none-any.whl → 2026.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_flux/__init__.py +4 -4
- sibi_flux/artifacts/parquet_engine/executor.py +1 -1
- sibi_flux/cli.py +45 -0
- sibi_flux/config/__init__.py +3 -0
- sibi_flux/{conf → config}/settings.py +7 -7
- sibi_flux/dask_cluster/async_core.py +1 -0
- sibi_flux/dask_cluster/client_manager.py +5 -2
- sibi_flux/dask_cluster/core.py +3 -0
- sibi_flux/datacube/_data_cube.py +12 -3
- sibi_flux/datacube/cli.py +1247 -0
- sibi_flux/datacube/config_engine.py +88 -21
- sibi_flux/datacube/field_factory.py +105 -22
- sibi_flux/datacube/field_mapper.py +243 -0
- sibi_flux/datacube/field_registry.py +2 -0
- sibi_flux/datacube/generator.py +322 -90
- sibi_flux/datacube/orchestrator.py +167 -41
- sibi_flux/{utils/boilerplate/base_cube_router.py → datacube/router.py} +2 -3
- sibi_flux/dataset/_dataset.py +1 -1
- sibi_flux/df_helper/_df_helper.py +2 -1
- sibi_flux/df_helper/backends/_params.py +6 -6
- sibi_flux/df_validator/_df_validator.py +5 -0
- sibi_flux/init/__init__.py +0 -0
- sibi_flux/init/core.py +159 -0
- sibi_flux/init/discovery_updater.py +99 -0
- sibi_flux/init/env.py +86 -0
- sibi_flux/init/env_engine.py +151 -0
- sibi_flux/init/env_generator.py +554 -0
- sibi_flux/init/templates/__init__.py +0 -0
- sibi_flux/init/templates/discovery_params.yaml +45 -0
- sibi_flux/init/templates/gen_dc.py +137 -0
- sibi_flux/init/templates/property_template.yaml +10 -0
- sibi_flux/mcp/__init__.py +10 -2
- sibi_flux/mcp/router.py +1 -1
- sibi_flux/osmnx_helper/__init__.py +12 -4
- sibi_flux/parquet/__init__.py +8 -0
- sibi_flux/parquet/readers/__init__.py +4 -0
- sibi_flux/parquet/readers/base.py +82 -0
- sibi_flux/pipelines/base.py +1 -2
- sibi_flux/readers/__init__.py +2 -2
- sibi_flux/readers/base.py +2 -81
- sibi_flux/{utils/storage → storage}/_storage_manager.py +4 -4
- sibi_flux/{utils/storage → storage}/factory.py +1 -1
- sibi_flux/utils/__init__.py +1 -2
- {sibi_flux-2025.12.0.dist-info → sibi_flux-2026.1.2.dist-info}/METADATA +43 -1
- {sibi_flux-2025.12.0.dist-info → sibi_flux-2026.1.2.dist-info}/RECORD +57 -42
- sibi_flux-2026.1.2.dist-info/entry_points.txt +3 -0
- sibi_flux/utils/boilerplate/__init__.py +0 -19
- sibi_flux/utils/boilerplate/base_data_cube.py +0 -132
- sibi_flux/utils/credentials/__init__.py +0 -3
- /sibi_flux/{utils/credentials/_config_manager.py → config/manager.py} +0 -0
- /sibi_flux/{utils/boilerplate/hybrid_data_loader.py → dataset/hybrid_loader.py} +0 -0
- /sibi_flux/{utils/boilerplate/base_attacher.py → df_enricher/attacher.py} +0 -0
- /sibi_flux/{readers → parquet/readers}/parquet.py +0 -0
- /sibi_flux/{utils/parquet_saver → parquet/saver}/__init__.py +0 -0
- /sibi_flux/{utils/parquet_saver → parquet/saver}/_parquet_saver.py +0 -0
- /sibi_flux/{utils/parquet_saver → parquet/saver}/_write_gatekeeper.py +0 -0
- /sibi_flux/{utils/boilerplate/base_pipeline_template.py → pipelines/template.py} +0 -0
- /sibi_flux/{utils/storage → storage}/__init__.py +0 -0
- /sibi_flux/{utils/storage → storage}/_fs_registry.py +0 -0
- {sibi_flux-2025.12.0.dist-info → sibi_flux-2026.1.2.dist-info}/WHEEL +0 -0
sibi_flux/__init__.py
CHANGED
|
@@ -21,11 +21,12 @@ from sibi_flux.df_validator._df_validator import DfValidator
|
|
|
21
21
|
|
|
22
22
|
# Artifacts
|
|
23
23
|
from sibi_flux.artifacts import ParquetArtifact, BaseArtifact as Artifact
|
|
24
|
-
from sibi_flux.
|
|
24
|
+
from sibi_flux.parquet import ParquetReader
|
|
25
25
|
|
|
26
26
|
# Utilities (Sub-packages)
|
|
27
27
|
from sibi_flux import dask_cluster
|
|
28
|
-
from sibi_flux.utils import
|
|
28
|
+
from sibi_flux.utils import clickhouse_writer
|
|
29
|
+
from sibi_flux import parquet
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
__all__ = [
|
|
@@ -43,7 +44,6 @@ __all__ = [
|
|
|
43
44
|
"Artifact",
|
|
44
45
|
"ParquetReader",
|
|
45
46
|
"dask_cluster",
|
|
46
|
-
"
|
|
47
|
-
"parquet_saver",
|
|
47
|
+
"parquet",
|
|
48
48
|
"clickhouse_writer",
|
|
49
49
|
]
|
|
@@ -13,7 +13,7 @@ import functools
|
|
|
13
13
|
from tqdm import tqdm
|
|
14
14
|
import pandas as pd
|
|
15
15
|
from sibi_flux.core import ManagedResource
|
|
16
|
-
from sibi_flux.
|
|
16
|
+
from sibi_flux.parquet import ParquetSaver
|
|
17
17
|
|
|
18
18
|
from sibi_flux.utils import ensure_slash
|
|
19
19
|
from sibi_flux.utils.retry import with_retry
|
sibi_flux/cli.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from sibi_flux.init.core import initialize_project
|
|
6
|
+
|
|
7
|
+
app = typer.Typer(help="Sibi Flux CLI")
|
|
8
|
+
console = Console()
|
|
9
|
+
|
|
10
|
+
@app.callback()
|
|
11
|
+
def callback():
|
|
12
|
+
"""
|
|
13
|
+
Sibi Flux CLI
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
@app.command()
|
|
17
|
+
def init(
|
|
18
|
+
project_name: str = typer.Argument(..., help="Name of the project to create"),
|
|
19
|
+
lib: bool = typer.Option(False, "--lib", help="Initialize as a library project (passed to uv init)"),
|
|
20
|
+
app: bool = typer.Option(False, "--app", help="Initialize as an application project (passed to uv init)")
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize a new Sibi Flux project.
|
|
24
|
+
|
|
25
|
+
Creates a new directory <project_name>, initializes it with 'uv',
|
|
26
|
+
and adds 'sibi-flux' as a dependency.
|
|
27
|
+
"""
|
|
28
|
+
initialize_project(project_name, lib, app)
|
|
29
|
+
|
|
30
|
+
@app.command()
|
|
31
|
+
def env(
|
|
32
|
+
project_path: Path = typer.Argument(Path("."), help="Project root directory"),
|
|
33
|
+
env_file: Optional[Path] = typer.Option(None, "--env-file", "-e", help="Path to environment file (defaults to .env)"),
|
|
34
|
+
cleanup: bool = typer.Option(False, "--cleanup", help="Remove existing configuration files"),
|
|
35
|
+
production: bool = typer.Option(False, "--production", "-p", help="Generate production skeleton (no hardcoded values)"),
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize configuration files (settings.py, credentials) based on .env
|
|
39
|
+
"""
|
|
40
|
+
from sibi_flux.init.env import init_env
|
|
41
|
+
init_env(project_path, env_file, cleanup=cleanup, production_mode=production)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
app()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Any
|
|
1
|
+
from typing import Optional, Any, ClassVar
|
|
2
2
|
from pydantic import SecretStr
|
|
3
3
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
4
4
|
|
|
@@ -9,6 +9,8 @@ class SibiBaseSettings(BaseSettings):
|
|
|
9
9
|
model_config = SettingsConfigDict(
|
|
10
10
|
env_file=".env", env_file_encoding="utf-8", extra="ignore"
|
|
11
11
|
)
|
|
12
|
+
|
|
13
|
+
conf_name: ClassVar[str] = ""
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class FsSettings(SibiBaseSettings):
|
|
@@ -84,13 +86,8 @@ class DatabaseSettings(SibiBaseSettings):
|
|
|
84
86
|
"""Generic SQL Database settings."""
|
|
85
87
|
|
|
86
88
|
db_url: str = "sqlite:///:memory:"
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class ClickhouseBaseSettings(SibiBaseSettings):
|
|
90
|
-
"""Base settings for ClickHouse connection."""
|
|
91
|
-
|
|
92
89
|
host: str = "localhost"
|
|
93
|
-
port: int =
|
|
90
|
+
port: int = 5432
|
|
94
91
|
database: str = "default"
|
|
95
92
|
user: str = "default"
|
|
96
93
|
password: SecretStr = SecretStr("secret")
|
|
@@ -102,9 +99,12 @@ class ClickhouseBaseSettings(SibiBaseSettings):
|
|
|
102
99
|
"dbname": self.database,
|
|
103
100
|
"user": self.user,
|
|
104
101
|
"password": self.password.get_secret_value() if self.password else None,
|
|
102
|
+
"db_url": self.db_url,
|
|
105
103
|
}
|
|
106
104
|
|
|
107
105
|
|
|
106
|
+
|
|
107
|
+
|
|
108
108
|
class RedisBaseSettings(SibiBaseSettings):
|
|
109
109
|
"""Base settings for Redis connection."""
|
|
110
110
|
|
|
@@ -23,6 +23,7 @@ import shutil
|
|
|
23
23
|
|
|
24
24
|
try:
|
|
25
25
|
from dask.distributed import Client, LocalCluster, get_client
|
|
26
|
+
|
|
26
27
|
HAS_DISTRIBUTED = True
|
|
27
28
|
except ImportError:
|
|
28
29
|
Client = object
|
|
@@ -231,8 +232,10 @@ class DaskClientMixin:
|
|
|
231
232
|
def _init_dask_client(self, **kwargs) -> None:
|
|
232
233
|
self._init_params = kwargs
|
|
233
234
|
if not HAS_DISTRIBUTED:
|
|
234
|
-
|
|
235
|
-
|
|
235
|
+
self.logger.info(
|
|
236
|
+
"Dask Distributed not installed. Skipping cluster initialization."
|
|
237
|
+
)
|
|
238
|
+
return
|
|
236
239
|
|
|
237
240
|
if kwargs.get("dask_client"):
|
|
238
241
|
self.dask_client = kwargs["dask_client"]
|
sibi_flux/dask_cluster/core.py
CHANGED
|
@@ -14,15 +14,18 @@ from typing import Any, Callable, Dict, List, Optional, TypeVar
|
|
|
14
14
|
import dask
|
|
15
15
|
import dask.dataframe as dd
|
|
16
16
|
import pandas as pd
|
|
17
|
+
|
|
17
18
|
try:
|
|
18
19
|
from dask.distributed import Client, Future
|
|
19
20
|
from dask.distributed import wait as dask_wait
|
|
20
21
|
except ImportError:
|
|
21
22
|
Client = object
|
|
22
23
|
Future = object
|
|
24
|
+
|
|
23
25
|
def dask_wait(*args, **kwargs):
|
|
24
26
|
pass
|
|
25
27
|
|
|
28
|
+
|
|
26
29
|
# Project-specific imports
|
|
27
30
|
from .client_manager import get_persistent_client
|
|
28
31
|
from .exceptions import RECOVERABLE_COMMS
|
sibi_flux/datacube/_data_cube.py
CHANGED
|
@@ -175,9 +175,10 @@ class Datacube(DfHelper):
|
|
|
175
175
|
# but we log it for debugging.
|
|
176
176
|
self.logger.debug(f"Schema inference skipped: {e}")
|
|
177
177
|
|
|
178
|
-
def
|
|
178
|
+
def validate_data(self, df: DataFrameType) -> DataFrameType:
|
|
179
179
|
"""
|
|
180
180
|
Runs DfValidator if a schema is configured.
|
|
181
|
+
Overrides BaseDatacube hook.
|
|
181
182
|
"""
|
|
182
183
|
schema = self.config.get("validation_schema")
|
|
183
184
|
if not schema:
|
|
@@ -203,6 +204,14 @@ class Datacube(DfHelper):
|
|
|
203
204
|
|
|
204
205
|
return validator.get_df()
|
|
205
206
|
|
|
207
|
+
async def avalidate_data(self, df: DataFrameType) -> DataFrameType:
|
|
208
|
+
"""
|
|
209
|
+
Asynchronous validation hook.
|
|
210
|
+
Offloads synchronous validation (CPU bound) to a thread.
|
|
211
|
+
"""
|
|
212
|
+
import asyncio
|
|
213
|
+
return await asyncio.to_thread(self.validate_data, df)
|
|
214
|
+
|
|
206
215
|
def get_ddl(self, table_name: Optional[str] = None) -> str:
|
|
207
216
|
"""
|
|
208
217
|
Generates ClickHouse DDL for the current cube.
|
|
@@ -230,7 +239,7 @@ class Datacube(DfHelper):
|
|
|
230
239
|
# 3. Apply Transform Hook
|
|
231
240
|
df = self.fix_data(df, **kwargs)
|
|
232
241
|
# 4. Validate
|
|
233
|
-
df = self.
|
|
242
|
+
df = self.validate_data(df)
|
|
234
243
|
else:
|
|
235
244
|
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
236
245
|
|
|
@@ -254,7 +263,7 @@ class Datacube(DfHelper):
|
|
|
254
263
|
# 3. Apply Async Transform Hook
|
|
255
264
|
df = await self.afix_data(df, **kwargs)
|
|
256
265
|
# 4. Validate (CPU bound)
|
|
257
|
-
df = await
|
|
266
|
+
df = await self.avalidate_data(df)
|
|
258
267
|
else:
|
|
259
268
|
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
260
269
|
|