sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
# Map standard SQL/DDL types to our desired Dask/Pandas working schema.
|
|
4
|
+
# We prioritize PyArrow-backed types for performance and better null handling.
|
|
5
|
+
SQLALCHEMY_TO_DASK_DTYPE: Dict[str, str] = {
|
|
6
|
+
# Integers: Use Pandas extension types (e.g., Int64) for nullability
|
|
7
|
+
"INTEGER": "Int64[pyarrow]",
|
|
8
|
+
"SMALLINT": "Int32[pyarrow]",
|
|
9
|
+
"BIGINT": "Int64[pyarrow]",
|
|
10
|
+
# Floats: Use Pandas extension types (Float64) for nullability
|
|
11
|
+
"FLOAT": "Float64[pyarrow]",
|
|
12
|
+
"DOUBLE": "Float64[pyarrow]",
|
|
13
|
+
# Exact numbers: String is the safest choice to avoid floating point error
|
|
14
|
+
"NUMERIC": "string[pyarrow]",
|
|
15
|
+
"DECIMAL": "string[pyarrow]",
|
|
16
|
+
# Boolean: Use Pandas extension type for nullability
|
|
17
|
+
"BOOLEAN": "boolean[pyarrow]",
|
|
18
|
+
# Strings: Use PyArrow-backed string (efficient storage/zero-copy potential)
|
|
19
|
+
"VARCHAR": "string[pyarrow]",
|
|
20
|
+
"CHAR": "string[pyarrow]",
|
|
21
|
+
"TEXT": "string[pyarrow]",
|
|
22
|
+
"UUID": "string[pyarrow]",
|
|
23
|
+
# Dates/Times: Enforce Standard Pandas UTC Dtype for consistency
|
|
24
|
+
"DATE": "datetime64[ns, UTC]",
|
|
25
|
+
"DATETIME": "datetime64[ns, UTC]",
|
|
26
|
+
"TIMESTAMP": "datetime64[ns, UTC]",
|
|
27
|
+
"TIME": "string[pyarrow]",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
DASK_TO_CLICKHOUSE_DTYPE: Dict[str, str] = {
|
|
31
|
+
# --- 1. PyArrow-Backed Integer Types (Nullable) ---
|
|
32
|
+
# These cover the Pandas extension type (Capital 'I') and the raw PyArrow string alias (lowercase 'i').
|
|
33
|
+
"Int64[pyarrow]": "Nullable(Int64)",
|
|
34
|
+
"Int32[pyarrow]": "Nullable(Int32)",
|
|
35
|
+
"int64[pyarrow]": "Nullable(Int64)", # Alias found in logs
|
|
36
|
+
"int32[pyarrow]": "Nullable(Int32)", # Alias found in logs
|
|
37
|
+
# --- 2. PyArrow-Backed Float Types (Nullable) ---
|
|
38
|
+
# These cover the Pandas extension type (Capital 'F') and common PyArrow string aliases.
|
|
39
|
+
"Float64[pyarrow]": "Nullable(Float64)",
|
|
40
|
+
"Float32[pyarrow]": "Nullable(Float32)",
|
|
41
|
+
"double[pyarrow]": "Nullable(Float64)", # Alias found in logs
|
|
42
|
+
"float64[pyarrow]": "Nullable(Float64)",
|
|
43
|
+
# --- 3. PyArrow-Backed String and Boolean Types (Nullable) ---
|
|
44
|
+
"boolean[pyarrow]": "Nullable(Bool)",
|
|
45
|
+
"string[pyarrow]": "String", # ClickHouse String is inherently nullable
|
|
46
|
+
# --- 4. PyArrow Timestamp and Datetime Types ---
|
|
47
|
+
# We map both Pandas canonical datetime types and the raw PyArrow timestamp aliases.
|
|
48
|
+
"datetime64[ns, UTC]": "DateTime64(9, 'UTC')",
|
|
49
|
+
"datetime64[ns]": "DateTime64(9)",
|
|
50
|
+
"timestamp[ns][pyarrow]": "DateTime64(9, 'UTC')", # Alias found in logs
|
|
51
|
+
"timestamp[us][pyarrow]": "DateTime64(6, 'UTC')", # Microsecond precision
|
|
52
|
+
"timestamp[ms][pyarrow]": "DateTime64(3, 'UTC')", # Millisecond precision
|
|
53
|
+
# --- 5. Standard Pandas/NumPy Dtypes (Fallback for non-PyArrow) ---
|
|
54
|
+
# These types are non-nullable in NumPy, so we map them to non-Nullable CH types.
|
|
55
|
+
# Note: If these appear, it might indicate an issue earlier in the pipeline.
|
|
56
|
+
"int64": "Int64",
|
|
57
|
+
"int32": "Int32",
|
|
58
|
+
"float64": "Float64",
|
|
59
|
+
"float32": "Float32",
|
|
60
|
+
# Generic types that must default to String
|
|
61
|
+
"object": "String",
|
|
62
|
+
"category": "String", # Base type for LowCardinality wrapper in DDL logic
|
|
63
|
+
"bool": "Bool",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Map intended data type to the Dask/Pandas type needed for safe, fast *ingestion*
|
|
67
|
+
CSV_INGESTION_DTYPE: Dict[str, str] = {
|
|
68
|
+
# Integers: Must use Pandas Extension Type to handle NULLs,
|
|
69
|
+
# preventing the column from becoming float64.
|
|
70
|
+
"INT_WITH_NULLS": "Int64[pyarrow]",
|
|
71
|
+
"SMALL_INT_WITH_NULLS": "Int32[pyarrow]",
|
|
72
|
+
# Floats: Standard float is usually fine for ingestion
|
|
73
|
+
"FLOAT": "float64[pyarrow]",
|
|
74
|
+
# High-Precision Decimals/Strings: Use PyArrow-backed string
|
|
75
|
+
"DECIMAL_AS_TEXT": "string[pyarrow]",
|
|
76
|
+
"STRING": "string[pyarrow]",
|
|
77
|
+
"TEXT": "string[pyarrow]",
|
|
78
|
+
# Dates: Read as object/string first, clean later in the pipeline
|
|
79
|
+
"DATE_OR_DATETIME": "object",
|
|
80
|
+
"TIMESTAMP": "object",
|
|
81
|
+
# Boolean
|
|
82
|
+
"BOOLEAN_WITH_NULLS": "boolean[pyarrow]",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
DASK_TO_SQLALCHEMY_DTYPE: Dict[str, str] = {
|
|
86
|
+
# Integers: Prefer BIGINT as it accommodates both INTEGER and BIGINT sizes (safer default)
|
|
87
|
+
"Int64[pyarrow]": "BIGINT",
|
|
88
|
+
"Int32[pyarrow]": "INTEGER",
|
|
89
|
+
# Floats: Prefer DOUBLE as it offers the highest precision (safer default)
|
|
90
|
+
"Float64[pyarrow]": "DOUBLE",
|
|
91
|
+
# Strings: Prefer VARCHAR (or TEXT) as a general-purpose string type
|
|
92
|
+
# If the precision was critical (NUMERIC/DECIMAL), you'd usually write to TEXT
|
|
93
|
+
"string[pyarrow]": "TEXT",
|
|
94
|
+
"category": "VARCHAR", # If exporting a categorical type
|
|
95
|
+
# Boolean
|
|
96
|
+
"boolean[pyarrow]": "BOOLEAN",
|
|
97
|
+
# Dates/Times:
|
|
98
|
+
"datetime64[ns, UTC]": "TIMESTAMP",
|
|
99
|
+
"datetime64[ns]": "TIMESTAMP", # Handle naive timestamp fallback
|
|
100
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dask Resilience - A module for robust Dask operations with automatic recovery.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .core import (
|
|
6
|
+
safe_compute,
|
|
7
|
+
safe_persist,
|
|
8
|
+
safe_gather,
|
|
9
|
+
safe_wait,
|
|
10
|
+
dask_is_empty,
|
|
11
|
+
dask_is_probably_empty,
|
|
12
|
+
dask_is_empty_truthful,
|
|
13
|
+
UniqueValuesExtractor,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from .client_manager import (
|
|
17
|
+
DaskClientMixin,
|
|
18
|
+
get_persistent_client,
|
|
19
|
+
shared_dask_session,
|
|
20
|
+
force_close_persistent_client,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from .async_core import (
|
|
24
|
+
async_compute,
|
|
25
|
+
async_persist,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Define public API
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Core operations
|
|
31
|
+
"safe_compute",
|
|
32
|
+
"safe_persist",
|
|
33
|
+
"safe_gather",
|
|
34
|
+
"safe_wait",
|
|
35
|
+
"dask_is_empty",
|
|
36
|
+
"dask_is_probably_empty",
|
|
37
|
+
"dask_is_empty_truthful",
|
|
38
|
+
"UniqueValuesExtractor",
|
|
39
|
+
# Client management
|
|
40
|
+
"DaskClientMixin",
|
|
41
|
+
"get_persistent_client",
|
|
42
|
+
"shared_dask_session",
|
|
43
|
+
"force_close_persistent_client",
|
|
44
|
+
# Async operations
|
|
45
|
+
"async_compute",
|
|
46
|
+
"async_persist",
|
|
47
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async utilities for Dask operations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
import asyncio
|
|
7
|
+
try:
|
|
8
|
+
from dask.distributed import Client
|
|
9
|
+
except ImportError:
|
|
10
|
+
Client = object
|
|
11
|
+
from .core import safe_compute, safe_persist
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def async_compute(obj: Any, dask_client: Optional[Client] = None) -> Any:
|
|
15
|
+
"""Compute Dask object using async client if available."""
|
|
16
|
+
if dask_client and getattr(dask_client, "asynchronous", False):
|
|
17
|
+
return await dask_client.compute(obj)
|
|
18
|
+
# Offload sync compute (which calls .result()) to a thread
|
|
19
|
+
return await asyncio.to_thread(safe_compute, obj, dask_client)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def async_persist(obj: Any, dask_client: Optional[Client] = None) -> Any:
|
|
23
|
+
"""Persist Dask object using async client if available."""
|
|
24
|
+
if dask_client and getattr(dask_client, "asynchronous", False):
|
|
25
|
+
return await dask_client.persist(obj)
|
|
26
|
+
# Offload sync persist (though usually fast, safe_persist might check active_client)
|
|
27
|
+
return await asyncio.to_thread(safe_persist, obj, dask_client=dask_client)
|