sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,100 @@
1
+ from typing import Dict
2
+
3
+ # Map standard SQL/DDL types to our desired Dask/Pandas working schema.
4
+ # We prioritize PyArrow-backed types for performance and better null handling.
5
+ SQLALCHEMY_TO_DASK_DTYPE: Dict[str, str] = {
6
+ # Integers: Use Pandas extension types (e.g., Int64) for nullability
7
+ "INTEGER": "Int64[pyarrow]",
8
+ "SMALLINT": "Int32[pyarrow]",
9
+ "BIGINT": "Int64[pyarrow]",
10
+ # Floats: Use Pandas extension types (Float64) for nullability
11
+ "FLOAT": "Float64[pyarrow]",
12
+ "DOUBLE": "Float64[pyarrow]",
13
+ # Exact numbers: String is the safest choice to avoid floating point error
14
+ "NUMERIC": "string[pyarrow]",
15
+ "DECIMAL": "string[pyarrow]",
16
+ # Boolean: Use Pandas extension type for nullability
17
+ "BOOLEAN": "boolean[pyarrow]",
18
+ # Strings: Use PyArrow-backed string (efficient storage/zero-copy potential)
19
+ "VARCHAR": "string[pyarrow]",
20
+ "CHAR": "string[pyarrow]",
21
+ "TEXT": "string[pyarrow]",
22
+ "UUID": "string[pyarrow]",
23
+ # Dates/Times: Enforce Standard Pandas UTC Dtype for consistency
24
+ "DATE": "datetime64[ns, UTC]",
25
+ "DATETIME": "datetime64[ns, UTC]",
26
+ "TIMESTAMP": "datetime64[ns, UTC]",
27
+ "TIME": "string[pyarrow]",
28
+ }
29
+
30
+ DASK_TO_CLICKHOUSE_DTYPE: Dict[str, str] = {
31
+ # --- 1. PyArrow-Backed Integer Types (Nullable) ---
32
+ # These cover the Pandas extension type (Capital 'I') and the raw PyArrow string alias (lowercase 'i').
33
+ "Int64[pyarrow]": "Nullable(Int64)",
34
+ "Int32[pyarrow]": "Nullable(Int32)",
35
+ "int64[pyarrow]": "Nullable(Int64)", # Alias found in logs
36
+ "int32[pyarrow]": "Nullable(Int32)", # Alias found in logs
37
+ # --- 2. PyArrow-Backed Float Types (Nullable) ---
38
+ # These cover the Pandas extension type (Capital 'F') and common PyArrow string aliases.
39
+ "Float64[pyarrow]": "Nullable(Float64)",
40
+ "Float32[pyarrow]": "Nullable(Float32)",
41
+ "double[pyarrow]": "Nullable(Float64)", # Alias found in logs
42
+ "float64[pyarrow]": "Nullable(Float64)",
43
+ # --- 3. PyArrow-Backed String and Boolean Types (Nullable) ---
44
+ "boolean[pyarrow]": "Nullable(Bool)",
45
+ "string[pyarrow]": "String", # ClickHouse String is inherently nullable
46
+ # --- 4. PyArrow Timestamp and Datetime Types ---
47
+ # We map both Pandas canonical datetime types and the raw PyArrow timestamp aliases.
48
+ "datetime64[ns, UTC]": "DateTime64(9, 'UTC')",
49
+ "datetime64[ns]": "DateTime64(9)",
50
+ "timestamp[ns][pyarrow]": "DateTime64(9, 'UTC')", # Alias found in logs
51
+ "timestamp[us][pyarrow]": "DateTime64(6, 'UTC')", # Microsecond precision
52
+ "timestamp[ms][pyarrow]": "DateTime64(3, 'UTC')", # Millisecond precision
53
+ # --- 5. Standard Pandas/NumPy Dtypes (Fallback for non-PyArrow) ---
54
+ # These types are non-nullable in NumPy, so we map them to non-Nullable CH types.
55
+ # Note: If these appear, it might indicate an issue earlier in the pipeline.
56
+ "int64": "Int64",
57
+ "int32": "Int32",
58
+ "float64": "Float64",
59
+ "float32": "Float32",
60
+ # Generic types that must default to String
61
+ "object": "String",
62
+ "category": "String", # Base type for LowCardinality wrapper in DDL logic
63
+ "bool": "Bool",
64
+ }
65
+
66
+ # Map intended data type to the Dask/Pandas type needed for safe, fast *ingestion*
67
+ CSV_INGESTION_DTYPE: Dict[str, str] = {
68
+ # Integers: Must use Pandas Extension Type to handle NULLs,
69
+ # preventing the column from becoming float64.
70
+ "INT_WITH_NULLS": "Int64[pyarrow]",
71
+ "SMALL_INT_WITH_NULLS": "Int32[pyarrow]",
72
+ # Floats: Standard float is usually fine for ingestion
73
+ "FLOAT": "float64[pyarrow]",
74
+ # High-Precision Decimals/Strings: Use PyArrow-backed string
75
+ "DECIMAL_AS_TEXT": "string[pyarrow]",
76
+ "STRING": "string[pyarrow]",
77
+ "TEXT": "string[pyarrow]",
78
+ # Dates: Read as object/string first, clean later in the pipeline
79
+ "DATE_OR_DATETIME": "object",
80
+ "TIMESTAMP": "object",
81
+ # Boolean
82
+ "BOOLEAN_WITH_NULLS": "boolean[pyarrow]",
83
+ }
84
+
85
+ DASK_TO_SQLALCHEMY_DTYPE: Dict[str, str] = {
86
+ # Integers: Prefer BIGINT as it accommodates both INTEGER and BIGINT sizes (safer default)
87
+ "Int64[pyarrow]": "BIGINT",
88
+ "Int32[pyarrow]": "INTEGER",
89
+ # Floats: Prefer DOUBLE as it offers the highest precision (safer default)
90
+ "Float64[pyarrow]": "DOUBLE",
91
+ # Strings: Prefer VARCHAR (or TEXT) as a general-purpose string type
92
+ # If the precision was critical (NUMERIC/DECIMAL), you'd usually write to TEXT
93
+ "string[pyarrow]": "TEXT",
94
+ "category": "VARCHAR", # If exporting a categorical type
95
+ # Boolean
96
+ "boolean[pyarrow]": "BOOLEAN",
97
+ # Dates/Times:
98
+ "datetime64[ns, UTC]": "TIMESTAMP",
99
+ "datetime64[ns]": "TIMESTAMP", # Handle naive timestamp fallback
100
+ }
@@ -0,0 +1,47 @@
1
+ """
2
+ Dask Resilience - A module for robust Dask operations with automatic recovery.
3
+ """
4
+
5
+ from .core import (
6
+ safe_compute,
7
+ safe_persist,
8
+ safe_gather,
9
+ safe_wait,
10
+ dask_is_empty,
11
+ dask_is_probably_empty,
12
+ dask_is_empty_truthful,
13
+ UniqueValuesExtractor,
14
+ )
15
+
16
+ from .client_manager import (
17
+ DaskClientMixin,
18
+ get_persistent_client,
19
+ shared_dask_session,
20
+ force_close_persistent_client,
21
+ )
22
+
23
+ from .async_core import (
24
+ async_compute,
25
+ async_persist,
26
+ )
27
+
28
+ # Define public API
29
+ __all__ = [
30
+ # Core operations
31
+ "safe_compute",
32
+ "safe_persist",
33
+ "safe_gather",
34
+ "safe_wait",
35
+ "dask_is_empty",
36
+ "dask_is_probably_empty",
37
+ "dask_is_empty_truthful",
38
+ "UniqueValuesExtractor",
39
+ # Client management
40
+ "DaskClientMixin",
41
+ "get_persistent_client",
42
+ "shared_dask_session",
43
+ "force_close_persistent_client",
44
+ # Async operations
45
+ "async_compute",
46
+ "async_persist",
47
+ ]
@@ -0,0 +1,27 @@
1
+ """
2
+ Async utilities for Dask operations.
3
+ """
4
+
5
+ from typing import Any, Optional
6
+ import asyncio
7
+ try:
8
+ from dask.distributed import Client
9
+ except ImportError:
10
+ Client = object
11
+ from .core import safe_compute, safe_persist
12
+
13
+
14
+ async def async_compute(obj: Any, dask_client: Optional[Client] = None) -> Any:
15
+ """Compute Dask object using async client if available."""
16
+ if dask_client and getattr(dask_client, "asynchronous", False):
17
+ return await dask_client.compute(obj)
18
+ # Offload sync compute (which calls .result()) to a thread
19
+ return await asyncio.to_thread(safe_compute, obj, dask_client)
20
+
21
+
22
+ async def async_persist(obj: Any, dask_client: Optional[Client] = None) -> Any:
23
+ """Persist Dask object using async client if available."""
24
+ if dask_client and getattr(dask_client, "asynchronous", False):
25
+ return await dask_client.persist(obj)
26
+ # Offload sync persist (though usually fast, safe_persist might check active_client)
27
+ return await asyncio.to_thread(safe_persist, obj, dask_client=dask_client)