sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,322 @@
1
+ """
2
+ Core functionality for resilient Dask operations.
3
+ - Dry Run: Graph complexity inspection and logging to OpenObserve.
4
+ - Resilience: Auto-healing via persistent client registry.
5
+ - Invariants: Strict prohibition of local fallback for Dask DataFrames.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import logging
12
+ from typing import Any, Callable, Dict, List, Optional, TypeVar
13
+
14
+ import dask
15
+ import dask.dataframe as dd
16
+ import pandas as pd
17
+ try:
18
+ from dask.distributed import Client, Future
19
+ from dask.distributed import wait as dask_wait
20
+ except ImportError:
21
+ Client = object
22
+ Future = object
23
+ def dask_wait(*args, **kwargs):
24
+ pass
25
+
26
+ # Project-specific imports
27
+ from .client_manager import get_persistent_client
28
+ from .exceptions import RECOVERABLE_COMMS
29
+ from .utils import _to_int_safe
30
+
31
+ T = TypeVar("T")
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Late-Binding & Helpers
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ def _get_log():
39
+ """Late-binds the Logger to prevent circular imports during init."""
40
+ try:
41
+ from sibi_flux.logger import Logger
42
+
43
+ return Logger.default_logger(logger_name="dask_cluster.core")
44
+ except ImportError:
45
+ return logging.getLogger("dask_cluster.core")
46
+
47
+
48
+ def _is_dask_dataframe_like(obj: Any) -> bool:
49
+ """Checks if object is a Dask collection relying on distributed state."""
50
+ return isinstance(obj, (dd.DataFrame, dd.Series)) or hasattr(obj, "_meta")
51
+
52
+
53
+ def _get_active_client(
54
+ provided_client: Optional[Client], logger=None
55
+ ) -> Optional[Client]:
56
+ """Retrieves a healthy client, healing the persistent one if necessary."""
57
+ if provided_client and provided_client.status == "running":
58
+ return provided_client
59
+ try:
60
+ # get_persistent_client handles internal healing/watchdog logic
61
+ return get_persistent_client(logger=logger)
62
+ except Exception:
63
+ return None
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Inspection & Dry Run
68
+ # ---------------------------------------------------------------------------
69
+
70
+
71
+ def get_graph_metrics(obj: Any) -> Dict[str, Any]:
72
+ """Extract complexity metrics from a Dask object for observability."""
73
+ try:
74
+ # If it's a list (e.g., for safe_gather), check if any item is dask-backed
75
+ if isinstance(obj, list):
76
+ obj = obj[0] if obj and hasattr(obj[0], "__dask_graph__") else None
77
+
78
+ if obj is None or not hasattr(obj, "__dask_graph__"):
79
+ return {"is_dask": False}
80
+
81
+ graph = obj.__dask_graph__()
82
+ return {
83
+ "type": type(obj).__name__,
84
+ "is_dask": True,
85
+ "task_count": len(graph),
86
+ "n_partitions": getattr(obj, "npartitions", "N/A"),
87
+ "layers": (
88
+ len(getattr(graph, "layers", []))
89
+ if hasattr(graph, "layers")
90
+ else "unknown"
91
+ ),
92
+ }
93
+ except Exception as e:
94
+ return {"error_extracting_metrics": str(e)}
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # Resilient Execution Engine
99
+ # ---------------------------------------------------------------------------
100
+
101
+
102
+ def _execute_with_resilience(
103
+ op: Callable[..., T],
104
+ obj: Any,
105
+ dask_client: Optional[Client],
106
+ logger=None,
107
+ dry_run: bool = False,
108
+ **kwargs,
109
+ ) -> Optional[T]:
110
+ """
111
+ Orchestrates Dask operations with dry-run logging and a single-retry
112
+ healing mechanism for communication failures.
113
+ """
114
+ log = logger or _get_log()
115
+
116
+ # 1. Observability: Log graph complexity to OpenObserve
117
+ metrics = get_graph_metrics(obj)
118
+ if metrics.get("is_dask"):
119
+ log.info(
120
+ "Dask Graph Inspection",
121
+ extra={"graph_metrics": metrics, "dry_run": dry_run},
122
+ )
123
+ if dry_run:
124
+ log.info("Dry Run: Execution skipped.")
125
+ return None
126
+
127
+ # 2. Execution with Auto-Healing
128
+ active_client = _get_active_client(dask_client, logger=log)
129
+ try:
130
+ return op(obj, active_client, **kwargs)
131
+ except RECOVERABLE_COMMS as e:
132
+ log.warning(f"Dask comm failure ({type(e).__name__}). Healing and retrying.")
133
+
134
+ # Trigger explicit heal via singleton refresh
135
+ active_client = get_persistent_client(logger=log)
136
+
137
+ if active_client:
138
+ log.info("Client healed. Resubmitting task.")
139
+ return op(obj, active_client, **kwargs)
140
+
141
+ # Guard: Never fall back to local compute for DataFrames (Memory Safety)
142
+ if _is_dask_dataframe_like(obj):
143
+ raise RuntimeError(
144
+ "Distributed client lost and cannot be healed. "
145
+ "Local fallback forbidden for DataFrames."
146
+ ) from e
147
+
148
+ log.warning("Falling back to local threaded compute (safe for non-DataFrame).")
149
+ return obj.compute(scheduler="threads")
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # Public API
154
+ # ---------------------------------------------------------------------------
155
+
156
+
157
+ def _compute_impl(obj: Any, client: Optional[Client]) -> Any:
158
+ if client:
159
+ res = client.compute(obj)
160
+ return res.result() if isinstance(res, Future) else res
161
+ return obj.compute()
162
+
163
+
164
+ def safe_compute(
165
+ obj: Any, dask_client: Optional[Client] = None, logger=None, dry_run: bool = False
166
+ ) -> Any:
167
+ """Compute with auto-healing and optional dry-run complexity logging."""
168
+ return _execute_with_resilience(
169
+ _compute_impl, obj, dask_client, logger, dry_run=dry_run
170
+ )
171
+
172
+
173
+ def safe_persist(obj: Any, dask_client: Optional[Client] = None, logger=None) -> Any:
174
+ """Persist a collection to distributed memory with auto-healing."""
175
+
176
+ def _persist_op(o, c):
177
+ return c.persist(o) if c else o.persist()
178
+
179
+ return _execute_with_resilience(_persist_op, obj, dask_client, logger)
180
+
181
+
182
+ def safe_gather(
183
+ objs: List[Any], dask_client: Optional[Client] = None, logger=None
184
+ ) -> List[Any]:
185
+ """Gather multiple futures or collections into local memory."""
186
+ if not objs:
187
+ return []
188
+
189
+ def _gather_op(items, client):
190
+ if client:
191
+ return client.gather(client.compute(items))
192
+ return list(dask.compute(*items, scheduler="threads"))
193
+
194
+ return _execute_with_resilience(_gather_op, objs, dask_client, logger)
195
+
196
+
197
+ def safe_wait(
198
+ obj: Any,
199
+ dask_client: Optional[Client] = None,
200
+ timeout: Optional[float] = None,
201
+ logger=None,
202
+ ) -> Any:
203
+ """Wait for completion. Safe from local-fallback for DataFrames."""
204
+ log = logger or _get_log()
205
+ client = _get_active_client(dask_client, logger=log)
206
+ try:
207
+ if client:
208
+ dask_wait(obj, timeout=timeout)
209
+ elif not _is_dask_dataframe_like(obj) and hasattr(obj, "compute"):
210
+ obj.compute(scheduler="threads")
211
+ return obj
212
+ except Exception as e:
213
+ log.warning(f"safe_wait: {type(e).__name__}: {e}")
214
+ return obj
215
+
216
+
217
+ def safe_dry_run(obj: Any, logger=None) -> Dict[str, Any]:
218
+ """Utility to log and return graph metrics without execution."""
219
+ metrics = get_graph_metrics(obj)
220
+ _get_log().info("Manual Dask Dry Run", extra={"graph_metrics": metrics})
221
+ return metrics
222
+
223
+
224
+ # ---------------------------------------------------------------------------
225
+ # Heuristic Emptiness Checks
226
+ # ---------------------------------------------------------------------------
227
+
228
+
229
+ def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
230
+ """Metadata check (zero partitions)."""
231
+ return getattr(ddf, "npartitions", 0) == 0
232
+
233
+
234
+ def dask_is_empty_truthful(
235
+ ddf: dd.DataFrame, dask_client: Optional[Client] = None, logger=None
236
+ ) -> bool:
237
+ """Expensive but accurate full-table count check."""
238
+ total = safe_compute(
239
+ ddf.map_partitions(len, meta=("n", "int64")).sum(),
240
+ dask_client=dask_client,
241
+ logger=logger,
242
+ )
243
+ return int(_to_int_safe(total)) == 0
244
+
245
+
246
+ def dask_is_empty(
247
+ ddf: dd.DataFrame,
248
+ *,
249
+ sample: int = 4,
250
+ dask_client: Optional[Client] = None,
251
+ logger=None,
252
+ ) -> bool:
253
+ """
254
+ Multi-stage check:
255
+ 1. Metadata
256
+ 2. Parallel sampling of first K partitions
257
+ 3. Truthful sum (fallback)
258
+ """
259
+ if dask_is_probably_empty(ddf):
260
+ return True
261
+
262
+ k = min(max(sample, 1), ddf.npartitions)
263
+ try:
264
+ parts = [
265
+ ddf.get_partition(i).map_partitions(len, meta=("n", "int64"))
266
+ for i in range(k)
267
+ ]
268
+ probes = safe_gather(parts, dask_client, logger=logger)
269
+
270
+ if any(_to_int_safe(n) > 0 for n in probes):
271
+ return False
272
+
273
+ if k == ddf.npartitions:
274
+ return True
275
+ except Exception as e:
276
+ _get_log().warning(f"dask_is_empty probe failed: {e}")
277
+ return False
278
+
279
+ return dask_is_empty_truthful(ddf, dask_client=dask_client, logger=logger)
280
+
281
+
282
+ # ---------------------------------------------------------------------------
283
+ # Data Extraction
284
+ # ---------------------------------------------------------------------------
285
+
286
+
287
+ class UniqueValuesExtractor:
288
+ """Resilient unique value extraction from Dask columns."""
289
+
290
+ def __init__(self, dask_client: Optional[Client] = None, logger=None):
291
+ self.dask_client = dask_client
292
+ self.logger = logger
293
+
294
+ async def extract_unique_values(
295
+ self, df: dd.DataFrame, *columns: str, limit: int = 100_000
296
+ ) -> Dict[str, List[Any]]:
297
+ async def _extract(col):
298
+ # Optimization: drop duplicates on the distributed collection first
299
+ unique_dd = df[col].dropna().drop_duplicates()
300
+
301
+ # Fetch only the stats head to avoid OOM
302
+ # npartitions=-1 forces a logical 'head' across partitions if needed,
303
+ # but usually for unique values we want to be careful.
304
+ # Using compute=True on head triggers the fetch.
305
+
306
+ # We run this in a thread because head(compute=True) is blocking
307
+ res = await asyncio.to_thread(
308
+ lambda: unique_dd.head(limit, npartitions=-1, compute=True)
309
+ )
310
+
311
+ if len(res) >= limit:
312
+ if self.logger:
313
+ self.logger.warning(
314
+ f"Unique value extraction for column '{col}' truncated at {limit} items. "
315
+ "High cardinality detected.",
316
+ extra={"column": col, "limit": limit},
317
+ )
318
+
319
+ return col, res.tolist()
320
+
321
+ results = await asyncio.gather(*(_extract(c) for c in columns))
322
+ return dict(results)
@@ -0,0 +1,34 @@
1
+ """
2
+ Custom exceptions for the dask_cluster package.
3
+ """
4
+
5
+ try:
6
+ # distributed >=2024 uses this location
7
+ from distributed.comm.core import CommClosedError # type: ignore
8
+ except ImportError: # pragma: no cover
9
+
10
+ class CommClosedError(Exception): # type: ignore[no-redef]
11
+ """Fallback CommClosedError for older distributed versions."""
12
+
13
+ pass
14
+
15
+
16
+ try:
17
+ from tornado.iostream import StreamClosedError # type: ignore
18
+ except ImportError: # pragma: no cover
19
+
20
+ class StreamClosedError(Exception): # type: ignore[no-redef]
21
+ """Fallback StreamClosedError for missing tornado."""
22
+
23
+ pass
24
+
25
+
26
+ # Common exception set considered recoverable by rebind-and-retry
27
+ RECOVERABLE_COMMS = (
28
+ CommClosedError,
29
+ StreamClosedError,
30
+ TimeoutError,
31
+ ConnectionError,
32
+ OSError,
33
+ RuntimeError,
34
+ )
@@ -0,0 +1,49 @@
1
+ """
2
+ Utility functions for the dask_cluster package.
3
+ """
4
+
5
+ import numpy as np # type: ignore
6
+ import pandas as pd
7
+ from typing import Any
8
+
9
+
10
+ def _to_int_safe(x: Any, default: int = 0) -> int:
11
+ """
12
+ Safely convert a value to integer with fallback defaults.
13
+ """
14
+ if x is None:
15
+ return default
16
+ if isinstance(x, (int, np.integer)) and not isinstance(x, bool):
17
+ return int(x)
18
+ if isinstance(x, (float, np.floating)):
19
+ try:
20
+ return int(x)
21
+ except Exception:
22
+ return default
23
+ if isinstance(x, np.generic):
24
+ try:
25
+ return int(x.item())
26
+ except Exception:
27
+ return default
28
+ if isinstance(x, (pd.Series, pd.Index, list, tuple, np.ndarray)):
29
+ try:
30
+ arr = np.asarray(x)
31
+ if arr.size == 0:
32
+ return default
33
+ return _to_int_safe(arr.ravel()[0], default=default)
34
+ except Exception:
35
+ return default
36
+ if hasattr(x, "item"):
37
+ try:
38
+ return _to_int_safe(x.item(), default=default)
39
+ except Exception:
40
+ return default
41
+ if hasattr(x, "iloc"):
42
+ try:
43
+ return _to_int_safe(x.iloc[0], default=default)
44
+ except Exception:
45
+ return default
46
+ try:
47
+ return int(x)
48
+ except Exception:
49
+ return default
@@ -0,0 +1,3 @@
1
+ from ._data_cube import Datacube, DatacubeConfig
2
+
3
+ __all__ = ["Datacube", "DatacubeConfig"]