sibi-dst 2025.9.14__py3-none-any.whl → 2025.9.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/utils/dask_utils.py +325 -48
- {sibi_dst-2025.9.14.dist-info → sibi_dst-2025.9.15.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.14.dist-info → sibi_dst-2025.9.15.dist-info}/RECORD +5 -5
- {sibi_dst-2025.9.14.dist-info → sibi_dst-2025.9.15.dist-info}/WHEEL +0 -0
- {sibi_dst-2025.9.14.dist-info → sibi_dst-2025.9.15.dist-info}/top_level.txt +0 -0
sibi_dst/utils/dask_utils.py
CHANGED
@@ -63,52 +63,158 @@ class UniqueValuesExtractor:
|
|
63
63
|
pairs = await asyncio.gather(*(one(c) for c in columns))
|
64
64
|
return dict(pairs)
|
65
65
|
|
66
|
-
|
67
|
-
|
66
|
+
import asyncio
|
67
|
+
import json
|
68
|
+
import logging
|
68
69
|
import os
|
70
|
+
import tempfile
|
71
|
+
from contextlib import suppress, asynccontextmanager, contextmanager
|
72
|
+
from typing import Optional
|
73
|
+
from dask.distributed import Client, LocalCluster, get_client
|
74
|
+
from filelock import FileLock
|
75
|
+
|
69
76
|
|
70
77
|
class DaskClientMixin:
|
71
78
|
"""
|
72
|
-
Provides shared Dask client lifecycle management
|
73
|
-
|
74
|
-
|
79
|
+
Provides shared Dask client lifecycle management with:
|
80
|
+
- Shared registry (JSON + file lock)
|
81
|
+
- Automatic refcounting across processes
|
82
|
+
- Auto-cleanup of stale clusters
|
83
|
+
- Optional background watchdog to monitor cluster health
|
75
84
|
"""
|
76
85
|
|
86
|
+
REGISTRY_PATH = os.path.join(tempfile.gettempdir(), "shared_dask_cluster.json")
|
87
|
+
REGISTRY_LOCK = FileLock(REGISTRY_PATH + ".lock")
|
88
|
+
WATCHDOG_INTERVAL = 60 # seconds between health checks
|
89
|
+
|
90
|
+
def __init__(self, **kwargs):
|
91
|
+
self.dask_client: Optional[Client] = None
|
92
|
+
self.own_dask_client: bool = False
|
93
|
+
self.logger = kwargs.get("logger") or logging.getLogger(__name__)
|
94
|
+
self._watchdog_task: Optional[asyncio.Task] = None
|
95
|
+
self._watchdog_stop = asyncio.Event()
|
96
|
+
|
97
|
+
# ----------------------------------------------------------------------
|
98
|
+
# Registry management
|
99
|
+
# ----------------------------------------------------------------------
|
100
|
+
@classmethod
|
101
|
+
def _read_registry(cls) -> Optional[dict]:
|
102
|
+
"""Read registry JSON if it exists and is valid."""
|
103
|
+
if not os.path.exists(cls.REGISTRY_PATH):
|
104
|
+
return None
|
105
|
+
try:
|
106
|
+
with open(cls.REGISTRY_PATH, "r") as f:
|
107
|
+
data = json.load(f)
|
108
|
+
if "address" not in data or not isinstance(data["address"], str):
|
109
|
+
return None
|
110
|
+
return data
|
111
|
+
except (json.JSONDecodeError, OSError):
|
112
|
+
return None
|
113
|
+
|
114
|
+
@classmethod
|
115
|
+
def _write_registry(cls, data: dict) -> None:
|
116
|
+
"""Write updated registry JSON atomically."""
|
117
|
+
tmp_path = cls.REGISTRY_PATH + ".tmp"
|
118
|
+
with open(tmp_path, "w") as f:
|
119
|
+
json.dump(data, f)
|
120
|
+
os.replace(tmp_path, cls.REGISTRY_PATH)
|
121
|
+
|
122
|
+
@classmethod
|
123
|
+
def _remove_registry(cls) -> None:
|
124
|
+
"""Delete the registry file if present."""
|
125
|
+
with suppress(FileNotFoundError):
|
126
|
+
os.remove(cls.REGISTRY_PATH)
|
127
|
+
|
128
|
+
@classmethod
|
129
|
+
def _cleanup_stale_registry(cls, logger=None):
|
130
|
+
"""Detect and remove stale registry entries if cluster is unreachable."""
|
131
|
+
registry = cls._read_registry()
|
132
|
+
if not registry:
|
133
|
+
return
|
134
|
+
try:
|
135
|
+
client = Client(address=registry["address"], timeout=5)
|
136
|
+
client.close()
|
137
|
+
except Exception:
|
138
|
+
if logger:
|
139
|
+
logger.warning(
|
140
|
+
f"Detected stale Dask cluster registry at {registry.get('address')}. Cleaning up."
|
141
|
+
)
|
142
|
+
cls._remove_registry()
|
143
|
+
|
144
|
+
# ----------------------------------------------------------------------
|
145
|
+
# Dask client initialization
|
146
|
+
# ----------------------------------------------------------------------
|
77
147
|
def _init_dask_client(
|
78
148
|
self,
|
79
|
-
dask_client=None,
|
80
|
-
logger=None,
|
149
|
+
dask_client: Optional[Client] = None,
|
81
150
|
*,
|
82
|
-
|
151
|
+
logger=None,
|
152
|
+
scheduler_address: Optional[str] = None,
|
153
|
+
use_remote_cluster: bool = False,
|
154
|
+
n_workers: int = 2,
|
83
155
|
threads_per_worker: int = 1,
|
84
156
|
processes: bool = False,
|
85
157
|
asynchronous: bool = False,
|
86
158
|
memory_limit: str = "auto",
|
87
|
-
|
88
|
-
local_directory: str | None = None,
|
159
|
+
local_directory: Optional[str] = None,
|
89
160
|
silence_logs: str = "info",
|
90
|
-
resources: dict
|
161
|
+
resources: Optional[dict] = None,
|
91
162
|
timeout: int = 30,
|
163
|
+
watchdog: bool = True,
|
92
164
|
):
|
165
|
+
"""Initialize or attach to a shared Dask client."""
|
166
|
+
self.logger = logger or self.logger
|
93
167
|
self.dask_client = dask_client
|
94
168
|
self.own_dask_client = False
|
95
|
-
|
96
|
-
#
|
97
|
-
logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(
|
98
|
-
logging.ERROR
|
99
|
-
)
|
169
|
+
|
170
|
+
# Silence excessive logging
|
100
171
|
logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
101
172
|
logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
173
|
+
logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(logging.ERROR)
|
102
174
|
|
175
|
+
# 1️⃣ Try reusing existing client
|
103
176
|
if self.dask_client is None:
|
104
177
|
with suppress(ValueError, RuntimeError):
|
105
|
-
# Try to attach to an existing client (common in shared Dask setups)
|
106
178
|
self.dask_client = get_client()
|
107
179
|
|
108
|
-
|
109
|
-
|
110
|
-
|
180
|
+
# 2️⃣ Try remote cluster connection
|
181
|
+
if self.dask_client is None and use_remote_cluster and scheduler_address:
|
182
|
+
try:
|
183
|
+
self.dask_client = Client(address=scheduler_address, timeout=timeout)
|
184
|
+
self.own_dask_client = True
|
185
|
+
self.logger.info(
|
186
|
+
f"Connected to external Dask scheduler at {scheduler_address}. "
|
187
|
+
f"Dashboard: {self.dask_client.dashboard_link}"
|
188
|
+
)
|
189
|
+
if watchdog:
|
190
|
+
self._start_watchdog()
|
191
|
+
return
|
192
|
+
except Exception as e:
|
193
|
+
self.logger.warning(
|
194
|
+
f"Failed to connect to remote Dask scheduler: {e}. Falling back to local cluster."
|
195
|
+
)
|
196
|
+
|
197
|
+
# 3️⃣ Shared local cluster via registry
|
198
|
+
with self.REGISTRY_LOCK:
|
199
|
+
self._cleanup_stale_registry(self.logger)
|
200
|
+
registry = self._read_registry()
|
111
201
|
|
202
|
+
if registry:
|
203
|
+
try:
|
204
|
+
self.dask_client = Client(address=registry["address"], timeout=timeout)
|
205
|
+
registry["refcount"] = registry.get("refcount", 0) + 1
|
206
|
+
self._write_registry(registry)
|
207
|
+
self.logger.info(
|
208
|
+
f"Reusing existing LocalCluster at {registry['address']} (refcount={registry['refcount']})."
|
209
|
+
)
|
210
|
+
if watchdog:
|
211
|
+
self._start_watchdog()
|
212
|
+
return
|
213
|
+
except Exception:
|
214
|
+
self.logger.warning("Existing cluster unreachable. Recreating.")
|
215
|
+
self._remove_registry()
|
216
|
+
|
217
|
+
# Create a new local cluster
|
112
218
|
cluster = LocalCluster(
|
113
219
|
n_workers=n_workers,
|
114
220
|
threads_per_worker=threads_per_worker,
|
@@ -123,37 +229,208 @@ class DaskClientMixin:
|
|
123
229
|
|
124
230
|
self.dask_client = Client(cluster)
|
125
231
|
self.own_dask_client = True
|
232
|
+
registry = {"address": cluster.scheduler_address, "refcount": 1}
|
233
|
+
self._write_registry(registry)
|
234
|
+
self.logger.info(
|
235
|
+
f"Started new LocalCluster ({n_workers} workers × {threads_per_worker} threads). "
|
236
|
+
f"Dashboard: {self.dask_client.dashboard_link}"
|
237
|
+
)
|
126
238
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
239
|
+
if watchdog:
|
240
|
+
self._start_watchdog()
|
241
|
+
|
242
|
+
# ----------------------------------------------------------------------
|
243
|
+
# Watchdog logic
|
244
|
+
# ----------------------------------------------------------------------
|
245
|
+
def _start_watchdog(self):
|
246
|
+
"""Spawn a background watchdog that monitors registry health."""
|
247
|
+
async def watchdog_loop():
|
248
|
+
while not self._watchdog_stop.is_set():
|
249
|
+
await asyncio.sleep(self.WATCHDOG_INTERVAL)
|
250
|
+
try:
|
251
|
+
self._cleanup_stale_registry(self.logger)
|
252
|
+
except Exception as e:
|
253
|
+
self.logger.warning(f"Dask watchdog encountered an error: {e}")
|
137
254
|
|
255
|
+
try:
|
256
|
+
loop = asyncio.get_event_loop()
|
257
|
+
if loop.is_running():
|
258
|
+
self._watchdog_task = loop.create_task(watchdog_loop())
|
259
|
+
self.logger.debug("Started Dask registry watchdog (async).")
|
260
|
+
except RuntimeError:
|
261
|
+
# Fallback for synchronous usage
|
262
|
+
self.logger.debug("Watchdog skipped (no active event loop).")
|
263
|
+
|
264
|
+
async def _stop_watchdog(self):
|
265
|
+
"""Stop the watchdog loop gracefully."""
|
266
|
+
self._watchdog_stop.set()
|
267
|
+
if self._watchdog_task:
|
268
|
+
await asyncio.wait([self._watchdog_task], timeout=5)
|
269
|
+
self._watchdog_task = None
|
270
|
+
|
271
|
+
# ----------------------------------------------------------------------
|
272
|
+
# Client cleanup
|
273
|
+
# ----------------------------------------------------------------------
|
138
274
|
def _close_dask_client(self):
|
139
|
-
"""
|
140
|
-
if
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
cluster.close()
|
146
|
-
if self.logger:
|
147
|
-
self.logger.info("Closed local Dask client and cluster.")
|
148
|
-
except Exception as e:
|
149
|
-
if self.logger:
|
150
|
-
self.logger.warning(f"Error while closing Dask client: {e}")
|
275
|
+
"""Safely close client and update registry reference count."""
|
276
|
+
if not self.dask_client:
|
277
|
+
return
|
278
|
+
|
279
|
+
with self.REGISTRY_LOCK:
|
280
|
+
registry = self._read_registry()
|
151
281
|
|
152
|
-
|
153
|
-
|
282
|
+
if registry and "refcount" in registry:
|
283
|
+
registry["refcount"] = max(0, registry["refcount"] - 1)
|
284
|
+
if registry["refcount"] == 0:
|
285
|
+
self.logger.info("Reference count 0 — closing LocalCluster.")
|
286
|
+
try:
|
287
|
+
cluster = getattr(self.dask_client, "cluster", None)
|
288
|
+
self.dask_client.close()
|
289
|
+
if cluster:
|
290
|
+
cluster.close()
|
291
|
+
except Exception as e:
|
292
|
+
self.logger.warning(f"Error closing Dask cluster: {e}")
|
293
|
+
self._remove_registry()
|
294
|
+
else:
|
295
|
+
self._write_registry(registry)
|
296
|
+
self.logger.debug(
|
297
|
+
f"Decremented LocalCluster refcount to {registry['refcount']}."
|
298
|
+
)
|
299
|
+
else:
|
300
|
+
with suppress(Exception):
|
301
|
+
self.dask_client.close()
|
302
|
+
self.logger.debug("Closed Dask client without registry tracking.")
|
303
|
+
|
304
|
+
# Stop watchdog if active
|
305
|
+
if self._watchdog_task:
|
306
|
+
asyncio.create_task(self._stop_watchdog())
|
307
|
+
|
308
|
+
|
309
|
+
# ----------------------------------------------------------------------
|
310
|
+
# Shared Dask session (sync + async)
|
311
|
+
# ----------------------------------------------------------------------
|
312
|
+
def shared_dask_session(*, async_mode: bool = True, **kwargs):
|
313
|
+
"""
|
314
|
+
Context manager for a shared Dask session (supports async + sync).
|
315
|
+
|
316
|
+
Example:
|
317
|
+
async with shared_dask_session(logger=logger) as client:
|
318
|
+
...
|
319
|
+
|
320
|
+
with shared_dask_session(async_mode=False) as client:
|
321
|
+
...
|
322
|
+
"""
|
154
323
|
mixin = DaskClientMixin()
|
155
324
|
mixin._init_dask_client(**kwargs)
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
325
|
+
|
326
|
+
if async_mode:
|
327
|
+
@asynccontextmanager
|
328
|
+
async def _async_manager():
|
329
|
+
try:
|
330
|
+
yield mixin.dask_client
|
331
|
+
finally:
|
332
|
+
mixin._close_dask_client()
|
333
|
+
return _async_manager()
|
334
|
+
else:
|
335
|
+
@contextmanager
|
336
|
+
def _sync_manager():
|
337
|
+
try:
|
338
|
+
yield mixin.dask_client
|
339
|
+
finally:
|
340
|
+
mixin._close_dask_client()
|
341
|
+
return _sync_manager()
|
342
|
+
|
343
|
+
# from contextlib import suppress, asynccontextmanager
|
344
|
+
# from dask.distributed import Client, LocalCluster, get_client
|
345
|
+
# import os
|
346
|
+
#
|
347
|
+
# class DaskClientMixin:
|
348
|
+
# """
|
349
|
+
# Provides shared Dask client lifecycle management.
|
350
|
+
# Ensures reuse of an existing client if available,
|
351
|
+
# or creates a local in-process Dask cluster for fallback.
|
352
|
+
# """
|
353
|
+
#
|
354
|
+
# def _init_dask_client(
|
355
|
+
# self,
|
356
|
+
# dask_client=None,
|
357
|
+
# logger=None,
|
358
|
+
# *,
|
359
|
+
# n_workers: int = 1,
|
360
|
+
# threads_per_worker: int = 1,
|
361
|
+
# processes: bool = False,
|
362
|
+
# asynchronous: bool = False,
|
363
|
+
# memory_limit: str = "auto",
|
364
|
+
# #dashboard_address: str | None = None,
|
365
|
+
# local_directory: str | None = None,
|
366
|
+
# silence_logs: str = "info",
|
367
|
+
# resources: dict | None = None,
|
368
|
+
# timeout: int = 30,
|
369
|
+
# ):
|
370
|
+
# self.dask_client = dask_client
|
371
|
+
# self.own_dask_client = False
|
372
|
+
# self.logger = logger
|
373
|
+
# # Apply log filters globally
|
374
|
+
# logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(
|
375
|
+
# logging.ERROR
|
376
|
+
# )
|
377
|
+
# logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
378
|
+
# logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
379
|
+
#
|
380
|
+
# if self.dask_client is None:
|
381
|
+
# with suppress(ValueError, RuntimeError):
|
382
|
+
# # Try to attach to an existing client (common in shared Dask setups)
|
383
|
+
# self.dask_client = get_client()
|
384
|
+
#
|
385
|
+
# if self.dask_client is None:
|
386
|
+
# # Default to half of logical cores if not specified
|
387
|
+
# n_workers = n_workers or max(2, os.cpu_count() // 2)
|
388
|
+
#
|
389
|
+
# cluster = LocalCluster(
|
390
|
+
# n_workers=n_workers,
|
391
|
+
# threads_per_worker=threads_per_worker,
|
392
|
+
# processes=processes,
|
393
|
+
# asynchronous=asynchronous,
|
394
|
+
# memory_limit=memory_limit,
|
395
|
+
# local_directory=local_directory,
|
396
|
+
# silence_logs=silence_logs,
|
397
|
+
# resources=resources,
|
398
|
+
# timeout=timeout,
|
399
|
+
# )
|
400
|
+
#
|
401
|
+
# self.dask_client = Client(cluster)
|
402
|
+
# self.own_dask_client = True
|
403
|
+
#
|
404
|
+
# if self.logger:
|
405
|
+
# self.logger.info(
|
406
|
+
# f"Started local Dask cluster with {n_workers} workers × {threads_per_worker} threads "
|
407
|
+
# f"({memory_limit} memory per worker). Dashboard: {self.dask_client.dashboard_link}"
|
408
|
+
# )
|
409
|
+
# else:
|
410
|
+
# if self.logger:
|
411
|
+
# self.logger.debug(
|
412
|
+
# f"Using existing Dask client: {self.dask_client.dashboard_link}"
|
413
|
+
# )
|
414
|
+
#
|
415
|
+
# def _close_dask_client(self):
|
416
|
+
# """Close the Dask client if this instance created it."""
|
417
|
+
# if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
418
|
+
# try:
|
419
|
+
# cluster = getattr(self.dask_client, "cluster", None)
|
420
|
+
# self.dask_client.close()
|
421
|
+
# if cluster is not None:
|
422
|
+
# cluster.close()
|
423
|
+
# if self.logger:
|
424
|
+
# self.logger.info("Closed local Dask client and cluster.")
|
425
|
+
# except Exception as e:
|
426
|
+
# if self.logger:
|
427
|
+
# self.logger.warning(f"Error while closing Dask client: {e}")
|
428
|
+
#
|
429
|
+
# @asynccontextmanager
|
430
|
+
# async def shared_dask_session(**kwargs):
|
431
|
+
# mixin = DaskClientMixin()
|
432
|
+
# mixin._init_dask_client(**kwargs)
|
433
|
+
# try:
|
434
|
+
# yield mixin.dask_client
|
435
|
+
# finally:
|
436
|
+
# mixin._close_dask_client()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.9.
|
3
|
+
Version: 2025.9.15
|
4
4
|
Summary: A data science toolkit for scalable data processing and analysis.
|
5
5
|
Requires-Python: >=3.11
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -9,6 +9,7 @@ Requires-Dist: clickhouse-driver>=0.2.9
|
|
9
9
|
Requires-Dist: dask>=2025.9.1
|
10
10
|
Requires-Dist: distributed>=2025.9.1
|
11
11
|
Requires-Dist: fastapi>=0.118.0
|
12
|
+
Requires-Dist: filelock>=3.20.0
|
12
13
|
Requires-Dist: folium>=0.20.0
|
13
14
|
Requires-Dist: mysqlclient>=2.2.7
|
14
15
|
Requires-Dist: opentelemetry-api>=1.37.0
|
@@ -42,7 +42,7 @@ sibi_dst/utils/base.py,sha256=ycaaXlXTH4tn3fM954jGx8-zKWEmdjJkJKArQ2rfAH0,17527
|
|
42
42
|
sibi_dst/utils/business_days.py,sha256=DPZExTXTt7n3IbAaEuVacm-vZgbR_Ug2bJTPBUaoP3g,6694
|
43
43
|
sibi_dst/utils/clickhouse_writer.py,sha256=8W_dTEOKQp4pXANznVSxRqFA2H5oD8UJifiBAONpXWY,17001
|
44
44
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
45
|
-
sibi_dst/utils/dask_utils.py,sha256=
|
45
|
+
sibi_dst/utils/dask_utils.py,sha256=F78smHtuJfZwCvCCBmPTofa0DPcm9oaNDH8YMOioweY,16383
|
46
46
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
47
47
|
sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
|
48
48
|
sibi_dst/utils/data_wrapper.py,sha256=9HTuDXgvfhmFAOyNG_GEOaHuojxE3639yyzOoBt7Unc,18000
|
@@ -94,7 +94,7 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
94
94
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
95
95
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
96
96
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
97
|
-
sibi_dst-2025.9.
|
98
|
-
sibi_dst-2025.9.
|
99
|
-
sibi_dst-2025.9.
|
100
|
-
sibi_dst-2025.9.
|
97
|
+
sibi_dst-2025.9.15.dist-info/METADATA,sha256=UGjP-M2r9_h6jksc5bY-ltd-NA6G34N0Y3TH7OTvGCs,2445
|
98
|
+
sibi_dst-2025.9.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
99
|
+
sibi_dst-2025.9.15.dist-info/top_level.txt,sha256=g3Cj4R-rciuNyJgcxuxNgw5nhN0n4TCB0ujcTEjZNiU,9
|
100
|
+
sibi_dst-2025.9.15.dist-info/RECORD,,
|
File without changes
|
File without changes
|