sibi-dst 2025.9.13__tar.gz → 2025.9.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/PKG-INFO +2 -1
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/pyproject.toml +2 -1
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/_df_helper.py +1 -1
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/base.py +40 -18
- sibi_dst-2025.9.15/sibi_dst/utils/dask_utils.py +436 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst.egg-info/PKG-INFO +2 -1
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst.egg-info/requires.txt +1 -0
- sibi_dst-2025.9.13/sibi_dst/utils/dask_utils.py +0 -200
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/README.md +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/setup.cfg +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/tests/test_baseclass.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_pipeline.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_pipeline_template.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/iceberg_saver.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/progress/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/progress/jobs.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/progress/sse_runner.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/storage_hive.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/write_gatekeeper.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/utils/log_utils.py +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst.egg-info/SOURCES.txt +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst.egg-info/dependency_links.txt +0 -0
- {sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.9.
|
3
|
+
Version: 2025.9.15
|
4
4
|
Summary: A data science toolkit for scalable data processing and analysis.
|
5
5
|
Requires-Python: >=3.11
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -9,6 +9,7 @@ Requires-Dist: clickhouse-driver>=0.2.9
|
|
9
9
|
Requires-Dist: dask>=2025.9.1
|
10
10
|
Requires-Dist: distributed>=2025.9.1
|
11
11
|
Requires-Dist: fastapi>=0.118.0
|
12
|
+
Requires-Dist: filelock>=3.20.0
|
12
13
|
Requires-Dist: folium>=0.20.0
|
13
14
|
Requires-Dist: mysqlclient>=2.2.7
|
14
15
|
Requires-Dist: opentelemetry-api>=1.37.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "sibi-dst"
|
3
|
-
version = "2025.9.
|
3
|
+
version = "2025.9.15"
|
4
4
|
description = "A data science toolkit for scalable data processing and analysis."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.11"
|
@@ -10,6 +10,7 @@ dependencies = [
|
|
10
10
|
"dask>=2025.9.1",
|
11
11
|
"distributed>=2025.9.1",
|
12
12
|
"fastapi>=0.118.0",
|
13
|
+
"filelock>=3.20.0",
|
13
14
|
"folium>=0.20.0",
|
14
15
|
"mysqlclient>=2.2.7",
|
15
16
|
"opentelemetry-api>=1.37.0",
|
@@ -137,7 +137,7 @@ class DfHelper(ManagedResource):
|
|
137
137
|
def __init__(self, backend="sqlalchemy", **kwargs):
|
138
138
|
self.default_config = self.default_config or {}
|
139
139
|
kwargs = {**self.default_config.copy(), **kwargs}
|
140
|
-
kwargs.setdefault("auto_sse",
|
140
|
+
kwargs.setdefault("auto_sse", False)
|
141
141
|
super().__init__(**kwargs)
|
142
142
|
self.backend = backend
|
143
143
|
|
@@ -14,7 +14,18 @@ from sibi_dst.utils import Logger
|
|
14
14
|
|
15
15
|
# --------- Minimal built-in SSE sink (used when auto_sse=True) ----------
|
16
16
|
class _QueueSSE:
|
17
|
-
"""
|
17
|
+
"""
|
18
|
+
Handles asynchronous streaming of events with structured data.
|
19
|
+
|
20
|
+
This class provides the ability to manage an asynchronous queue for handling
|
21
|
+
streamed Server-Sent Events (SSE). It supports operations like sending events
|
22
|
+
with associated data, manually enqueuing items, and iterating over items in an
|
23
|
+
asynchronous loop. The class also includes mechanisms for clean closure of the
|
24
|
+
stream.
|
25
|
+
|
26
|
+
:ivar q: An asynchronous queue used to store events and data.
|
27
|
+
:type q: asyncio.Queue
|
28
|
+
"""
|
18
29
|
__slots__ = ("q", "_closed")
|
19
30
|
|
20
31
|
def __init__(self) -> None:
|
@@ -46,11 +57,31 @@ class _QueueSSE:
|
|
46
57
|
# ------------------------------ Base class ------------------------------
|
47
58
|
class ManagedResource(abc.ABC):
|
48
59
|
"""
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
60
|
+
Management of shared resources with configurable verbosity, logging,
|
61
|
+
and support for external file systems and server-sent events (SSE).
|
62
|
+
|
63
|
+
This class is designed to assist in managing resources such as logging,
|
64
|
+
file systems, and SSE within an asynchronous or synchronous environment.
|
65
|
+
It provides facilities for handling resource lifecycle, introspection,
|
66
|
+
and cleanup while ensuring resources are appropriately managed. The class
|
67
|
+
also supports lazy initialization of external dependencies via factories.
|
68
|
+
|
69
|
+
:ivar verbose: Controls verbosity of logging or operations. If set to True,
|
70
|
+
more detailed logging/output will be generated.
|
71
|
+
:type verbose: bool
|
72
|
+
:ivar debug: Enables debug-level logging and internal diagnostics when True.
|
73
|
+
Typically used for troubleshooting purposes.
|
74
|
+
:type debug: bool
|
75
|
+
:ivar logger: The logger instance used for this resource. If left unset,
|
76
|
+
a default logger will be created.
|
77
|
+
:type logger: Optional[Logger]
|
78
|
+
:ivar fs: The file system interface being used. Typically an instance of
|
79
|
+
`fsspec.AbstractFileSystem`. If not provided, it may be created lazily
|
80
|
+
using a supplied factory function.
|
81
|
+
:type fs: Optional[fsspec.AbstractFileSystem]
|
82
|
+
:ivar emitter: A callable, potentially asynchronous, function for emitting
|
83
|
+
events. Events are sent as a combination of event names and payload data.
|
84
|
+
:type emitter: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]]
|
54
85
|
"""
|
55
86
|
|
56
87
|
__slots__ = (
|
@@ -74,28 +105,23 @@ class ManagedResource(abc.ABC):
|
|
74
105
|
debug: bool = False,
|
75
106
|
log_cleanup_errors: bool = True,
|
76
107
|
logger: Optional[Logger] = None,
|
77
|
-
# filesystem
|
78
108
|
fs: Optional[fsspec.AbstractFileSystem] = None,
|
79
109
|
fs_factory: Optional[Callable[[], fsspec.AbstractFileSystem]] = None,
|
80
|
-
# SSE
|
81
110
|
emitter: Optional[Callable[[str, Dict[str, Any]], Awaitable[None]]] = None,
|
82
111
|
emitter_factory: Optional[Callable[[], Callable[[str, Dict[str, Any]], Awaitable[None]]]] = None,
|
83
112
|
sse: Optional[object] = None,
|
84
113
|
sse_factory: Optional[Callable[[], object]] = None,
|
85
|
-
auto_sse: bool = False,
|
114
|
+
auto_sse: bool = False,
|
86
115
|
**_: object,
|
87
116
|
) -> None:
|
88
|
-
# flags
|
89
117
|
self.verbose = verbose
|
90
118
|
self.debug = debug
|
91
119
|
self._log_cleanup_errors = log_cleanup_errors
|
92
120
|
|
93
|
-
# lifecycle
|
94
121
|
self._is_closed = False
|
95
122
|
self._closing = False
|
96
123
|
self._close_lock = threading.RLock()
|
97
124
|
|
98
|
-
# logger
|
99
125
|
if logger is None:
|
100
126
|
self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
|
101
127
|
self._owns_logger = True
|
@@ -105,7 +131,6 @@ class ManagedResource(abc.ABC):
|
|
105
131
|
self.logger = logger
|
106
132
|
self._owns_logger = False
|
107
133
|
|
108
|
-
# fs
|
109
134
|
self.fs: Optional[fsspec.AbstractFileSystem] = None
|
110
135
|
self._fs_factory = None
|
111
136
|
self._owns_fs = False
|
@@ -119,7 +144,6 @@ class ManagedResource(abc.ABC):
|
|
119
144
|
self._fs_factory = fs_factory
|
120
145
|
self._owns_fs = True
|
121
146
|
|
122
|
-
# sse / emitter
|
123
147
|
self._sse: Optional[object] = None
|
124
148
|
self._sse_factory: Optional[Callable[[], object]] = None
|
125
149
|
self._owns_sse = False
|
@@ -140,16 +164,15 @@ class ManagedResource(abc.ABC):
|
|
140
164
|
self._sse_factory = sse_factory
|
141
165
|
self._owns_sse = True
|
142
166
|
|
143
|
-
# EAGER auto-SSE: create sink+emitter now if none supplied
|
144
167
|
if self._auto_sse and self._sse is None and self._emitter is None and self._sse_factory is None:
|
145
168
|
self._create_auto_sse()
|
146
169
|
|
147
|
-
#
|
170
|
+
# Garbage Collector finaliser
|
148
171
|
self._finalizer = weakref.finalize(self, self._finalize_static, weakref.ref(self))
|
149
172
|
|
150
173
|
if self.debug:
|
151
174
|
with contextlib.suppress(Exception):
|
152
|
-
self.logger.debug("
|
175
|
+
self.logger.debug("Initialised %s %s", self.__class__.__name__, repr(self))
|
153
176
|
|
154
177
|
# ---------- Introspection ----------
|
155
178
|
@property
|
@@ -222,7 +245,6 @@ class ManagedResource(abc.ABC):
|
|
222
245
|
|
223
246
|
# ---------- SSE ----------
|
224
247
|
def _create_auto_sse(self) -> None:
|
225
|
-
# internal helper: create queue sink + emitter, mark as owned
|
226
248
|
sink = _QueueSSE()
|
227
249
|
self._sse = sink
|
228
250
|
self._owns_sse = True
|
@@ -0,0 +1,436 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import logging
|
5
|
+
from typing import List, Any, Dict
|
6
|
+
|
7
|
+
import dask
|
8
|
+
import dask.dataframe as dd
|
9
|
+
|
10
|
+
def _to_int_safe(x) -> int:
|
11
|
+
"""
|
12
|
+
Convert scalar-like to int safely.
|
13
|
+
Handles numpy scalars, pandas Series/DataFrame outputs.
|
14
|
+
"""
|
15
|
+
if hasattr(x, "item"): # numpy scalar, pandas scalar
|
16
|
+
return int(x.item())
|
17
|
+
if hasattr(x, "iloc"): # Series-like
|
18
|
+
return int(x.iloc[0])
|
19
|
+
return int(x)
|
20
|
+
|
21
|
+
def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
|
22
|
+
return getattr(ddf, "npartitions", 0) == 0 or len(ddf._meta.columns) == 0
|
23
|
+
|
24
|
+
|
25
|
+
def dask_is_empty_truthful(ddf: dd.DataFrame) -> bool:
|
26
|
+
n = ddf.map_partitions(len).sum().compute()
|
27
|
+
return int(n) == 0
|
28
|
+
|
29
|
+
|
30
|
+
def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
|
31
|
+
if dask_is_probably_empty(ddf):
|
32
|
+
return True
|
33
|
+
|
34
|
+
k = min(max(sample, 1), ddf.npartitions)
|
35
|
+
probes = dask.compute(*[
|
36
|
+
ddf.get_partition(i).map_partitions(len) for i in range(k)
|
37
|
+
], scheduler="threads")
|
38
|
+
|
39
|
+
if any(_to_int_safe(n) > 0 for n in probes):
|
40
|
+
return False
|
41
|
+
if k == ddf.npartitions and all(_to_int_safe(n) == 0 for n in probes):
|
42
|
+
return True
|
43
|
+
|
44
|
+
return dask_is_empty_truthful(ddf)
|
45
|
+
|
46
|
+
class UniqueValuesExtractor:
|
47
|
+
@staticmethod
|
48
|
+
def _compute_to_list_sync(series) -> List[Any]:
|
49
|
+
"""Run in a worker thread when Dask-backed."""
|
50
|
+
if hasattr(series, "compute"):
|
51
|
+
return series.compute().tolist()
|
52
|
+
return series.tolist()
|
53
|
+
|
54
|
+
async def compute_to_list(self, series) -> List[Any]:
|
55
|
+
# Offload potential Dask .compute() to a thread to avoid blocking the event loop
|
56
|
+
return await asyncio.to_thread(self._compute_to_list_sync, series)
|
57
|
+
|
58
|
+
async def extract_unique_values(self, df, *columns: str) -> Dict[str, List[Any]]:
|
59
|
+
async def one(col: str):
|
60
|
+
ser = df[col].dropna().unique()
|
61
|
+
return col, await self.compute_to_list(ser)
|
62
|
+
|
63
|
+
pairs = await asyncio.gather(*(one(c) for c in columns))
|
64
|
+
return dict(pairs)
|
65
|
+
|
66
|
+
import asyncio
|
67
|
+
import json
|
68
|
+
import logging
|
69
|
+
import os
|
70
|
+
import tempfile
|
71
|
+
from contextlib import suppress, asynccontextmanager, contextmanager
|
72
|
+
from typing import Optional
|
73
|
+
from dask.distributed import Client, LocalCluster, get_client
|
74
|
+
from filelock import FileLock
|
75
|
+
|
76
|
+
|
77
|
+
class DaskClientMixin:
|
78
|
+
"""
|
79
|
+
Provides shared Dask client lifecycle management with:
|
80
|
+
- Shared registry (JSON + file lock)
|
81
|
+
- Automatic refcounting across processes
|
82
|
+
- Auto-cleanup of stale clusters
|
83
|
+
- Optional background watchdog to monitor cluster health
|
84
|
+
"""
|
85
|
+
|
86
|
+
REGISTRY_PATH = os.path.join(tempfile.gettempdir(), "shared_dask_cluster.json")
|
87
|
+
REGISTRY_LOCK = FileLock(REGISTRY_PATH + ".lock")
|
88
|
+
WATCHDOG_INTERVAL = 60 # seconds between health checks
|
89
|
+
|
90
|
+
def __init__(self, **kwargs):
|
91
|
+
self.dask_client: Optional[Client] = None
|
92
|
+
self.own_dask_client: bool = False
|
93
|
+
self.logger = kwargs.get("logger") or logging.getLogger(__name__)
|
94
|
+
self._watchdog_task: Optional[asyncio.Task] = None
|
95
|
+
self._watchdog_stop = asyncio.Event()
|
96
|
+
|
97
|
+
# ----------------------------------------------------------------------
|
98
|
+
# Registry management
|
99
|
+
# ----------------------------------------------------------------------
|
100
|
+
@classmethod
|
101
|
+
def _read_registry(cls) -> Optional[dict]:
|
102
|
+
"""Read registry JSON if it exists and is valid."""
|
103
|
+
if not os.path.exists(cls.REGISTRY_PATH):
|
104
|
+
return None
|
105
|
+
try:
|
106
|
+
with open(cls.REGISTRY_PATH, "r") as f:
|
107
|
+
data = json.load(f)
|
108
|
+
if "address" not in data or not isinstance(data["address"], str):
|
109
|
+
return None
|
110
|
+
return data
|
111
|
+
except (json.JSONDecodeError, OSError):
|
112
|
+
return None
|
113
|
+
|
114
|
+
@classmethod
|
115
|
+
def _write_registry(cls, data: dict) -> None:
|
116
|
+
"""Write updated registry JSON atomically."""
|
117
|
+
tmp_path = cls.REGISTRY_PATH + ".tmp"
|
118
|
+
with open(tmp_path, "w") as f:
|
119
|
+
json.dump(data, f)
|
120
|
+
os.replace(tmp_path, cls.REGISTRY_PATH)
|
121
|
+
|
122
|
+
@classmethod
|
123
|
+
def _remove_registry(cls) -> None:
|
124
|
+
"""Delete the registry file if present."""
|
125
|
+
with suppress(FileNotFoundError):
|
126
|
+
os.remove(cls.REGISTRY_PATH)
|
127
|
+
|
128
|
+
@classmethod
|
129
|
+
def _cleanup_stale_registry(cls, logger=None):
|
130
|
+
"""Detect and remove stale registry entries if cluster is unreachable."""
|
131
|
+
registry = cls._read_registry()
|
132
|
+
if not registry:
|
133
|
+
return
|
134
|
+
try:
|
135
|
+
client = Client(address=registry["address"], timeout=5)
|
136
|
+
client.close()
|
137
|
+
except Exception:
|
138
|
+
if logger:
|
139
|
+
logger.warning(
|
140
|
+
f"Detected stale Dask cluster registry at {registry.get('address')}. Cleaning up."
|
141
|
+
)
|
142
|
+
cls._remove_registry()
|
143
|
+
|
144
|
+
# ----------------------------------------------------------------------
|
145
|
+
# Dask client initialization
|
146
|
+
# ----------------------------------------------------------------------
|
147
|
+
def _init_dask_client(
|
148
|
+
self,
|
149
|
+
dask_client: Optional[Client] = None,
|
150
|
+
*,
|
151
|
+
logger=None,
|
152
|
+
scheduler_address: Optional[str] = None,
|
153
|
+
use_remote_cluster: bool = False,
|
154
|
+
n_workers: int = 2,
|
155
|
+
threads_per_worker: int = 1,
|
156
|
+
processes: bool = False,
|
157
|
+
asynchronous: bool = False,
|
158
|
+
memory_limit: str = "auto",
|
159
|
+
local_directory: Optional[str] = None,
|
160
|
+
silence_logs: str = "info",
|
161
|
+
resources: Optional[dict] = None,
|
162
|
+
timeout: int = 30,
|
163
|
+
watchdog: bool = True,
|
164
|
+
):
|
165
|
+
"""Initialize or attach to a shared Dask client."""
|
166
|
+
self.logger = logger or self.logger
|
167
|
+
self.dask_client = dask_client
|
168
|
+
self.own_dask_client = False
|
169
|
+
|
170
|
+
# Silence excessive logging
|
171
|
+
logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
172
|
+
logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
173
|
+
logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(logging.ERROR)
|
174
|
+
|
175
|
+
# 1️⃣ Try reusing existing client
|
176
|
+
if self.dask_client is None:
|
177
|
+
with suppress(ValueError, RuntimeError):
|
178
|
+
self.dask_client = get_client()
|
179
|
+
|
180
|
+
# 2️⃣ Try remote cluster connection
|
181
|
+
if self.dask_client is None and use_remote_cluster and scheduler_address:
|
182
|
+
try:
|
183
|
+
self.dask_client = Client(address=scheduler_address, timeout=timeout)
|
184
|
+
self.own_dask_client = True
|
185
|
+
self.logger.info(
|
186
|
+
f"Connected to external Dask scheduler at {scheduler_address}. "
|
187
|
+
f"Dashboard: {self.dask_client.dashboard_link}"
|
188
|
+
)
|
189
|
+
if watchdog:
|
190
|
+
self._start_watchdog()
|
191
|
+
return
|
192
|
+
except Exception as e:
|
193
|
+
self.logger.warning(
|
194
|
+
f"Failed to connect to remote Dask scheduler: {e}. Falling back to local cluster."
|
195
|
+
)
|
196
|
+
|
197
|
+
# 3️⃣ Shared local cluster via registry
|
198
|
+
with self.REGISTRY_LOCK:
|
199
|
+
self._cleanup_stale_registry(self.logger)
|
200
|
+
registry = self._read_registry()
|
201
|
+
|
202
|
+
if registry:
|
203
|
+
try:
|
204
|
+
self.dask_client = Client(address=registry["address"], timeout=timeout)
|
205
|
+
registry["refcount"] = registry.get("refcount", 0) + 1
|
206
|
+
self._write_registry(registry)
|
207
|
+
self.logger.info(
|
208
|
+
f"Reusing existing LocalCluster at {registry['address']} (refcount={registry['refcount']})."
|
209
|
+
)
|
210
|
+
if watchdog:
|
211
|
+
self._start_watchdog()
|
212
|
+
return
|
213
|
+
except Exception:
|
214
|
+
self.logger.warning("Existing cluster unreachable. Recreating.")
|
215
|
+
self._remove_registry()
|
216
|
+
|
217
|
+
# Create a new local cluster
|
218
|
+
cluster = LocalCluster(
|
219
|
+
n_workers=n_workers,
|
220
|
+
threads_per_worker=threads_per_worker,
|
221
|
+
processes=processes,
|
222
|
+
asynchronous=asynchronous,
|
223
|
+
memory_limit=memory_limit,
|
224
|
+
local_directory=local_directory,
|
225
|
+
silence_logs=silence_logs,
|
226
|
+
resources=resources,
|
227
|
+
timeout=timeout,
|
228
|
+
)
|
229
|
+
|
230
|
+
self.dask_client = Client(cluster)
|
231
|
+
self.own_dask_client = True
|
232
|
+
registry = {"address": cluster.scheduler_address, "refcount": 1}
|
233
|
+
self._write_registry(registry)
|
234
|
+
self.logger.info(
|
235
|
+
f"Started new LocalCluster ({n_workers} workers × {threads_per_worker} threads). "
|
236
|
+
f"Dashboard: {self.dask_client.dashboard_link}"
|
237
|
+
)
|
238
|
+
|
239
|
+
if watchdog:
|
240
|
+
self._start_watchdog()
|
241
|
+
|
242
|
+
# ----------------------------------------------------------------------
|
243
|
+
# Watchdog logic
|
244
|
+
# ----------------------------------------------------------------------
|
245
|
+
def _start_watchdog(self):
|
246
|
+
"""Spawn a background watchdog that monitors registry health."""
|
247
|
+
async def watchdog_loop():
|
248
|
+
while not self._watchdog_stop.is_set():
|
249
|
+
await asyncio.sleep(self.WATCHDOG_INTERVAL)
|
250
|
+
try:
|
251
|
+
self._cleanup_stale_registry(self.logger)
|
252
|
+
except Exception as e:
|
253
|
+
self.logger.warning(f"Dask watchdog encountered an error: {e}")
|
254
|
+
|
255
|
+
try:
|
256
|
+
loop = asyncio.get_event_loop()
|
257
|
+
if loop.is_running():
|
258
|
+
self._watchdog_task = loop.create_task(watchdog_loop())
|
259
|
+
self.logger.debug("Started Dask registry watchdog (async).")
|
260
|
+
except RuntimeError:
|
261
|
+
# Fallback for synchronous usage
|
262
|
+
self.logger.debug("Watchdog skipped (no active event loop).")
|
263
|
+
|
264
|
+
async def _stop_watchdog(self):
|
265
|
+
"""Stop the watchdog loop gracefully."""
|
266
|
+
self._watchdog_stop.set()
|
267
|
+
if self._watchdog_task:
|
268
|
+
await asyncio.wait([self._watchdog_task], timeout=5)
|
269
|
+
self._watchdog_task = None
|
270
|
+
|
271
|
+
# ----------------------------------------------------------------------
|
272
|
+
# Client cleanup
|
273
|
+
# ----------------------------------------------------------------------
|
274
|
+
def _close_dask_client(self):
|
275
|
+
"""Safely close client and update registry reference count."""
|
276
|
+
if not self.dask_client:
|
277
|
+
return
|
278
|
+
|
279
|
+
with self.REGISTRY_LOCK:
|
280
|
+
registry = self._read_registry()
|
281
|
+
|
282
|
+
if registry and "refcount" in registry:
|
283
|
+
registry["refcount"] = max(0, registry["refcount"] - 1)
|
284
|
+
if registry["refcount"] == 0:
|
285
|
+
self.logger.info("Reference count 0 — closing LocalCluster.")
|
286
|
+
try:
|
287
|
+
cluster = getattr(self.dask_client, "cluster", None)
|
288
|
+
self.dask_client.close()
|
289
|
+
if cluster:
|
290
|
+
cluster.close()
|
291
|
+
except Exception as e:
|
292
|
+
self.logger.warning(f"Error closing Dask cluster: {e}")
|
293
|
+
self._remove_registry()
|
294
|
+
else:
|
295
|
+
self._write_registry(registry)
|
296
|
+
self.logger.debug(
|
297
|
+
f"Decremented LocalCluster refcount to {registry['refcount']}."
|
298
|
+
)
|
299
|
+
else:
|
300
|
+
with suppress(Exception):
|
301
|
+
self.dask_client.close()
|
302
|
+
self.logger.debug("Closed Dask client without registry tracking.")
|
303
|
+
|
304
|
+
# Stop watchdog if active
|
305
|
+
if self._watchdog_task:
|
306
|
+
asyncio.create_task(self._stop_watchdog())
|
307
|
+
|
308
|
+
|
309
|
+
# ----------------------------------------------------------------------
|
310
|
+
# Shared Dask session (sync + async)
|
311
|
+
# ----------------------------------------------------------------------
|
312
|
+
def shared_dask_session(*, async_mode: bool = True, **kwargs):
|
313
|
+
"""
|
314
|
+
Context manager for a shared Dask session (supports async + sync).
|
315
|
+
|
316
|
+
Example:
|
317
|
+
async with shared_dask_session(logger=logger) as client:
|
318
|
+
...
|
319
|
+
|
320
|
+
with shared_dask_session(async_mode=False) as client:
|
321
|
+
...
|
322
|
+
"""
|
323
|
+
mixin = DaskClientMixin()
|
324
|
+
mixin._init_dask_client(**kwargs)
|
325
|
+
|
326
|
+
if async_mode:
|
327
|
+
@asynccontextmanager
|
328
|
+
async def _async_manager():
|
329
|
+
try:
|
330
|
+
yield mixin.dask_client
|
331
|
+
finally:
|
332
|
+
mixin._close_dask_client()
|
333
|
+
return _async_manager()
|
334
|
+
else:
|
335
|
+
@contextmanager
|
336
|
+
def _sync_manager():
|
337
|
+
try:
|
338
|
+
yield mixin.dask_client
|
339
|
+
finally:
|
340
|
+
mixin._close_dask_client()
|
341
|
+
return _sync_manager()
|
342
|
+
|
343
|
+
# from contextlib import suppress, asynccontextmanager
|
344
|
+
# from dask.distributed import Client, LocalCluster, get_client
|
345
|
+
# import os
|
346
|
+
#
|
347
|
+
# class DaskClientMixin:
|
348
|
+
# """
|
349
|
+
# Provides shared Dask client lifecycle management.
|
350
|
+
# Ensures reuse of an existing client if available,
|
351
|
+
# or creates a local in-process Dask cluster for fallback.
|
352
|
+
# """
|
353
|
+
#
|
354
|
+
# def _init_dask_client(
|
355
|
+
# self,
|
356
|
+
# dask_client=None,
|
357
|
+
# logger=None,
|
358
|
+
# *,
|
359
|
+
# n_workers: int = 1,
|
360
|
+
# threads_per_worker: int = 1,
|
361
|
+
# processes: bool = False,
|
362
|
+
# asynchronous: bool = False,
|
363
|
+
# memory_limit: str = "auto",
|
364
|
+
# #dashboard_address: str | None = None,
|
365
|
+
# local_directory: str | None = None,
|
366
|
+
# silence_logs: str = "info",
|
367
|
+
# resources: dict | None = None,
|
368
|
+
# timeout: int = 30,
|
369
|
+
# ):
|
370
|
+
# self.dask_client = dask_client
|
371
|
+
# self.own_dask_client = False
|
372
|
+
# self.logger = logger
|
373
|
+
# # Apply log filters globally
|
374
|
+
# logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(
|
375
|
+
# logging.ERROR
|
376
|
+
# )
|
377
|
+
# logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
378
|
+
# logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
379
|
+
#
|
380
|
+
# if self.dask_client is None:
|
381
|
+
# with suppress(ValueError, RuntimeError):
|
382
|
+
# # Try to attach to an existing client (common in shared Dask setups)
|
383
|
+
# self.dask_client = get_client()
|
384
|
+
#
|
385
|
+
# if self.dask_client is None:
|
386
|
+
# # Default to half of logical cores if not specified
|
387
|
+
# n_workers = n_workers or max(2, os.cpu_count() // 2)
|
388
|
+
#
|
389
|
+
# cluster = LocalCluster(
|
390
|
+
# n_workers=n_workers,
|
391
|
+
# threads_per_worker=threads_per_worker,
|
392
|
+
# processes=processes,
|
393
|
+
# asynchronous=asynchronous,
|
394
|
+
# memory_limit=memory_limit,
|
395
|
+
# local_directory=local_directory,
|
396
|
+
# silence_logs=silence_logs,
|
397
|
+
# resources=resources,
|
398
|
+
# timeout=timeout,
|
399
|
+
# )
|
400
|
+
#
|
401
|
+
# self.dask_client = Client(cluster)
|
402
|
+
# self.own_dask_client = True
|
403
|
+
#
|
404
|
+
# if self.logger:
|
405
|
+
# self.logger.info(
|
406
|
+
# f"Started local Dask cluster with {n_workers} workers × {threads_per_worker} threads "
|
407
|
+
# f"({memory_limit} memory per worker). Dashboard: {self.dask_client.dashboard_link}"
|
408
|
+
# )
|
409
|
+
# else:
|
410
|
+
# if self.logger:
|
411
|
+
# self.logger.debug(
|
412
|
+
# f"Using existing Dask client: {self.dask_client.dashboard_link}"
|
413
|
+
# )
|
414
|
+
#
|
415
|
+
# def _close_dask_client(self):
|
416
|
+
# """Close the Dask client if this instance created it."""
|
417
|
+
# if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
418
|
+
# try:
|
419
|
+
# cluster = getattr(self.dask_client, "cluster", None)
|
420
|
+
# self.dask_client.close()
|
421
|
+
# if cluster is not None:
|
422
|
+
# cluster.close()
|
423
|
+
# if self.logger:
|
424
|
+
# self.logger.info("Closed local Dask client and cluster.")
|
425
|
+
# except Exception as e:
|
426
|
+
# if self.logger:
|
427
|
+
# self.logger.warning(f"Error while closing Dask client: {e}")
|
428
|
+
#
|
429
|
+
# @asynccontextmanager
|
430
|
+
# async def shared_dask_session(**kwargs):
|
431
|
+
# mixin = DaskClientMixin()
|
432
|
+
# mixin._init_dask_client(**kwargs)
|
433
|
+
# try:
|
434
|
+
# yield mixin.dask_client
|
435
|
+
# finally:
|
436
|
+
# mixin._close_dask_client()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.9.
|
3
|
+
Version: 2025.9.15
|
4
4
|
Summary: A data science toolkit for scalable data processing and analysis.
|
5
5
|
Requires-Python: >=3.11
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -9,6 +9,7 @@ Requires-Dist: clickhouse-driver>=0.2.9
|
|
9
9
|
Requires-Dist: dask>=2025.9.1
|
10
10
|
Requires-Dist: distributed>=2025.9.1
|
11
11
|
Requires-Dist: fastapi>=0.118.0
|
12
|
+
Requires-Dist: filelock>=3.20.0
|
12
13
|
Requires-Dist: folium>=0.20.0
|
13
14
|
Requires-Dist: mysqlclient>=2.2.7
|
14
15
|
Requires-Dist: opentelemetry-api>=1.37.0
|
@@ -1,200 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import asyncio
|
4
|
-
import logging
|
5
|
-
from typing import List, Any, Dict
|
6
|
-
|
7
|
-
import dask
|
8
|
-
# dask.config.set({"distributed.worker.daemon": False})
|
9
|
-
import dask.dataframe as dd
|
10
|
-
|
11
|
-
def _to_int_safe(x) -> int:
|
12
|
-
"""
|
13
|
-
Convert scalar-like to int safely.
|
14
|
-
Handles numpy scalars, pandas Series/DataFrame outputs.
|
15
|
-
"""
|
16
|
-
if hasattr(x, "item"): # numpy scalar, pandas scalar
|
17
|
-
return int(x.item())
|
18
|
-
if hasattr(x, "iloc"): # Series-like
|
19
|
-
return int(x.iloc[0])
|
20
|
-
return int(x)
|
21
|
-
|
22
|
-
def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
|
23
|
-
return getattr(ddf, "npartitions", 0) == 0 or len(ddf._meta.columns) == 0
|
24
|
-
|
25
|
-
|
26
|
-
def dask_is_empty_truthful(ddf: dd.DataFrame) -> bool:
|
27
|
-
n = ddf.map_partitions(len).sum().compute()
|
28
|
-
return int(n) == 0
|
29
|
-
|
30
|
-
|
31
|
-
def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
|
32
|
-
if dask_is_probably_empty(ddf):
|
33
|
-
return True
|
34
|
-
|
35
|
-
k = min(max(sample, 1), ddf.npartitions)
|
36
|
-
probes = dask.compute(*[
|
37
|
-
ddf.get_partition(i).map_partitions(len) for i in range(k)
|
38
|
-
], scheduler="threads")
|
39
|
-
|
40
|
-
if any(_to_int_safe(n) > 0 for n in probes):
|
41
|
-
return False
|
42
|
-
if k == ddf.npartitions and all(_to_int_safe(n) == 0 for n in probes):
|
43
|
-
return True
|
44
|
-
|
45
|
-
return dask_is_empty_truthful(ddf)
|
46
|
-
|
47
|
-
class UniqueValuesExtractor:
|
48
|
-
@staticmethod
|
49
|
-
def _compute_to_list_sync(series) -> List[Any]:
|
50
|
-
"""Run in a worker thread when Dask-backed."""
|
51
|
-
if hasattr(series, "compute"):
|
52
|
-
return series.compute().tolist()
|
53
|
-
return series.tolist()
|
54
|
-
|
55
|
-
async def compute_to_list(self, series) -> List[Any]:
|
56
|
-
# Offload potential Dask .compute() to a thread to avoid blocking the event loop
|
57
|
-
return await asyncio.to_thread(self._compute_to_list_sync, series)
|
58
|
-
|
59
|
-
async def extract_unique_values(self, df, *columns: str) -> Dict[str, List[Any]]:
|
60
|
-
async def one(col: str):
|
61
|
-
ser = df[col].dropna().unique()
|
62
|
-
return col, await self.compute_to_list(ser)
|
63
|
-
|
64
|
-
pairs = await asyncio.gather(*(one(c) for c in columns))
|
65
|
-
return dict(pairs)
|
66
|
-
|
67
|
-
from contextlib import suppress, asynccontextmanager
|
68
|
-
from dask.distributed import Client, LocalCluster, get_client
|
69
|
-
import os
|
70
|
-
|
71
|
-
class DaskClientMixin:
|
72
|
-
"""
|
73
|
-
Provides shared Dask client lifecycle management.
|
74
|
-
Ensures reuse of an existing client if available,
|
75
|
-
or creates a local in-process Dask cluster for fallback.
|
76
|
-
"""
|
77
|
-
|
78
|
-
def _init_dask_client(
|
79
|
-
self,
|
80
|
-
dask_client=None,
|
81
|
-
logger=None,
|
82
|
-
*,
|
83
|
-
n_workers: int = 1,
|
84
|
-
threads_per_worker: int = 1,
|
85
|
-
processes: bool = False,
|
86
|
-
asynchronous: bool = False,
|
87
|
-
memory_limit: str = "auto",
|
88
|
-
#dashboard_address: str | None = None,
|
89
|
-
local_directory: str | None = None,
|
90
|
-
silence_logs: str = "info",
|
91
|
-
resources: dict | None = None,
|
92
|
-
timeout: int = 30,
|
93
|
-
):
|
94
|
-
self.dask_client = dask_client
|
95
|
-
self.own_dask_client = False
|
96
|
-
self.logger = logger
|
97
|
-
# Apply log filters globally
|
98
|
-
logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(
|
99
|
-
logging.ERROR
|
100
|
-
)
|
101
|
-
logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
102
|
-
logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
103
|
-
|
104
|
-
if self.dask_client is None:
|
105
|
-
with suppress(ValueError, RuntimeError):
|
106
|
-
# Try to attach to an existing client (common in shared Dask setups)
|
107
|
-
self.dask_client = get_client()
|
108
|
-
|
109
|
-
if self.dask_client is None:
|
110
|
-
# Default to half of logical cores if not specified
|
111
|
-
n_workers = n_workers or max(2, os.cpu_count() // 2)
|
112
|
-
|
113
|
-
cluster = LocalCluster(
|
114
|
-
n_workers=n_workers,
|
115
|
-
threads_per_worker=threads_per_worker,
|
116
|
-
processes=processes,
|
117
|
-
asynchronous=asynchronous,
|
118
|
-
memory_limit=memory_limit,
|
119
|
-
local_directory=local_directory,
|
120
|
-
silence_logs=silence_logs,
|
121
|
-
resources=resources,
|
122
|
-
timeout=timeout,
|
123
|
-
)
|
124
|
-
|
125
|
-
self.dask_client = Client(cluster)
|
126
|
-
self.own_dask_client = True
|
127
|
-
|
128
|
-
if self.logger:
|
129
|
-
self.logger.info(
|
130
|
-
f"Started local Dask cluster with {n_workers} workers × {threads_per_worker} threads "
|
131
|
-
f"({memory_limit} memory per worker). Dashboard: {self.dask_client.dashboard_link}"
|
132
|
-
)
|
133
|
-
else:
|
134
|
-
if self.logger:
|
135
|
-
self.logger.debug(
|
136
|
-
f"Using existing Dask client: {self.dask_client.dashboard_link}"
|
137
|
-
)
|
138
|
-
|
139
|
-
def _close_dask_client(self):
|
140
|
-
"""Close the Dask client if this instance created it."""
|
141
|
-
if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
142
|
-
try:
|
143
|
-
cluster = getattr(self.dask_client, "cluster", None)
|
144
|
-
self.dask_client.close()
|
145
|
-
if cluster is not None:
|
146
|
-
cluster.close()
|
147
|
-
if self.logger:
|
148
|
-
self.logger.info("Closed local Dask client and cluster.")
|
149
|
-
except Exception as e:
|
150
|
-
if self.logger:
|
151
|
-
self.logger.warning(f"Error while closing Dask client: {e}")
|
152
|
-
|
153
|
-
@asynccontextmanager
|
154
|
-
async def shared_dask_session(**kwargs):
|
155
|
-
mixin = DaskClientMixin()
|
156
|
-
mixin._init_dask_client(**kwargs)
|
157
|
-
try:
|
158
|
-
yield mixin.dask_client
|
159
|
-
finally:
|
160
|
-
mixin._close_dask_client()
|
161
|
-
|
162
|
-
# from contextlib import suppress
|
163
|
-
# from dask.distributed import Client, get_client
|
164
|
-
#
|
165
|
-
# class DaskClientMixin:
|
166
|
-
# """
|
167
|
-
# Provides shared Dask client lifecycle management.
|
168
|
-
# Ensures reuse of existing client when available, otherwise creates a lightweight local one.
|
169
|
-
# """
|
170
|
-
#
|
171
|
-
# def _init_dask_client(self, dask_client=None, logger=None):
|
172
|
-
# self.dask_client = dask_client
|
173
|
-
# self.own_dask_client = False
|
174
|
-
# self.logger = logger
|
175
|
-
#
|
176
|
-
# if self.dask_client is None:
|
177
|
-
# with suppress(ValueError, RuntimeError):
|
178
|
-
# # Try to attach to an existing active client if running inside a Dask context
|
179
|
-
# self.dask_client = get_client()
|
180
|
-
#
|
181
|
-
# if self.dask_client is None:
|
182
|
-
# # Start a local in-process scheduler for fallback
|
183
|
-
# self.dask_client = Client(processes=False)
|
184
|
-
# self.own_dask_client = True
|
185
|
-
# if self.logger:
|
186
|
-
# self.logger.info(f"Started local Dask client: {self.dask_client.dashboard_link}")
|
187
|
-
# else:
|
188
|
-
# if self.logger:
|
189
|
-
# self.logger.debug(f"Using existing Dask client: {self.dask_client.dashboard_link}")
|
190
|
-
#
|
191
|
-
# def _close_dask_client(self):
|
192
|
-
# """Close client only if this instance created it."""
|
193
|
-
# if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
194
|
-
# try:
|
195
|
-
# self.dask_client.close()
|
196
|
-
# if self.logger:
|
197
|
-
# self.logger.info("Closed local Dask client.")
|
198
|
-
# except Exception as e:
|
199
|
-
# if self.logger:
|
200
|
-
# self.logger.warning(f"Error while closing Dask client: {e}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_parquet_artifact.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/utils/boilerplate/base_pipeline_template.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.13 → sibi_dst-2025.9.15}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|