sibi-dst 2025.9.11__py3-none-any.whl → 2025.9.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +11 -6
- sibi_dst/df_helper/__init__.py +0 -1
- sibi_dst/df_helper/_artifact_updater_async.py +199 -175
- sibi_dst/osmnx_helper/__init__.py +3 -1
- sibi_dst/utils/__init__.py +2 -1
- sibi_dst/utils/boilerplate/base_pipeline.py +1 -2
- sibi_dst/utils/dask_utils.py +140 -1
- sibi_dst/utils/data_wrapper.py +0 -11
- {sibi_dst-2025.9.11.dist-info → sibi_dst-2025.9.13.dist-info}/METADATA +26 -30
- {sibi_dst-2025.9.11.dist-info → sibi_dst-2025.9.13.dist-info}/RECORD +24 -24
- {sibi_dst-2025.9.11.dist-info → sibi_dst-2025.9.13.dist-info}/WHEEL +2 -1
- sibi_dst-2025.9.13.dist-info/top_level.txt +1 -0
- sibi_dst/df_helper/data_cleaner.py +0 -132
sibi_dst/__init__.py
CHANGED
@@ -10,12 +10,17 @@ try:
|
|
10
10
|
except version_reader.PackageNotFoundError:
|
11
11
|
__version__ = "unknown"
|
12
12
|
|
13
|
-
|
14
|
-
"__version__",
|
15
|
-
]
|
16
|
-
|
17
|
-
import sibi_dst.df_helper as df_helper
|
13
|
+
from sibi_dst.df_helper import *
|
18
14
|
from sibi_dst.osmnx_helper import *
|
19
15
|
from sibi_dst.geopy_helper import *
|
20
|
-
from sibi_dst
|
16
|
+
from sibi_dst import utils as sibiutils
|
21
17
|
|
18
|
+
|
19
|
+
__all__ = [
|
20
|
+
"__version__",
|
21
|
+
"DfHelper",
|
22
|
+
"ParquetArtifact",
|
23
|
+
"ParquetReader",
|
24
|
+
"ArtifactUpdaterMultiWrapperAsync",
|
25
|
+
"sibiutils"
|
26
|
+
]
|
sibi_dst/df_helper/__init__.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
#from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
6
|
from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
|
8
7
|
from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
|
9
8
|
|
@@ -4,28 +4,24 @@ import asyncio
|
|
4
4
|
import datetime
|
5
5
|
import random
|
6
6
|
import time
|
7
|
-
|
7
|
+
import pickle
|
8
|
+
from contextlib import ExitStack, suppress
|
8
9
|
from dataclasses import dataclass
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
|
10
11
|
|
11
|
-
from sibi_dst.utils import ManagedResource
|
12
|
-
|
13
|
-
try:
|
14
|
-
from dask.distributed import Client, LocalCluster
|
15
|
-
except ImportError:
|
16
|
-
Client = None
|
17
|
-
LocalCluster = None
|
12
|
+
from sibi_dst.utils import ManagedResource, Logger
|
13
|
+
from sibi_dst.utils.dask_utils import DaskClientMixin
|
18
14
|
|
19
15
|
|
20
16
|
@dataclass(slots=True)
|
21
17
|
class _RetryCfg:
|
18
|
+
"""Retry and backoff configuration."""
|
22
19
|
attempts: int = 3
|
23
20
|
backoff_base: float = 2.0
|
24
21
|
backoff_max: float = 60.0
|
25
22
|
jitter: float = 0.15
|
26
23
|
|
27
24
|
|
28
|
-
# ---------------- Worker (safe for Dask pickling) ----------------
|
29
25
|
def run_artifact_update(
|
30
26
|
cls: Type,
|
31
27
|
artifact_class_kwargs: Dict[str, Any],
|
@@ -33,21 +29,43 @@ def run_artifact_update(
|
|
33
29
|
period: str,
|
34
30
|
artifact_kwargs: Dict[str, Any],
|
35
31
|
) -> Dict[str, Any]:
|
36
|
-
"""
|
32
|
+
"""
|
33
|
+
Executed inside Dask worker.
|
34
|
+
Instantiates artifact and runs update_parquet() with retry logic.
|
35
|
+
Reconstructs logger and filesystem if not provided (worker isolation safe).
|
36
|
+
"""
|
37
37
|
import logging
|
38
|
+
import fsspec
|
39
|
+
from sibi_dst.utils import Logger
|
38
40
|
|
39
|
-
logger
|
41
|
+
# ---- Reinitialize a lightweight logger for the worker
|
42
|
+
worker_logger = Logger.default_logger(logger_name=cls.__name__) if hasattr(Logger, "default_logger") else logging.getLogger(cls.__name__)
|
43
|
+
worker_logger.set_level(logging.INFO)
|
44
|
+
|
45
|
+
# ---- Ensure fs is recreated if missing
|
46
|
+
fs = artifact_class_kwargs.get("fs")
|
47
|
+
if fs is None or isinstance(fs, str):
|
48
|
+
try:
|
49
|
+
fs_protocol = fs if isinstance(fs, str) else "file"
|
50
|
+
fs = fsspec.filesystem(fs_protocol)
|
51
|
+
except Exception:
|
52
|
+
fs = fsspec.filesystem("file")
|
53
|
+
|
54
|
+
# ---- Merge reconstructed environment into kwargs
|
55
|
+
artifact_kwargs_final = {
|
56
|
+
**artifact_class_kwargs,
|
57
|
+
"logger": worker_logger,
|
58
|
+
"fs": fs,
|
59
|
+
}
|
40
60
|
|
41
|
-
|
42
|
-
|
43
|
-
success = False
|
44
|
-
error_msg = None
|
61
|
+
start_time = datetime.datetime.now()
|
62
|
+
success, error_msg, attempts = False, None, 0
|
45
63
|
|
46
64
|
for attempt in range(1, retry.attempts + 1):
|
47
|
-
|
65
|
+
attempts = attempt
|
48
66
|
try:
|
49
67
|
with ExitStack() as stack:
|
50
|
-
inst = cls(**
|
68
|
+
inst = cls(**artifact_kwargs_final)
|
51
69
|
inst = stack.enter_context(inst)
|
52
70
|
inst.update_parquet(period=period, **artifact_kwargs)
|
53
71
|
success = True
|
@@ -59,31 +77,40 @@ def run_artifact_update(
|
|
59
77
|
delay *= 1 + random.uniform(0, retry.jitter)
|
60
78
|
time.sleep(delay)
|
61
79
|
|
62
|
-
|
63
|
-
|
80
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
81
|
+
status = "😀" if success else "😩"
|
82
|
+
worker_logger.info(
|
83
|
+
f"{status} {cls.__name__} [{period}] finished in {duration:.2f}s ({attempts} attempt(s))"
|
84
|
+
)
|
64
85
|
|
65
86
|
return {
|
66
87
|
"artifact": cls.__name__,
|
67
88
|
"period": period,
|
68
|
-
"start": start_wall.isoformat(),
|
69
|
-
"end": end_wall.isoformat(),
|
70
|
-
"processing_time": duration,
|
71
|
-
"retries": attempt_count - 1 if success else attempt_count,
|
72
89
|
"success": success,
|
73
90
|
"error": error_msg,
|
91
|
+
"attempts": attempts,
|
92
|
+
"duration_seconds": duration,
|
93
|
+
"started_at": start_time.isoformat(),
|
94
|
+
"ended_at": datetime.datetime.now().isoformat(),
|
74
95
|
}
|
75
96
|
|
76
97
|
|
77
|
-
|
98
|
+
# ---------------- Async Orchestrator ----------------
|
99
|
+
class ArtifactUpdaterMultiWrapperAsync(DaskClientMixin, ManagedResource):
|
78
100
|
"""
|
79
|
-
Async
|
80
|
-
|
101
|
+
Async orchestrator for concurrent artifact updates.
|
102
|
+
|
103
|
+
• Uses Dask client (via DaskClientMixin) or local threads.
|
104
|
+
• Automatically sanitizes non-picklable arguments (e.g., loggers, fs).
|
105
|
+
• Provides structured retries, async orchestration, and safe cleanup.
|
81
106
|
"""
|
82
107
|
|
83
108
|
def __init__(
|
84
109
|
self,
|
85
110
|
wrapped_classes: Dict[str, Sequence[Type]],
|
86
111
|
*,
|
112
|
+
logger: Logger,
|
113
|
+
fs,
|
87
114
|
max_workers: int = 3,
|
88
115
|
retry_attempts: int = 3,
|
89
116
|
update_timeout_seconds: int = 600,
|
@@ -92,88 +119,127 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
92
119
|
backoff_jitter: float = 0.15,
|
93
120
|
priority_fn: Optional[Callable[[Type], int]] = None,
|
94
121
|
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
95
|
-
dask_client: Optional[Client] = None,
|
96
122
|
use_dask: bool = True,
|
123
|
+
dask_client: Optional[Any] = None,
|
124
|
+
debug: bool = False,
|
125
|
+
verbose: bool = False,
|
97
126
|
**kwargs: Any,
|
98
127
|
) -> None:
|
99
|
-
super().__init__(
|
128
|
+
super().__init__(logger=logger, fs=fs, debug=debug, verbose=verbose)
|
129
|
+
|
130
|
+
# ---- Client lifecycle management
|
131
|
+
|
132
|
+
self.own_dask_client = dask_client is None
|
133
|
+
self._init_dask_client(dask_client, logger=logger)
|
134
|
+
self.use_dask = use_dask
|
100
135
|
|
136
|
+
# ---- Core configuration
|
101
137
|
self.wrapped_classes = wrapped_classes
|
102
|
-
self.max_workers =
|
103
|
-
self.update_timeout_seconds = int(update_timeout_seconds)
|
138
|
+
self.max_workers = max_workers
|
104
139
|
self.priority_fn = priority_fn
|
105
|
-
self.
|
106
|
-
self.client: Optional[Client] = dask_client
|
107
|
-
self._owns_client = False
|
140
|
+
self.update_timeout_seconds = update_timeout_seconds
|
108
141
|
|
142
|
+
# ---- Retry configuration
|
109
143
|
self._retry = _RetryCfg(
|
110
|
-
attempts=
|
111
|
-
backoff_base=
|
112
|
-
backoff_max=
|
113
|
-
jitter=
|
144
|
+
attempts=retry_attempts,
|
145
|
+
backoff_base=backoff_base,
|
146
|
+
backoff_max=backoff_max,
|
147
|
+
jitter=backoff_jitter,
|
114
148
|
)
|
115
149
|
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
}
|
123
|
-
|
124
|
-
self.artifact_class_kwargs = {
|
125
|
-
"logger": self.logger,
|
126
|
-
"fs": self.fs,
|
127
|
-
"debug": self.debug,
|
128
|
-
"verbose": self.verbose,
|
129
|
-
**(artifact_class_kwargs or {}),
|
130
|
-
}
|
150
|
+
# ---- Artifact instantiation arguments
|
151
|
+
self.artifact_class_kwargs = {
|
152
|
+
"logger": logger,
|
153
|
+
"fs": fs,
|
154
|
+
"debug": debug,
|
155
|
+
"verbose": verbose,
|
156
|
+
**(artifact_class_kwargs or {}),
|
157
|
+
}
|
131
158
|
|
159
|
+
# ---- Runtime tracking
|
132
160
|
self.completion_secs: Dict[str, float] = {}
|
133
161
|
self.failed: List[str] = []
|
134
|
-
self.
|
135
|
-
|
136
|
-
if self.use_dask and Client is None:
|
137
|
-
raise RuntimeError("Dask is not installed, cannot use Dask mode")
|
138
|
-
|
139
|
-
# auto-start local client if requested
|
140
|
-
if self.use_dask and not self.client:
|
141
|
-
self.client = Client(
|
142
|
-
LocalCluster(
|
143
|
-
n_workers=max_workers,
|
144
|
-
threads_per_worker=1,
|
145
|
-
dashboard_address=None,
|
146
|
-
)
|
147
|
-
)
|
148
|
-
self._owns_client = True
|
162
|
+
self._stop_event = asyncio.Event()
|
149
163
|
|
150
|
-
|
164
|
+
self.logger_extra = {"sibi_dst_component": self.__class__.__name__}
|
165
|
+
|
166
|
+
if self.use_dask:
|
167
|
+
self.logger.debug(f"Initialized with Dask client: {self.dask_client}")
|
168
|
+
else:
|
169
|
+
self.logger.debug(f"Running in local thread-based mode.")
|
170
|
+
|
171
|
+
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
172
|
+
"""Runs updates for all artifacts in a given period."""
|
173
|
+
self.completion_secs.clear()
|
174
|
+
self.failed.clear()
|
175
|
+
classes = self._classes_for(period)
|
176
|
+
|
177
|
+
self.logger.info(
|
178
|
+
f"Starting artifact updates for period '{period}' ({len(classes)} artifacts).",
|
179
|
+
extra=self.logger_extra,
|
180
|
+
)
|
151
181
|
|
152
|
-
def _classes_for(self, period: str) -> List[Type]:
|
153
182
|
try:
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
183
|
+
if self.use_dask:
|
184
|
+
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
185
|
+
results = await asyncio.to_thread(lambda: self.dask_client.gather(futures))
|
186
|
+
else:
|
187
|
+
sem = asyncio.Semaphore(self.max_workers)
|
188
|
+
tasks = [self._run_one_async(cls, period, sem, kwargs) for cls in classes]
|
189
|
+
results = await asyncio.gather(*tasks)
|
190
|
+
|
191
|
+
self.logger.info(
|
192
|
+
f"Completed {len(results)} artifact updates for period '{period}'.",
|
193
|
+
extra=self.logger_extra,
|
194
|
+
)
|
195
|
+
return results
|
196
|
+
|
197
|
+
finally:
|
198
|
+
# Always cleanup if we own the client
|
199
|
+
if getattr(self, "own_dask_client", False):
|
200
|
+
self._close_dask_client()
|
201
|
+
|
202
|
+
|
203
|
+
def _sanitize_kwargs_for_dask(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
204
|
+
"""
|
205
|
+
Removes non-picklable runtime objects (e.g., loggers, fs) before sending to Dask.
|
206
|
+
"""
|
207
|
+
clean: Dict[str, Any] = {}
|
208
|
+
for k, v in kwargs.items():
|
160
209
|
try:
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
210
|
+
pickle.dumps(v)
|
211
|
+
clean[k] = v
|
212
|
+
except Exception:
|
213
|
+
self.logger.debug(f"Skipping non-picklable key '{k}' for Dask worker.")
|
214
|
+
return clean
|
165
215
|
|
166
216
|
def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
|
167
|
-
|
217
|
+
"""Submit one artifact job to Dask."""
|
218
|
+
safe_kwargs = self._sanitize_kwargs_for_dask(self.artifact_class_kwargs)
|
219
|
+
return self.dask_client.submit(
|
168
220
|
run_artifact_update,
|
169
221
|
cls,
|
170
|
-
|
222
|
+
safe_kwargs,
|
171
223
|
self._retry,
|
172
224
|
period,
|
173
225
|
artifact_kwargs,
|
174
226
|
pure=False,
|
175
227
|
)
|
176
228
|
|
229
|
+
def _classes_for(self, period: str) -> List[Type]:
|
230
|
+
"""Selects artifact classes for the given period."""
|
231
|
+
try:
|
232
|
+
classes = list(self.wrapped_classes[period])
|
233
|
+
except KeyError:
|
234
|
+
raise ValueError(f"No artifacts configured for period '{period}'.")
|
235
|
+
if not classes:
|
236
|
+
raise ValueError(f"No artifact classes found for '{period}'.")
|
237
|
+
|
238
|
+
if self.priority_fn:
|
239
|
+
with suppress(Exception):
|
240
|
+
classes.sort(key=self.priority_fn)
|
241
|
+
return classes
|
242
|
+
|
177
243
|
async def _run_one_async(
|
178
244
|
self,
|
179
245
|
cls: Type,
|
@@ -181,112 +247,70 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
181
247
|
sem: asyncio.Semaphore,
|
182
248
|
artifact_kwargs: Dict[str, Any],
|
183
249
|
) -> Dict[str, Any]:
|
184
|
-
"""
|
250
|
+
"""Fallback local async execution (no Dask)."""
|
185
251
|
name = cls.__name__
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
"success": success,
|
231
|
-
"error": error_msg,
|
232
|
-
}
|
233
|
-
|
234
|
-
if success:
|
235
|
-
self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
|
236
|
-
self.completion_secs[name] = duration
|
237
|
-
else:
|
238
|
-
self.logger.error(f"❌ Artifact {name} failed", extra=result)
|
239
|
-
self.failed.append(name)
|
240
|
-
|
241
|
-
return result
|
252
|
+
start_time = datetime.datetime.now()
|
253
|
+
|
254
|
+
async with sem:
|
255
|
+
for attempt in range(1, self._retry.attempts + 1):
|
256
|
+
try:
|
257
|
+
def _sync_block():
|
258
|
+
with ExitStack() as stack:
|
259
|
+
inst = cls(**self.artifact_class_kwargs)
|
260
|
+
inst = stack.enter_context(inst)
|
261
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
262
|
+
|
263
|
+
await asyncio.wait_for(
|
264
|
+
asyncio.to_thread(_sync_block),
|
265
|
+
timeout=self.update_timeout_seconds,
|
266
|
+
)
|
267
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
268
|
+
self.completion_secs[name] = duration
|
269
|
+
self.logger.info(f"✅ {name} completed in {duration:.2f}s")
|
270
|
+
return {
|
271
|
+
"artifact": name,
|
272
|
+
"period": period,
|
273
|
+
"success": True,
|
274
|
+
"attempts": attempt,
|
275
|
+
"duration_seconds": duration,
|
276
|
+
}
|
277
|
+
|
278
|
+
except Exception as e:
|
279
|
+
if attempt < self._retry.attempts:
|
280
|
+
delay = min(self._retry.backoff_base ** attempt, self._retry.backoff_max)
|
281
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
282
|
+
self.logger.warning(f"Retry {attempt}/{self._retry.attempts} for {name}: {e}")
|
283
|
+
await asyncio.sleep(delay)
|
284
|
+
else:
|
285
|
+
duration = (datetime.datetime.now() - start_time).total_seconds()
|
286
|
+
self.failed.append(name)
|
287
|
+
self.logger.error(f"❌ {name} failed after {attempt} attempts: {e}")
|
288
|
+
return {
|
289
|
+
"artifact": name,
|
290
|
+
"period": period,
|
291
|
+
"success": False,
|
292
|
+
"attempts": attempt,
|
293
|
+
"error": str(e),
|
294
|
+
"duration_seconds": duration,
|
295
|
+
}
|
242
296
|
|
243
|
-
# ---- Public API -----------------------------------------------------------
|
244
|
-
|
245
|
-
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
246
|
-
self.completion_secs.clear()
|
247
|
-
self.failed.clear()
|
248
|
-
classes = self._classes_for(period)
|
249
|
-
|
250
|
-
try:
|
251
|
-
if self.use_dask:
|
252
|
-
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
253
|
-
results = await asyncio.to_thread(lambda: self.client.gather(futures))
|
254
|
-
else:
|
255
|
-
sem = asyncio.Semaphore(self.max_workers)
|
256
|
-
tasks = [
|
257
|
-
asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
|
258
|
-
for cls in classes
|
259
|
-
]
|
260
|
-
results = await asyncio.gather(*tasks)
|
261
|
-
return results
|
262
|
-
finally:
|
263
|
-
# only shut down if we own the client
|
264
|
-
if self._owns_client:
|
265
|
-
self.close()
|
266
297
|
|
267
298
|
def get_update_status(self) -> Dict[str, Any]:
|
299
|
+
"""Returns summary of completed, failed, and pending artifacts."""
|
268
300
|
done = set(self.completion_secs)
|
269
301
|
fail = set(self.failed)
|
270
|
-
all_names = {
|
302
|
+
all_names = {cls.__name__ for v in self.wrapped_classes.values() for cls in v}
|
271
303
|
return {
|
272
304
|
"total": len(all_names),
|
273
305
|
"completed": sorted(done),
|
274
306
|
"failed": sorted(fail),
|
275
307
|
"pending": sorted(all_names - done - fail),
|
276
|
-
"completion_times":
|
308
|
+
"completion_times": self.completion_secs,
|
277
309
|
}
|
278
310
|
|
279
|
-
# ---- Lifecycle ------------------------------------------------------------
|
280
|
-
|
281
311
|
def _cleanup(self) -> None:
|
282
|
-
"""
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
self.client.close()
|
287
|
-
if cluster is not None:
|
288
|
-
cluster.close()
|
289
|
-
finally:
|
290
|
-
self.client = None
|
291
|
-
self._owns_client = False
|
312
|
+
"""Ensures safe resource closure."""
|
313
|
+
with suppress(Exception):
|
314
|
+
if getattr(self, "own_dask_client", False):
|
315
|
+
self._close_dask_client()
|
292
316
|
|
sibi_dst/utils/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from .manifest_manager import MissingManifestManager
|
|
24
24
|
__all__ = [
|
25
25
|
"Logger",
|
26
26
|
"ManagedResource",
|
27
|
+
|
27
28
|
"ConfigManager",
|
28
29
|
"ConfigLoader",
|
29
30
|
"DateUtils",
|
@@ -42,5 +43,5 @@ __all__ = [
|
|
42
43
|
"FsRegistry",
|
43
44
|
"DataFromHttpSource",
|
44
45
|
"WebDAVClient",
|
45
|
-
"MissingManifestManager"
|
46
|
+
"MissingManifestManager",
|
46
47
|
]
|
@@ -93,7 +93,7 @@ class BasePipeline(ManagedResource):
|
|
93
93
|
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
94
|
df["partition_date"] = df[self.date_field].dt.date.astype(str)
|
95
95
|
|
96
|
-
out_path = self.storage_path.rstrip("/")
|
96
|
+
out_path = self.storage_path.rstrip("/")
|
97
97
|
self.logger.info("Saving dataset to %s", out_path)
|
98
98
|
ps = ParquetSaver(
|
99
99
|
df_result=df,
|
@@ -111,7 +111,6 @@ class BasePipeline(ManagedResource):
|
|
111
111
|
parquet_start_date=self.start_date,
|
112
112
|
parquet_end_date=self.end_date,
|
113
113
|
parquet_storage_path=self.storage_path,
|
114
|
-
parquet_filename=self._get_output_filename(),
|
115
114
|
fs=self.fs,
|
116
115
|
debug=self.debug,
|
117
116
|
logger=self.logger,
|
sibi_dst/utils/dask_utils.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import asyncio
|
4
|
+
import logging
|
2
5
|
from typing import List, Any, Dict
|
3
6
|
|
4
7
|
import dask
|
8
|
+
# dask.config.set({"distributed.worker.daemon": False})
|
5
9
|
import dask.dataframe as dd
|
6
10
|
|
7
11
|
def _to_int_safe(x) -> int:
|
@@ -58,4 +62,139 @@ class UniqueValuesExtractor:
|
|
58
62
|
return col, await self.compute_to_list(ser)
|
59
63
|
|
60
64
|
pairs = await asyncio.gather(*(one(c) for c in columns))
|
61
|
-
return dict(pairs)
|
65
|
+
return dict(pairs)
|
66
|
+
|
67
|
+
from contextlib import suppress, asynccontextmanager
|
68
|
+
from dask.distributed import Client, LocalCluster, get_client
|
69
|
+
import os
|
70
|
+
|
71
|
+
class DaskClientMixin:
|
72
|
+
"""
|
73
|
+
Provides shared Dask client lifecycle management.
|
74
|
+
Ensures reuse of an existing client if available,
|
75
|
+
or creates a local in-process Dask cluster for fallback.
|
76
|
+
"""
|
77
|
+
|
78
|
+
def _init_dask_client(
|
79
|
+
self,
|
80
|
+
dask_client=None,
|
81
|
+
logger=None,
|
82
|
+
*,
|
83
|
+
n_workers: int = 1,
|
84
|
+
threads_per_worker: int = 1,
|
85
|
+
processes: bool = False,
|
86
|
+
asynchronous: bool = False,
|
87
|
+
memory_limit: str = "auto",
|
88
|
+
#dashboard_address: str | None = None,
|
89
|
+
local_directory: str | None = None,
|
90
|
+
silence_logs: str = "info",
|
91
|
+
resources: dict | None = None,
|
92
|
+
timeout: int = 30,
|
93
|
+
):
|
94
|
+
self.dask_client = dask_client
|
95
|
+
self.own_dask_client = False
|
96
|
+
self.logger = logger
|
97
|
+
# Apply log filters globally
|
98
|
+
logging.getLogger("distributed.shuffle._scheduler_plugin").setLevel(
|
99
|
+
logging.ERROR
|
100
|
+
)
|
101
|
+
logging.getLogger("distributed.scheduler").setLevel(logging.WARNING)
|
102
|
+
logging.getLogger("distributed.worker").setLevel(logging.WARNING)
|
103
|
+
|
104
|
+
if self.dask_client is None:
|
105
|
+
with suppress(ValueError, RuntimeError):
|
106
|
+
# Try to attach to an existing client (common in shared Dask setups)
|
107
|
+
self.dask_client = get_client()
|
108
|
+
|
109
|
+
if self.dask_client is None:
|
110
|
+
# Default to half of logical cores if not specified
|
111
|
+
n_workers = n_workers or max(2, os.cpu_count() // 2)
|
112
|
+
|
113
|
+
cluster = LocalCluster(
|
114
|
+
n_workers=n_workers,
|
115
|
+
threads_per_worker=threads_per_worker,
|
116
|
+
processes=processes,
|
117
|
+
asynchronous=asynchronous,
|
118
|
+
memory_limit=memory_limit,
|
119
|
+
local_directory=local_directory,
|
120
|
+
silence_logs=silence_logs,
|
121
|
+
resources=resources,
|
122
|
+
timeout=timeout,
|
123
|
+
)
|
124
|
+
|
125
|
+
self.dask_client = Client(cluster)
|
126
|
+
self.own_dask_client = True
|
127
|
+
|
128
|
+
if self.logger:
|
129
|
+
self.logger.info(
|
130
|
+
f"Started local Dask cluster with {n_workers} workers × {threads_per_worker} threads "
|
131
|
+
f"({memory_limit} memory per worker). Dashboard: {self.dask_client.dashboard_link}"
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
if self.logger:
|
135
|
+
self.logger.debug(
|
136
|
+
f"Using existing Dask client: {self.dask_client.dashboard_link}"
|
137
|
+
)
|
138
|
+
|
139
|
+
def _close_dask_client(self):
|
140
|
+
"""Close the Dask client if this instance created it."""
|
141
|
+
if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
142
|
+
try:
|
143
|
+
cluster = getattr(self.dask_client, "cluster", None)
|
144
|
+
self.dask_client.close()
|
145
|
+
if cluster is not None:
|
146
|
+
cluster.close()
|
147
|
+
if self.logger:
|
148
|
+
self.logger.info("Closed local Dask client and cluster.")
|
149
|
+
except Exception as e:
|
150
|
+
if self.logger:
|
151
|
+
self.logger.warning(f"Error while closing Dask client: {e}")
|
152
|
+
|
153
|
+
@asynccontextmanager
|
154
|
+
async def shared_dask_session(**kwargs):
|
155
|
+
mixin = DaskClientMixin()
|
156
|
+
mixin._init_dask_client(**kwargs)
|
157
|
+
try:
|
158
|
+
yield mixin.dask_client
|
159
|
+
finally:
|
160
|
+
mixin._close_dask_client()
|
161
|
+
|
162
|
+
# from contextlib import suppress
|
163
|
+
# from dask.distributed import Client, get_client
|
164
|
+
#
|
165
|
+
# class DaskClientMixin:
|
166
|
+
# """
|
167
|
+
# Provides shared Dask client lifecycle management.
|
168
|
+
# Ensures reuse of existing client when available, otherwise creates a lightweight local one.
|
169
|
+
# """
|
170
|
+
#
|
171
|
+
# def _init_dask_client(self, dask_client=None, logger=None):
|
172
|
+
# self.dask_client = dask_client
|
173
|
+
# self.own_dask_client = False
|
174
|
+
# self.logger = logger
|
175
|
+
#
|
176
|
+
# if self.dask_client is None:
|
177
|
+
# with suppress(ValueError, RuntimeError):
|
178
|
+
# # Try to attach to an existing active client if running inside a Dask context
|
179
|
+
# self.dask_client = get_client()
|
180
|
+
#
|
181
|
+
# if self.dask_client is None:
|
182
|
+
# # Start a local in-process scheduler for fallback
|
183
|
+
# self.dask_client = Client(processes=False)
|
184
|
+
# self.own_dask_client = True
|
185
|
+
# if self.logger:
|
186
|
+
# self.logger.info(f"Started local Dask client: {self.dask_client.dashboard_link}")
|
187
|
+
# else:
|
188
|
+
# if self.logger:
|
189
|
+
# self.logger.debug(f"Using existing Dask client: {self.dask_client.dashboard_link}")
|
190
|
+
#
|
191
|
+
# def _close_dask_client(self):
|
192
|
+
# """Close client only if this instance created it."""
|
193
|
+
# if getattr(self, "own_dask_client", False) and self.dask_client is not None:
|
194
|
+
# try:
|
195
|
+
# self.dask_client.close()
|
196
|
+
# if self.logger:
|
197
|
+
# self.logger.info("Closed local Dask client.")
|
198
|
+
# except Exception as e:
|
199
|
+
# if self.logger:
|
200
|
+
# self.logger.warning(f"Error while closing Dask client: {e}")
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -87,17 +87,6 @@ class DataWrapper(ManagedResource):
|
|
87
87
|
"dataclass": self.dataclass.__name__
|
88
88
|
})
|
89
89
|
|
90
|
-
# --------------------- Context Management ---------------------
|
91
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
92
|
-
"""Ensure manifest is saved and resources are cleaned up on context exit."""
|
93
|
-
if self.mmanifest:
|
94
|
-
try:
|
95
|
-
self.mmanifest.save()
|
96
|
-
except Exception as e:
|
97
|
-
self.logger.error(f"Failed to save manifest in __exit__: {e}", extra=self.logger_extra)
|
98
|
-
# Call parent's __exit__ which triggers _cleanup
|
99
|
-
return super().__exit__(exc_type, exc_val, exc_tb)
|
100
|
-
|
101
90
|
# --------------------- Cleanup ---------------------
|
102
91
|
def _cleanup(self) -> None:
|
103
92
|
"""Signal shutdown during class-specific cleanup."""
|
@@ -1,34 +1,31 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.9.
|
4
|
-
Summary:
|
5
|
-
|
6
|
-
Author-email: lvalverdeb@gmail.com
|
7
|
-
Requires-Python: >=3.11,<4.0
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: Programming Language :: Python :: 3.11
|
10
|
-
Classifier: Programming Language :: Python :: 3.12
|
11
|
-
Classifier: Programming Language :: Python :: 3.13
|
12
|
-
Requires-Dist: clickhouse-connect (>=0.8.18,<0.9.0)
|
13
|
-
Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
|
14
|
-
Requires-Dist: dask[complete] (>=2025.9.0,<2026.0.0)
|
15
|
-
Requires-Dist: distributed (>=2025.9.1,<2026.0.0)
|
16
|
-
Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
|
17
|
-
Requires-Dist: opentelemetry-exporter-otlp (>=1.35.0,<2.0.0)
|
18
|
-
Requires-Dist: opentelemetry-sdk (>=1.35.0,<2.0.0)
|
19
|
-
Requires-Dist: pandas (>=2.3.1,<3.0.0)
|
20
|
-
Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
21
|
-
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
22
|
-
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
23
|
-
Requires-Dist: pyiceberg[hive,s3fs] (>=0.9.1,<0.10.0)
|
24
|
-
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
25
|
-
Requires-Dist: pyrosm (>=0.6.2,<0.7.0)
|
26
|
-
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
27
|
-
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
28
|
-
Requires-Dist: sse-starlette (>=3.0.2,<4.0.0)
|
29
|
-
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
30
|
-
Requires-Dist: webdav4 (>=0.10.0,<0.11.0)
|
3
|
+
Version: 2025.9.13
|
4
|
+
Summary: A data science toolkit for scalable data processing and analysis.
|
5
|
+
Requires-Python: >=3.11
|
31
6
|
Description-Content-Type: text/markdown
|
7
|
+
Requires-Dist: clickhouse-connect>=0.9.2
|
8
|
+
Requires-Dist: clickhouse-driver>=0.2.9
|
9
|
+
Requires-Dist: dask>=2025.9.1
|
10
|
+
Requires-Dist: distributed>=2025.9.1
|
11
|
+
Requires-Dist: fastapi>=0.118.0
|
12
|
+
Requires-Dist: folium>=0.20.0
|
13
|
+
Requires-Dist: mysqlclient>=2.2.7
|
14
|
+
Requires-Dist: opentelemetry-api>=1.37.0
|
15
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.37.0
|
16
|
+
Requires-Dist: opentelemetry-sdk>=1.37.0
|
17
|
+
Requires-Dist: pandas>=2.3.3
|
18
|
+
Requires-Dist: psycopg2>=2.9.10
|
19
|
+
Requires-Dist: pyarrow>=21.0.0
|
20
|
+
Requires-Dist: pydantic>=2.11.10
|
21
|
+
Requires-Dist: pymysql>=1.1.2
|
22
|
+
Requires-Dist: redis>=6.4.0
|
23
|
+
Requires-Dist: s3fs>=2025.9.0
|
24
|
+
Requires-Dist: sqlalchemy>=2.0.43
|
25
|
+
Requires-Dist: tqdm>=4.67.1
|
26
|
+
Requires-Dist: uvicorn>=0.37.0
|
27
|
+
Requires-Dist: webdav4>=0.10.0
|
28
|
+
Requires-Dist: wheel>=0.45.1
|
32
29
|
|
33
30
|
### SIBI-DST
|
34
31
|
|
@@ -60,4 +57,3 @@ pip install sibi-dst[dev,test,geospatial] # Install all optional dependencies
|
|
60
57
|
|
61
58
|
|
62
59
|
```
|
63
|
-
|
@@ -1,6 +1,6 @@
|
|
1
|
-
sibi_dst/__init__.py,sha256=
|
2
|
-
sibi_dst/df_helper/__init__.py,sha256=
|
3
|
-
sibi_dst/df_helper/_artifact_updater_async.py,sha256=
|
1
|
+
sibi_dst/__init__.py,sha256=QQVT3Xlj8iZN17sSMfRQFSb_DHr8A7giJP8hn02K2Oo,585
|
2
|
+
sibi_dst/df_helper/__init__.py,sha256=7rUdMybgCNZhQL_J7IFTTHz_xtFin81xavi5-PUExkA,463
|
3
|
+
sibi_dst/df_helper/_artifact_updater_async.py,sha256=AZp0vM3vji0tjiaScr8a9SUMH15NjPIKYPdRQ7SJe3Y,11372
|
4
4
|
sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
|
5
5
|
sibi_dst/df_helper/_df_helper.py,sha256=rgVP4ggiCW6tTHmUz2UqUvLznwOtY5IyoVS3WSlg73U,17005
|
6
6
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=UXkhDSAVRNKp9DykVhJd3agnryCZT0Sj2qhdhUZomuM,19421
|
@@ -22,39 +22,30 @@ sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi
|
|
22
22
|
sibi_dst/df_helper/core/_filter_handler.py,sha256=9C30zrT8wSGy1X8ryiTWc0XfnbpeoHndHgoOcHKOPOo,19309
|
23
23
|
sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
|
24
24
|
sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
|
25
|
-
sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
|
26
25
|
sibi_dst/geopy_helper/__init__.py,sha256=Q1RJiUZIOlV0QNNLjxZ_2IZS5LqIe5jRbeQkfD1Vm60,112
|
27
26
|
sibi_dst/geopy_helper/geo_location_service.py,sha256=1ArI980QF_gRw096ZsABHwJt-m55jrfOlB8tPwL1BvY,2959
|
28
27
|
sibi_dst/geopy_helper/utils.py,sha256=Sb7qfSqIyWh-AZ4GBdB9-z5FrQPWtrdtQLLcNjph0yw,3351
|
29
|
-
sibi_dst/osmnx_helper/__init__.py,sha256=
|
28
|
+
sibi_dst/osmnx_helper/__init__.py,sha256=hTv1uCnN35MSUEYNWk30OHHSflDb1qidwfLDXhUU8cE,243
|
30
29
|
sibi_dst/osmnx_helper/base_osm_map.py,sha256=L7g3VBiayHX41BcCBTOCS0iJOKzp2ZZYcrp8N-mnU90,19392
|
30
|
+
sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
|
31
|
+
sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
|
31
32
|
sibi_dst/osmnx_helper/basemaps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
33
|
sibi_dst/osmnx_helper/basemaps/calendar_html.py,sha256=UArt6FDgoCgoRte45Xo3IHqd-RNzW0YgitgZYfOFasY,4031
|
33
34
|
sibi_dst/osmnx_helper/basemaps/route_map_plotter.py,sha256=rsJidieojcqIoe0kBanZbrxcelrS6nWoAyWoQXWdPiQ,11849
|
34
35
|
sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqTadrxMx-YK4djYhqPqfQ,10941
|
35
|
-
sibi_dst/osmnx_helper/route_path_builder.py,sha256=XJJyu4YXegAkCRjE-knyQncwXaxDVXZhalYacLcb7e0,3557
|
36
|
-
sibi_dst/osmnx_helper/utils.py,sha256=7-lFVhGn4rHjGz6FvpXtC2jY8UzGIVyKR3MVyEfB7nw,14407
|
37
36
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
37
|
sibi_dst/tests/test_baseclass.py,sha256=5huAwjWo_SOEZR2_0y5w9qUmw5G7pVdm8X1OTG87JK0,11562
|
39
38
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
40
|
-
sibi_dst/utils/__init__.py,sha256=
|
39
|
+
sibi_dst/utils/__init__.py,sha256=eoW7iROrCUVYjNT1owMgGvW6U7lolbNy3FrBb_wInPs,1304
|
41
40
|
sibi_dst/utils/async_utils.py,sha256=53aywfgq1Q6-0OVr9qR1Sf6g7Qv3I9qunAAR4fjFXBE,351
|
42
41
|
sibi_dst/utils/base.py,sha256=sFngliI7Ku8bZMz0YdVhppuaPNZ0dvqRwCsPe9XdF1A,16256
|
43
|
-
sibi_dst/utils/boilerplate/__init__.py,sha256=89oepkDCnjegnhzd6Kgga71AH17KSvNMlgD83CAAmg8,582
|
44
|
-
sibi_dst/utils/boilerplate/base_attacher.py,sha256=iZftWNUx8y370OJP_kGCs5v3t2RgPuARIK_jQeFfbAU,2089
|
45
|
-
sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
|
46
|
-
sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
|
47
|
-
sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
|
48
|
-
sibi_dst/utils/boilerplate/base_pipeline.py,sha256=LQZBACksqHO3tQ8OhWShfqjiGyda7UhrmllRq3eWQfU,5690
|
49
|
-
sibi_dst/utils/boilerplate/base_pipeline_template.py,sha256=D5HFA4odsR2wlTY6iLg1tm57Tsh91QkoYjjX8eUgrjU,1574
|
50
|
-
sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
|
51
42
|
sibi_dst/utils/business_days.py,sha256=DPZExTXTt7n3IbAaEuVacm-vZgbR_Ug2bJTPBUaoP3g,6694
|
52
43
|
sibi_dst/utils/clickhouse_writer.py,sha256=8W_dTEOKQp4pXANznVSxRqFA2H5oD8UJifiBAONpXWY,17001
|
53
44
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
54
|
-
sibi_dst/utils/dask_utils.py,sha256=
|
45
|
+
sibi_dst/utils/dask_utils.py,sha256=FT85GPvHxaR7BDhxgeQT4mgQ_lFmNT5CVmiyhRv9Fns,7251
|
55
46
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
56
47
|
sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
|
57
|
-
sibi_dst/utils/data_wrapper.py,sha256
|
48
|
+
sibi_dst/utils/data_wrapper.py,sha256=9HTuDXgvfhmFAOyNG_GEOaHuojxE3639yyzOoBt7Unc,18000
|
58
49
|
sibi_dst/utils/date_utils.py,sha256=hBVWu9_cqiZ-XsLR7QY9Iek09DQKLwrY1ZlYxWlXj7g,31101
|
59
50
|
sibi_dst/utils/df_utils.py,sha256=bQGromLOEdRTvbVVcuHq0vQ0fIgqhwOoD_eIp5v7VEY,10899
|
60
51
|
sibi_dst/utils/file_age_checker.py,sha256=44B3lwH_PLwzMfiKkgvJKjKx-qSgITIXxKfNbdf_VeA,11552
|
@@ -66,15 +57,23 @@ sibi_dst/utils/manifest_manager.py,sha256=9y4cV-Ig8O-ekhApp_UObTY-cTsl-bGnvKIThI
|
|
66
57
|
sibi_dst/utils/parquet_saver.py,sha256=Itctsf8UnBCnD6NrP00FK0y9KzZgKYfjUk1CC-0x-F0,20486
|
67
58
|
sibi_dst/utils/periods.py,sha256=8eTGi-bToa6_a8Vwyg4fkBPryyzft9Nzy-3ToxjqC8c,1434
|
68
59
|
sibi_dst/utils/phone_formatter.py,sha256=oeM22nLjhObENrpItCNeVpkYS4pXRm5hSxdk0M4nvwU,4580
|
69
|
-
sibi_dst/utils/progress/__init__.py,sha256=VELVxzo2cePN_-LL0veel8-F3po6tokY5MOOpu6pz1A,92
|
70
|
-
sibi_dst/utils/progress/jobs.py,sha256=nE58ng9GPCPZhnaCDltr1tQgu3AJVqBJ1dWbGcCH4xo,3089
|
71
|
-
sibi_dst/utils/progress/sse_runner.py,sha256=NttASZH_ayXo1Zi6I4tSwYnWySLxexOYQGlqzOZiXlI,4965
|
72
60
|
sibi_dst/utils/storage_config.py,sha256=DLtP5jKVM0mdFdgRw6LQfRqyavMjJcCVU7GhsUCRH78,4427
|
73
61
|
sibi_dst/utils/storage_hive.py,sha256=eZ3nq2YWLUUG-06iJubSC15cwSHEbKKdKIwoVhD_I_E,8568
|
74
62
|
sibi_dst/utils/storage_manager.py,sha256=La1NY79bhRAmHWXp7QcXJZtbHoRboJMgoXOSXbIl1SA,6643
|
75
63
|
sibi_dst/utils/update_planner.py,sha256=1UOh4MjZSfaA_ZO-nKailOGal5EY-xVR8KSCJzo7p_g,16834
|
76
64
|
sibi_dst/utils/webdav_client.py,sha256=D9J5d1f1qQwHGm5FE5AMVpOPwcU5oD7K8JZoKGP8NpM,5811
|
77
65
|
sibi_dst/utils/write_gatekeeper.py,sha256=V8sY9YMO-JuN8Ps7prqwVSjP4f1HGH9KiVV-aTPCC_k,569
|
66
|
+
sibi_dst/utils/boilerplate/__init__.py,sha256=89oepkDCnjegnhzd6Kgga71AH17KSvNMlgD83CAAmg8,582
|
67
|
+
sibi_dst/utils/boilerplate/base_attacher.py,sha256=iZftWNUx8y370OJP_kGCs5v3t2RgPuARIK_jQeFfbAU,2089
|
68
|
+
sibi_dst/utils/boilerplate/base_data_cube.py,sha256=ErKTM2kT8LsSXADcyYvT436O_Mp0J2hm8xs1IUircb4,2760
|
69
|
+
sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh7XLA3zmf-Rq7s_kzITVniA,3753
|
70
|
+
sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
|
71
|
+
sibi_dst/utils/boilerplate/base_pipeline.py,sha256=cPVvjGxd0tDDvP_c27MNDHNuNnM6k3yx5bivN5ZFgrQ,5587
|
72
|
+
sibi_dst/utils/boilerplate/base_pipeline_template.py,sha256=D5HFA4odsR2wlTY6iLg1tm57Tsh91QkoYjjX8eUgrjU,1574
|
73
|
+
sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
|
74
|
+
sibi_dst/utils/progress/__init__.py,sha256=VELVxzo2cePN_-LL0veel8-F3po6tokY5MOOpu6pz1A,92
|
75
|
+
sibi_dst/utils/progress/jobs.py,sha256=nE58ng9GPCPZhnaCDltr1tQgu3AJVqBJ1dWbGcCH4xo,3089
|
76
|
+
sibi_dst/utils/progress/sse_runner.py,sha256=NttASZH_ayXo1Zi6I4tSwYnWySLxexOYQGlqzOZiXlI,4965
|
78
77
|
sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
79
78
|
sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
|
80
79
|
sibi_dst/v2/df_helper/_df_helper.py,sha256=9pED3bjQ2Z81zqzJrZ9e7SguoO4-hBmNTJK4WOKrr4M,9297
|
@@ -95,6 +94,7 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
95
94
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
96
95
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
97
96
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
98
|
-
sibi_dst-2025.9.
|
99
|
-
sibi_dst-2025.9.
|
100
|
-
sibi_dst-2025.9.
|
97
|
+
sibi_dst-2025.9.13.dist-info/METADATA,sha256=v8QE7j-yP0-1s6i4UcOhHIjsDpvSJGtBvzSkLy3f-v8,2413
|
98
|
+
sibi_dst-2025.9.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
99
|
+
sibi_dst-2025.9.13.dist-info/top_level.txt,sha256=g3Cj4R-rciuNyJgcxuxNgw5nhN0n4TCB0ujcTEjZNiU,9
|
100
|
+
sibi_dst-2025.9.13.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
sibi_dst
|
@@ -1,132 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from nltk.corpus import stopwords
|
3
|
-
from nltk.stem import SnowballStemmer
|
4
|
-
import dask.dataframe as dd
|
5
|
-
from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
|
6
|
-
import nltk
|
7
|
-
|
8
|
-
class DataCleaner:
|
9
|
-
def __init__(self, dataframe):
|
10
|
-
self.original_df = dataframe
|
11
|
-
self.df = dataframe.copy()
|
12
|
-
self.duplicates_df = None
|
13
|
-
|
14
|
-
def handle_missing_values(self, strategy='mean'):
|
15
|
-
if strategy == 'mean':
|
16
|
-
self.df = self.df.fillna(self.df.mean())
|
17
|
-
elif strategy == 'median':
|
18
|
-
self.df = self.df.fillna(self.df.median())
|
19
|
-
elif strategy == 'mode':
|
20
|
-
self.df = self.df.fillna(self.df.mode().iloc[0])
|
21
|
-
elif strategy == 'drop':
|
22
|
-
self.df = self.df.dropna()
|
23
|
-
return self
|
24
|
-
|
25
|
-
def identify_duplicates(self, subset=None):
|
26
|
-
self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
|
27
|
-
return self.duplicates_df
|
28
|
-
|
29
|
-
def remove_duplicates(self):
|
30
|
-
if self.duplicates_df is not None:
|
31
|
-
self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
|
32
|
-
return self
|
33
|
-
|
34
|
-
def validate_date_fields(self, date_columns=None):
|
35
|
-
if date_columns is None:
|
36
|
-
date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
|
37
|
-
for col in date_columns:
|
38
|
-
print('Validating date field: ', col)
|
39
|
-
self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
|
40
|
-
return self
|
41
|
-
|
42
|
-
def clean_text(self, text_columns=None, language='english'):
|
43
|
-
nltk.download('stopwords')
|
44
|
-
stop_words = set(stopwords.words(language))
|
45
|
-
stemmer = SnowballStemmer(language)
|
46
|
-
|
47
|
-
def clean_text(text):
|
48
|
-
if isinstance(text, str):
|
49
|
-
text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
|
50
|
-
text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
|
51
|
-
words = text.split()
|
52
|
-
words = [word for word in words if word not in stop_words] # Remove stop words
|
53
|
-
words = [stemmer.stem(word) for word in words] # Apply stemming
|
54
|
-
return ' '.join(words)
|
55
|
-
return text
|
56
|
-
|
57
|
-
if text_columns is None:
|
58
|
-
text_columns = self.df.select_dtypes(include=['object', 'string']).columns
|
59
|
-
text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
|
60
|
-
|
61
|
-
for col in text_columns:
|
62
|
-
print('Cleaning text field: ', col)
|
63
|
-
self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
|
64
|
-
return self
|
65
|
-
|
66
|
-
def validate_numeric_fields(self, int_columns=None, float_columns=None):
|
67
|
-
if int_columns is None:
|
68
|
-
int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
|
69
|
-
if float_columns is None:
|
70
|
-
float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
|
71
|
-
|
72
|
-
for col in int_columns:
|
73
|
-
print('Validating integer field: ', col)
|
74
|
-
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
|
75
|
-
|
76
|
-
for col in float_columns:
|
77
|
-
print('Validating float field: ', col)
|
78
|
-
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
|
79
|
-
|
80
|
-
return self
|
81
|
-
|
82
|
-
def detect_categorical_columns(self, threshold=0.05):
|
83
|
-
"""
|
84
|
-
Detect columns that can be converted to 'category' dtype.
|
85
|
-
|
86
|
-
Parameters:
|
87
|
-
threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
|
88
|
-
|
89
|
-
Returns:
|
90
|
-
List of column names that can be converted to 'category' dtype.
|
91
|
-
"""
|
92
|
-
categorical_columns = []
|
93
|
-
|
94
|
-
def unique_ratio(partition, col):
|
95
|
-
return partition[col].nunique() / len(partition)
|
96
|
-
|
97
|
-
for col in self.df.columns:
|
98
|
-
print("Detecting categorical columns: ", col)
|
99
|
-
unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
|
100
|
-
overall_unique_ratio = unique_ratios.sum() / len(self.df)
|
101
|
-
if overall_unique_ratio < threshold:
|
102
|
-
print(f'Column {col} is categorical')
|
103
|
-
categorical_columns.append(col)
|
104
|
-
|
105
|
-
return categorical_columns
|
106
|
-
|
107
|
-
def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
|
108
|
-
if columns is None:
|
109
|
-
columns = self.detect_categorical_columns(threshold)
|
110
|
-
|
111
|
-
if method == 'onehot':
|
112
|
-
for col in columns:
|
113
|
-
self.df[col] = self.df[col].astype('category')
|
114
|
-
encoder = OneHotEncoder(sparse_output=False)
|
115
|
-
self.df = encoder.fit_transform(self.df)
|
116
|
-
elif method == 'label':
|
117
|
-
encoder = LabelEncoder()
|
118
|
-
for col in columns:
|
119
|
-
self.df[col] = encoder.fit_transform(self.df[col])
|
120
|
-
return self
|
121
|
-
|
122
|
-
def analyze_dtypes(self):
|
123
|
-
return self.df.dtypes
|
124
|
-
|
125
|
-
def get_cleaned_dataframe(self):
|
126
|
-
return self.df
|
127
|
-
|
128
|
-
def get_original_dataframe(self):
|
129
|
-
return self.original_df
|
130
|
-
|
131
|
-
def get_duplicates_dataframe(self):
|
132
|
-
return self.duplicates_df
|