sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_async.py +191 -137
- sibi_dst/df_helper/_parquet_artifact.py +6 -326
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +26 -2
- sibi_dst/utils/boilerplate/__init__.py +5 -3
- sibi_dst/utils/boilerplate/base_pipeline.py +14 -29
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_wrapper.py +46 -312
- sibi_dst/utils/parquet_saver.py +29 -16
- sibi_dst/utils/progress/sse_runner.py +39 -11
- sibi_dst/utils/update_planner.py +161 -805
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.10.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.10.dist-info}/RECORD +14 -14
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.10.dist-info}/WHEEL +0 -0
@@ -6,10 +6,16 @@ import random
|
|
6
6
|
import time
|
7
7
|
from contextlib import ExitStack
|
8
8
|
from dataclasses import dataclass
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence,
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
|
10
10
|
|
11
11
|
from sibi_dst.utils import ManagedResource
|
12
12
|
|
13
|
+
try:
|
14
|
+
from dask.distributed import Client, LocalCluster
|
15
|
+
except ImportError:
|
16
|
+
Client = None
|
17
|
+
LocalCluster = None
|
18
|
+
|
13
19
|
|
14
20
|
@dataclass(slots=True)
|
15
21
|
class _RetryCfg:
|
@@ -19,30 +25,59 @@ class _RetryCfg:
|
|
19
25
|
jitter: float = 0.15
|
20
26
|
|
21
27
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
"
|
31
|
-
|
28
|
+
# ---------------- Worker (safe for Dask pickling) ----------------
|
29
|
+
def run_artifact_update(
|
30
|
+
cls: Type,
|
31
|
+
artifact_class_kwargs: Dict[str, Any],
|
32
|
+
retry: _RetryCfg,
|
33
|
+
period: str,
|
34
|
+
artifact_kwargs: Dict[str, Any],
|
35
|
+
) -> Dict[str, Any]:
|
36
|
+
"""Standalone worker — safe for Dask distributed execution."""
|
37
|
+
import logging
|
38
|
+
|
39
|
+
logger = logging.getLogger(cls.__name__)
|
32
40
|
|
41
|
+
start_wall = datetime.datetime.now()
|
42
|
+
attempt_count = 0
|
43
|
+
success = False
|
44
|
+
error_msg = None
|
45
|
+
|
46
|
+
for attempt in range(1, retry.attempts + 1):
|
47
|
+
attempt_count = attempt
|
48
|
+
try:
|
49
|
+
with ExitStack() as stack:
|
50
|
+
inst = cls(**artifact_class_kwargs)
|
51
|
+
inst = stack.enter_context(inst)
|
52
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
53
|
+
success = True
|
54
|
+
break
|
55
|
+
except Exception as e:
|
56
|
+
error_msg = str(e)
|
57
|
+
if attempt < retry.attempts:
|
58
|
+
delay = min(retry.backoff_base ** (attempt - 1), retry.backoff_max)
|
59
|
+
delay *= 1 + random.uniform(0, retry.jitter)
|
60
|
+
time.sleep(delay)
|
61
|
+
|
62
|
+
end_wall = datetime.datetime.now()
|
63
|
+
duration = (end_wall - start_wall).total_seconds()
|
33
64
|
|
34
|
-
def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
|
35
65
|
return {
|
36
|
-
"
|
37
|
-
"
|
38
|
-
"
|
39
|
-
"
|
66
|
+
"artifact": cls.__name__,
|
67
|
+
"period": period,
|
68
|
+
"start": start_wall.isoformat(),
|
69
|
+
"end": end_wall.isoformat(),
|
70
|
+
"processing_time": duration,
|
71
|
+
"retries": attempt_count - 1 if success else attempt_count,
|
72
|
+
"success": success,
|
73
|
+
"error": error_msg,
|
40
74
|
}
|
41
75
|
|
42
76
|
|
43
77
|
class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
44
78
|
"""
|
45
|
-
|
79
|
+
Async/Threaded orchestrator.
|
80
|
+
Dask-enabled if a Client is passed (or created automatically).
|
46
81
|
"""
|
47
82
|
|
48
83
|
def __init__(
|
@@ -57,13 +92,19 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
57
92
|
backoff_jitter: float = 0.15,
|
58
93
|
priority_fn: Optional[Callable[[Type], int]] = None,
|
59
94
|
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
95
|
+
dask_client: Optional[Client] = None,
|
96
|
+
use_dask: bool = True,
|
60
97
|
**kwargs: Any,
|
61
98
|
) -> None:
|
62
99
|
super().__init__(**kwargs)
|
100
|
+
|
63
101
|
self.wrapped_classes = wrapped_classes
|
64
102
|
self.max_workers = int(max_workers)
|
65
103
|
self.update_timeout_seconds = int(update_timeout_seconds)
|
66
104
|
self.priority_fn = priority_fn
|
105
|
+
self.use_dask = use_dask
|
106
|
+
self.client: Optional[Client] = dask_client
|
107
|
+
self._owns_client = False
|
67
108
|
|
68
109
|
self._retry = _RetryCfg(
|
69
110
|
attempts=int(retry_attempts),
|
@@ -72,29 +113,41 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
72
113
|
jitter=float(backoff_jitter),
|
73
114
|
)
|
74
115
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
116
|
+
# Safe kwargs for artifacts
|
117
|
+
if self.use_dask:
|
118
|
+
self.artifact_class_kwargs = {
|
119
|
+
"debug": self.debug,
|
120
|
+
"verbose": self.verbose,
|
121
|
+
**(artifact_class_kwargs or {}),
|
122
|
+
}
|
123
|
+
else:
|
124
|
+
self.artifact_class_kwargs = {
|
125
|
+
"logger": self.logger,
|
126
|
+
"fs": self.fs,
|
127
|
+
"debug": self.debug,
|
128
|
+
"verbose": self.verbose,
|
129
|
+
**(artifact_class_kwargs or {}),
|
130
|
+
}
|
79
131
|
|
80
132
|
self.completion_secs: Dict[str, float] = {}
|
81
133
|
self.failed: List[str] = []
|
82
|
-
|
83
|
-
# NEW: async stop gate — tripped on cleanup/cancel
|
84
134
|
self._stop = asyncio.Event()
|
85
135
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
136
|
+
if self.use_dask and Client is None:
|
137
|
+
raise RuntimeError("Dask is not installed, cannot use Dask mode")
|
138
|
+
|
139
|
+
# auto-start local client if requested
|
140
|
+
if self.use_dask and not self.client:
|
141
|
+
self.client = Client(
|
142
|
+
LocalCluster(
|
143
|
+
n_workers=max_workers,
|
144
|
+
threads_per_worker=1,
|
145
|
+
dashboard_address=None,
|
146
|
+
)
|
147
|
+
)
|
148
|
+
self._owns_client = True
|
96
149
|
|
97
|
-
# ----
|
150
|
+
# ---- Internals ------------------------------------------------------------
|
98
151
|
|
99
152
|
def _classes_for(self, period: str) -> List[Type]:
|
100
153
|
try:
|
@@ -102,7 +155,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
102
155
|
except KeyError:
|
103
156
|
raise ValueError(f"Unsupported period '{period}'.")
|
104
157
|
if not classes:
|
105
|
-
raise ValueError(f"No artifact classes configured for
|
158
|
+
raise ValueError(f"No artifact classes configured for '{period}'.")
|
106
159
|
if self.priority_fn:
|
107
160
|
try:
|
108
161
|
classes.sort(key=self.priority_fn)
|
@@ -110,120 +163,106 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
110
163
|
self.logger.warning(f"priority_fn failed; using listed order: {e}")
|
111
164
|
return classes
|
112
165
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
166
|
+
def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
|
167
|
+
return self.client.submit(
|
168
|
+
run_artifact_update,
|
169
|
+
cls,
|
170
|
+
dict(self.artifact_class_kwargs),
|
171
|
+
self._retry,
|
172
|
+
period,
|
173
|
+
artifact_kwargs,
|
174
|
+
pure=False,
|
175
|
+
)
|
123
176
|
|
124
|
-
async def
|
177
|
+
async def _run_one_async(
|
178
|
+
self,
|
179
|
+
cls: Type,
|
180
|
+
period: str,
|
181
|
+
sem: asyncio.Semaphore,
|
182
|
+
artifact_kwargs: Dict[str, Any],
|
183
|
+
) -> Dict[str, Any]:
|
184
|
+
"""Async/threaded fallback execution."""
|
125
185
|
name = cls.__name__
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
def _sync_block() -> None:
|
138
|
-
with ExitStack() as stack:
|
139
|
-
inst = cls(**self.artifact_class_kwargs)
|
140
|
-
inst = stack.enter_context(inst)
|
141
|
-
inst.update_parquet(period=period, **artifact_kwargs)
|
142
|
-
|
143
|
-
await asyncio.wait_for(asyncio.to_thread(_sync_block), timeout=self.update_timeout_seconds)
|
144
|
-
dt_secs = loop.time() - start
|
145
|
-
self.completion_secs[name] = dt_secs
|
146
|
-
self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
|
147
|
-
return
|
148
|
-
|
149
|
-
except asyncio.TimeoutError:
|
150
|
-
self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
|
151
|
-
except asyncio.CancelledError:
|
152
|
-
raise
|
153
|
-
except Exception as e:
|
154
|
-
self.logger.error(
|
155
|
-
f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
|
156
|
-
exc_info=self.debug,
|
157
|
-
)
|
158
|
-
|
159
|
-
if attempt < self._retry.attempts and not self._stop.is_set():
|
160
|
-
delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
|
161
|
-
delay *= 1 + random.uniform(0, self._retry.jitter)
|
186
|
+
self.logger.info(f"▶️ Starting {name} for period '{period}'")
|
187
|
+
start_wall = datetime.datetime.now()
|
188
|
+
|
189
|
+
attempt_count = 0
|
190
|
+
success = False
|
191
|
+
error_msg = None
|
192
|
+
|
193
|
+
try:
|
194
|
+
async with sem:
|
195
|
+
for attempt in range(1, self._retry.attempts + 1):
|
196
|
+
attempt_count = attempt
|
162
197
|
try:
|
163
|
-
|
164
|
-
|
165
|
-
|
198
|
+
def _sync_block():
|
199
|
+
with ExitStack() as stack:
|
200
|
+
inst = cls(**self.artifact_class_kwargs)
|
201
|
+
inst = stack.enter_context(inst)
|
202
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
203
|
+
|
204
|
+
await asyncio.wait_for(
|
205
|
+
asyncio.to_thread(_sync_block),
|
206
|
+
timeout=self.update_timeout_seconds,
|
207
|
+
)
|
208
|
+
success = True
|
209
|
+
break
|
210
|
+
except Exception as e:
|
211
|
+
error_msg = str(e)
|
212
|
+
if attempt < self._retry.attempts and not self._stop.is_set():
|
213
|
+
delay = min(
|
214
|
+
self._retry.backoff_base ** (attempt - 1),
|
215
|
+
self._retry.backoff_max,
|
216
|
+
)
|
217
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
218
|
+
await asyncio.sleep(delay)
|
219
|
+
finally:
|
220
|
+
end_wall = datetime.datetime.now()
|
221
|
+
duration = (end_wall - start_wall).total_seconds()
|
166
222
|
|
167
|
-
|
168
|
-
|
223
|
+
result = {
|
224
|
+
"artifact": name,
|
225
|
+
"period": period,
|
226
|
+
"start": start_wall.isoformat(),
|
227
|
+
"end": end_wall.isoformat(),
|
228
|
+
"processing_time": duration,
|
229
|
+
"retries": attempt_count - 1 if success else attempt_count,
|
230
|
+
"success": success,
|
231
|
+
"error": error_msg,
|
232
|
+
}
|
233
|
+
|
234
|
+
if success:
|
235
|
+
self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
|
236
|
+
self.completion_secs[name] = duration
|
237
|
+
else:
|
238
|
+
self.logger.error(f"❌ Artifact {name} failed", extra=result)
|
239
|
+
self.failed.append(name)
|
169
240
|
|
170
|
-
|
241
|
+
return result
|
171
242
|
|
172
|
-
|
173
|
-
"""
|
174
|
-
Backward-compatible:
|
175
|
-
- Accepts orchestrator knobs in kwargs (we consume them).
|
176
|
-
- Forwards only artifact-friendly kwargs to update_parquet.
|
177
|
-
"""
|
178
|
-
_, artifact_kwargs = self._split_kwargs(kwargs)
|
243
|
+
# ---- Public API -----------------------------------------------------------
|
179
244
|
|
245
|
+
async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
|
180
246
|
self.completion_secs.clear()
|
181
247
|
self.failed.clear()
|
182
|
-
|
183
248
|
classes = self._classes_for(period)
|
184
|
-
self.logger.info(
|
185
|
-
f"Starting update of {len(classes)} artifacts for period '{period}'",
|
186
|
-
extra={
|
187
|
-
"action_module_name": self.__class__.__name__,
|
188
|
-
"date_of_update": time.strftime('%Y-%m-%d'),
|
189
|
-
"start_time": time.strftime('%H:%M:%S'),
|
190
|
-
"period": period,
|
191
|
-
},
|
192
|
-
)
|
193
|
-
|
194
|
-
sem = asyncio.Semaphore(self.max_workers)
|
195
|
-
tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
|
196
249
|
|
197
250
|
try:
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
251
|
+
if self.use_dask:
|
252
|
+
futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
|
253
|
+
results = await asyncio.to_thread(lambda: self.client.gather(futures))
|
254
|
+
else:
|
255
|
+
sem = asyncio.Semaphore(self.max_workers)
|
256
|
+
tasks = [
|
257
|
+
asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
|
258
|
+
for cls in classes
|
259
|
+
]
|
260
|
+
results = await asyncio.gather(*tasks)
|
261
|
+
return results
|
207
262
|
finally:
|
208
|
-
#
|
209
|
-
|
210
|
-
|
211
|
-
t.cancel()
|
212
|
-
await asyncio.gather(*tasks, return_exceptions=True)
|
213
|
-
|
214
|
-
self.logger.info(
|
215
|
-
f"Update completed for period: {period}",
|
216
|
-
extra={
|
217
|
-
"action_module_name": self.__class__.__name__,
|
218
|
-
"date_of_update": datetime.date.today().strftime('%Y-%m-%d'),
|
219
|
-
"end_time": datetime.datetime.now().strftime('%H:%M:%S'),
|
220
|
-
"period": period,
|
221
|
-
},
|
222
|
-
)
|
223
|
-
self.logger.info(
|
224
|
-
f"Artifacts processed: total={len(classes)}, "
|
225
|
-
f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
|
226
|
-
)
|
263
|
+
# only shut down if we own the client
|
264
|
+
if self._owns_client:
|
265
|
+
self.close()
|
227
266
|
|
228
267
|
def get_update_status(self) -> Dict[str, Any]:
|
229
268
|
done = set(self.completion_secs)
|
@@ -235,4 +274,19 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
|
235
274
|
"failed": sorted(fail),
|
236
275
|
"pending": sorted(all_names - done - fail),
|
237
276
|
"completion_times": dict(self.completion_secs),
|
238
|
-
}
|
277
|
+
}
|
278
|
+
|
279
|
+
# ---- Lifecycle ------------------------------------------------------------
|
280
|
+
|
281
|
+
def _cleanup(self) -> None:
|
282
|
+
"""Release any resources created by this wrapper."""
|
283
|
+
if self._owns_client and self.client is not None:
|
284
|
+
try:
|
285
|
+
cluster = getattr(self.client, "cluster", None)
|
286
|
+
self.client.close()
|
287
|
+
if cluster is not None:
|
288
|
+
cluster.close()
|
289
|
+
finally:
|
290
|
+
self.client = None
|
291
|
+
self._owns_client = False
|
292
|
+
|