sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,16 @@ import random
6
6
  import time
7
7
  from contextlib import ExitStack
8
8
  from dataclasses import dataclass
9
- from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type
9
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Type
10
10
 
11
11
  from sibi_dst.utils import ManagedResource
12
12
 
13
+ try:
14
+ from dask.distributed import Client, LocalCluster
15
+ except ImportError:
16
+ Client = None
17
+ LocalCluster = None
18
+
13
19
 
14
20
  @dataclass(slots=True)
15
21
  class _RetryCfg:
@@ -19,30 +25,59 @@ class _RetryCfg:
19
25
  jitter: float = 0.15
20
26
 
21
27
 
22
- _ORCHESTRATOR_KEYS = {
23
- "retry_attempts",
24
- "backoff_base",
25
- "backoff_max",
26
- "backoff_jitter",
27
- "update_timeout_seconds",
28
- "max_workers",
29
- "priority_fn",
30
- "artifact_class_kwargs",
31
- }
28
+ # ---------------- Worker (safe for Dask pickling) ----------------
29
+ def run_artifact_update(
30
+ cls: Type,
31
+ artifact_class_kwargs: Dict[str, Any],
32
+ retry: _RetryCfg,
33
+ period: str,
34
+ artifact_kwargs: Dict[str, Any],
35
+ ) -> Dict[str, Any]:
36
+ """Standalone worker — safe for Dask distributed execution."""
37
+ import logging
38
+
39
+ logger = logging.getLogger(cls.__name__)
32
40
 
41
+ start_wall = datetime.datetime.now()
42
+ attempt_count = 0
43
+ success = False
44
+ error_msg = None
45
+
46
+ for attempt in range(1, retry.attempts + 1):
47
+ attempt_count = attempt
48
+ try:
49
+ with ExitStack() as stack:
50
+ inst = cls(**artifact_class_kwargs)
51
+ inst = stack.enter_context(inst)
52
+ inst.update_parquet(period=period, **artifact_kwargs)
53
+ success = True
54
+ break
55
+ except Exception as e:
56
+ error_msg = str(e)
57
+ if attempt < retry.attempts:
58
+ delay = min(retry.backoff_base ** (attempt - 1), retry.backoff_max)
59
+ delay *= 1 + random.uniform(0, retry.jitter)
60
+ time.sleep(delay)
61
+
62
+ end_wall = datetime.datetime.now()
63
+ duration = (end_wall - start_wall).total_seconds()
33
64
 
34
- def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
35
65
  return {
36
- "logger": resource.logger,
37
- "debug": resource.debug,
38
- "fs": resource.fs,
39
- "verbose": resource.verbose,
66
+ "artifact": cls.__name__,
67
+ "period": period,
68
+ "start": start_wall.isoformat(),
69
+ "end": end_wall.isoformat(),
70
+ "processing_time": duration,
71
+ "retries": attempt_count - 1 if success else attempt_count,
72
+ "success": success,
73
+ "error": error_msg,
40
74
  }
41
75
 
42
76
 
43
77
  class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
44
78
  """
45
- Backward-compatible async orchestrator with shutdown-aware scheduling.
79
+ Async/Threaded orchestrator.
80
+ Dask-enabled if a Client is passed (or created automatically).
46
81
  """
47
82
 
48
83
  def __init__(
@@ -57,13 +92,19 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
57
92
  backoff_jitter: float = 0.15,
58
93
  priority_fn: Optional[Callable[[Type], int]] = None,
59
94
  artifact_class_kwargs: Optional[Dict[str, Any]] = None,
95
+ dask_client: Optional[Client] = None,
96
+ use_dask: bool = True,
60
97
  **kwargs: Any,
61
98
  ) -> None:
62
99
  super().__init__(**kwargs)
100
+
63
101
  self.wrapped_classes = wrapped_classes
64
102
  self.max_workers = int(max_workers)
65
103
  self.update_timeout_seconds = int(update_timeout_seconds)
66
104
  self.priority_fn = priority_fn
105
+ self.use_dask = use_dask
106
+ self.client: Optional[Client] = dask_client
107
+ self._owns_client = False
67
108
 
68
109
  self._retry = _RetryCfg(
69
110
  attempts=int(retry_attempts),
@@ -72,29 +113,41 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
72
113
  jitter=float(backoff_jitter),
73
114
  )
74
115
 
75
- self.artifact_class_kwargs = {
76
- **_default_artifact_kwargs(self),
77
- **(artifact_class_kwargs or {}),
78
- }
116
+ # Safe kwargs for artifacts
117
+ if self.use_dask:
118
+ self.artifact_class_kwargs = {
119
+ "debug": self.debug,
120
+ "verbose": self.verbose,
121
+ **(artifact_class_kwargs or {}),
122
+ }
123
+ else:
124
+ self.artifact_class_kwargs = {
125
+ "logger": self.logger,
126
+ "fs": self.fs,
127
+ "debug": self.debug,
128
+ "verbose": self.verbose,
129
+ **(artifact_class_kwargs or {}),
130
+ }
79
131
 
80
132
  self.completion_secs: Dict[str, float] = {}
81
133
  self.failed: List[str] = []
82
-
83
- # NEW: async stop gate — tripped on cleanup/cancel
84
134
  self._stop = asyncio.Event()
85
135
 
86
- # Trip stop gate on close paths
87
- def _cleanup(self) -> None:
88
- try:
89
- loop = asyncio.get_running_loop()
90
- loop.call_soon_threadsafe(self._stop.set)
91
- except RuntimeError:
92
- self._stop.set()
93
-
94
- async def _acleanup(self) -> None:
95
- self._stop.set()
136
+ if self.use_dask and Client is None:
137
+ raise RuntimeError("Dask is not installed, cannot use Dask mode")
138
+
139
+ # auto-start local client if requested
140
+ if self.use_dask and not self.client:
141
+ self.client = Client(
142
+ LocalCluster(
143
+ n_workers=max_workers,
144
+ threads_per_worker=1,
145
+ dashboard_address=None,
146
+ )
147
+ )
148
+ self._owns_client = True
96
149
 
97
- # ---- internals -----------------------------------------------------------
150
+ # ---- Internals ------------------------------------------------------------
98
151
 
99
152
  def _classes_for(self, period: str) -> List[Type]:
100
153
  try:
@@ -102,7 +155,7 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
102
155
  except KeyError:
103
156
  raise ValueError(f"Unsupported period '{period}'.")
104
157
  if not classes:
105
- raise ValueError(f"No artifact classes configured for period '{period}'.")
158
+ raise ValueError(f"No artifact classes configured for '{period}'.")
106
159
  if self.priority_fn:
107
160
  try:
108
161
  classes.sort(key=self.priority_fn)
@@ -110,120 +163,106 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
110
163
  self.logger.warning(f"priority_fn failed; using listed order: {e}")
111
164
  return classes
112
165
 
113
- @staticmethod
114
- def _split_kwargs(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
115
- orch: Dict[str, Any] = {}
116
- art: Dict[str, Any] = {}
117
- for k, v in raw.items():
118
- if k in _ORCHESTRATOR_KEYS:
119
- orch[k] = v
120
- else:
121
- art[k] = v
122
- return orch, art
166
+ def _submit_one_dask(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]):
167
+ return self.client.submit(
168
+ run_artifact_update,
169
+ cls,
170
+ dict(self.artifact_class_kwargs),
171
+ self._retry,
172
+ period,
173
+ artifact_kwargs,
174
+ pure=False,
175
+ )
123
176
 
124
- async def _run_one(self, cls: Type, period: str, sem: asyncio.Semaphore, artifact_kwargs: Dict[str, Any]) -> None:
177
+ async def _run_one_async(
178
+ self,
179
+ cls: Type,
180
+ period: str,
181
+ sem: asyncio.Semaphore,
182
+ artifact_kwargs: Dict[str, Any],
183
+ ) -> Dict[str, Any]:
184
+ """Async/threaded fallback execution."""
125
185
  name = cls.__name__
126
- if self._stop.is_set() or self.closed:
127
- raise asyncio.CancelledError()
128
-
129
- self.logger.info(f"Running {name} with period '{period}'", extra={"artifact": name, "period": period})
130
- async with sem:
131
- loop = asyncio.get_running_loop()
132
- start = loop.time()
133
- for attempt in range(1, self._retry.attempts + 1):
134
- if self._stop.is_set() or self.closed:
135
- raise asyncio.CancelledError()
136
- try:
137
- def _sync_block() -> None:
138
- with ExitStack() as stack:
139
- inst = cls(**self.artifact_class_kwargs)
140
- inst = stack.enter_context(inst)
141
- inst.update_parquet(period=period, **artifact_kwargs)
142
-
143
- await asyncio.wait_for(asyncio.to_thread(_sync_block), timeout=self.update_timeout_seconds)
144
- dt_secs = loop.time() - start
145
- self.completion_secs[name] = dt_secs
146
- self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
147
- return
148
-
149
- except asyncio.TimeoutError:
150
- self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
151
- except asyncio.CancelledError:
152
- raise
153
- except Exception as e:
154
- self.logger.error(
155
- f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
156
- exc_info=self.debug,
157
- )
158
-
159
- if attempt < self._retry.attempts and not self._stop.is_set():
160
- delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
161
- delay *= 1 + random.uniform(0, self._retry.jitter)
186
+ self.logger.info(f"▶️ Starting {name} for period '{period}'")
187
+ start_wall = datetime.datetime.now()
188
+
189
+ attempt_count = 0
190
+ success = False
191
+ error_msg = None
192
+
193
+ try:
194
+ async with sem:
195
+ for attempt in range(1, self._retry.attempts + 1):
196
+ attempt_count = attempt
162
197
  try:
163
- await asyncio.sleep(delay)
164
- except asyncio.CancelledError:
165
- raise
198
+ def _sync_block():
199
+ with ExitStack() as stack:
200
+ inst = cls(**self.artifact_class_kwargs)
201
+ inst = stack.enter_context(inst)
202
+ inst.update_parquet(period=period, **artifact_kwargs)
203
+
204
+ await asyncio.wait_for(
205
+ asyncio.to_thread(_sync_block),
206
+ timeout=self.update_timeout_seconds,
207
+ )
208
+ success = True
209
+ break
210
+ except Exception as e:
211
+ error_msg = str(e)
212
+ if attempt < self._retry.attempts and not self._stop.is_set():
213
+ delay = min(
214
+ self._retry.backoff_base ** (attempt - 1),
215
+ self._retry.backoff_max,
216
+ )
217
+ delay *= 1 + random.uniform(0, self._retry.jitter)
218
+ await asyncio.sleep(delay)
219
+ finally:
220
+ end_wall = datetime.datetime.now()
221
+ duration = (end_wall - start_wall).total_seconds()
166
222
 
167
- self.failed.append(name)
168
- self.logger.error(f"✖️ {name} permanently failed")
223
+ result = {
224
+ "artifact": name,
225
+ "period": period,
226
+ "start": start_wall.isoformat(),
227
+ "end": end_wall.isoformat(),
228
+ "processing_time": duration,
229
+ "retries": attempt_count - 1 if success else attempt_count,
230
+ "success": success,
231
+ "error": error_msg,
232
+ }
233
+
234
+ if success:
235
+ self.logger.info(f"✅ Artifact {name} succeeded", extra=result)
236
+ self.completion_secs[name] = duration
237
+ else:
238
+ self.logger.error(f"❌ Artifact {name} failed", extra=result)
239
+ self.failed.append(name)
169
240
 
170
- # ---- public API ----------------------------------------------------------
241
+ return result
171
242
 
172
- async def update_data(self, period: str, **kwargs: Any) -> None:
173
- """
174
- Backward-compatible:
175
- - Accepts orchestrator knobs in kwargs (we consume them).
176
- - Forwards only artifact-friendly kwargs to update_parquet.
177
- """
178
- _, artifact_kwargs = self._split_kwargs(kwargs)
243
+ # ---- Public API -----------------------------------------------------------
179
244
 
245
+ async def update_data(self, period: str, **kwargs: Any) -> List[Dict[str, Any]]:
180
246
  self.completion_secs.clear()
181
247
  self.failed.clear()
182
-
183
248
  classes = self._classes_for(period)
184
- self.logger.info(
185
- f"Starting update of {len(classes)} artifacts for period '{period}'",
186
- extra={
187
- "action_module_name": self.__class__.__name__,
188
- "date_of_update": time.strftime('%Y-%m-%d'),
189
- "start_time": time.strftime('%H:%M:%S'),
190
- "period": period,
191
- },
192
- )
193
-
194
- sem = asyncio.Semaphore(self.max_workers)
195
- tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
196
249
 
197
250
  try:
198
- for t in asyncio.as_completed(tasks):
199
- if self._stop.is_set():
200
- break
201
- await t
202
- except (asyncio.CancelledError, KeyboardInterrupt):
203
- self._stop.set()
204
- for t in tasks:
205
- t.cancel()
206
- raise
251
+ if self.use_dask:
252
+ futures = [self._submit_one_dask(cls, period, kwargs) for cls in classes]
253
+ results = await asyncio.to_thread(lambda: self.client.gather(futures))
254
+ else:
255
+ sem = asyncio.Semaphore(self.max_workers)
256
+ tasks = [
257
+ asyncio.create_task(self._run_one_async(cls, period, sem, kwargs))
258
+ for cls in classes
259
+ ]
260
+ results = await asyncio.gather(*tasks)
261
+ return results
207
262
  finally:
208
- # Drain/cancel everything deterministically
209
- for t in tasks:
210
- if not t.done():
211
- t.cancel()
212
- await asyncio.gather(*tasks, return_exceptions=True)
213
-
214
- self.logger.info(
215
- f"Update completed for period: {period}",
216
- extra={
217
- "action_module_name": self.__class__.__name__,
218
- "date_of_update": datetime.date.today().strftime('%Y-%m-%d'),
219
- "end_time": datetime.datetime.now().strftime('%H:%M:%S'),
220
- "period": period,
221
- },
222
- )
223
- self.logger.info(
224
- f"Artifacts processed: total={len(classes)}, "
225
- f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
226
- )
263
+ # only shut down if we own the client
264
+ if self._owns_client:
265
+ self.close()
227
266
 
228
267
  def get_update_status(self) -> Dict[str, Any]:
229
268
  done = set(self.completion_secs)
@@ -235,4 +274,19 @@ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
235
274
  "failed": sorted(fail),
236
275
  "pending": sorted(all_names - done - fail),
237
276
  "completion_times": dict(self.completion_secs),
238
- }
277
+ }
278
+
279
+ # ---- Lifecycle ------------------------------------------------------------
280
+
281
+ def _cleanup(self) -> None:
282
+ """Release any resources created by this wrapper."""
283
+ if self._owns_client and self.client is not None:
284
+ try:
285
+ cluster = getattr(self.client, "cluster", None)
286
+ self.client.close()
287
+ if cluster is not None:
288
+ cluster.close()
289
+ finally:
290
+ self.client = None
291
+ self._owns_client = False
292
+