sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/__init__.py +1 -0
  13. sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
  14. sibi_dst/osmnx_helper/route_path_builder.py +97 -0
  15. sibi_dst/osmnx_helper/utils.py +2 -0
  16. sibi_dst/utils/base.py +302 -96
  17. sibi_dst/utils/clickhouse_writer.py +472 -206
  18. sibi_dst/utils/data_utils.py +139 -186
  19. sibi_dst/utils/data_wrapper.py +317 -73
  20. sibi_dst/utils/date_utils.py +1 -0
  21. sibi_dst/utils/df_utils.py +193 -213
  22. sibi_dst/utils/file_utils.py +3 -2
  23. sibi_dst/utils/filepath_generator.py +314 -152
  24. sibi_dst/utils/log_utils.py +581 -242
  25. sibi_dst/utils/manifest_manager.py +60 -76
  26. sibi_dst/utils/parquet_saver.py +33 -27
  27. sibi_dst/utils/phone_formatter.py +88 -95
  28. sibi_dst/utils/update_planner.py +180 -178
  29. sibi_dst/utils/webdav_client.py +116 -166
  30. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  31. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
  32. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,422 +1,315 @@
1
+ from __future__ import annotations
2
+
1
3
  import time
2
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
- from typing import Any, Callable, Dict, List, Optional, Type
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict
4
7
 
5
8
  from sibi_dst.utils import ManagedResource
6
9
 
10
+
11
+ @dataclass(slots=True)
12
+ class _RetryCfg:
13
+ attempts: int = 3
14
+ backoff_base: float = 2.0
15
+ backoff_max: float = 60.0
16
+ jitter: float = 0.15
17
+
18
+
19
+ _ORCHESTRATOR_KEYS = {
20
+ "retry_attempts",
21
+ "backoff_base",
22
+ "backoff_max",
23
+ "backoff_jitter",
24
+ "update_timeout_seconds", # accepted but unused in pure-threads version
25
+ "max_workers",
26
+ "priority_fn",
27
+ "artifact_class_kwargs",
28
+ }
29
+
30
+
31
+ def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
32
+ return {
33
+ "logger": resource.logger,
34
+ "debug": resource.debug,
35
+ "fs": resource.fs,
36
+ "verbose": resource.verbose,
37
+ }
38
+
39
+
7
40
  class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
8
41
  """
9
- Updates artifacts concurrently using a ThreadPoolExecutor.
10
-
11
- This version is refactored for a pure multi-threaded environment, aligning
12
- the orchestration model with the underlying threaded workers (DataWrapper).
42
+ Backward-compatible threaded orchestrator.
13
43
  """
14
- wrapped_classes: Dict[str, List[Type]]
44
+
15
45
  def __init__(
16
- self,
17
- wrapped_classes: Dict[str, List[Type]],
18
- *,
19
- max_workers: int = 4,
20
- retry_attempts: int = 3,
21
- backoff_base: int = 2,
22
- backoff_max: int = 60,
23
- backoff_jitter: float = 0.1,
24
- priority_fn: Optional[Callable[[Type], int]] = None,
25
- artifact_class_kwargs: Optional[Dict[str, Any]] = None,
26
- **kwargs: Dict[str, Any]
46
+ self,
47
+ wrapped_classes: Dict[str, Sequence[Type]],
48
+ *,
49
+ max_workers: int = 4,
50
+ retry_attempts: int = 3,
51
+ backoff_base: float = 2.0,
52
+ backoff_max: float = 60.0,
53
+ backoff_jitter: float = 0.15,
54
+ priority_fn: Optional[Callable[[Type], int]] = None,
55
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
56
+ **kwargs: Any,
27
57
  ) -> None:
28
58
  super().__init__(**kwargs)
29
59
  self.wrapped_classes = wrapped_classes
30
- self.max_workers = max_workers
31
- self.retry_attempts = retry_attempts
32
- self.backoff_base = backoff_base
33
- self.backoff_max = backoff_max
34
- self.backoff_jitter = backoff_jitter
60
+ self.max_workers = int(max_workers)
35
61
  self.priority_fn = priority_fn
36
- # Default artifact init kwargs
37
- today = datetime.datetime.today() + datetime.timedelta(days=1)
38
- default_kwargs = {
39
- 'parquet_start_date': today.strftime('%Y-%m-%d'),
40
- 'parquet_end_date': today.strftime('%Y-%m-%d'),
41
- 'logger': self.logger,
42
- 'debug': self.debug,
43
- 'fs': self.fs,
44
- 'verbose': self.verbose,
62
+ self._retry = _RetryCfg(
63
+ attempts=int(retry_attempts),
64
+ backoff_base=float(backoff_base),
65
+ backoff_max=float(backoff_max),
66
+ jitter=float(backoff_jitter),
67
+ )
68
+ self.artifact_class_kwargs = {
69
+ **_default_artifact_kwargs(self),
70
+ **(artifact_class_kwargs or {}),
45
71
  }
46
- self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
47
-
48
- # State tracking
49
- self.completion_times: Dict[str, float] = {}
72
+ self.completion_secs: Dict[str, float] = {}
50
73
  self.failed: List[str] = []
51
- self.original_classes: List[Type] = []
52
- self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
53
74
 
54
- def get_artifact_classes(self, data_type: str) -> List[Type]:
55
- """Retrieve artifact classes by data type."""
56
- self.logger.info(f"Fetching artifact classes for '{data_type}'")
57
- classes = self.wrapped_classes.get(data_type)
75
+ def _classes_for(self, period: str) -> List[Type]:
76
+ try:
77
+ classes = list(self.wrapped_classes[period])
78
+ except KeyError:
79
+ raise ValueError(f"Unsupported period '{period}'.")
58
80
  if not classes:
59
- raise ValueError(f"Unsupported data type: {data_type}")
60
- self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
61
- return classes
62
-
63
- def estimate_priority(self, artifact_cls: Type) -> int:
64
- """
65
- Determines task priority. Lower values run first.
66
- Note: This is a blocking call and will run sequentially before updates start.
67
- """
68
- name = artifact_cls.__name__
69
- # Custom priority function takes precedence
81
+ raise ValueError(f"No artifact classes configured for period '{period}'.")
70
82
  if self.priority_fn:
71
83
  try:
72
- return self.priority_fn(artifact_cls)
84
+ classes.sort(key=self.priority_fn)
73
85
  except Exception as e:
74
- self.logger.warning(f"priority_fn error for {name}: {e}")
75
-
76
- # # Fallback to size estimate if available
77
- # if hasattr(artifact_cls, 'get_size_estimate'):
78
- # try:
79
- # # This performs blocking I/O
80
- # return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
81
- #
82
- # except Exception as e:
83
- # self.logger.warning(f"get_size_estimate failed for {name}: {e}")
84
-
85
- # Default priority
86
- return 999
87
-
88
- def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
89
- """
90
- A blocking worker function that handles instantiation, update, and retries for a single artifact.
91
- This function is designed to be run in a ThreadPoolExecutor.
92
- """
93
- name = artifact_cls.__name__
94
- self.logger.debug(f"Worker thread starting update for {name}")
86
+ self.logger.warning(f"priority_fn failed; using listed order: {e}")
87
+ return classes
95
88
 
96
- for attempt in range(1, self.retry_attempts + 1):
89
+ @staticmethod
90
+ def _split_kwargs(raw: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Any]]:
91
+ orch: Dict[str, Any] = {}
92
+ art: Dict[str, Any] = {}
93
+ for k, v in raw.items():
94
+ if k in _ORCHESTRATOR_KEYS:
95
+ orch[k] = v
96
+ else:
97
+ art[k] = v
98
+ return orch, art
99
+
100
+ def _run_one(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]) -> str:
101
+ name = cls.__name__
102
+ start = time.monotonic()
103
+ for attempt in range(1, self._retry.attempts + 1):
97
104
  try:
98
- # Instantiate and update directly within the worker thread
99
- artifact_instance = artifact_cls(**self.artifact_class_kwargs)
100
- artifact_instance.update_parquet(**update_kwargs)
101
-
102
- self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
103
- return name # Return the name on success
104
-
105
+ with ExitStack() as stack:
106
+ inst = cls(**self.artifact_class_kwargs)
107
+ inst = stack.enter_context(inst)
108
+ inst.update_parquet(period=period, **artifact_kwargs)
109
+ self.completion_secs[name] = time.monotonic() - start
110
+ return name
105
111
  except Exception as e:
106
- self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
107
- if attempt < self.retry_attempts:
108
- delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
109
- delay *= 1 + random.uniform(0, self.backoff_jitter)
110
- self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
112
+ if attempt < self._retry.attempts:
113
+ delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
114
+ delay *= 1 + random.uniform(0, self._retry.jitter)
111
115
  time.sleep(delay)
116
+ else:
117
+ raise RuntimeError(f"{name} failed after {self._retry.attempts} attempts: {e}") from e
112
118
 
113
- # If all retries fail, raise an exception to be caught by the main loop
114
- raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
119
+ def update_data(self, period: str, **kwargs: Any) -> None:
120
+ # Split kwargs to preserve backward compatibility
121
+ _, artifact_kwargs = self._split_kwargs(kwargs)
115
122
 
116
- async def update_data(self, data_type: str, **kwargs: Any) -> None:
117
- """
118
- Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
119
- """
120
- self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
121
-
122
- # Reset state for this run
123
- self.completion_times.clear()
123
+ self.completion_secs.clear()
124
124
  self.failed.clear()
125
- self.original_classes = self.get_artifact_classes(data_type)
126
-
127
- # Sequentially estimate priorities and sort classes before execution
128
- self.logger.debug("Estimating priorities to order tasks...")
129
- ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
130
- self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
131
-
132
- start_time = time.monotonic()
133
125
 
134
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
135
- future_to_class_name = {
136
- executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
137
- for cls in ordered_classes
138
- }
139
-
140
- for future in as_completed(future_to_class_name):
141
- name = future_to_class_name[future]
126
+ classes = self._classes_for(period)
127
+ with ThreadPoolExecutor(max_workers=self.max_workers) as pool:
128
+ fut2name = {pool.submit(self._run_one, cls, period, dict(artifact_kwargs)): cls.__name__ for cls in classes}
129
+ for fut in as_completed(fut2name):
130
+ name = fut2name[fut]
142
131
  try:
143
- # result() will re-raise the exception from the worker if one occurred
144
- future.result()
145
- # If no exception, the task succeeded
146
- self.completion_times[name] = time.monotonic() - start_time
132
+ fut.result()
133
+ self.logger.info(f"✅ {name} ({period}) in {self.completion_secs[name]:.2f}s")
147
134
  except Exception as e:
148
- self.logger.error(f"✖️ {name} permanently failed. See error log above.")
149
135
  self.failed.append(name)
136
+ self.logger.error(f"✖️ {name} permanently failed: {e}")
150
137
 
151
- # Log final status
152
- total = len(self.original_classes)
153
- completed = len(self.completion_times)
154
- failed_count = len(self.failed)
155
- self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
138
+ self.logger.info(
139
+ f"Artifacts processed: total={len(classes)}, "
140
+ f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
141
+ )
156
142
 
157
143
  def get_update_status(self) -> Dict[str, Any]:
158
- """Returns a summary status including completion times."""
159
- completed_set = set(self.completion_times.keys())
160
- failed_set = set(self.failed)
161
- pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
162
-
144
+ done = set(self.completion_secs)
145
+ fail = set(self.failed)
146
+ all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
163
147
  return {
164
- 'total': len(self.original_classes),
165
- 'completed': list(completed_set),
166
- 'failed': list(failed_set),
167
- 'pending': list(pending_set),
168
- 'completion_times': self.completion_times,
148
+ "total": len(all_names),
149
+ "completed": sorted(done),
150
+ "failed": sorted(fail),
151
+ "pending": sorted(all_names - done - fail),
152
+ "completion_times": dict(self.completion_secs),
169
153
  }
170
154
 
171
- @staticmethod
172
- def format_status_table(status: Dict[str, Any]) -> str:
173
- """Formats the status dictionary into a readable table."""
174
- lines = [
175
- f"Total: {status['total']}",
176
- f"Completed: {len(status['completed'])}",
177
- f"Failed: {len(status['failed'])}",
178
- f"Pending: {len(status['pending'])}",
179
- "\nPer-artifact completion times (seconds):"
180
- ]
181
- sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
182
- for name, duration in sorted_times:
183
- lines.append(f" - {name:<30}: {duration:.2f}s")
184
- if status['failed']:
185
- lines.append("\nFailed artifacts:")
186
- for name in status['failed']:
187
- lines.append(f" - {name}")
188
- return "\n".join(lines)
189
-
190
-
191
155
  import asyncio
192
- import datetime
193
156
  import random
194
- from typing import Any, Callable, Dict, List, Optional, Type
157
+ from contextlib import ExitStack
158
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Type
195
159
 
196
160
  class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
197
161
  """
198
- Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
199
-
200
- Features:
201
- - Caps concurrency at max_workers via semaphore
202
- - Optionally prioritises tasks via a priority function or static method on artifact classes
203
- - Tracks per-artifact completion times
204
- - Configurable retry/backoff strategy
205
- - Optional metrics integration
206
- - Thread-safe within a single asyncio loop
207
-
208
- Usage:
209
- wrapper = ArtifactUpdaterMultiWrapper(
210
- wrapped_classes={
211
- 'mydata': [DataArtifactA, DataArtifactB],
212
- },
213
- max_workers=4,
214
- retry_attempts=3,
215
- update_timeout_seconds=600,
216
- backoff_base=2,
217
- backoff_max=60,
218
- backoff_jitter=0.1,
219
- priority_fn=None, # or custom
220
- metrics_client=None,
221
- debug=True,
222
- logger=None,
223
- artifact_class_kwargs={
224
- 'fs': my_fs,
225
- 'parquet_storage_path': 's3://bucket/data',
226
- 'logger': my_logger,
227
- 'debug': True,
228
- }
229
- )
230
- await wrapper.update_data('mydata', period='ytd', overwrite=True)
162
+ Backward-compatible async orchestrator.
163
+
164
+ Public API preserved:
165
+ __init__(wrapped_classes, *, max_workers=..., retry_attempts=..., backoff_*=..., update_timeout_seconds=..., priority_fn=..., artifact_class_kwargs=..., **kwargs)
166
+ update_data(period, **kwargs) -> forwards only artifact-friendly kwargs to update_parquet
231
167
  """
168
+
232
169
  def __init__(
233
170
  self,
234
- wrapped_classes: Dict[str, List[Type]],
171
+ wrapped_classes: Dict[str, Sequence[Type]],
235
172
  *,
236
173
  max_workers: int = 3,
237
174
  retry_attempts: int = 3,
238
175
  update_timeout_seconds: int = 600,
239
- backoff_base: int = 2,
240
- backoff_max: Optional[int] = 60,
241
- backoff_jitter: float = 0.1,
176
+ backoff_base: float = 2.0,
177
+ backoff_max: float = 60.0,
178
+ backoff_jitter: float = 0.15,
242
179
  priority_fn: Optional[Callable[[Type], int]] = None,
243
- metrics_client: Any = None,
244
180
  artifact_class_kwargs: Optional[Dict[str, Any]] = None,
245
- **kwargs: Dict[str, Any]
181
+ **kwargs: Any,
246
182
  ) -> None:
247
183
  super().__init__(**kwargs)
248
184
  self.wrapped_classes = wrapped_classes
249
- self.max_workers = max_workers
250
- self.retry_attempts = retry_attempts
251
- self.update_timeout_seconds = update_timeout_seconds
252
- self.backoff_base = backoff_base
253
- self.backoff_max = backoff_max
254
- self.backoff_jitter = backoff_jitter
185
+ self.max_workers = int(max_workers)
186
+ self.update_timeout_seconds = int(update_timeout_seconds)
255
187
  self.priority_fn = priority_fn
256
- self.metrics_client = metrics_client
257
-
258
- # Default artifact init kwargs
259
- today = datetime.datetime.today() + datetime.timedelta(days=1)
260
- default_kwargs = {
261
- 'parquet_start_date': today.strftime('%Y-%m-%d'),
262
- 'parquet_end_date': today.strftime('%Y-%m-%d'),
263
- 'logger': self.logger,
264
- 'debug': self.debug,
265
- 'fs': self.fs,
266
- 'verbose': self.verbose,
188
+
189
+ self._retry = _RetryCfg(
190
+ attempts=int(retry_attempts),
191
+ backoff_base=float(backoff_base),
192
+ backoff_max=float(backoff_max),
193
+ jitter=float(backoff_jitter),
194
+ )
195
+
196
+ self.artifact_class_kwargs = {
197
+ **_default_artifact_kwargs(self),
198
+ **(artifact_class_kwargs or {}),
267
199
  }
268
- self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
269
200
 
270
- # State
271
- self.completion_times: Dict[str, float] = {}
201
+ self.completion_secs: Dict[str, float] = {}
272
202
  self.failed: List[str] = []
273
- self.original_classes: List[Type] = []
274
- self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
275
203
 
276
- def get_artifact_classes(self, data_type: str) -> List[Type]:
277
- """
278
- Retrieve artifact classes by data type.
279
- """
280
- self.logger.info(f"Fetching artifact classes for '{data_type}'")
281
- if data_type not in self.wrapped_classes:
282
- raise ValueError(f"Unsupported data type: {data_type}")
283
- classes = self.wrapped_classes[data_type]
284
- self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
285
- return classes
204
+ # ---- internals -----------------------------------------------------------
286
205
 
287
- def estimate_priority(self, artifact_cls: Type) -> int:
288
- """
289
- Determine task priority for ordering. Lower values run first.
290
- """
291
- name = artifact_cls.__name__
206
+ def _classes_for(self, period: str) -> List[Type]:
207
+ try:
208
+ classes = list(self.wrapped_classes[period])
209
+ except KeyError:
210
+ raise ValueError(f"Unsupported period '{period}'.")
211
+ if not classes:
212
+ raise ValueError(f"No artifact classes configured for period '{period}'.")
292
213
  if self.priority_fn:
293
214
  try:
294
- pr = self.priority_fn(artifact_cls)
295
- self.logger.debug(f"priority_fn for {name}: {pr}")
296
- return pr
215
+ classes.sort(key=self.priority_fn)
297
216
  except Exception as e:
298
- self.logger.warning(f"priority_fn error for {name}: {e}")
299
- try:
300
- fs = self.artifact_class_kwargs.get('fs')
301
- path = self.artifact_class_kwargs.get('parquet_storage_path')
302
- pr=1
303
- if hasattr(artifact_cls, 'get_size_estimate'):
304
- pr = artifact_cls.get_size_estimate(fs, path)
305
- self.logger.debug(f"Estimated priority for {name}: {pr}")
306
- return pr
307
- except Exception:
308
- return 1
309
-
310
- async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
217
+ self.logger.warning(f"priority_fn failed; using listed order: {e}")
218
+ return classes
219
+
220
+ @staticmethod
221
+ def _split_kwargs(raw: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Any]]:
311
222
  """
312
- Wrap update_artifact in a semaphore slot to limit concurrency.
223
+ Split kwargs into (orchestrator-only, artifact-forwarded).
224
+ Keeps backward compatibility: callers can pass all knobs in one dict.
313
225
  """
226
+ orch: Dict[str, Any] = {}
227
+ art: Dict[str, Any] = {}
228
+ for k, v in raw.items():
229
+ if k in _ORCHESTRATOR_KEYS:
230
+ orch[k] = v
231
+ else:
232
+ art[k] = v
233
+ return orch, art
234
+
235
+ async def _run_one(self, cls: Type, period: str, sem: asyncio.Semaphore, artifact_kwargs: Dict[str, Any]) -> None:
236
+ name = cls.__name__
314
237
  async with sem:
315
- name = artifact_cls.__name__
316
- start = asyncio.get_event_loop().time()
317
- self.logger.info(f"Starting update for {name}")
318
- try:
319
- for attempt in range(1, self.retry_attempts + 1):
320
- try:
321
- artifact = await asyncio.to_thread(
322
- artifact_cls, **self.artifact_class_kwargs
323
- )
324
- await asyncio.wait_for(
325
- asyncio.to_thread(
326
- artifact.update_parquet, **update_kwargs
327
- ),
328
- timeout=self.update_timeout_seconds
329
- )
330
- duration = asyncio.get_event_loop().time() - start
331
- self.completion_times[name] = duration
332
- self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
333
- if self.metrics_client:
334
- self.metrics_client.increment('task_succeeded')
335
- return
336
- except asyncio.TimeoutError:
337
- self.logger.warning(f"Timeout on {name}, attempt {attempt}")
338
- except Exception as e:
339
- self.logger.error(f"Error on {name} attempt {attempt}: {e}")
340
-
341
- delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
342
- delay *= 1 + random.uniform(0, self.backoff_jitter)
343
- self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
238
+ start = asyncio.get_running_loop().time()
239
+ for attempt in range(1, self._retry.attempts + 1):
240
+ try:
241
+ # Run sync context + method in thread
242
+ def _sync_block() -> None:
243
+ with ExitStack() as stack:
244
+ inst = cls(**self.artifact_class_kwargs)
245
+ inst = stack.enter_context(inst)
246
+ inst.update_parquet(period=period, **artifact_kwargs)
247
+
248
+ await asyncio.wait_for(
249
+ asyncio.to_thread(_sync_block),
250
+ timeout=self.update_timeout_seconds,
251
+ )
252
+ dt_secs = asyncio.get_running_loop().time() - start
253
+ self.completion_secs[name] = dt_secs
254
+ self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
255
+ return
256
+
257
+ except asyncio.TimeoutError:
258
+ self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
259
+ except Exception as e:
260
+ self.logger.error(
261
+ f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
262
+ exc_info=self.debug,
263
+ )
264
+
265
+ if attempt < self._retry.attempts:
266
+ delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
267
+ delay *= 1 + random.uniform(0, self._retry.jitter)
344
268
  await asyncio.sleep(delay)
345
269
 
346
- except asyncio.CancelledError:
347
- self.logger.warning(f"{name} update cancelled")
348
- raise
349
-
350
- # permanent failure
351
- self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
352
- if self.metrics_client:
353
- self.metrics_client.increment('task_failed')
354
270
  self.failed.append(name)
271
+ self.logger.error(f"✖️ {name} permanently failed")
272
+
273
+ # ---- public API ----------------------------------------------------------
355
274
 
356
- async def update_data(self, data_type: str, **kwargs: Any) -> None:
275
+ async def update_data(self, period: str, **kwargs: Any) -> None:
357
276
  """
358
- Entry point to update all artifacts of a given type concurrently.
277
+ Backward-compatible:
278
+ - Accepts orchestrator knobs in kwargs (we consume them).
279
+ - Forwards only artifact-friendly kwargs to update_parquet.
359
280
  """
360
- self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
281
+ # split kwargs; ignore any runtime attempts to mutate orchestrator config mid-call
282
+ _, artifact_kwargs = self._split_kwargs(kwargs)
361
283
 
362
- # RESET STATE
363
- self.completion_times.clear()
284
+ self.completion_secs.clear()
364
285
  self.failed.clear()
365
- self.original_classes = self.get_artifact_classes(data_type)
366
-
367
- # NON-DESTRUCTIVE SORTING
368
- ordered = sorted(self.original_classes, key=self.estimate_priority)
369
286
 
287
+ classes = self._classes_for(period)
370
288
  sem = asyncio.Semaphore(self.max_workers)
371
- tasks = [
372
- asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
373
- for cls in ordered
374
- ]
289
+ tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
375
290
 
376
- try:
377
- for coro in asyncio.as_completed(tasks):
378
- await coro
379
- except asyncio.CancelledError:
380
- self.logger.warning("update_data was cancelled—aborting remaining retries")
381
- for t in tasks:
382
- t.cancel()
383
- raise
384
- finally:
385
- total = len(self.original_classes)
386
- completed = len(self.completion_times)
387
- failed = len(self.failed)
388
- self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
291
+ for t in asyncio.as_completed(tasks):
292
+ try:
293
+ await t
294
+ except asyncio.CancelledError:
295
+ for rest in tasks:
296
+ rest.cancel()
297
+ raise
389
298
 
390
- def get_update_status(self) -> Dict[str, Any]:
391
- """
392
- Returns summary status including completion times.
393
- """
394
- total = len(self.original_classes)
395
- completed = set(self.completion_times.keys())
396
- failed = set(self.failed)
397
- pending = {cls.__name__ for cls in self.original_classes} - completed - failed
299
+ self.logger.info(
300
+ f"Artifacts processed: total={len(classes)}, "
301
+ f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
302
+ )
398
303
 
304
+ # Optional helper
305
+ def get_update_status(self) -> Dict[str, Any]:
306
+ done = set(self.completion_secs)
307
+ fail = set(self.failed)
308
+ all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
399
309
  return {
400
- 'total': total,
401
- 'completed': list(completed),
402
- 'failed': list(failed),
403
- 'pending': list(pending),
404
- 'completion_times': self.completion_times,
405
- }
406
-
407
- @staticmethod
408
- def format_status_table(status: Dict[str, Any]) -> str:
409
- """
410
- Formats the status dict into a readable table.
411
- """
412
- lines = [
413
- f"Total: {status['total']}",
414
- f"Completed: {len(status['completed'])} {status['completed']}",
415
- f"Failed: {len(status['failed'])} {status['failed']}",
416
- f"Pending: {len(status['pending'])} {status['pending']}",
417
- "",
418
- "Per-artifact timings:"
419
- ]
420
- for name, dur in status['completion_times'].items():
421
- lines.append(f" {name}: {dur:.2f}s")
422
- return "\n".join(lines)
310
+ "total": len(all_names),
311
+ "completed": sorted(done),
312
+ "failed": sorted(fail),
313
+ "pending": sorted(all_names - done - fail),
314
+ "completion_times": dict(self.completion_secs),
315
+ }