sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/__init__.py +1 -0
- sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
- sibi_dst/osmnx_helper/route_path_builder.py +97 -0
- sibi_dst/osmnx_helper/utils.py +2 -0
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,422 +1,315 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import time
|
2
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
-
from
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Dict
|
4
7
|
|
5
8
|
from sibi_dst.utils import ManagedResource
|
6
9
|
|
10
|
+
|
11
|
+
@dataclass(slots=True)
|
12
|
+
class _RetryCfg:
|
13
|
+
attempts: int = 3
|
14
|
+
backoff_base: float = 2.0
|
15
|
+
backoff_max: float = 60.0
|
16
|
+
jitter: float = 0.15
|
17
|
+
|
18
|
+
|
19
|
+
_ORCHESTRATOR_KEYS = {
|
20
|
+
"retry_attempts",
|
21
|
+
"backoff_base",
|
22
|
+
"backoff_max",
|
23
|
+
"backoff_jitter",
|
24
|
+
"update_timeout_seconds", # accepted but unused in pure-threads version
|
25
|
+
"max_workers",
|
26
|
+
"priority_fn",
|
27
|
+
"artifact_class_kwargs",
|
28
|
+
}
|
29
|
+
|
30
|
+
|
31
|
+
def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
|
32
|
+
return {
|
33
|
+
"logger": resource.logger,
|
34
|
+
"debug": resource.debug,
|
35
|
+
"fs": resource.fs,
|
36
|
+
"verbose": resource.verbose,
|
37
|
+
}
|
38
|
+
|
39
|
+
|
7
40
|
class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
8
41
|
"""
|
9
|
-
|
10
|
-
|
11
|
-
This version is refactored for a pure multi-threaded environment, aligning
|
12
|
-
the orchestration model with the underlying threaded workers (DataWrapper).
|
42
|
+
Backward-compatible threaded orchestrator.
|
13
43
|
"""
|
14
|
-
|
44
|
+
|
15
45
|
def __init__(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
46
|
+
self,
|
47
|
+
wrapped_classes: Dict[str, Sequence[Type]],
|
48
|
+
*,
|
49
|
+
max_workers: int = 4,
|
50
|
+
retry_attempts: int = 3,
|
51
|
+
backoff_base: float = 2.0,
|
52
|
+
backoff_max: float = 60.0,
|
53
|
+
backoff_jitter: float = 0.15,
|
54
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
55
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
56
|
+
**kwargs: Any,
|
27
57
|
) -> None:
|
28
58
|
super().__init__(**kwargs)
|
29
59
|
self.wrapped_classes = wrapped_classes
|
30
|
-
self.max_workers = max_workers
|
31
|
-
self.retry_attempts = retry_attempts
|
32
|
-
self.backoff_base = backoff_base
|
33
|
-
self.backoff_max = backoff_max
|
34
|
-
self.backoff_jitter = backoff_jitter
|
60
|
+
self.max_workers = int(max_workers)
|
35
61
|
self.priority_fn = priority_fn
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
62
|
+
self._retry = _RetryCfg(
|
63
|
+
attempts=int(retry_attempts),
|
64
|
+
backoff_base=float(backoff_base),
|
65
|
+
backoff_max=float(backoff_max),
|
66
|
+
jitter=float(backoff_jitter),
|
67
|
+
)
|
68
|
+
self.artifact_class_kwargs = {
|
69
|
+
**_default_artifact_kwargs(self),
|
70
|
+
**(artifact_class_kwargs or {}),
|
45
71
|
}
|
46
|
-
self.
|
47
|
-
|
48
|
-
# State tracking
|
49
|
-
self.completion_times: Dict[str, float] = {}
|
72
|
+
self.completion_secs: Dict[str, float] = {}
|
50
73
|
self.failed: List[str] = []
|
51
|
-
self.original_classes: List[Type] = []
|
52
|
-
self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
|
53
74
|
|
54
|
-
def
|
55
|
-
|
56
|
-
|
57
|
-
|
75
|
+
def _classes_for(self, period: str) -> List[Type]:
|
76
|
+
try:
|
77
|
+
classes = list(self.wrapped_classes[period])
|
78
|
+
except KeyError:
|
79
|
+
raise ValueError(f"Unsupported period '{period}'.")
|
58
80
|
if not classes:
|
59
|
-
raise ValueError(f"
|
60
|
-
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
61
|
-
return classes
|
62
|
-
|
63
|
-
def estimate_priority(self, artifact_cls: Type) -> int:
|
64
|
-
"""
|
65
|
-
Determines task priority. Lower values run first.
|
66
|
-
Note: This is a blocking call and will run sequentially before updates start.
|
67
|
-
"""
|
68
|
-
name = artifact_cls.__name__
|
69
|
-
# Custom priority function takes precedence
|
81
|
+
raise ValueError(f"No artifact classes configured for period '{period}'.")
|
70
82
|
if self.priority_fn:
|
71
83
|
try:
|
72
|
-
|
84
|
+
classes.sort(key=self.priority_fn)
|
73
85
|
except Exception as e:
|
74
|
-
self.logger.warning(f"priority_fn
|
75
|
-
|
76
|
-
# # Fallback to size estimate if available
|
77
|
-
# if hasattr(artifact_cls, 'get_size_estimate'):
|
78
|
-
# try:
|
79
|
-
# # This performs blocking I/O
|
80
|
-
# return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
|
81
|
-
#
|
82
|
-
# except Exception as e:
|
83
|
-
# self.logger.warning(f"get_size_estimate failed for {name}: {e}")
|
84
|
-
|
85
|
-
# Default priority
|
86
|
-
return 999
|
87
|
-
|
88
|
-
def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
|
89
|
-
"""
|
90
|
-
A blocking worker function that handles instantiation, update, and retries for a single artifact.
|
91
|
-
This function is designed to be run in a ThreadPoolExecutor.
|
92
|
-
"""
|
93
|
-
name = artifact_cls.__name__
|
94
|
-
self.logger.debug(f"Worker thread starting update for {name}")
|
86
|
+
self.logger.warning(f"priority_fn failed; using listed order: {e}")
|
87
|
+
return classes
|
95
88
|
|
96
|
-
|
89
|
+
@staticmethod
|
90
|
+
def _split_kwargs(raw: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
91
|
+
orch: Dict[str, Any] = {}
|
92
|
+
art: Dict[str, Any] = {}
|
93
|
+
for k, v in raw.items():
|
94
|
+
if k in _ORCHESTRATOR_KEYS:
|
95
|
+
orch[k] = v
|
96
|
+
else:
|
97
|
+
art[k] = v
|
98
|
+
return orch, art
|
99
|
+
|
100
|
+
def _run_one(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]) -> str:
|
101
|
+
name = cls.__name__
|
102
|
+
start = time.monotonic()
|
103
|
+
for attempt in range(1, self._retry.attempts + 1):
|
97
104
|
try:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
self.
|
103
|
-
return name
|
104
|
-
|
105
|
+
with ExitStack() as stack:
|
106
|
+
inst = cls(**self.artifact_class_kwargs)
|
107
|
+
inst = stack.enter_context(inst)
|
108
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
109
|
+
self.completion_secs[name] = time.monotonic() - start
|
110
|
+
return name
|
105
111
|
except Exception as e:
|
106
|
-
|
107
|
-
|
108
|
-
delay
|
109
|
-
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
110
|
-
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
112
|
+
if attempt < self._retry.attempts:
|
113
|
+
delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
|
114
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
111
115
|
time.sleep(delay)
|
116
|
+
else:
|
117
|
+
raise RuntimeError(f"{name} failed after {self._retry.attempts} attempts: {e}") from e
|
112
118
|
|
113
|
-
|
114
|
-
|
119
|
+
def update_data(self, period: str, **kwargs: Any) -> None:
|
120
|
+
# Split kwargs to preserve backward compatibility
|
121
|
+
_, artifact_kwargs = self._split_kwargs(kwargs)
|
115
122
|
|
116
|
-
|
117
|
-
"""
|
118
|
-
Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
|
119
|
-
"""
|
120
|
-
self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
|
121
|
-
|
122
|
-
# Reset state for this run
|
123
|
-
self.completion_times.clear()
|
123
|
+
self.completion_secs.clear()
|
124
124
|
self.failed.clear()
|
125
|
-
self.original_classes = self.get_artifact_classes(data_type)
|
126
|
-
|
127
|
-
# Sequentially estimate priorities and sort classes before execution
|
128
|
-
self.logger.debug("Estimating priorities to order tasks...")
|
129
|
-
ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
|
130
|
-
self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
|
131
|
-
|
132
|
-
start_time = time.monotonic()
|
133
125
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
for future in as_completed(future_to_class_name):
|
141
|
-
name = future_to_class_name[future]
|
126
|
+
classes = self._classes_for(period)
|
127
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as pool:
|
128
|
+
fut2name = {pool.submit(self._run_one, cls, period, dict(artifact_kwargs)): cls.__name__ for cls in classes}
|
129
|
+
for fut in as_completed(fut2name):
|
130
|
+
name = fut2name[fut]
|
142
131
|
try:
|
143
|
-
|
144
|
-
|
145
|
-
# If no exception, the task succeeded
|
146
|
-
self.completion_times[name] = time.monotonic() - start_time
|
132
|
+
fut.result()
|
133
|
+
self.logger.info(f"✅ {name} ({period}) in {self.completion_secs[name]:.2f}s")
|
147
134
|
except Exception as e:
|
148
|
-
self.logger.error(f"✖️ {name} permanently failed. See error log above.")
|
149
135
|
self.failed.append(name)
|
136
|
+
self.logger.error(f"✖️ {name} permanently failed: {e}")
|
150
137
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
|
138
|
+
self.logger.info(
|
139
|
+
f"Artifacts processed: total={len(classes)}, "
|
140
|
+
f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
|
141
|
+
)
|
156
142
|
|
157
143
|
def get_update_status(self) -> Dict[str, Any]:
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
|
162
|
-
|
144
|
+
done = set(self.completion_secs)
|
145
|
+
fail = set(self.failed)
|
146
|
+
all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
|
163
147
|
return {
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
148
|
+
"total": len(all_names),
|
149
|
+
"completed": sorted(done),
|
150
|
+
"failed": sorted(fail),
|
151
|
+
"pending": sorted(all_names - done - fail),
|
152
|
+
"completion_times": dict(self.completion_secs),
|
169
153
|
}
|
170
154
|
|
171
|
-
@staticmethod
|
172
|
-
def format_status_table(status: Dict[str, Any]) -> str:
|
173
|
-
"""Formats the status dictionary into a readable table."""
|
174
|
-
lines = [
|
175
|
-
f"Total: {status['total']}",
|
176
|
-
f"Completed: {len(status['completed'])}",
|
177
|
-
f"Failed: {len(status['failed'])}",
|
178
|
-
f"Pending: {len(status['pending'])}",
|
179
|
-
"\nPer-artifact completion times (seconds):"
|
180
|
-
]
|
181
|
-
sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
|
182
|
-
for name, duration in sorted_times:
|
183
|
-
lines.append(f" - {name:<30}: {duration:.2f}s")
|
184
|
-
if status['failed']:
|
185
|
-
lines.append("\nFailed artifacts:")
|
186
|
-
for name in status['failed']:
|
187
|
-
lines.append(f" - {name}")
|
188
|
-
return "\n".join(lines)
|
189
|
-
|
190
|
-
|
191
155
|
import asyncio
|
192
|
-
import datetime
|
193
156
|
import random
|
194
|
-
from
|
157
|
+
from contextlib import ExitStack
|
158
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
|
195
159
|
|
196
160
|
class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
197
161
|
"""
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
- Tracks per-artifact completion times
|
204
|
-
- Configurable retry/backoff strategy
|
205
|
-
- Optional metrics integration
|
206
|
-
- Thread-safe within a single asyncio loop
|
207
|
-
|
208
|
-
Usage:
|
209
|
-
wrapper = ArtifactUpdaterMultiWrapper(
|
210
|
-
wrapped_classes={
|
211
|
-
'mydata': [DataArtifactA, DataArtifactB],
|
212
|
-
},
|
213
|
-
max_workers=4,
|
214
|
-
retry_attempts=3,
|
215
|
-
update_timeout_seconds=600,
|
216
|
-
backoff_base=2,
|
217
|
-
backoff_max=60,
|
218
|
-
backoff_jitter=0.1,
|
219
|
-
priority_fn=None, # or custom
|
220
|
-
metrics_client=None,
|
221
|
-
debug=True,
|
222
|
-
logger=None,
|
223
|
-
artifact_class_kwargs={
|
224
|
-
'fs': my_fs,
|
225
|
-
'parquet_storage_path': 's3://bucket/data',
|
226
|
-
'logger': my_logger,
|
227
|
-
'debug': True,
|
228
|
-
}
|
229
|
-
)
|
230
|
-
await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
162
|
+
Backward-compatible async orchestrator.
|
163
|
+
|
164
|
+
Public API preserved:
|
165
|
+
• __init__(wrapped_classes, *, max_workers=..., retry_attempts=..., backoff_*=..., update_timeout_seconds=..., priority_fn=..., artifact_class_kwargs=..., **kwargs)
|
166
|
+
• update_data(period, **kwargs) -> forwards only artifact-friendly kwargs to update_parquet
|
231
167
|
"""
|
168
|
+
|
232
169
|
def __init__(
|
233
170
|
self,
|
234
|
-
wrapped_classes: Dict[str,
|
171
|
+
wrapped_classes: Dict[str, Sequence[Type]],
|
235
172
|
*,
|
236
173
|
max_workers: int = 3,
|
237
174
|
retry_attempts: int = 3,
|
238
175
|
update_timeout_seconds: int = 600,
|
239
|
-
backoff_base:
|
240
|
-
backoff_max:
|
241
|
-
backoff_jitter: float = 0.
|
176
|
+
backoff_base: float = 2.0,
|
177
|
+
backoff_max: float = 60.0,
|
178
|
+
backoff_jitter: float = 0.15,
|
242
179
|
priority_fn: Optional[Callable[[Type], int]] = None,
|
243
|
-
metrics_client: Any = None,
|
244
180
|
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
245
|
-
**kwargs:
|
181
|
+
**kwargs: Any,
|
246
182
|
) -> None:
|
247
183
|
super().__init__(**kwargs)
|
248
184
|
self.wrapped_classes = wrapped_classes
|
249
|
-
self.max_workers = max_workers
|
250
|
-
self.
|
251
|
-
self.update_timeout_seconds = update_timeout_seconds
|
252
|
-
self.backoff_base = backoff_base
|
253
|
-
self.backoff_max = backoff_max
|
254
|
-
self.backoff_jitter = backoff_jitter
|
185
|
+
self.max_workers = int(max_workers)
|
186
|
+
self.update_timeout_seconds = int(update_timeout_seconds)
|
255
187
|
self.priority_fn = priority_fn
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
188
|
+
|
189
|
+
self._retry = _RetryCfg(
|
190
|
+
attempts=int(retry_attempts),
|
191
|
+
backoff_base=float(backoff_base),
|
192
|
+
backoff_max=float(backoff_max),
|
193
|
+
jitter=float(backoff_jitter),
|
194
|
+
)
|
195
|
+
|
196
|
+
self.artifact_class_kwargs = {
|
197
|
+
**_default_artifact_kwargs(self),
|
198
|
+
**(artifact_class_kwargs or {}),
|
267
199
|
}
|
268
|
-
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
269
200
|
|
270
|
-
|
271
|
-
self.completion_times: Dict[str, float] = {}
|
201
|
+
self.completion_secs: Dict[str, float] = {}
|
272
202
|
self.failed: List[str] = []
|
273
|
-
self.original_classes: List[Type] = []
|
274
|
-
self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
|
275
203
|
|
276
|
-
|
277
|
-
"""
|
278
|
-
Retrieve artifact classes by data type.
|
279
|
-
"""
|
280
|
-
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
281
|
-
if data_type not in self.wrapped_classes:
|
282
|
-
raise ValueError(f"Unsupported data type: {data_type}")
|
283
|
-
classes = self.wrapped_classes[data_type]
|
284
|
-
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
285
|
-
return classes
|
204
|
+
# ---- internals -----------------------------------------------------------
|
286
205
|
|
287
|
-
def
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
206
|
+
def _classes_for(self, period: str) -> List[Type]:
|
207
|
+
try:
|
208
|
+
classes = list(self.wrapped_classes[period])
|
209
|
+
except KeyError:
|
210
|
+
raise ValueError(f"Unsupported period '{period}'.")
|
211
|
+
if not classes:
|
212
|
+
raise ValueError(f"No artifact classes configured for period '{period}'.")
|
292
213
|
if self.priority_fn:
|
293
214
|
try:
|
294
|
-
|
295
|
-
self.logger.debug(f"priority_fn for {name}: {pr}")
|
296
|
-
return pr
|
215
|
+
classes.sort(key=self.priority_fn)
|
297
216
|
except Exception as e:
|
298
|
-
self.logger.warning(f"priority_fn
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
if hasattr(artifact_cls, 'get_size_estimate'):
|
304
|
-
pr = artifact_cls.get_size_estimate(fs, path)
|
305
|
-
self.logger.debug(f"Estimated priority for {name}: {pr}")
|
306
|
-
return pr
|
307
|
-
except Exception:
|
308
|
-
return 1
|
309
|
-
|
310
|
-
async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
|
217
|
+
self.logger.warning(f"priority_fn failed; using listed order: {e}")
|
218
|
+
return classes
|
219
|
+
|
220
|
+
@staticmethod
|
221
|
+
def _split_kwargs(raw: Dict[str, Any]) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
311
222
|
"""
|
312
|
-
|
223
|
+
Split kwargs into (orchestrator-only, artifact-forwarded).
|
224
|
+
Keeps backward compatibility: callers can pass all knobs in one dict.
|
313
225
|
"""
|
226
|
+
orch: Dict[str, Any] = {}
|
227
|
+
art: Dict[str, Any] = {}
|
228
|
+
for k, v in raw.items():
|
229
|
+
if k in _ORCHESTRATOR_KEYS:
|
230
|
+
orch[k] = v
|
231
|
+
else:
|
232
|
+
art[k] = v
|
233
|
+
return orch, art
|
234
|
+
|
235
|
+
async def _run_one(self, cls: Type, period: str, sem: asyncio.Semaphore, artifact_kwargs: Dict[str, Any]) -> None:
|
236
|
+
name = cls.__name__
|
314
237
|
async with sem:
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
self.
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
self.
|
238
|
+
start = asyncio.get_running_loop().time()
|
239
|
+
for attempt in range(1, self._retry.attempts + 1):
|
240
|
+
try:
|
241
|
+
# Run sync context + method in thread
|
242
|
+
def _sync_block() -> None:
|
243
|
+
with ExitStack() as stack:
|
244
|
+
inst = cls(**self.artifact_class_kwargs)
|
245
|
+
inst = stack.enter_context(inst)
|
246
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
247
|
+
|
248
|
+
await asyncio.wait_for(
|
249
|
+
asyncio.to_thread(_sync_block),
|
250
|
+
timeout=self.update_timeout_seconds,
|
251
|
+
)
|
252
|
+
dt_secs = asyncio.get_running_loop().time() - start
|
253
|
+
self.completion_secs[name] = dt_secs
|
254
|
+
self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
|
255
|
+
return
|
256
|
+
|
257
|
+
except asyncio.TimeoutError:
|
258
|
+
self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
|
259
|
+
except Exception as e:
|
260
|
+
self.logger.error(
|
261
|
+
f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
|
262
|
+
exc_info=self.debug,
|
263
|
+
)
|
264
|
+
|
265
|
+
if attempt < self._retry.attempts:
|
266
|
+
delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
|
267
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
344
268
|
await asyncio.sleep(delay)
|
345
269
|
|
346
|
-
except asyncio.CancelledError:
|
347
|
-
self.logger.warning(f"{name} update cancelled")
|
348
|
-
raise
|
349
|
-
|
350
|
-
# permanent failure
|
351
|
-
self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
352
|
-
if self.metrics_client:
|
353
|
-
self.metrics_client.increment('task_failed')
|
354
270
|
self.failed.append(name)
|
271
|
+
self.logger.error(f"✖️ {name} permanently failed")
|
272
|
+
|
273
|
+
# ---- public API ----------------------------------------------------------
|
355
274
|
|
356
|
-
async def update_data(self,
|
275
|
+
async def update_data(self, period: str, **kwargs: Any) -> None:
|
357
276
|
"""
|
358
|
-
|
277
|
+
Backward-compatible:
|
278
|
+
- Accepts orchestrator knobs in kwargs (we consume them).
|
279
|
+
- Forwards only artifact-friendly kwargs to update_parquet.
|
359
280
|
"""
|
360
|
-
|
281
|
+
# split kwargs; ignore any runtime attempts to mutate orchestrator config mid-call
|
282
|
+
_, artifact_kwargs = self._split_kwargs(kwargs)
|
361
283
|
|
362
|
-
|
363
|
-
self.completion_times.clear()
|
284
|
+
self.completion_secs.clear()
|
364
285
|
self.failed.clear()
|
365
|
-
self.original_classes = self.get_artifact_classes(data_type)
|
366
|
-
|
367
|
-
# NON-DESTRUCTIVE SORTING
|
368
|
-
ordered = sorted(self.original_classes, key=self.estimate_priority)
|
369
286
|
|
287
|
+
classes = self._classes_for(period)
|
370
288
|
sem = asyncio.Semaphore(self.max_workers)
|
371
|
-
tasks = [
|
372
|
-
asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
373
|
-
for cls in ordered
|
374
|
-
]
|
289
|
+
tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
|
375
290
|
|
376
|
-
|
377
|
-
|
378
|
-
await
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
raise
|
384
|
-
finally:
|
385
|
-
total = len(self.original_classes)
|
386
|
-
completed = len(self.completion_times)
|
387
|
-
failed = len(self.failed)
|
388
|
-
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
291
|
+
for t in asyncio.as_completed(tasks):
|
292
|
+
try:
|
293
|
+
await t
|
294
|
+
except asyncio.CancelledError:
|
295
|
+
for rest in tasks:
|
296
|
+
rest.cancel()
|
297
|
+
raise
|
389
298
|
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
total = len(self.original_classes)
|
395
|
-
completed = set(self.completion_times.keys())
|
396
|
-
failed = set(self.failed)
|
397
|
-
pending = {cls.__name__ for cls in self.original_classes} - completed - failed
|
299
|
+
self.logger.info(
|
300
|
+
f"Artifacts processed: total={len(classes)}, "
|
301
|
+
f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
|
302
|
+
)
|
398
303
|
|
304
|
+
# Optional helper
|
305
|
+
def get_update_status(self) -> Dict[str, Any]:
|
306
|
+
done = set(self.completion_secs)
|
307
|
+
fail = set(self.failed)
|
308
|
+
all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
|
399
309
|
return {
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
}
|
406
|
-
|
407
|
-
@staticmethod
|
408
|
-
def format_status_table(status: Dict[str, Any]) -> str:
|
409
|
-
"""
|
410
|
-
Formats the status dict into a readable table.
|
411
|
-
"""
|
412
|
-
lines = [
|
413
|
-
f"Total: {status['total']}",
|
414
|
-
f"Completed: {len(status['completed'])} {status['completed']}",
|
415
|
-
f"Failed: {len(status['failed'])} {status['failed']}",
|
416
|
-
f"Pending: {len(status['pending'])} {status['pending']}",
|
417
|
-
"",
|
418
|
-
"Per-artifact timings:"
|
419
|
-
]
|
420
|
-
for name, dur in status['completion_times'].items():
|
421
|
-
lines.append(f" {name}: {dur:.2f}s")
|
422
|
-
return "\n".join(lines)
|
310
|
+
"total": len(all_names),
|
311
|
+
"completed": sorted(done),
|
312
|
+
"failed": sorted(fail),
|
313
|
+
"pending": sorted(all_names - done - fail),
|
314
|
+
"completion_times": dict(self.completion_secs),
|
315
|
+
}
|