sibi-dst 2025.1.3__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +96 -375
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +3 -1
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.3.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
sibi_dst/__init__.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
try:
|
3
2
|
import importlib.metadata as version_reader
|
4
3
|
except ImportError:
|
@@ -8,3 +7,7 @@ try:
|
|
8
7
|
__version__ = version_reader.version("sibi-dst")
|
9
8
|
except version_reader.PackageNotFoundError:
|
10
9
|
__version__ = "unknown"
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"__version__",
|
13
|
+
]
|
sibi_dst/df_helper/__init__.py
CHANGED
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
from ._artifact_updater_multi_wrapper import
|
6
|
+
from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded
|
7
7
|
|
8
8
|
__all__ = [
|
9
9
|
'DfHelper',
|
10
10
|
'ParquetArtifact',
|
11
11
|
'ParquetReader',
|
12
|
-
'
|
12
|
+
'ArtifactUpdaterMultiWrapperThreaded',
|
13
13
|
]
|
@@ -1,80 +1,41 @@
|
|
1
|
-
import asyncio
|
2
|
-
import logging
|
3
1
|
import datetime
|
2
|
+
import time
|
4
3
|
import random
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
5
|
from typing import Any, Callable, Dict, List, Optional, Type
|
6
6
|
|
7
|
-
from sibi_dst.utils import
|
7
|
+
from sibi_dst.utils import ManagedResource
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
11
11
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
- Optionally prioritises tasks via a priority function or static method on artifact classes
|
17
|
-
- Tracks per-artifact completion times
|
18
|
-
- Configurable retry/backoff strategy
|
19
|
-
- Optional metrics integration
|
20
|
-
- Thread-safe within a single asyncio loop
|
21
|
-
|
22
|
-
Usage:
|
23
|
-
wrapper = ArtifactUpdaterMultiWrapper(
|
24
|
-
wrapped_classes={
|
25
|
-
'mydata': [DataArtifactA, DataArtifactB],
|
26
|
-
},
|
27
|
-
max_workers=4,
|
28
|
-
retry_attempts=3,
|
29
|
-
update_timeout_seconds=600,
|
30
|
-
backoff_base=2,
|
31
|
-
backoff_max=60,
|
32
|
-
backoff_jitter=0.1,
|
33
|
-
priority_fn=None, # or custom
|
34
|
-
metrics_client=None,
|
35
|
-
debug=True,
|
36
|
-
logger=None,
|
37
|
-
artifact_class_kwargs={
|
38
|
-
'fs': my_fs,
|
39
|
-
'parquet_storage_path': 's3://bucket/data',
|
40
|
-
'logger': my_logger,
|
41
|
-
'debug': True,
|
42
|
-
}
|
43
|
-
)
|
44
|
-
await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
12
|
+
Updates artifacts concurrently using a ThreadPoolExecutor.
|
13
|
+
|
14
|
+
This version is refactored for a pure multi-threaded environment, aligning
|
15
|
+
the orchestration model with the underlying threaded workers (DataWrapper).
|
45
16
|
"""
|
17
|
+
|
46
18
|
def __init__(
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
debug: bool = False,
|
59
|
-
logger: Optional[logging.Logger] = None,
|
60
|
-
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
19
|
+
self,
|
20
|
+
wrapped_classes: Dict[str, List[Type]],
|
21
|
+
*,
|
22
|
+
max_workers: int = 4,
|
23
|
+
retry_attempts: int = 3,
|
24
|
+
backoff_base: int = 2,
|
25
|
+
backoff_max: int = 60,
|
26
|
+
backoff_jitter: float = 0.1,
|
27
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
28
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
29
|
+
**kwargs: Any,
|
61
30
|
) -> None:
|
31
|
+
super().__init__(**kwargs)
|
62
32
|
self.wrapped_classes = wrapped_classes
|
63
33
|
self.max_workers = max_workers
|
64
34
|
self.retry_attempts = retry_attempts
|
65
|
-
self.update_timeout_seconds = update_timeout_seconds
|
66
35
|
self.backoff_base = backoff_base
|
67
36
|
self.backoff_max = backoff_max
|
68
37
|
self.backoff_jitter = backoff_jitter
|
69
38
|
self.priority_fn = priority_fn
|
70
|
-
self.metrics_client = metrics_client
|
71
|
-
|
72
|
-
self.debug = debug
|
73
|
-
self.logger = logger or Logger.default_logger(
|
74
|
-
logger_name=self.__class__.__name__,
|
75
|
-
log_level=Logger.DEBUG if debug else Logger.INFO
|
76
|
-
)
|
77
|
-
|
78
39
|
# Default artifact init kwargs
|
79
40
|
today = datetime.datetime.today() + datetime.timedelta(days=1)
|
80
41
|
default_kwargs = {
|
@@ -82,158 +43,389 @@ class ArtifactUpdaterMultiWrapper:
|
|
82
43
|
'parquet_end_date': today.strftime('%Y-%m-%d'),
|
83
44
|
'logger': self.logger,
|
84
45
|
'debug': self.debug,
|
46
|
+
'fs': self.fs,
|
47
|
+
'verbose': self.verbose,
|
85
48
|
}
|
86
49
|
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
87
50
|
|
88
|
-
# State
|
51
|
+
# State tracking
|
89
52
|
self.completion_times: Dict[str, float] = {}
|
90
53
|
self.failed: List[str] = []
|
91
54
|
self.original_classes: List[Type] = []
|
92
55
|
|
93
56
|
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
94
|
-
"""
|
95
|
-
Retrieve artifact classes by data type.
|
96
|
-
"""
|
57
|
+
"""Retrieve artifact classes by data type."""
|
97
58
|
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
98
|
-
|
59
|
+
classes = self.wrapped_classes.get(data_type)
|
60
|
+
if not classes:
|
99
61
|
raise ValueError(f"Unsupported data type: {data_type}")
|
100
|
-
classes = self.wrapped_classes[data_type]
|
101
62
|
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
102
63
|
return classes
|
103
64
|
|
104
65
|
def estimate_priority(self, artifact_cls: Type) -> int:
|
105
66
|
"""
|
106
|
-
|
67
|
+
Determines task priority. Lower values run first.
|
68
|
+
Note: This is a blocking call and will run sequentially before updates start.
|
107
69
|
"""
|
108
70
|
name = artifact_cls.__name__
|
71
|
+
# Custom priority function takes precedence
|
109
72
|
if self.priority_fn:
|
110
73
|
try:
|
111
|
-
|
112
|
-
self.logger.debug(f"priority_fn for {name}: {pr}")
|
113
|
-
return pr
|
74
|
+
return self.priority_fn(artifact_cls)
|
114
75
|
except Exception as e:
|
115
76
|
self.logger.warning(f"priority_fn error for {name}: {e}")
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
77
|
+
|
78
|
+
# Fallback to size estimate if available
|
79
|
+
if hasattr(artifact_cls, 'get_size_estimate'):
|
80
|
+
try:
|
81
|
+
# This performs blocking I/O
|
82
|
+
return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
|
83
|
+
|
84
|
+
except Exception as e:
|
85
|
+
self.logger.warning(f"get_size_estimate failed for {name}: {e}")
|
86
|
+
|
87
|
+
# Default priority
|
88
|
+
return 999
|
89
|
+
|
90
|
+
def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
|
128
91
|
"""
|
129
|
-
|
92
|
+
A blocking worker function that handles instantiation, update, and retries for a single artifact.
|
93
|
+
This function is designed to be run in a ThreadPoolExecutor.
|
130
94
|
"""
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
95
|
+
name = artifact_cls.__name__
|
96
|
+
self.logger.debug(f"Worker thread starting update for {name}")
|
97
|
+
|
98
|
+
for attempt in range(1, self.retry_attempts + 1):
|
135
99
|
try:
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
artifact_cls, **self.artifact_class_kwargs
|
140
|
-
)
|
141
|
-
await asyncio.wait_for(
|
142
|
-
asyncio.to_thread(
|
143
|
-
artifact.update_parquet, **update_kwargs
|
144
|
-
),
|
145
|
-
timeout=self.update_timeout_seconds
|
146
|
-
)
|
147
|
-
duration = asyncio.get_event_loop().time() - start
|
148
|
-
self.completion_times[name] = duration
|
149
|
-
self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
|
150
|
-
if self.metrics_client:
|
151
|
-
self.metrics_client.increment('task_succeeded')
|
152
|
-
return
|
153
|
-
except asyncio.TimeoutError:
|
154
|
-
self.logger.warning(f"Timeout on {name}, attempt {attempt}")
|
155
|
-
except Exception as e:
|
156
|
-
self.logger.error(f"Error on {name} attempt {attempt}: {e}")
|
100
|
+
# Instantiate and update directly within the worker thread
|
101
|
+
artifact_instance = artifact_cls(**self.artifact_class_kwargs)
|
102
|
+
artifact_instance.update_parquet(**update_kwargs)
|
157
103
|
|
104
|
+
self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
|
105
|
+
return name # Return the name on success
|
106
|
+
|
107
|
+
except Exception as e:
|
108
|
+
self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
|
109
|
+
if attempt < self.retry_attempts:
|
158
110
|
delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
159
111
|
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
160
112
|
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
161
|
-
|
113
|
+
time.sleep(delay)
|
162
114
|
|
163
|
-
|
164
|
-
|
165
|
-
raise
|
115
|
+
# If all retries fail, raise an exception to be caught by the main loop
|
116
|
+
raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
|
166
117
|
|
167
|
-
|
168
|
-
self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
169
|
-
if self.metrics_client:
|
170
|
-
self.metrics_client.increment('task_failed')
|
171
|
-
self.failed.append(name)
|
172
|
-
|
173
|
-
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
118
|
+
def update_data(self, data_type: str, **kwargs: Any) -> None:
|
174
119
|
"""
|
175
|
-
Entry point to update all artifacts of a given type
|
120
|
+
Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
|
176
121
|
"""
|
177
|
-
self.logger.
|
122
|
+
self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
|
178
123
|
|
179
|
-
#
|
124
|
+
# Reset state for this run
|
180
125
|
self.completion_times.clear()
|
181
126
|
self.failed.clear()
|
182
127
|
self.original_classes = self.get_artifact_classes(data_type)
|
183
128
|
|
184
|
-
#
|
185
|
-
|
129
|
+
# Sequentially estimate priorities and sort classes before execution
|
130
|
+
self.logger.debug("Estimating priorities to order tasks...")
|
131
|
+
ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
|
132
|
+
self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
|
186
133
|
|
187
|
-
|
188
|
-
tasks = [
|
189
|
-
asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
190
|
-
for cls in ordered
|
191
|
-
]
|
134
|
+
start_time = time.monotonic()
|
192
135
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
for t in tasks:
|
199
|
-
t.cancel()
|
200
|
-
raise
|
201
|
-
finally:
|
202
|
-
total = len(self.original_classes)
|
203
|
-
completed = len(self.completion_times)
|
204
|
-
failed = len(self.failed)
|
205
|
-
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
136
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
137
|
+
future_to_class_name = {
|
138
|
+
executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
|
139
|
+
for cls in ordered_classes
|
140
|
+
}
|
206
141
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
142
|
+
for future in as_completed(future_to_class_name):
|
143
|
+
name = future_to_class_name[future]
|
144
|
+
try:
|
145
|
+
# result() will re-raise the exception from the worker if one occurred
|
146
|
+
future.result()
|
147
|
+
# If no exception, the task succeeded
|
148
|
+
self.completion_times[name] = time.monotonic() - start_time
|
149
|
+
except Exception as e:
|
150
|
+
self.logger.error(f"✖️ {name} permanently failed. See error log above.")
|
151
|
+
self.failed.append(name)
|
152
|
+
|
153
|
+
# Log final status
|
211
154
|
total = len(self.original_classes)
|
212
|
-
completed =
|
213
|
-
|
214
|
-
|
155
|
+
completed = len(self.completion_times)
|
156
|
+
failed_count = len(self.failed)
|
157
|
+
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
|
158
|
+
|
159
|
+
def get_update_status(self) -> Dict[str, Any]:
|
160
|
+
"""Returns a summary status including completion times."""
|
161
|
+
completed_set = set(self.completion_times.keys())
|
162
|
+
failed_set = set(self.failed)
|
163
|
+
pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
|
215
164
|
|
216
165
|
return {
|
217
|
-
'total':
|
218
|
-
'completed': list(
|
219
|
-
'failed':
|
220
|
-
'pending':
|
166
|
+
'total': len(self.original_classes),
|
167
|
+
'completed': list(completed_set),
|
168
|
+
'failed': list(failed_set),
|
169
|
+
'pending': list(pending_set),
|
221
170
|
'completion_times': self.completion_times,
|
222
171
|
}
|
223
172
|
|
224
173
|
@staticmethod
|
225
174
|
def format_status_table(status: Dict[str, Any]) -> str:
|
226
|
-
"""
|
227
|
-
Formats the status dict into a readable table.
|
228
|
-
"""
|
175
|
+
"""Formats the status dictionary into a readable table."""
|
229
176
|
lines = [
|
230
177
|
f"Total: {status['total']}",
|
231
|
-
f"Completed: {len(status['completed'])}
|
232
|
-
f"Failed: {len(status['failed'])}
|
233
|
-
f"Pending: {len(status['pending'])}
|
234
|
-
""
|
235
|
-
"Per-artifact timings:"
|
178
|
+
f"Completed: {len(status['completed'])}",
|
179
|
+
f"Failed: {len(status['failed'])}",
|
180
|
+
f"Pending: {len(status['pending'])}",
|
181
|
+
"\nPer-artifact completion times (seconds):"
|
236
182
|
]
|
237
|
-
|
238
|
-
|
183
|
+
sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
|
184
|
+
for name, duration in sorted_times:
|
185
|
+
lines.append(f" - {name:<30}: {duration:.2f}s")
|
186
|
+
if status['failed']:
|
187
|
+
lines.append("\nFailed artifacts:")
|
188
|
+
for name in status['failed']:
|
189
|
+
lines.append(f" - {name}")
|
239
190
|
return "\n".join(lines)
|
191
|
+
|
192
|
+
|
193
|
+
# import asyncio
|
194
|
+
# import logging
|
195
|
+
# import datetime
|
196
|
+
# import random
|
197
|
+
# from typing import Any, Callable, Dict, List, Optional, Type
|
198
|
+
#
|
199
|
+
# from sibi_dst.utils import Logger
|
200
|
+
#
|
201
|
+
#
|
202
|
+
# class ArtifactUpdaterMultiWrapperAsync:
|
203
|
+
# """
|
204
|
+
# Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
|
205
|
+
#
|
206
|
+
# Features:
|
207
|
+
# - Caps concurrency at max_workers via semaphore
|
208
|
+
# - Optionally prioritises tasks via a priority function or static method on artifact classes
|
209
|
+
# - Tracks per-artifact completion times
|
210
|
+
# - Configurable retry/backoff strategy
|
211
|
+
# - Optional metrics integration
|
212
|
+
# - Thread-safe within a single asyncio loop
|
213
|
+
#
|
214
|
+
# Usage:
|
215
|
+
# wrapper = ArtifactUpdaterMultiWrapper(
|
216
|
+
# wrapped_classes={
|
217
|
+
# 'mydata': [DataArtifactA, DataArtifactB],
|
218
|
+
# },
|
219
|
+
# max_workers=4,
|
220
|
+
# retry_attempts=3,
|
221
|
+
# update_timeout_seconds=600,
|
222
|
+
# backoff_base=2,
|
223
|
+
# backoff_max=60,
|
224
|
+
# backoff_jitter=0.1,
|
225
|
+
# priority_fn=None, # or custom
|
226
|
+
# metrics_client=None,
|
227
|
+
# debug=True,
|
228
|
+
# logger=None,
|
229
|
+
# artifact_class_kwargs={
|
230
|
+
# 'fs': my_fs,
|
231
|
+
# 'parquet_storage_path': 's3://bucket/data',
|
232
|
+
# 'logger': my_logger,
|
233
|
+
# 'debug': True,
|
234
|
+
# }
|
235
|
+
# )
|
236
|
+
# await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
237
|
+
# """
|
238
|
+
# def __init__(
|
239
|
+
# self,
|
240
|
+
# wrapped_classes: Dict[str, List[Type]],
|
241
|
+
# *,
|
242
|
+
# max_workers: int = 3,
|
243
|
+
# retry_attempts: int = 3,
|
244
|
+
# update_timeout_seconds: int = 600,
|
245
|
+
# backoff_base: int = 2,
|
246
|
+
# backoff_max: Optional[int] = 60,
|
247
|
+
# backoff_jitter: float = 0.1,
|
248
|
+
# priority_fn: Optional[Callable[[Type], int]] = None,
|
249
|
+
# metrics_client: Any = None,
|
250
|
+
# debug: bool = False,
|
251
|
+
# logger: Optional[logging.Logger] = None,
|
252
|
+
# artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
253
|
+
# ) -> None:
|
254
|
+
# self.wrapped_classes = wrapped_classes
|
255
|
+
# self.max_workers = max_workers
|
256
|
+
# self.retry_attempts = retry_attempts
|
257
|
+
# self.update_timeout_seconds = update_timeout_seconds
|
258
|
+
# self.backoff_base = backoff_base
|
259
|
+
# self.backoff_max = backoff_max
|
260
|
+
# self.backoff_jitter = backoff_jitter
|
261
|
+
# self.priority_fn = priority_fn
|
262
|
+
# self.metrics_client = metrics_client
|
263
|
+
#
|
264
|
+
# self.debug = debug
|
265
|
+
# self.logger = logger or Logger.default_logger(
|
266
|
+
# logger_name=self.__class__.__name__,
|
267
|
+
# log_level=Logger.DEBUG if debug else Logger.INFO
|
268
|
+
# )
|
269
|
+
#
|
270
|
+
# # Default artifact init kwargs
|
271
|
+
# today = datetime.datetime.today() + datetime.timedelta(days=1)
|
272
|
+
# default_kwargs = {
|
273
|
+
# 'parquet_start_date': today.strftime('%Y-%m-%d'),
|
274
|
+
# 'parquet_end_date': today.strftime('%Y-%m-%d'),
|
275
|
+
# 'logger': self.logger,
|
276
|
+
# 'debug': self.debug,
|
277
|
+
# }
|
278
|
+
# self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
279
|
+
#
|
280
|
+
# # State
|
281
|
+
# self.completion_times: Dict[str, float] = {}
|
282
|
+
# self.failed: List[str] = []
|
283
|
+
# self.original_classes: List[Type] = []
|
284
|
+
#
|
285
|
+
# def get_artifact_classes(self, data_type: str) -> List[Type]:
|
286
|
+
# """
|
287
|
+
# Retrieve artifact classes by data type.
|
288
|
+
# """
|
289
|
+
# self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
290
|
+
# if data_type not in self.wrapped_classes:
|
291
|
+
# raise ValueError(f"Unsupported data type: {data_type}")
|
292
|
+
# classes = self.wrapped_classes[data_type]
|
293
|
+
# self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
294
|
+
# return classes
|
295
|
+
#
|
296
|
+
# def estimate_priority(self, artifact_cls: Type) -> int:
|
297
|
+
# """
|
298
|
+
# Determine task priority for ordering. Lower values run first.
|
299
|
+
# """
|
300
|
+
# name = artifact_cls.__name__
|
301
|
+
# if self.priority_fn:
|
302
|
+
# try:
|
303
|
+
# pr = self.priority_fn(artifact_cls)
|
304
|
+
# self.logger.debug(f"priority_fn for {name}: {pr}")
|
305
|
+
# return pr
|
306
|
+
# except Exception as e:
|
307
|
+
# self.logger.warning(f"priority_fn error for {name}: {e}")
|
308
|
+
# try:
|
309
|
+
# fs = self.artifact_class_kwargs.get('fs')
|
310
|
+
# path = self.artifact_class_kwargs.get('parquet_storage_path')
|
311
|
+
# pr=1
|
312
|
+
# if hasattr(artifact_cls, 'get_size_estimate'):
|
313
|
+
# pr = artifact_cls.get_size_estimate(fs, path)
|
314
|
+
# self.logger.debug(f"Estimated priority for {name}: {pr}")
|
315
|
+
# return pr
|
316
|
+
# except Exception:
|
317
|
+
# return 1
|
318
|
+
#
|
319
|
+
# async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
|
320
|
+
# """
|
321
|
+
# Wrap update_artifact in a semaphore slot to limit concurrency.
|
322
|
+
# """
|
323
|
+
# async with sem:
|
324
|
+
# name = artifact_cls.__name__
|
325
|
+
# start = asyncio.get_event_loop().time()
|
326
|
+
# self.logger.info(f"Starting update for {name}")
|
327
|
+
# try:
|
328
|
+
# for attempt in range(1, self.retry_attempts + 1):
|
329
|
+
# try:
|
330
|
+
# artifact = await asyncio.to_thread(
|
331
|
+
# artifact_cls, **self.artifact_class_kwargs
|
332
|
+
# )
|
333
|
+
# await asyncio.wait_for(
|
334
|
+
# asyncio.to_thread(
|
335
|
+
# artifact.update_parquet, **update_kwargs
|
336
|
+
# ),
|
337
|
+
# timeout=self.update_timeout_seconds
|
338
|
+
# )
|
339
|
+
# duration = asyncio.get_event_loop().time() - start
|
340
|
+
# self.completion_times[name] = duration
|
341
|
+
# self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
|
342
|
+
# if self.metrics_client:
|
343
|
+
# self.metrics_client.increment('task_succeeded')
|
344
|
+
# return
|
345
|
+
# except asyncio.TimeoutError:
|
346
|
+
# self.logger.warning(f"Timeout on {name}, attempt {attempt}")
|
347
|
+
# except Exception as e:
|
348
|
+
# self.logger.error(f"Error on {name} attempt {attempt}: {e}")
|
349
|
+
#
|
350
|
+
# delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
351
|
+
# delay *= 1 + random.uniform(0, self.backoff_jitter)
|
352
|
+
# self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
353
|
+
# await asyncio.sleep(delay)
|
354
|
+
#
|
355
|
+
# except asyncio.CancelledError:
|
356
|
+
# self.logger.warning(f"{name} update cancelled")
|
357
|
+
# raise
|
358
|
+
#
|
359
|
+
# # permanent failure
|
360
|
+
# self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
361
|
+
# if self.metrics_client:
|
362
|
+
# self.metrics_client.increment('task_failed')
|
363
|
+
# self.failed.append(name)
|
364
|
+
#
|
365
|
+
# async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
366
|
+
# """
|
367
|
+
# Entry point to update all artifacts of a given type concurrently.
|
368
|
+
# """
|
369
|
+
# self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
|
370
|
+
#
|
371
|
+
# # RESET STATE
|
372
|
+
# self.completion_times.clear()
|
373
|
+
# self.failed.clear()
|
374
|
+
# self.original_classes = self.get_artifact_classes(data_type)
|
375
|
+
#
|
376
|
+
# # NON-DESTRUCTIVE SORTING
|
377
|
+
# ordered = sorted(self.original_classes, key=self.estimate_priority)
|
378
|
+
#
|
379
|
+
# sem = asyncio.Semaphore(self.max_workers)
|
380
|
+
# tasks = [
|
381
|
+
# asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
382
|
+
# for cls in ordered
|
383
|
+
# ]
|
384
|
+
#
|
385
|
+
# try:
|
386
|
+
# for coro in asyncio.as_completed(tasks):
|
387
|
+
# await coro
|
388
|
+
# except asyncio.CancelledError:
|
389
|
+
# self.logger.warning("update_data was cancelled—aborting remaining retries")
|
390
|
+
# for t in tasks:
|
391
|
+
# t.cancel()
|
392
|
+
# raise
|
393
|
+
# finally:
|
394
|
+
# total = len(self.original_classes)
|
395
|
+
# completed = len(self.completion_times)
|
396
|
+
# failed = len(self.failed)
|
397
|
+
# self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
398
|
+
#
|
399
|
+
# def get_update_status(self) -> Dict[str, Any]:
|
400
|
+
# """
|
401
|
+
# Returns summary status including completion times.
|
402
|
+
# """
|
403
|
+
# total = len(self.original_classes)
|
404
|
+
# completed = set(self.completion_times.keys())
|
405
|
+
# failed = set(self.failed)
|
406
|
+
# pending = {cls.__name__ for cls in self.original_classes} - completed - failed
|
407
|
+
#
|
408
|
+
# return {
|
409
|
+
# 'total': total,
|
410
|
+
# 'completed': list(completed),
|
411
|
+
# 'failed': list(failed),
|
412
|
+
# 'pending': list(pending),
|
413
|
+
# 'completion_times': self.completion_times,
|
414
|
+
# }
|
415
|
+
#
|
416
|
+
# @staticmethod
|
417
|
+
# def format_status_table(status: Dict[str, Any]) -> str:
|
418
|
+
# """
|
419
|
+
# Formats the status dict into a readable table.
|
420
|
+
# """
|
421
|
+
# lines = [
|
422
|
+
# f"Total: {status['total']}",
|
423
|
+
# f"Completed: {len(status['completed'])} {status['completed']}",
|
424
|
+
# f"Failed: {len(status['failed'])} {status['failed']}",
|
425
|
+
# f"Pending: {len(status['pending'])} {status['pending']}",
|
426
|
+
# "",
|
427
|
+
# "Per-artifact timings:"
|
428
|
+
# ]
|
429
|
+
# for name, dur in status['completion_times'].items():
|
430
|
+
# lines.append(f" {name}: {dur:.2f}s")
|
431
|
+
# return "\n".join(lines)
|