sibi-dst 2025.1.6__tar.gz → 2025.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/PKG-INFO +1 -1
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/pyproject.toml +1 -1
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/__init__.py +2 -1
- sibi_dst-2025.1.8/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +420 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_parquet_reader.py +4 -18
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_wrapper.py +33 -30
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/parquet_saver.py +0 -4
- sibi_dst-2025.1.6/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -431
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/README.md +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -3,11 +3,12 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded
|
6
|
+
from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
7
|
|
8
8
|
__all__ = [
|
9
9
|
'DfHelper',
|
10
10
|
'ParquetArtifact',
|
11
11
|
'ParquetReader',
|
12
12
|
'ArtifactUpdaterMultiWrapperThreaded',
|
13
|
+
'ArtifactUpdaterMultiWrapperAsync',
|
13
14
|
]
|
@@ -0,0 +1,420 @@
|
|
1
|
+
import time
|
2
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Type
|
4
|
+
|
5
|
+
from sibi_dst.utils import ManagedResource
|
6
|
+
|
7
|
+
class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
8
|
+
"""
|
9
|
+
Updates artifacts concurrently using a ThreadPoolExecutor.
|
10
|
+
|
11
|
+
This version is refactored for a pure multi-threaded environment, aligning
|
12
|
+
the orchestration model with the underlying threaded workers (DataWrapper).
|
13
|
+
"""
|
14
|
+
wrapped_classes: Dict[str, List[Type]]
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
wrapped_classes: Dict[str, List[Type]],
|
18
|
+
*,
|
19
|
+
max_workers: int = 4,
|
20
|
+
retry_attempts: int = 3,
|
21
|
+
backoff_base: int = 2,
|
22
|
+
backoff_max: int = 60,
|
23
|
+
backoff_jitter: float = 0.1,
|
24
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
25
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
26
|
+
**kwargs: Dict[str, Any]
|
27
|
+
) -> None:
|
28
|
+
super().__init__(**kwargs)
|
29
|
+
self.wrapped_classes = wrapped_classes
|
30
|
+
self.max_workers = max_workers
|
31
|
+
self.retry_attempts = retry_attempts
|
32
|
+
self.backoff_base = backoff_base
|
33
|
+
self.backoff_max = backoff_max
|
34
|
+
self.backoff_jitter = backoff_jitter
|
35
|
+
self.priority_fn = priority_fn
|
36
|
+
# Default artifact init kwargs
|
37
|
+
today = datetime.datetime.today() + datetime.timedelta(days=1)
|
38
|
+
default_kwargs = {
|
39
|
+
'parquet_start_date': today.strftime('%Y-%m-%d'),
|
40
|
+
'parquet_end_date': today.strftime('%Y-%m-%d'),
|
41
|
+
'logger': self.logger,
|
42
|
+
'debug': self.debug,
|
43
|
+
'fs': self.fs,
|
44
|
+
'verbose': self.verbose,
|
45
|
+
}
|
46
|
+
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
47
|
+
|
48
|
+
# State tracking
|
49
|
+
self.completion_times: Dict[str, float] = {}
|
50
|
+
self.failed: List[str] = []
|
51
|
+
self.original_classes: List[Type] = []
|
52
|
+
|
53
|
+
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
54
|
+
"""Retrieve artifact classes by data type."""
|
55
|
+
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
56
|
+
classes = self.wrapped_classes.get(data_type)
|
57
|
+
if not classes:
|
58
|
+
raise ValueError(f"Unsupported data type: {data_type}")
|
59
|
+
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
60
|
+
return classes
|
61
|
+
|
62
|
+
def estimate_priority(self, artifact_cls: Type) -> int:
|
63
|
+
"""
|
64
|
+
Determines task priority. Lower values run first.
|
65
|
+
Note: This is a blocking call and will run sequentially before updates start.
|
66
|
+
"""
|
67
|
+
name = artifact_cls.__name__
|
68
|
+
# Custom priority function takes precedence
|
69
|
+
if self.priority_fn:
|
70
|
+
try:
|
71
|
+
return self.priority_fn(artifact_cls)
|
72
|
+
except Exception as e:
|
73
|
+
self.logger.warning(f"priority_fn error for {name}: {e}")
|
74
|
+
|
75
|
+
# # Fallback to size estimate if available
|
76
|
+
# if hasattr(artifact_cls, 'get_size_estimate'):
|
77
|
+
# try:
|
78
|
+
# # This performs blocking I/O
|
79
|
+
# return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
|
80
|
+
#
|
81
|
+
# except Exception as e:
|
82
|
+
# self.logger.warning(f"get_size_estimate failed for {name}: {e}")
|
83
|
+
|
84
|
+
# Default priority
|
85
|
+
return 999
|
86
|
+
|
87
|
+
def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
|
88
|
+
"""
|
89
|
+
A blocking worker function that handles instantiation, update, and retries for a single artifact.
|
90
|
+
This function is designed to be run in a ThreadPoolExecutor.
|
91
|
+
"""
|
92
|
+
name = artifact_cls.__name__
|
93
|
+
self.logger.debug(f"Worker thread starting update for {name}")
|
94
|
+
|
95
|
+
for attempt in range(1, self.retry_attempts + 1):
|
96
|
+
try:
|
97
|
+
# Instantiate and update directly within the worker thread
|
98
|
+
artifact_instance = artifact_cls(**self.artifact_class_kwargs)
|
99
|
+
artifact_instance.update_parquet(**update_kwargs)
|
100
|
+
|
101
|
+
self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
|
102
|
+
return name # Return the name on success
|
103
|
+
|
104
|
+
except Exception as e:
|
105
|
+
self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
|
106
|
+
if attempt < self.retry_attempts:
|
107
|
+
delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
108
|
+
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
109
|
+
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
110
|
+
time.sleep(delay)
|
111
|
+
|
112
|
+
# If all retries fail, raise an exception to be caught by the main loop
|
113
|
+
raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
|
114
|
+
|
115
|
+
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
116
|
+
"""
|
117
|
+
Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
|
118
|
+
"""
|
119
|
+
self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
|
120
|
+
|
121
|
+
# Reset state for this run
|
122
|
+
self.completion_times.clear()
|
123
|
+
self.failed.clear()
|
124
|
+
self.original_classes = self.get_artifact_classes(data_type)
|
125
|
+
|
126
|
+
# Sequentially estimate priorities and sort classes before execution
|
127
|
+
self.logger.debug("Estimating priorities to order tasks...")
|
128
|
+
ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
|
129
|
+
self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
|
130
|
+
|
131
|
+
start_time = time.monotonic()
|
132
|
+
|
133
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
134
|
+
future_to_class_name = {
|
135
|
+
executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
|
136
|
+
for cls in ordered_classes
|
137
|
+
}
|
138
|
+
|
139
|
+
for future in as_completed(future_to_class_name):
|
140
|
+
name = future_to_class_name[future]
|
141
|
+
try:
|
142
|
+
# result() will re-raise the exception from the worker if one occurred
|
143
|
+
future.result()
|
144
|
+
# If no exception, the task succeeded
|
145
|
+
self.completion_times[name] = time.monotonic() - start_time
|
146
|
+
except Exception as e:
|
147
|
+
self.logger.error(f"✖️ {name} permanently failed. See error log above.")
|
148
|
+
self.failed.append(name)
|
149
|
+
|
150
|
+
# Log final status
|
151
|
+
total = len(self.original_classes)
|
152
|
+
completed = len(self.completion_times)
|
153
|
+
failed_count = len(self.failed)
|
154
|
+
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
|
155
|
+
|
156
|
+
def get_update_status(self) -> Dict[str, Any]:
|
157
|
+
"""Returns a summary status including completion times."""
|
158
|
+
completed_set = set(self.completion_times.keys())
|
159
|
+
failed_set = set(self.failed)
|
160
|
+
pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
|
161
|
+
|
162
|
+
return {
|
163
|
+
'total': len(self.original_classes),
|
164
|
+
'completed': list(completed_set),
|
165
|
+
'failed': list(failed_set),
|
166
|
+
'pending': list(pending_set),
|
167
|
+
'completion_times': self.completion_times,
|
168
|
+
}
|
169
|
+
|
170
|
+
@staticmethod
|
171
|
+
def format_status_table(status: Dict[str, Any]) -> str:
|
172
|
+
"""Formats the status dictionary into a readable table."""
|
173
|
+
lines = [
|
174
|
+
f"Total: {status['total']}",
|
175
|
+
f"Completed: {len(status['completed'])}",
|
176
|
+
f"Failed: {len(status['failed'])}",
|
177
|
+
f"Pending: {len(status['pending'])}",
|
178
|
+
"\nPer-artifact completion times (seconds):"
|
179
|
+
]
|
180
|
+
sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
|
181
|
+
for name, duration in sorted_times:
|
182
|
+
lines.append(f" - {name:<30}: {duration:.2f}s")
|
183
|
+
if status['failed']:
|
184
|
+
lines.append("\nFailed artifacts:")
|
185
|
+
for name in status['failed']:
|
186
|
+
lines.append(f" - {name}")
|
187
|
+
return "\n".join(lines)
|
188
|
+
|
189
|
+
|
190
|
+
import asyncio
|
191
|
+
import datetime
|
192
|
+
import random
|
193
|
+
from typing import Any, Callable, Dict, List, Optional, Type
|
194
|
+
|
195
|
+
class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
196
|
+
"""
|
197
|
+
Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
|
198
|
+
|
199
|
+
Features:
|
200
|
+
- Caps concurrency at max_workers via semaphore
|
201
|
+
- Optionally prioritises tasks via a priority function or static method on artifact classes
|
202
|
+
- Tracks per-artifact completion times
|
203
|
+
- Configurable retry/backoff strategy
|
204
|
+
- Optional metrics integration
|
205
|
+
- Thread-safe within a single asyncio loop
|
206
|
+
|
207
|
+
Usage:
|
208
|
+
wrapper = ArtifactUpdaterMultiWrapper(
|
209
|
+
wrapped_classes={
|
210
|
+
'mydata': [DataArtifactA, DataArtifactB],
|
211
|
+
},
|
212
|
+
max_workers=4,
|
213
|
+
retry_attempts=3,
|
214
|
+
update_timeout_seconds=600,
|
215
|
+
backoff_base=2,
|
216
|
+
backoff_max=60,
|
217
|
+
backoff_jitter=0.1,
|
218
|
+
priority_fn=None, # or custom
|
219
|
+
metrics_client=None,
|
220
|
+
debug=True,
|
221
|
+
logger=None,
|
222
|
+
artifact_class_kwargs={
|
223
|
+
'fs': my_fs,
|
224
|
+
'parquet_storage_path': 's3://bucket/data',
|
225
|
+
'logger': my_logger,
|
226
|
+
'debug': True,
|
227
|
+
}
|
228
|
+
)
|
229
|
+
await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
230
|
+
"""
|
231
|
+
def __init__(
|
232
|
+
self,
|
233
|
+
wrapped_classes: Dict[str, List[Type]],
|
234
|
+
*,
|
235
|
+
max_workers: int = 3,
|
236
|
+
retry_attempts: int = 3,
|
237
|
+
update_timeout_seconds: int = 600,
|
238
|
+
backoff_base: int = 2,
|
239
|
+
backoff_max: Optional[int] = 60,
|
240
|
+
backoff_jitter: float = 0.1,
|
241
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
242
|
+
metrics_client: Any = None,
|
243
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
244
|
+
**kwargs: Dict[str, Any]
|
245
|
+
) -> None:
|
246
|
+
super().__init__(**kwargs)
|
247
|
+
self.wrapped_classes = wrapped_classes
|
248
|
+
self.max_workers = max_workers
|
249
|
+
self.retry_attempts = retry_attempts
|
250
|
+
self.update_timeout_seconds = update_timeout_seconds
|
251
|
+
self.backoff_base = backoff_base
|
252
|
+
self.backoff_max = backoff_max
|
253
|
+
self.backoff_jitter = backoff_jitter
|
254
|
+
self.priority_fn = priority_fn
|
255
|
+
self.metrics_client = metrics_client
|
256
|
+
|
257
|
+
# Default artifact init kwargs
|
258
|
+
today = datetime.datetime.today() + datetime.timedelta(days=1)
|
259
|
+
default_kwargs = {
|
260
|
+
'parquet_start_date': today.strftime('%Y-%m-%d'),
|
261
|
+
'parquet_end_date': today.strftime('%Y-%m-%d'),
|
262
|
+
'logger': self.logger,
|
263
|
+
'debug': self.debug,
|
264
|
+
'fs': self.fs,
|
265
|
+
'verbose': self.verbose,
|
266
|
+
}
|
267
|
+
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
268
|
+
|
269
|
+
# State
|
270
|
+
self.completion_times: Dict[str, float] = {}
|
271
|
+
self.failed: List[str] = []
|
272
|
+
self.original_classes: List[Type] = []
|
273
|
+
|
274
|
+
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
275
|
+
"""
|
276
|
+
Retrieve artifact classes by data type.
|
277
|
+
"""
|
278
|
+
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
279
|
+
if data_type not in self.wrapped_classes:
|
280
|
+
raise ValueError(f"Unsupported data type: {data_type}")
|
281
|
+
classes = self.wrapped_classes[data_type]
|
282
|
+
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
283
|
+
return classes
|
284
|
+
|
285
|
+
def estimate_priority(self, artifact_cls: Type) -> int:
|
286
|
+
"""
|
287
|
+
Determine task priority for ordering. Lower values run first.
|
288
|
+
"""
|
289
|
+
name = artifact_cls.__name__
|
290
|
+
if self.priority_fn:
|
291
|
+
try:
|
292
|
+
pr = self.priority_fn(artifact_cls)
|
293
|
+
self.logger.debug(f"priority_fn for {name}: {pr}")
|
294
|
+
return pr
|
295
|
+
except Exception as e:
|
296
|
+
self.logger.warning(f"priority_fn error for {name}: {e}")
|
297
|
+
try:
|
298
|
+
fs = self.artifact_class_kwargs.get('fs')
|
299
|
+
path = self.artifact_class_kwargs.get('parquet_storage_path')
|
300
|
+
pr=1
|
301
|
+
if hasattr(artifact_cls, 'get_size_estimate'):
|
302
|
+
pr = artifact_cls.get_size_estimate(fs, path)
|
303
|
+
self.logger.debug(f"Estimated priority for {name}: {pr}")
|
304
|
+
return pr
|
305
|
+
except Exception:
|
306
|
+
return 1
|
307
|
+
|
308
|
+
async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
|
309
|
+
"""
|
310
|
+
Wrap update_artifact in a semaphore slot to limit concurrency.
|
311
|
+
"""
|
312
|
+
async with sem:
|
313
|
+
name = artifact_cls.__name__
|
314
|
+
start = asyncio.get_event_loop().time()
|
315
|
+
self.logger.info(f"Starting update for {name}")
|
316
|
+
try:
|
317
|
+
for attempt in range(1, self.retry_attempts + 1):
|
318
|
+
try:
|
319
|
+
artifact = await asyncio.to_thread(
|
320
|
+
artifact_cls, **self.artifact_class_kwargs
|
321
|
+
)
|
322
|
+
await asyncio.wait_for(
|
323
|
+
asyncio.to_thread(
|
324
|
+
artifact.update_parquet, **update_kwargs
|
325
|
+
),
|
326
|
+
timeout=self.update_timeout_seconds
|
327
|
+
)
|
328
|
+
duration = asyncio.get_event_loop().time() - start
|
329
|
+
self.completion_times[name] = duration
|
330
|
+
self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
|
331
|
+
if self.metrics_client:
|
332
|
+
self.metrics_client.increment('task_succeeded')
|
333
|
+
return
|
334
|
+
except asyncio.TimeoutError:
|
335
|
+
self.logger.warning(f"Timeout on {name}, attempt {attempt}")
|
336
|
+
except Exception as e:
|
337
|
+
self.logger.error(f"Error on {name} attempt {attempt}: {e}")
|
338
|
+
|
339
|
+
delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
340
|
+
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
341
|
+
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
342
|
+
await asyncio.sleep(delay)
|
343
|
+
|
344
|
+
except asyncio.CancelledError:
|
345
|
+
self.logger.warning(f"{name} update cancelled")
|
346
|
+
raise
|
347
|
+
|
348
|
+
# permanent failure
|
349
|
+
self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
350
|
+
if self.metrics_client:
|
351
|
+
self.metrics_client.increment('task_failed')
|
352
|
+
self.failed.append(name)
|
353
|
+
|
354
|
+
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
355
|
+
"""
|
356
|
+
Entry point to update all artifacts of a given type concurrently.
|
357
|
+
"""
|
358
|
+
self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
|
359
|
+
|
360
|
+
# RESET STATE
|
361
|
+
self.completion_times.clear()
|
362
|
+
self.failed.clear()
|
363
|
+
self.original_classes = self.get_artifact_classes(data_type)
|
364
|
+
|
365
|
+
# NON-DESTRUCTIVE SORTING
|
366
|
+
ordered = sorted(self.original_classes, key=self.estimate_priority)
|
367
|
+
|
368
|
+
sem = asyncio.Semaphore(self.max_workers)
|
369
|
+
tasks = [
|
370
|
+
asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
371
|
+
for cls in ordered
|
372
|
+
]
|
373
|
+
|
374
|
+
try:
|
375
|
+
for coro in asyncio.as_completed(tasks):
|
376
|
+
await coro
|
377
|
+
except asyncio.CancelledError:
|
378
|
+
self.logger.warning("update_data was cancelled—aborting remaining retries")
|
379
|
+
for t in tasks:
|
380
|
+
t.cancel()
|
381
|
+
raise
|
382
|
+
finally:
|
383
|
+
total = len(self.original_classes)
|
384
|
+
completed = len(self.completion_times)
|
385
|
+
failed = len(self.failed)
|
386
|
+
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
387
|
+
|
388
|
+
def get_update_status(self) -> Dict[str, Any]:
|
389
|
+
"""
|
390
|
+
Returns summary status including completion times.
|
391
|
+
"""
|
392
|
+
total = len(self.original_classes)
|
393
|
+
completed = set(self.completion_times.keys())
|
394
|
+
failed = set(self.failed)
|
395
|
+
pending = {cls.__name__ for cls in self.original_classes} - completed - failed
|
396
|
+
|
397
|
+
return {
|
398
|
+
'total': total,
|
399
|
+
'completed': list(completed),
|
400
|
+
'failed': list(failed),
|
401
|
+
'pending': list(pending),
|
402
|
+
'completion_times': self.completion_times,
|
403
|
+
}
|
404
|
+
|
405
|
+
@staticmethod
|
406
|
+
def format_status_table(status: Dict[str, Any]) -> str:
|
407
|
+
"""
|
408
|
+
Formats the status dict into a readable table.
|
409
|
+
"""
|
410
|
+
lines = [
|
411
|
+
f"Total: {status['total']}",
|
412
|
+
f"Completed: {len(status['completed'])} {status['completed']}",
|
413
|
+
f"Failed: {len(status['failed'])} {status['failed']}",
|
414
|
+
f"Pending: {len(status['pending'])} {status['pending']}",
|
415
|
+
"",
|
416
|
+
"Per-artifact timings:"
|
417
|
+
]
|
418
|
+
for name, dur in status['completion_times'].items():
|
419
|
+
lines.append(f" {name}: {dur:.2f}s")
|
420
|
+
return "\n".join(lines)
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
from typing import Optional, ClassVar, Dict
|
3
2
|
|
4
3
|
import dask.dataframe as dd
|
@@ -47,15 +46,13 @@ class ParquetReader(DfHelper):
|
|
47
46
|
'backend': 'parquet'
|
48
47
|
}
|
49
48
|
|
50
|
-
def __init__(self,
|
49
|
+
def __init__(self, **kwargs):
|
51
50
|
self.config = {
|
52
51
|
**self.DEFAULT_CONFIG,
|
53
52
|
**kwargs,
|
54
53
|
}
|
55
|
-
self.
|
56
|
-
|
57
|
-
#self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
|
58
|
-
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
54
|
+
super().__init__(**self.config)
|
55
|
+
|
59
56
|
self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
|
60
57
|
if self.parquet_storage_path is None:
|
61
58
|
raise ValueError('parquet_storage_path must be set')
|
@@ -67,19 +64,9 @@ class ParquetReader(DfHelper):
|
|
67
64
|
if self.parquet_end_date is None:
|
68
65
|
raise ValueError('parquet_end_date must be set')
|
69
66
|
|
70
|
-
# Filesystem setup
|
71
|
-
#self.filesystem_type = filesystem_type
|
72
|
-
#self.filesystem_options = filesystem_options or {}
|
73
|
-
#self.fs = self.config.setdefault('fs', None)
|
74
|
-
#if self.fs is None:
|
75
|
-
# self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
76
|
-
#self.config.setdefault('fs', self.fs)
|
77
|
-
|
78
67
|
if not self.directory_exists():
|
79
68
|
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
80
69
|
|
81
|
-
super().__init__(**self.config)
|
82
|
-
|
83
70
|
def load(self, **kwargs):
|
84
71
|
self.df = super().load(**kwargs)
|
85
72
|
return self.df
|
@@ -89,5 +76,4 @@ class ParquetReader(DfHelper):
|
|
89
76
|
info = self.fs.info(self.parquet_storage_path)
|
90
77
|
return info['type'] == 'directory'
|
91
78
|
except FileNotFoundError:
|
92
|
-
return False
|
93
|
-
|
79
|
+
return False
|
@@ -1,16 +1,13 @@
|
|
1
1
|
import datetime
|
2
|
-
import logging
|
3
2
|
import threading
|
4
3
|
import time
|
5
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
5
|
from typing import Type, Any, Dict, Optional, Union, List, ClassVar
|
7
6
|
|
8
|
-
import fsspec
|
9
7
|
import pandas as pd
|
10
8
|
from tqdm import tqdm
|
11
9
|
|
12
10
|
from . import ManagedResource
|
13
|
-
from .log_utils import Logger
|
14
11
|
from .parquet_saver import ParquetSaver
|
15
12
|
|
16
13
|
|
@@ -60,8 +57,7 @@ class DataWrapper(ManagedResource):
|
|
60
57
|
self.processed_dates: List[datetime.date] = []
|
61
58
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
62
59
|
self.mmanifest = kwargs.get("mmanifest", None)
|
63
|
-
self.update_planner=kwargs.get("update_planner", None)
|
64
|
-
|
60
|
+
self.update_planner = kwargs.get("update_planner", None)
|
65
61
|
|
66
62
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
67
63
|
"""Context manager exit"""
|
@@ -104,7 +100,6 @@ class DataWrapper(ManagedResource):
|
|
104
100
|
if self.update_planner.show_progress:
|
105
101
|
self.show_benchmark_summary()
|
106
102
|
|
107
|
-
|
108
103
|
def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
|
109
104
|
"""Executes a single batch of tasks (dates) using a thread pool."""
|
110
105
|
desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
|
@@ -135,7 +130,7 @@ class DataWrapper(ManagedResource):
|
|
135
130
|
time.sleep(2 ** attempt) # Exponential backoff
|
136
131
|
else:
|
137
132
|
self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
|
138
|
-
#raise
|
133
|
+
# raise
|
139
134
|
|
140
135
|
def _process_single_date(self, date: datetime.date):
|
141
136
|
"""Core date processing logic with load/save timing and thread reporting"""
|
@@ -146,8 +141,8 @@ class DataWrapper(ManagedResource):
|
|
146
141
|
return
|
147
142
|
full_path = f"{path}{self.parquet_filename}"
|
148
143
|
|
149
|
-
#thread_name = threading.current_thread().name
|
150
|
-
#self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
144
|
+
# thread_name = threading.current_thread().name
|
145
|
+
# self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
151
146
|
|
152
147
|
overall_start = time.perf_counter()
|
153
148
|
try:
|
@@ -159,35 +154,43 @@ class DataWrapper(ManagedResource):
|
|
159
154
|
local_load_params = self.load_params.copy()
|
160
155
|
local_load_params.update(date_filter)
|
161
156
|
local_class_instance = self.dataclass(**self.class_params)
|
162
|
-
df=local_class_instance.load(**local_load_params)
|
157
|
+
df = local_class_instance.load(**local_load_params)
|
163
158
|
load_time = time.perf_counter() - load_start
|
164
159
|
|
165
160
|
if hasattr(local_class_instance, "total_records"):
|
166
|
-
self.logger.debug(
|
161
|
+
self.logger.debug(
|
162
|
+
f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
|
167
163
|
if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
|
168
164
|
if self.mmanifest:
|
169
165
|
self.mmanifest.record(
|
170
166
|
full_path=path
|
171
167
|
)
|
172
|
-
self.logger.info(f"No data found for {
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
168
|
+
self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
|
169
|
+
elif int(local_class_instance.total_records) < 0:
|
170
|
+
self.logger.warning(
|
171
|
+
f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
|
172
|
+
"This may indicate an error in the data loading process."
|
173
|
+
)
|
174
|
+
else:
|
175
|
+
save_start = time.perf_counter()
|
176
|
+
parquet_params ={
|
177
|
+
"df_result": df,
|
178
|
+
"parquet_storage_path": path,
|
179
|
+
"fs": self.fs,
|
180
|
+
"logger": self.logger,
|
181
|
+
"debug": self.debug,
|
182
|
+
}
|
183
|
+
with ParquetSaver(**parquet_params) as ps:
|
184
|
+
ps.save_to_parquet(self.parquet_filename, overwrite=True)
|
185
|
+
save_time = time.perf_counter() - save_start
|
186
|
+
|
187
|
+
total_time = time.perf_counter() - overall_start
|
188
|
+
self.benchmarks[date] = {
|
189
|
+
"load_duration": load_time,
|
190
|
+
"save_duration": save_time,
|
191
|
+
"total_duration": total_time
|
192
|
+
}
|
193
|
+
self._log_success(date, total_time, full_path)
|
191
194
|
except Exception as e:
|
192
195
|
self._log_failure(date, e)
|
193
196
|
raise
|
@@ -26,10 +26,6 @@ class ParquetSaver(ManagedResource):
|
|
26
26
|
super().__init__(**kwargs)
|
27
27
|
self.df_result = df_result
|
28
28
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
29
|
-
#self.debug = debug
|
30
|
-
#self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
31
|
-
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
32
|
-
#self.fs = fs
|
33
29
|
# Determine protocol for special handling (e.g., 's3')
|
34
30
|
if not self.fs:
|
35
31
|
raise ValueError("File system (fs) must be provided to ParquetSaver.")
|