sibi-dst 0.3.58__py3-none-any.whl → 0.3.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +211 -233
- sibi_dst/df_helper/_df_helper.py +7 -3
- sibi_dst/df_helper/_parquet_artifact.py +143 -52
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -3
- sibi_dst/utils/__init__.py +6 -3
- sibi_dst/utils/data_wrapper.py +149 -140
- sibi_dst/utils/date_utils.py +8 -8
- sibi_dst/utils/log_utils.py +1 -1
- sibi_dst/utils/manifest_manager.py +154 -0
- sibi_dst/utils/storage_config.py +59 -1
- sibi_dst/utils/update_planner.py +96 -85
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.60.dist-info}/METADATA +1 -1
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.60.dist-info}/RECORD +14 -13
- {sibi_dst-0.3.58.dist-info → sibi_dst-0.3.60.dist-info}/WHEEL +0 -0
@@ -1,261 +1,239 @@
|
|
1
1
|
import asyncio
|
2
2
|
import logging
|
3
3
|
import datetime
|
4
|
-
import
|
5
|
-
import
|
6
|
-
from functools import total_ordering
|
7
|
-
from collections import defaultdict
|
8
|
-
from contextlib import asynccontextmanager
|
9
|
-
import signal
|
10
|
-
from sibi_dst.utils import Logger
|
11
|
-
|
12
|
-
@total_ordering
|
13
|
-
class PrioritizedItem:
|
14
|
-
def __init__(self, priority, artifact):
|
15
|
-
self.priority = priority
|
16
|
-
self.artifact = artifact
|
4
|
+
import random
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Type
|
17
6
|
|
18
|
-
|
19
|
-
return self.priority < other.priority
|
7
|
+
from sibi_dst.utils import Logger
|
20
8
|
|
21
|
-
def __eq__(self, other):
|
22
|
-
return self.priority == other.priority
|
23
9
|
|
24
10
|
class ArtifactUpdaterMultiWrapper:
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
11
|
+
"""
|
12
|
+
Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
|
13
|
+
|
14
|
+
Features:
|
15
|
+
- Caps concurrency at max_workers via semaphore
|
16
|
+
- Optionally prioritises tasks via a priority function or static method on artifact classes
|
17
|
+
- Tracks per-artifact completion times
|
18
|
+
- Configurable retry/backoff strategy
|
19
|
+
- Optional metrics integration
|
20
|
+
- Thread-safe within a single asyncio loop
|
21
|
+
|
22
|
+
Usage:
|
23
|
+
wrapper = ArtifactUpdaterMultiWrapper(
|
24
|
+
wrapped_classes={
|
25
|
+
'mydata': [DataArtifactA, DataArtifactB],
|
26
|
+
},
|
27
|
+
max_workers=4,
|
28
|
+
retry_attempts=3,
|
29
|
+
update_timeout_seconds=600,
|
30
|
+
backoff_base=2,
|
31
|
+
backoff_max=60,
|
32
|
+
backoff_jitter=0.1,
|
33
|
+
priority_fn=None, # or custom
|
34
|
+
metrics_client=None,
|
35
|
+
debug=True,
|
36
|
+
logger=None,
|
37
|
+
artifact_class_kwargs={
|
38
|
+
'fs': my_fs,
|
39
|
+
'parquet_storage_path': 's3://bucket/data',
|
40
|
+
'logger': my_logger,
|
41
|
+
'debug': True,
|
42
|
+
}
|
30
43
|
)
|
31
|
-
|
44
|
+
await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
45
|
+
"""
|
46
|
+
def __init__(
|
47
|
+
self,
|
48
|
+
wrapped_classes: Dict[str, List[Type]],
|
49
|
+
*,
|
50
|
+
max_workers: int = 3,
|
51
|
+
retry_attempts: int = 3,
|
52
|
+
update_timeout_seconds: int = 600,
|
53
|
+
backoff_base: int = 2,
|
54
|
+
backoff_max: Optional[int] = 60,
|
55
|
+
backoff_jitter: float = 0.1,
|
56
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
57
|
+
metrics_client: Any = None,
|
58
|
+
debug: bool = False,
|
59
|
+
logger: Optional[logging.Logger] = None,
|
60
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
61
|
+
) -> None:
|
62
|
+
self.wrapped_classes = wrapped_classes
|
63
|
+
self.max_workers = max_workers
|
64
|
+
self.retry_attempts = retry_attempts
|
65
|
+
self.update_timeout_seconds = update_timeout_seconds
|
66
|
+
self.backoff_base = backoff_base
|
67
|
+
self.backoff_max = backoff_max
|
68
|
+
self.backoff_jitter = backoff_jitter
|
69
|
+
self.priority_fn = priority_fn
|
70
|
+
self.metrics_client = metrics_client
|
32
71
|
|
33
|
-
|
34
|
-
self.
|
35
|
-
|
36
|
-
|
37
|
-
)
|
38
|
-
self.parquet_end_date = kwargs.get(
|
39
|
-
'parquet_end_date',
|
40
|
-
today.strftime('%Y-%m-%d')
|
72
|
+
self.debug = debug
|
73
|
+
self.logger = logger or Logger.default_logger(
|
74
|
+
logger_name=self.__class__.__name__,
|
75
|
+
log_level=Logger.DEBUG if debug else Logger.INFO
|
41
76
|
)
|
42
77
|
|
43
|
-
#
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
self.
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
self.
|
56
|
-
self.
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
async def get_lock_for_artifact(self, artifact):
|
64
|
-
key = artifact.__class__.__name__
|
65
|
-
async with self.locks_lock:
|
66
|
-
if key not in self.locks:
|
67
|
-
self.locks[key] = asyncio.Lock()
|
68
|
-
return self.locks[key]
|
69
|
-
|
70
|
-
def get_artifacts(self, data_type):
|
78
|
+
# Default artifact init kwargs
|
79
|
+
today = datetime.datetime.today() + datetime.timedelta(days=1)
|
80
|
+
default_kwargs = {
|
81
|
+
'parquet_start_date': today.strftime('%Y-%m-%d'),
|
82
|
+
'parquet_end_date': today.strftime('%Y-%m-%d'),
|
83
|
+
'logger': self.logger,
|
84
|
+
'debug': self.debug,
|
85
|
+
}
|
86
|
+
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
87
|
+
|
88
|
+
# State
|
89
|
+
self.completion_times: Dict[str, float] = {}
|
90
|
+
self.failed: List[str] = []
|
91
|
+
self.original_classes: List[Type] = []
|
92
|
+
|
93
|
+
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
94
|
+
"""
|
95
|
+
Retrieve artifact classes by data type.
|
96
|
+
"""
|
97
|
+
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
71
98
|
if data_type not in self.wrapped_classes:
|
72
99
|
raise ValueError(f"Unsupported data type: {data_type}")
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
self.
|
83
|
-
return artifacts
|
84
|
-
|
85
|
-
def estimate_complexity(self, artifact):
|
86
|
-
try:
|
87
|
-
return artifact.get_size_estimate()
|
88
|
-
except Exception:
|
89
|
-
return 1
|
90
|
-
|
91
|
-
def prioritize_tasks(self, artifacts):
|
92
|
-
queue = asyncio.PriorityQueue()
|
93
|
-
for art in artifacts:
|
94
|
-
queue.put_nowait(PrioritizedItem(self.estimate_complexity(art), art))
|
95
|
-
return queue
|
96
|
-
|
97
|
-
async def resource_monitor(self, queue, workers):
|
98
|
-
while not queue.empty():
|
100
|
+
classes = self.wrapped_classes[data_type]
|
101
|
+
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
102
|
+
return classes
|
103
|
+
|
104
|
+
def estimate_priority(self, artifact_cls: Type) -> int:
|
105
|
+
"""
|
106
|
+
Determine task priority for ordering. Lower values run first.
|
107
|
+
"""
|
108
|
+
name = artifact_cls.__name__
|
109
|
+
if self.priority_fn:
|
99
110
|
try:
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
min(psutil.cpu_count(), max_by_mem, self.max_workers))
|
104
|
-
async with self.workers_lock:
|
105
|
-
current = len(workers)
|
106
|
-
if optimal > current:
|
107
|
-
for _ in range(optimal - current):
|
108
|
-
wid = len(workers)
|
109
|
-
workers.append(asyncio.create_task(self.worker(queue, wid)))
|
110
|
-
self.logger.info(f"Added worker {wid}")
|
111
|
-
elif optimal < current:
|
112
|
-
for _ in range(current - optimal):
|
113
|
-
w = workers.pop()
|
114
|
-
w.cancel()
|
115
|
-
self.logger.info("Removed a worker")
|
116
|
-
await asyncio.sleep(self.monitor_interval)
|
117
|
-
except asyncio.CancelledError:
|
118
|
-
break
|
111
|
+
pr = self.priority_fn(artifact_cls)
|
112
|
+
self.logger.debug(f"priority_fn for {name}: {pr}")
|
113
|
+
return pr
|
119
114
|
except Exception as e:
|
120
|
-
self.logger.
|
121
|
-
await asyncio.sleep(self.monitor_interval)
|
122
|
-
|
123
|
-
@asynccontextmanager
|
124
|
-
async def artifact_lock(self, artifact):
|
125
|
-
lock = await self.get_lock_for_artifact(artifact)
|
115
|
+
self.logger.warning(f"priority_fn error for {name}: {e}")
|
126
116
|
try:
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
if
|
131
|
-
|
117
|
+
fs = self.artifact_class_kwargs.get('fs')
|
118
|
+
path = self.artifact_class_kwargs.get('parquet_storage_path')
|
119
|
+
pr=1
|
120
|
+
if hasattr(artifact_cls, 'get_size_estimate'):
|
121
|
+
pr = artifact_cls.get_size_estimate(fs, path)
|
122
|
+
self.logger.debug(f"Estimated priority for {name}: {pr}")
|
123
|
+
return pr
|
124
|
+
except Exception:
|
125
|
+
return 1
|
132
126
|
|
133
|
-
async def
|
134
|
-
|
135
|
-
|
127
|
+
async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
|
128
|
+
"""
|
129
|
+
Wrap update_artifact in a semaphore slot to limit concurrency.
|
130
|
+
"""
|
131
|
+
async with sem:
|
132
|
+
name = artifact_cls.__name__
|
133
|
+
start = asyncio.get_event_loop().time()
|
134
|
+
self.logger.info(f"Starting update for {name}")
|
136
135
|
try:
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
f"{
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
lock.release()
|
164
|
-
await asyncio.sleep(2 ** (attempt - 1))
|
136
|
+
for attempt in range(1, self.retry_attempts + 1):
|
137
|
+
try:
|
138
|
+
artifact = await asyncio.to_thread(
|
139
|
+
artifact_cls, **self.artifact_class_kwargs
|
140
|
+
)
|
141
|
+
await asyncio.wait_for(
|
142
|
+
asyncio.to_thread(
|
143
|
+
artifact.update_parquet, **update_kwargs
|
144
|
+
),
|
145
|
+
timeout=self.update_timeout_seconds
|
146
|
+
)
|
147
|
+
duration = asyncio.get_event_loop().time() - start
|
148
|
+
self.completion_times[name] = duration
|
149
|
+
self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
|
150
|
+
if self.metrics_client:
|
151
|
+
self.metrics_client.increment('task_succeeded')
|
152
|
+
return
|
153
|
+
except asyncio.TimeoutError:
|
154
|
+
self.logger.warning(f"Timeout on {name}, attempt {attempt}")
|
155
|
+
except Exception as e:
|
156
|
+
self.logger.error(f"Error on {name} attempt {attempt}: {e}")
|
157
|
+
|
158
|
+
delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
159
|
+
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
160
|
+
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
161
|
+
await asyncio.sleep(delay)
|
165
162
|
|
166
|
-
# all retries exhausted -> mark failure
|
167
|
-
async with self.workers_lock:
|
168
|
-
self.pending.discard(artifact)
|
169
|
-
self.failed.add(artifact)
|
170
|
-
self.logger.error(f"✖️ Permanently failed {artifact.__class__.__name__}")
|
171
|
-
|
172
|
-
async def worker(self, queue, worker_id, **kwargs):
|
173
|
-
while True:
|
174
|
-
try:
|
175
|
-
item = await queue.get()
|
176
|
-
art = item.artifact
|
177
|
-
self.worker_heartbeat[worker_id] = time.time()
|
178
|
-
await self.async_update_artifact(art, **kwargs)
|
179
163
|
except asyncio.CancelledError:
|
180
|
-
self.logger.
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
monitor.cancel()
|
200
|
-
for t in tasks:
|
201
|
-
t.cancel()
|
202
|
-
await asyncio.gather(*tasks, return_exceptions=True)
|
203
|
-
self.logger.info(self.format_results_table())
|
204
|
-
self.logger.info("All artifacts processed.")
|
205
|
-
|
206
|
-
def format_results_table(self):
|
207
|
-
results = self.get_update_status()
|
208
|
-
headers = ["Metric", "Value"]
|
209
|
-
rows = [
|
210
|
-
["Total", results['total']],
|
211
|
-
["Completed", results['completed']],
|
212
|
-
["Pending", results['pending']],
|
213
|
-
["Failed", results['failed']],
|
214
|
-
["Pending Items", len(results['pending_items'])],
|
215
|
-
["Failed Items", len(results['failed_items'])]
|
216
|
-
]
|
217
|
-
|
218
|
-
# Find max lengths for alignment
|
219
|
-
max_metric = max(len(str(row[0])) for row in rows)
|
220
|
-
max_value = max(len(str(row[1])) for row in rows)
|
164
|
+
self.logger.warning(f"{name} update cancelled")
|
165
|
+
raise
|
166
|
+
|
167
|
+
# permanent failure
|
168
|
+
self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
169
|
+
if self.metrics_client:
|
170
|
+
self.metrics_client.increment('task_failed')
|
171
|
+
self.failed.append(name)
|
172
|
+
|
173
|
+
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
174
|
+
"""
|
175
|
+
Entry point to update all artifacts of a given type concurrently.
|
176
|
+
"""
|
177
|
+
self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
|
178
|
+
|
179
|
+
# RESET STATE
|
180
|
+
self.completion_times.clear()
|
181
|
+
self.failed.clear()
|
182
|
+
self.original_classes = self.get_artifact_classes(data_type)
|
221
183
|
|
222
|
-
|
184
|
+
# NON-DESTRUCTIVE SORTING
|
185
|
+
ordered = sorted(self.original_classes, key=self.estimate_priority)
|
223
186
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
187
|
+
sem = asyncio.Semaphore(self.max_workers)
|
188
|
+
tasks = [
|
189
|
+
asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
190
|
+
for cls in ordered
|
228
191
|
]
|
229
192
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
193
|
+
try:
|
194
|
+
for coro in asyncio.as_completed(tasks):
|
195
|
+
await coro
|
196
|
+
except asyncio.CancelledError:
|
197
|
+
self.logger.warning("update_data was cancelled—aborting remaining retries")
|
198
|
+
for t in tasks:
|
199
|
+
t.cancel()
|
200
|
+
raise
|
201
|
+
finally:
|
202
|
+
total = len(self.original_classes)
|
203
|
+
completed = len(self.completion_times)
|
204
|
+
failed = len(self.failed)
|
205
|
+
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
206
|
+
|
207
|
+
def get_update_status(self) -> Dict[str, Any]:
|
208
|
+
"""
|
209
|
+
Returns summary status including completion times.
|
210
|
+
"""
|
211
|
+
total = len(self.original_classes)
|
212
|
+
completed = set(self.completion_times.keys())
|
213
|
+
failed = set(self.failed)
|
214
|
+
pending = {cls.__name__ for cls in self.original_classes} - completed - failed
|
234
215
|
|
235
|
-
def get_update_status(self):
|
236
|
-
total = len(self.pending) + len(self.completed) + len(self.failed)
|
237
216
|
return {
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
"failed_items": [a.__class__.__name__ for a in self.failed]
|
217
|
+
'total': total,
|
218
|
+
'completed': list(completed),
|
219
|
+
'failed': list(failed),
|
220
|
+
'pending': list(pending),
|
221
|
+
'completion_times': self.completion_times,
|
244
222
|
}
|
245
223
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
224
|
+
@staticmethod
|
225
|
+
def format_status_table(status: Dict[str, Any]) -> str:
|
226
|
+
"""
|
227
|
+
Formats the status dict into a readable table.
|
228
|
+
"""
|
229
|
+
lines = [
|
230
|
+
f"Total: {status['total']}",
|
231
|
+
f"Completed: {len(status['completed'])} {status['completed']}",
|
232
|
+
f"Failed: {len(status['failed'])} {status['failed']}",
|
233
|
+
f"Pending: {len(status['pending'])} {status['pending']}",
|
234
|
+
"",
|
235
|
+
"Per-artifact timings:"
|
236
|
+
]
|
237
|
+
for name, dur in status['completion_times'].items():
|
238
|
+
lines.append(f" {name}: {dur:.2f}s")
|
239
|
+
return "\n".join(lines)
|
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -46,8 +46,7 @@ class DfHelper:
|
|
46
46
|
|
47
47
|
:ivar df: The DataFrame currently being processed or loaded.
|
48
48
|
:type df: Union[dd.DataFrame, pd.DataFrame]
|
49
|
-
:
|
50
|
-
:type backend_connection: Optional[DjangoConnectionConfig]
|
49
|
+
:type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
|
51
50
|
:ivar _backend_query: Internal configuration for query handling.
|
52
51
|
:type _backend_query: Optional[QueryConfig]
|
53
52
|
:ivar _backend_params: Internal parameters configuration for DataFrame handling.
|
@@ -81,9 +80,10 @@ class DfHelper:
|
|
81
80
|
self.debug = kwargs.setdefault("debug", False)
|
82
81
|
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
|
83
82
|
# Configure logger level
|
84
|
-
self.logger.set_level(
|
83
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
85
84
|
self.logger.debug("Logger initialized in DEBUG mode.")
|
86
85
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
86
|
+
self.parquet_filename = kwargs.setdefault("parquet_filename", None)
|
87
87
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
88
88
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
89
89
|
self.filesystem = kwargs.pop('filesystem', 'file')
|
@@ -429,7 +429,11 @@ class DfHelper:
|
|
429
429
|
the instance's attribute for storage path.
|
430
430
|
:return: None
|
431
431
|
"""
|
432
|
+
if self.df.map_partitions(len).compute().sum() == 0:
|
433
|
+
self.logger.debug("Cannot save to parquet since DataFrame is empty")
|
434
|
+
return
|
432
435
|
fs = kwargs.pop('fs', self.fs)
|
436
|
+
parquet_filename = parquet_filename or self.parquet_filename
|
433
437
|
parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
|
434
438
|
ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
|
435
439
|
ps.save_to_parquet(parquet_filename)
|