sibi-dst 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,261 +1,239 @@
1
1
  import asyncio
2
2
  import logging
3
3
  import datetime
4
- import psutil
5
- import time
6
- from functools import total_ordering
7
- from collections import defaultdict
8
- from contextlib import asynccontextmanager
9
- import signal
10
- from sibi_dst.utils import Logger
11
-
12
- @total_ordering
13
- class PrioritizedItem:
14
- def __init__(self, priority, artifact):
15
- self.priority = priority
16
- self.artifact = artifact
4
+ import random
5
+ from typing import Any, Callable, Dict, List, Optional, Type
17
6
 
18
- def __lt__(self, other):
19
- return self.priority < other.priority
7
+ from sibi_dst.utils import Logger
20
8
 
21
- def __eq__(self, other):
22
- return self.priority == other.priority
23
9
 
24
10
  class ArtifactUpdaterMultiWrapper:
25
- def __init__(self, wrapped_classes=None, debug=False, **kwargs):
26
- self.wrapped_classes = wrapped_classes or {}
27
- self.debug = debug
28
- self.logger = kwargs.setdefault(
29
- 'logger', Logger.default_logger(logger_name=self.__class__.__name__)
11
+ """
12
+ Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
13
+
14
+ Features:
15
+ - Caps concurrency at max_workers via semaphore
16
+ - Optionally prioritises tasks via a priority function or static method on artifact classes
17
+ - Tracks per-artifact completion times
18
+ - Configurable retry/backoff strategy
19
+ - Optional metrics integration
20
+ - Thread-safe within a single asyncio loop
21
+
22
+ Usage:
23
+ wrapper = ArtifactUpdaterMultiWrapper(
24
+ wrapped_classes={
25
+ 'mydata': [DataArtifactA, DataArtifactB],
26
+ },
27
+ max_workers=4,
28
+ retry_attempts=3,
29
+ update_timeout_seconds=600,
30
+ backoff_base=2,
31
+ backoff_max=60,
32
+ backoff_jitter=0.1,
33
+ priority_fn=None, # or custom
34
+ metrics_client=None,
35
+ debug=True,
36
+ logger=None,
37
+ artifact_class_kwargs={
38
+ 'fs': my_fs,
39
+ 'parquet_storage_path': 's3://bucket/data',
40
+ 'logger': my_logger,
41
+ 'debug': True,
42
+ }
30
43
  )
31
- self.logger.set_level(logging.DEBUG if debug else logging.INFO)
44
+ await wrapper.update_data('mydata', period='ytd', overwrite=True)
45
+ """
46
+ def __init__(
47
+ self,
48
+ wrapped_classes: Dict[str, List[Type]],
49
+ *,
50
+ max_workers: int = 3,
51
+ retry_attempts: int = 3,
52
+ update_timeout_seconds: int = 600,
53
+ backoff_base: int = 2,
54
+ backoff_max: Optional[int] = 60,
55
+ backoff_jitter: float = 0.1,
56
+ priority_fn: Optional[Callable[[Type], int]] = None,
57
+ metrics_client: Any = None,
58
+ debug: bool = False,
59
+ logger: Optional[logging.Logger] = None,
60
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
61
+ ) -> None:
62
+ self.wrapped_classes = wrapped_classes
63
+ self.max_workers = max_workers
64
+ self.retry_attempts = retry_attempts
65
+ self.update_timeout_seconds = update_timeout_seconds
66
+ self.backoff_base = backoff_base
67
+ self.backoff_max = backoff_max
68
+ self.backoff_jitter = backoff_jitter
69
+ self.priority_fn = priority_fn
70
+ self.metrics_client = metrics_client
32
71
 
33
- today = datetime.datetime.today()
34
- self.parquet_start_date = kwargs.get(
35
- 'parquet_start_date',
36
- datetime.date(today.year, 1, 1).strftime('%Y-%m-%d')
37
- )
38
- self.parquet_end_date = kwargs.get(
39
- 'parquet_end_date',
40
- today.strftime('%Y-%m-%d')
72
+ self.debug = debug
73
+ self.logger = logger or Logger.default_logger(
74
+ logger_name=self.__class__.__name__,
75
+ log_level=Logger.DEBUG if debug else Logger.INFO
41
76
  )
42
77
 
43
- # track pending/completed/failed artifacts
44
- self.pending = set()
45
- self.completed = set()
46
- self.failed = set()
47
-
48
- # concurrency primitives
49
- self.locks = {}
50
- self.locks_lock = asyncio.Lock()
51
- self.worker_heartbeat = defaultdict(float)
52
- self.workers_lock = asyncio.Lock()
53
-
54
- # dynamic scaling config
55
- self.min_workers = kwargs.get('min_workers', 1)
56
- self.max_workers = kwargs.get('max_workers', 3)
57
- self.memory_per_worker_gb = kwargs.get('memory_per_worker_gb', 1)
58
- self.monitor_interval = kwargs.get('monitor_interval', 10)
59
- self.retry_attempts = kwargs.get('retry_attempts', 3)
60
- self.update_timeout_seconds = kwargs.get('update_timeout_seconds', 600)
61
- self.lock_acquire_timeout_seconds = kwargs.get('lock_acquire_timeout_seconds', 10)
62
-
63
- async def get_lock_for_artifact(self, artifact):
64
- key = artifact.__class__.__name__
65
- async with self.locks_lock:
66
- if key not in self.locks:
67
- self.locks[key] = asyncio.Lock()
68
- return self.locks[key]
69
-
70
- def get_artifacts(self, data_type):
78
+ # Default artifact init kwargs
79
+ today = datetime.datetime.today() + datetime.timedelta(days=1)
80
+ default_kwargs = {
81
+ 'parquet_start_date': today.strftime('%Y-%m-%d'),
82
+ 'parquet_end_date': today.strftime('%Y-%m-%d'),
83
+ 'logger': self.logger,
84
+ 'debug': self.debug,
85
+ }
86
+ self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
87
+
88
+ # State
89
+ self.completion_times: Dict[str, float] = {}
90
+ self.failed: List[str] = []
91
+ self.original_classes: List[Type] = []
92
+
93
+ def get_artifact_classes(self, data_type: str) -> List[Type]:
94
+ """
95
+ Retrieve artifact classes by data type.
96
+ """
97
+ self.logger.info(f"Fetching artifact classes for '{data_type}'")
71
98
  if data_type not in self.wrapped_classes:
72
99
  raise ValueError(f"Unsupported data type: {data_type}")
73
- artifacts = [cls(
74
- parquet_start_date=self.parquet_start_date,
75
- parquet_end_date=self.parquet_end_date,
76
- logger=self.logger,
77
- debug=self.debug
78
- ) for cls in self.wrapped_classes[data_type]]
79
- # seed pending set and clear others
80
- self.pending = set(artifacts)
81
- self.completed.clear()
82
- self.failed.clear()
83
- return artifacts
84
-
85
- def estimate_complexity(self, artifact):
86
- try:
87
- return artifact.get_size_estimate()
88
- except Exception:
89
- return 1
90
-
91
- def prioritize_tasks(self, artifacts):
92
- queue = asyncio.PriorityQueue()
93
- for art in artifacts:
94
- queue.put_nowait(PrioritizedItem(self.estimate_complexity(art), art))
95
- return queue
96
-
97
- async def resource_monitor(self, queue, workers):
98
- while not queue.empty():
100
+ classes = self.wrapped_classes[data_type]
101
+ self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
102
+ return classes
103
+
104
+ def estimate_priority(self, artifact_cls: Type) -> int:
105
+ """
106
+ Determine task priority for ordering. Lower values run first.
107
+ """
108
+ name = artifact_cls.__name__
109
+ if self.priority_fn:
99
110
  try:
100
- avail = psutil.virtual_memory().available
101
- max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
102
- optimal = max(self.min_workers,
103
- min(psutil.cpu_count(), max_by_mem, self.max_workers))
104
- async with self.workers_lock:
105
- current = len(workers)
106
- if optimal > current:
107
- for _ in range(optimal - current):
108
- wid = len(workers)
109
- workers.append(asyncio.create_task(self.worker(queue, wid)))
110
- self.logger.info(f"Added worker {wid}")
111
- elif optimal < current:
112
- for _ in range(current - optimal):
113
- w = workers.pop()
114
- w.cancel()
115
- self.logger.info("Removed a worker")
116
- await asyncio.sleep(self.monitor_interval)
117
- except asyncio.CancelledError:
118
- break
111
+ pr = self.priority_fn(artifact_cls)
112
+ self.logger.debug(f"priority_fn for {name}: {pr}")
113
+ return pr
119
114
  except Exception as e:
120
- self.logger.error(f"Monitor error: {e}")
121
- await asyncio.sleep(self.monitor_interval)
122
-
123
- @asynccontextmanager
124
- async def artifact_lock(self, artifact):
125
- lock = await self.get_lock_for_artifact(artifact)
115
+ self.logger.warning(f"priority_fn error for {name}: {e}")
126
116
  try:
127
- await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
128
- yield
129
- finally:
130
- if lock.locked():
131
- lock.release()
117
+ fs = self.artifact_class_kwargs.get('fs')
118
+ path = self.artifact_class_kwargs.get('parquet_storage_path')
119
+ pr=1
120
+ if hasattr(artifact_cls, 'get_size_estimate'):
121
+ pr = artifact_cls.get_size_estimate(fs, path)
122
+ self.logger.debug(f"Estimated priority for {name}: {pr}")
123
+ return pr
124
+ except Exception:
125
+ return 1
132
126
 
133
- async def async_update_artifact(self, artifact, **kwargs):
134
- for attempt in range(1, self.retry_attempts + 1):
135
- lock = await self.get_lock_for_artifact(artifact)
127
+ async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
128
+ """
129
+ Wrap update_artifact in a semaphore slot to limit concurrency.
130
+ """
131
+ async with sem:
132
+ name = artifact_cls.__name__
133
+ start = asyncio.get_event_loop().time()
134
+ self.logger.info(f"Starting update for {name}")
136
135
  try:
137
- await asyncio.wait_for(lock.acquire(), timeout=self.lock_acquire_timeout_seconds)
138
- try:
139
- self.logger.info(f"Updating {artifact.__class__.__name__} (attempt {attempt})")
140
- await asyncio.wait_for(
141
- asyncio.to_thread(artifact.update_parquet, **kwargs),
142
- timeout=self.update_timeout_seconds
143
- )
144
- # mark success
145
- async with self.workers_lock:
146
- self.pending.discard(artifact)
147
- self.completed.add(artifact)
148
- self.logger.info(
149
- f"✅ {artifact.__class__.__name__} done — "
150
- f"{len(self.completed)}/{len(self.completed) + len(self.pending) + len(self.failed)} completed, "
151
- f"{len(self.failed)} failed"
152
- )
153
- return
154
- finally:
155
- if lock.locked():
156
- lock.release()
157
- except asyncio.TimeoutError:
158
- self.logger.warning(f"Timeout on {artifact.__class__.__name__}, attempt {attempt}")
159
- except Exception as e:
160
- self.logger.error(f"Error on {artifact}: {e}")
161
- finally:
162
- if lock.locked():
163
- lock.release()
164
- await asyncio.sleep(2 ** (attempt - 1))
136
+ for attempt in range(1, self.retry_attempts + 1):
137
+ try:
138
+ artifact = await asyncio.to_thread(
139
+ artifact_cls, **self.artifact_class_kwargs
140
+ )
141
+ await asyncio.wait_for(
142
+ asyncio.to_thread(
143
+ artifact.update_parquet, **update_kwargs
144
+ ),
145
+ timeout=self.update_timeout_seconds
146
+ )
147
+ duration = asyncio.get_event_loop().time() - start
148
+ self.completion_times[name] = duration
149
+ self.logger.info(f"{name} updated in {duration:.2f}s (attempt {attempt})")
150
+ if self.metrics_client:
151
+ self.metrics_client.increment('task_succeeded')
152
+ return
153
+ except asyncio.TimeoutError:
154
+ self.logger.warning(f"Timeout on {name}, attempt {attempt}")
155
+ except Exception as e:
156
+ self.logger.error(f"Error on {name} attempt {attempt}: {e}")
157
+
158
+ delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
159
+ delay *= 1 + random.uniform(0, self.backoff_jitter)
160
+ self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
161
+ await asyncio.sleep(delay)
165
162
 
166
- # all retries exhausted -> mark failure
167
- async with self.workers_lock:
168
- self.pending.discard(artifact)
169
- self.failed.add(artifact)
170
- self.logger.error(f"✖️ Permanently failed {artifact.__class__.__name__}")
171
-
172
- async def worker(self, queue, worker_id, **kwargs):
173
- while True:
174
- try:
175
- item = await queue.get()
176
- art = item.artifact
177
- self.worker_heartbeat[worker_id] = time.time()
178
- await self.async_update_artifact(art, **kwargs)
179
163
  except asyncio.CancelledError:
180
- self.logger.info(f"Worker {worker_id} stopped")
181
- break
182
- finally:
183
- queue.task_done()
184
-
185
- def calculate_initial_workers(self, count: int) -> int:
186
- avail = psutil.virtual_memory().available
187
- max_by_mem = avail // (self.memory_per_worker_gb * 2**30)
188
- return max(self.min_workers,
189
- min(psutil.cpu_count(), max_by_mem, count, self.max_workers))
190
-
191
- async def update_data(self, data_type, **kwargs):
192
- self.logger.info(f"Starting update for {data_type}")
193
- artifacts = self.get_artifacts(data_type)
194
- queue = self.prioritize_tasks(artifacts)
195
- init = self.calculate_initial_workers(len(artifacts))
196
- tasks = [asyncio.create_task(self.worker(queue, i, **kwargs)) for i in range(init)]
197
- monitor = asyncio.create_task(self.resource_monitor(queue, tasks))
198
- await queue.join()
199
- monitor.cancel()
200
- for t in tasks:
201
- t.cancel()
202
- await asyncio.gather(*tasks, return_exceptions=True)
203
- self.logger.info(self.format_results_table())
204
- self.logger.info("All artifacts processed.")
205
-
206
- def format_results_table(self):
207
- results = self.get_update_status()
208
- headers = ["Metric", "Value"]
209
- rows = [
210
- ["Total", results['total']],
211
- ["Completed", results['completed']],
212
- ["Pending", results['pending']],
213
- ["Failed", results['failed']],
214
- ["Pending Items", len(results['pending_items'])],
215
- ["Failed Items", len(results['failed_items'])]
216
- ]
217
-
218
- # Find max lengths for alignment
219
- max_metric = max(len(str(row[0])) for row in rows)
220
- max_value = max(len(str(row[1])) for row in rows)
164
+ self.logger.warning(f"{name} update cancelled")
165
+ raise
166
+
167
+ # permanent failure
168
+ self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
169
+ if self.metrics_client:
170
+ self.metrics_client.increment('task_failed')
171
+ self.failed.append(name)
172
+
173
+ async def update_data(self, data_type: str, **kwargs: Any) -> None:
174
+ """
175
+ Entry point to update all artifacts of a given type concurrently.
176
+ """
177
+ self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
178
+
179
+ # RESET STATE
180
+ self.completion_times.clear()
181
+ self.failed.clear()
182
+ self.original_classes = self.get_artifact_classes(data_type)
221
183
 
222
- format_str = "{:<%d} {:>%d}" % (max_metric, max_value)
184
+ # NON-DESTRUCTIVE SORTING
185
+ ordered = sorted(self.original_classes, key=self.estimate_priority)
223
186
 
224
- table = [
225
- "\n",
226
- format_str.format(*headers),
227
- "-" * (max_metric + max_value + 2)
187
+ sem = asyncio.Semaphore(self.max_workers)
188
+ tasks = [
189
+ asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
190
+ for cls in ordered
228
191
  ]
229
192
 
230
- for row in rows:
231
- table.append(format_str.format(row[0], row[1]))
232
-
233
- return "\n".join(table)
193
+ try:
194
+ for coro in asyncio.as_completed(tasks):
195
+ await coro
196
+ except asyncio.CancelledError:
197
+ self.logger.warning("update_data was cancelled—aborting remaining retries")
198
+ for t in tasks:
199
+ t.cancel()
200
+ raise
201
+ finally:
202
+ total = len(self.original_classes)
203
+ completed = len(self.completion_times)
204
+ failed = len(self.failed)
205
+ self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
206
+
207
+ def get_update_status(self) -> Dict[str, Any]:
208
+ """
209
+ Returns summary status including completion times.
210
+ """
211
+ total = len(self.original_classes)
212
+ completed = set(self.completion_times.keys())
213
+ failed = set(self.failed)
214
+ pending = {cls.__name__ for cls in self.original_classes} - completed - failed
234
215
 
235
- def get_update_status(self):
236
- total = len(self.pending) + len(self.completed) + len(self.failed)
237
216
  return {
238
- "total": total,
239
- "completed": len(self.completed),
240
- "pending": len(self.pending),
241
- "failed": len(self.failed),
242
- "pending_items": [a.__class__.__name__ for a in self.pending],
243
- "failed_items": [a.__class__.__name__ for a in self.failed]
217
+ 'total': total,
218
+ 'completed': list(completed),
219
+ 'failed': list(failed),
220
+ 'pending': list(pending),
221
+ 'completion_times': self.completion_times,
244
222
  }
245
223
 
246
- # Top‑level driver
247
- # environment = None # fill this in with your wrapped_classes dict
248
- #
249
- # async def main():
250
- # wrapper = ArtifactUpdaterMultiWrapper(
251
- # wrapped_classes=environment,
252
- # debug=True
253
- # )
254
- # loop = asyncio.get_running_loop()
255
- # for sig in (signal.SIGINT, signal.SIGTERM):
256
- # loop.add_signal_handler(sig, lambda: asyncio.create_task(wrapper.shutdown()))
257
- # await wrapper.update_data("your_data_type")
258
- #
259
- # if __name__ == "__main__":
260
- # asyncio.run(main())
261
-
224
+ @staticmethod
225
+ def format_status_table(status: Dict[str, Any]) -> str:
226
+ """
227
+ Formats the status dict into a readable table.
228
+ """
229
+ lines = [
230
+ f"Total: {status['total']}",
231
+ f"Completed: {len(status['completed'])} {status['completed']}",
232
+ f"Failed: {len(status['failed'])} {status['failed']}",
233
+ f"Pending: {len(status['pending'])} {status['pending']}",
234
+ "",
235
+ "Per-artifact timings:"
236
+ ]
237
+ for name, dur in status['completion_times'].items():
238
+ lines.append(f" {name}: {dur:.2f}s")
239
+ return "\n".join(lines)
@@ -46,8 +46,7 @@ class DfHelper:
46
46
 
47
47
  :ivar df: The DataFrame currently being processed or loaded.
48
48
  :type df: Union[dd.DataFrame, pd.DataFrame]
49
- :ivar backend_django: Configuration for interacting with Django database backends.
50
- :type backend_connection: Optional[DjangoConnectionConfig]
49
+ :type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
51
50
  :ivar _backend_query: Internal configuration for query handling.
52
51
  :type _backend_query: Optional[QueryConfig]
53
52
  :ivar _backend_params: Internal parameters configuration for DataFrame handling.
@@ -81,9 +80,10 @@ class DfHelper:
81
80
  self.debug = kwargs.setdefault("debug", False)
82
81
  self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
83
82
  # Configure logger level
84
- self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
83
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
85
84
  self.logger.debug("Logger initialized in DEBUG mode.")
86
85
  self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
86
+ self.parquet_filename = kwargs.setdefault("parquet_filename", None)
87
87
  self.dt_field = kwargs.setdefault("dt_field", None)
88
88
  self.as_pandas = kwargs.setdefault("as_pandas", False)
89
89
  self.filesystem = kwargs.pop('filesystem', 'file')
@@ -429,7 +429,11 @@ class DfHelper:
429
429
  the instance's attribute for storage path.
430
430
  :return: None
431
431
  """
432
+ if self.df.map_partitions(len).compute().sum() == 0:
433
+ self.logger.debug("Cannot save to parquet since DataFrame is empty")
434
+ return
432
435
  fs = kwargs.pop('fs', self.fs)
436
+ parquet_filename = parquet_filename or self.parquet_filename
433
437
  parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
434
438
  ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
435
439
  ps.save_to_parquet(parquet_filename)