sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,422 +0,0 @@
1
- import time
2
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
- from typing import Any, Callable, Dict, List, Optional, Type
4
-
5
- from sibi_dst.utils import ManagedResource
6
-
7
- class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
8
- """
9
- Updates artifacts concurrently using a ThreadPoolExecutor.
10
-
11
- This version is refactored for a pure multi-threaded environment, aligning
12
- the orchestration model with the underlying threaded workers (DataWrapper).
13
- """
14
- wrapped_classes: Dict[str, List[Type]]
15
- def __init__(
16
- self,
17
- wrapped_classes: Dict[str, List[Type]],
18
- *,
19
- max_workers: int = 4,
20
- retry_attempts: int = 3,
21
- backoff_base: int = 2,
22
- backoff_max: int = 60,
23
- backoff_jitter: float = 0.1,
24
- priority_fn: Optional[Callable[[Type], int]] = None,
25
- artifact_class_kwargs: Optional[Dict[str, Any]] = None,
26
- **kwargs: Dict[str, Any]
27
- ) -> None:
28
- super().__init__(**kwargs)
29
- self.wrapped_classes = wrapped_classes
30
- self.max_workers = max_workers
31
- self.retry_attempts = retry_attempts
32
- self.backoff_base = backoff_base
33
- self.backoff_max = backoff_max
34
- self.backoff_jitter = backoff_jitter
35
- self.priority_fn = priority_fn
36
- # Default artifact init kwargs
37
- today = datetime.datetime.today() + datetime.timedelta(days=1)
38
- default_kwargs = {
39
- 'parquet_start_date': today.strftime('%Y-%m-%d'),
40
- 'parquet_end_date': today.strftime('%Y-%m-%d'),
41
- 'logger': self.logger,
42
- 'debug': self.debug,
43
- 'fs': self.fs,
44
- 'verbose': self.verbose,
45
- }
46
- self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
47
-
48
- # State tracking
49
- self.completion_times: Dict[str, float] = {}
50
- self.failed: List[str] = []
51
- self.original_classes: List[Type] = []
52
- self.logger.info("ArtifactUpdaterMultiWrapperThreaded initialized")
53
-
54
- def get_artifact_classes(self, data_type: str) -> List[Type]:
55
- """Retrieve artifact classes by data type."""
56
- self.logger.info(f"Fetching artifact classes for '{data_type}'")
57
- classes = self.wrapped_classes.get(data_type)
58
- if not classes:
59
- raise ValueError(f"Unsupported data type: {data_type}")
60
- self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
61
- return classes
62
-
63
- def estimate_priority(self, artifact_cls: Type) -> int:
64
- """
65
- Determines task priority. Lower values run first.
66
- Note: This is a blocking call and will run sequentially before updates start.
67
- """
68
- name = artifact_cls.__name__
69
- # Custom priority function takes precedence
70
- if self.priority_fn:
71
- try:
72
- return self.priority_fn(artifact_cls)
73
- except Exception as e:
74
- self.logger.warning(f"priority_fn error for {name}: {e}")
75
-
76
- # # Fallback to size estimate if available
77
- # if hasattr(artifact_cls, 'get_size_estimate'):
78
- # try:
79
- # # This performs blocking I/O
80
- # return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
81
- #
82
- # except Exception as e:
83
- # self.logger.warning(f"get_size_estimate failed for {name}: {e}")
84
-
85
- # Default priority
86
- return 999
87
-
88
- def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
89
- """
90
- A blocking worker function that handles instantiation, update, and retries for a single artifact.
91
- This function is designed to be run in a ThreadPoolExecutor.
92
- """
93
- name = artifact_cls.__name__
94
- self.logger.debug(f"Worker thread starting update for {name}")
95
-
96
- for attempt in range(1, self.retry_attempts + 1):
97
- try:
98
- # Instantiate and update directly within the worker thread
99
- artifact_instance = artifact_cls(**self.artifact_class_kwargs)
100
- artifact_instance.update_parquet(**update_kwargs)
101
-
102
- self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
103
- return name # Return the name on success
104
-
105
- except Exception as e:
106
- self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
107
- if attempt < self.retry_attempts:
108
- delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
109
- delay *= 1 + random.uniform(0, self.backoff_jitter)
110
- self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
111
- time.sleep(delay)
112
-
113
- # If all retries fail, raise an exception to be caught by the main loop
114
- raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
115
-
116
- async def update_data(self, data_type: str, **kwargs: Any) -> None:
117
- """
118
- Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
119
- """
120
- self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
121
-
122
- # Reset state for this run
123
- self.completion_times.clear()
124
- self.failed.clear()
125
- self.original_classes = self.get_artifact_classes(data_type)
126
-
127
- # Sequentially estimate priorities and sort classes before execution
128
- self.logger.debug("Estimating priorities to order tasks...")
129
- ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
130
- self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
131
-
132
- start_time = time.monotonic()
133
-
134
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
135
- future_to_class_name = {
136
- executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
137
- for cls in ordered_classes
138
- }
139
-
140
- for future in as_completed(future_to_class_name):
141
- name = future_to_class_name[future]
142
- try:
143
- # result() will re-raise the exception from the worker if one occurred
144
- future.result()
145
- # If no exception, the task succeeded
146
- self.completion_times[name] = time.monotonic() - start_time
147
- except Exception as e:
148
- self.logger.error(f"✖️ {name} permanently failed. See error log above.")
149
- self.failed.append(name)
150
-
151
- # Log final status
152
- total = len(self.original_classes)
153
- completed = len(self.completion_times)
154
- failed_count = len(self.failed)
155
- self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
156
-
157
- def get_update_status(self) -> Dict[str, Any]:
158
- """Returns a summary status including completion times."""
159
- completed_set = set(self.completion_times.keys())
160
- failed_set = set(self.failed)
161
- pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
162
-
163
- return {
164
- 'total': len(self.original_classes),
165
- 'completed': list(completed_set),
166
- 'failed': list(failed_set),
167
- 'pending': list(pending_set),
168
- 'completion_times': self.completion_times,
169
- }
170
-
171
- @staticmethod
172
- def format_status_table(status: Dict[str, Any]) -> str:
173
- """Formats the status dictionary into a readable table."""
174
- lines = [
175
- f"Total: {status['total']}",
176
- f"Completed: {len(status['completed'])}",
177
- f"Failed: {len(status['failed'])}",
178
- f"Pending: {len(status['pending'])}",
179
- "\nPer-artifact completion times (seconds):"
180
- ]
181
- sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
182
- for name, duration in sorted_times:
183
- lines.append(f" - {name:<30}: {duration:.2f}s")
184
- if status['failed']:
185
- lines.append("\nFailed artifacts:")
186
- for name in status['failed']:
187
- lines.append(f" - {name}")
188
- return "\n".join(lines)
189
-
190
-
191
- import asyncio
192
- import datetime
193
- import random
194
- from typing import Any, Callable, Dict, List, Optional, Type
195
-
196
- class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
197
- """
198
- Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
199
-
200
- Features:
201
- - Caps concurrency at max_workers via semaphore
202
- - Optionally prioritises tasks via a priority function or static method on artifact classes
203
- - Tracks per-artifact completion times
204
- - Configurable retry/backoff strategy
205
- - Optional metrics integration
206
- - Thread-safe within a single asyncio loop
207
-
208
- Usage:
209
- wrapper = ArtifactUpdaterMultiWrapper(
210
- wrapped_classes={
211
- 'mydata': [DataArtifactA, DataArtifactB],
212
- },
213
- max_workers=4,
214
- retry_attempts=3,
215
- update_timeout_seconds=600,
216
- backoff_base=2,
217
- backoff_max=60,
218
- backoff_jitter=0.1,
219
- priority_fn=None, # or custom
220
- metrics_client=None,
221
- debug=True,
222
- logger=None,
223
- artifact_class_kwargs={
224
- 'fs': my_fs,
225
- 'parquet_storage_path': 's3://bucket/data',
226
- 'logger': my_logger,
227
- 'debug': True,
228
- }
229
- )
230
- await wrapper.update_data('mydata', period='ytd', overwrite=True)
231
- """
232
- def __init__(
233
- self,
234
- wrapped_classes: Dict[str, List[Type]],
235
- *,
236
- max_workers: int = 3,
237
- retry_attempts: int = 3,
238
- update_timeout_seconds: int = 600,
239
- backoff_base: int = 2,
240
- backoff_max: Optional[int] = 60,
241
- backoff_jitter: float = 0.1,
242
- priority_fn: Optional[Callable[[Type], int]] = None,
243
- metrics_client: Any = None,
244
- artifact_class_kwargs: Optional[Dict[str, Any]] = None,
245
- **kwargs: Dict[str, Any]
246
- ) -> None:
247
- super().__init__(**kwargs)
248
- self.wrapped_classes = wrapped_classes
249
- self.max_workers = max_workers
250
- self.retry_attempts = retry_attempts
251
- self.update_timeout_seconds = update_timeout_seconds
252
- self.backoff_base = backoff_base
253
- self.backoff_max = backoff_max
254
- self.backoff_jitter = backoff_jitter
255
- self.priority_fn = priority_fn
256
- self.metrics_client = metrics_client
257
-
258
- # Default artifact init kwargs
259
- today = datetime.datetime.today() + datetime.timedelta(days=1)
260
- default_kwargs = {
261
- 'parquet_start_date': today.strftime('%Y-%m-%d'),
262
- 'parquet_end_date': today.strftime('%Y-%m-%d'),
263
- 'logger': self.logger,
264
- 'debug': self.debug,
265
- 'fs': self.fs,
266
- 'verbose': self.verbose,
267
- }
268
- self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
269
-
270
- # State
271
- self.completion_times: Dict[str, float] = {}
272
- self.failed: List[str] = []
273
- self.original_classes: List[Type] = []
274
- self.logger.info("ArtifactUpdaterMultiWrapperAsync initialized")
275
-
276
- def get_artifact_classes(self, data_type: str) -> List[Type]:
277
- """
278
- Retrieve artifact classes by data type.
279
- """
280
- self.logger.info(f"Fetching artifact classes for '{data_type}'")
281
- if data_type not in self.wrapped_classes:
282
- raise ValueError(f"Unsupported data type: {data_type}")
283
- classes = self.wrapped_classes[data_type]
284
- self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
285
- return classes
286
-
287
- def estimate_priority(self, artifact_cls: Type) -> int:
288
- """
289
- Determine task priority for ordering. Lower values run first.
290
- """
291
- name = artifact_cls.__name__
292
- if self.priority_fn:
293
- try:
294
- pr = self.priority_fn(artifact_cls)
295
- self.logger.debug(f"priority_fn for {name}: {pr}")
296
- return pr
297
- except Exception as e:
298
- self.logger.warning(f"priority_fn error for {name}: {e}")
299
- try:
300
- fs = self.artifact_class_kwargs.get('fs')
301
- path = self.artifact_class_kwargs.get('parquet_storage_path')
302
- pr=1
303
- if hasattr(artifact_cls, 'get_size_estimate'):
304
- pr = artifact_cls.get_size_estimate(fs, path)
305
- self.logger.debug(f"Estimated priority for {name}: {pr}")
306
- return pr
307
- except Exception:
308
- return 1
309
-
310
- async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
311
- """
312
- Wrap update_artifact in a semaphore slot to limit concurrency.
313
- """
314
- async with sem:
315
- name = artifact_cls.__name__
316
- start = asyncio.get_event_loop().time()
317
- self.logger.info(f"Starting update for {name}")
318
- try:
319
- for attempt in range(1, self.retry_attempts + 1):
320
- try:
321
- artifact = await asyncio.to_thread(
322
- artifact_cls, **self.artifact_class_kwargs
323
- )
324
- await asyncio.wait_for(
325
- asyncio.to_thread(
326
- artifact.update_parquet, **update_kwargs
327
- ),
328
- timeout=self.update_timeout_seconds
329
- )
330
- duration = asyncio.get_event_loop().time() - start
331
- self.completion_times[name] = duration
332
- self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
333
- if self.metrics_client:
334
- self.metrics_client.increment('task_succeeded')
335
- return
336
- except asyncio.TimeoutError:
337
- self.logger.warning(f"Timeout on {name}, attempt {attempt}")
338
- except Exception as e:
339
- self.logger.error(f"Error on {name} attempt {attempt}: {e}")
340
-
341
- delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
342
- delay *= 1 + random.uniform(0, self.backoff_jitter)
343
- self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
344
- await asyncio.sleep(delay)
345
-
346
- except asyncio.CancelledError:
347
- self.logger.warning(f"{name} update cancelled")
348
- raise
349
-
350
- # permanent failure
351
- self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
352
- if self.metrics_client:
353
- self.metrics_client.increment('task_failed')
354
- self.failed.append(name)
355
-
356
- async def update_data(self, data_type: str, **kwargs: Any) -> None:
357
- """
358
- Entry point to update all artifacts of a given type concurrently.
359
- """
360
- self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
361
-
362
- # RESET STATE
363
- self.completion_times.clear()
364
- self.failed.clear()
365
- self.original_classes = self.get_artifact_classes(data_type)
366
-
367
- # NON-DESTRUCTIVE SORTING
368
- ordered = sorted(self.original_classes, key=self.estimate_priority)
369
-
370
- sem = asyncio.Semaphore(self.max_workers)
371
- tasks = [
372
- asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
373
- for cls in ordered
374
- ]
375
-
376
- try:
377
- for coro in asyncio.as_completed(tasks):
378
- await coro
379
- except asyncio.CancelledError:
380
- self.logger.warning("update_data was cancelled—aborting remaining retries")
381
- for t in tasks:
382
- t.cancel()
383
- raise
384
- finally:
385
- total = len(self.original_classes)
386
- completed = len(self.completion_times)
387
- failed = len(self.failed)
388
- self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
389
-
390
- def get_update_status(self) -> Dict[str, Any]:
391
- """
392
- Returns summary status including completion times.
393
- """
394
- total = len(self.original_classes)
395
- completed = set(self.completion_times.keys())
396
- failed = set(self.failed)
397
- pending = {cls.__name__ for cls in self.original_classes} - completed - failed
398
-
399
- return {
400
- 'total': total,
401
- 'completed': list(completed),
402
- 'failed': list(failed),
403
- 'pending': list(pending),
404
- 'completion_times': self.completion_times,
405
- }
406
-
407
- @staticmethod
408
- def format_status_table(status: Dict[str, Any]) -> str:
409
- """
410
- Formats the status dict into a readable table.
411
- """
412
- lines = [
413
- f"Total: {status['total']}",
414
- f"Completed: {len(status['completed'])} {status['completed']}",
415
- f"Failed: {len(status['failed'])} {status['failed']}",
416
- f"Pending: {len(status['pending'])} {status['pending']}",
417
- "",
418
- "Per-artifact timings:"
419
- ]
420
- for name, dur in status['completion_times'].items():
421
- lines.append(f" {name}: {dur:.2f}s")
422
- return "\n".join(lines)