sibi-dst 2025.1.6__tar.gz → 2025.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/PKG-INFO +1 -1
  2. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/pyproject.toml +1 -1
  3. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/__init__.py +2 -1
  4. sibi_dst-2025.1.8/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +420 -0
  5. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_parquet_reader.py +4 -18
  6. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_wrapper.py +33 -30
  7. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/parquet_saver.py +0 -4
  8. sibi_dst-2025.1.6/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -431
  9. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/README.md +0 -0
  10. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/__init__.py +0 -0
  11. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_df_helper.py +0 -0
  12. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  13. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/__init__.py +0 -0
  14. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  15. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  16. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  17. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  18. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  19. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  20. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  21. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  22. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  23. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  24. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/__init__.py +0 -0
  25. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_defaults.py +0 -0
  26. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  27. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_params_config.py +0 -0
  28. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/core/_query_config.py +0 -0
  29. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/df_helper/data_cleaner.py +0 -0
  30. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/__init__.py +0 -0
  31. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  32. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/geopy_helper/utils.py +0 -0
  33. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/__init__.py +0 -0
  34. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  35. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  36. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  37. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  38. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/osmnx_helper/utils.py +0 -0
  39. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/tests/__init__.py +0 -0
  40. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  41. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/__init__.py +0 -0
  42. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/base.py +0 -0
  43. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/clickhouse_writer.py +0 -0
  44. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/credentials.py +0 -0
  45. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_from_http_source.py +0 -0
  46. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/data_utils.py +0 -0
  47. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/date_utils.py +0 -0
  48. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/df_utils.py +0 -0
  49. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/file_utils.py +0 -0
  50. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/filepath_generator.py +0 -0
  51. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/log_utils.py +0 -0
  52. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/manifest_manager.py +0 -0
  53. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/phone_formatter.py +0 -0
  54. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/storage_config.py +0 -0
  55. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/storage_manager.py +0 -0
  56. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/update_planner.py +0 -0
  57. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/utils/webdav_client.py +0 -0
  58. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/__init__.py +0 -0
  59. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/__init__.py +0 -0
  60. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  61. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  62. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  63. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  64. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  65. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  66. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  67. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  68. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  69. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  70. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  71. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  72. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  73. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  74. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  75. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  76. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/utils/__init__.py +0 -0
  77. {sibi_dst-2025.1.6 → sibi_dst-2025.1.8}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.6
3
+ Version: 2025.1.8
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.1.6"
3
+ version = "2025.1.8"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -3,11 +3,12 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded
6
+ from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
7
 
8
8
  __all__ = [
9
9
  'DfHelper',
10
10
  'ParquetArtifact',
11
11
  'ParquetReader',
12
12
  'ArtifactUpdaterMultiWrapperThreaded',
13
+ 'ArtifactUpdaterMultiWrapperAsync',
13
14
  ]
@@ -0,0 +1,420 @@
1
+ import time
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from typing import Any, Callable, Dict, List, Optional, Type
4
+
5
+ from sibi_dst.utils import ManagedResource
6
+
7
+ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
8
+ """
9
+ Updates artifacts concurrently using a ThreadPoolExecutor.
10
+
11
+ This version is refactored for a pure multi-threaded environment, aligning
12
+ the orchestration model with the underlying threaded workers (DataWrapper).
13
+ """
14
+ wrapped_classes: Dict[str, List[Type]]
15
+ def __init__(
16
+ self,
17
+ wrapped_classes: Dict[str, List[Type]],
18
+ *,
19
+ max_workers: int = 4,
20
+ retry_attempts: int = 3,
21
+ backoff_base: int = 2,
22
+ backoff_max: int = 60,
23
+ backoff_jitter: float = 0.1,
24
+ priority_fn: Optional[Callable[[Type], int]] = None,
25
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
26
+ **kwargs: Dict[str, Any]
27
+ ) -> None:
28
+ super().__init__(**kwargs)
29
+ self.wrapped_classes = wrapped_classes
30
+ self.max_workers = max_workers
31
+ self.retry_attempts = retry_attempts
32
+ self.backoff_base = backoff_base
33
+ self.backoff_max = backoff_max
34
+ self.backoff_jitter = backoff_jitter
35
+ self.priority_fn = priority_fn
36
+ # Default artifact init kwargs
37
+ today = datetime.datetime.today() + datetime.timedelta(days=1)
38
+ default_kwargs = {
39
+ 'parquet_start_date': today.strftime('%Y-%m-%d'),
40
+ 'parquet_end_date': today.strftime('%Y-%m-%d'),
41
+ 'logger': self.logger,
42
+ 'debug': self.debug,
43
+ 'fs': self.fs,
44
+ 'verbose': self.verbose,
45
+ }
46
+ self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
47
+
48
+ # State tracking
49
+ self.completion_times: Dict[str, float] = {}
50
+ self.failed: List[str] = []
51
+ self.original_classes: List[Type] = []
52
+
53
+ def get_artifact_classes(self, data_type: str) -> List[Type]:
54
+ """Retrieve artifact classes by data type."""
55
+ self.logger.info(f"Fetching artifact classes for '{data_type}'")
56
+ classes = self.wrapped_classes.get(data_type)
57
+ if not classes:
58
+ raise ValueError(f"Unsupported data type: {data_type}")
59
+ self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
60
+ return classes
61
+
62
+ def estimate_priority(self, artifact_cls: Type) -> int:
63
+ """
64
+ Determines task priority. Lower values run first.
65
+ Note: This is a blocking call and will run sequentially before updates start.
66
+ """
67
+ name = artifact_cls.__name__
68
+ # Custom priority function takes precedence
69
+ if self.priority_fn:
70
+ try:
71
+ return self.priority_fn(artifact_cls)
72
+ except Exception as e:
73
+ self.logger.warning(f"priority_fn error for {name}: {e}")
74
+
75
+ # # Fallback to size estimate if available
76
+ # if hasattr(artifact_cls, 'get_size_estimate'):
77
+ # try:
78
+ # # This performs blocking I/O
79
+ # return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
80
+ #
81
+ # except Exception as e:
82
+ # self.logger.warning(f"get_size_estimate failed for {name}: {e}")
83
+
84
+ # Default priority
85
+ return 999
86
+
87
+ def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
88
+ """
89
+ A blocking worker function that handles instantiation, update, and retries for a single artifact.
90
+ This function is designed to be run in a ThreadPoolExecutor.
91
+ """
92
+ name = artifact_cls.__name__
93
+ self.logger.debug(f"Worker thread starting update for {name}")
94
+
95
+ for attempt in range(1, self.retry_attempts + 1):
96
+ try:
97
+ # Instantiate and update directly within the worker thread
98
+ artifact_instance = artifact_cls(**self.artifact_class_kwargs)
99
+ artifact_instance.update_parquet(**update_kwargs)
100
+
101
+ self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
102
+ return name # Return the name on success
103
+
104
+ except Exception as e:
105
+ self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
106
+ if attempt < self.retry_attempts:
107
+ delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
108
+ delay *= 1 + random.uniform(0, self.backoff_jitter)
109
+ self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
110
+ time.sleep(delay)
111
+
112
+ # If all retries fail, raise an exception to be caught by the main loop
113
+ raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
114
+
115
+ async def update_data(self, data_type: str, **kwargs: Any) -> None:
116
+ """
117
+ Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
118
+ """
119
+ self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
120
+
121
+ # Reset state for this run
122
+ self.completion_times.clear()
123
+ self.failed.clear()
124
+ self.original_classes = self.get_artifact_classes(data_type)
125
+
126
+ # Sequentially estimate priorities and sort classes before execution
127
+ self.logger.debug("Estimating priorities to order tasks...")
128
+ ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
129
+ self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
130
+
131
+ start_time = time.monotonic()
132
+
133
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
134
+ future_to_class_name = {
135
+ executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
136
+ for cls in ordered_classes
137
+ }
138
+
139
+ for future in as_completed(future_to_class_name):
140
+ name = future_to_class_name[future]
141
+ try:
142
+ # result() will re-raise the exception from the worker if one occurred
143
+ future.result()
144
+ # If no exception, the task succeeded
145
+ self.completion_times[name] = time.monotonic() - start_time
146
+ except Exception as e:
147
+ self.logger.error(f"✖️ {name} permanently failed. See error log above.")
148
+ self.failed.append(name)
149
+
150
+ # Log final status
151
+ total = len(self.original_classes)
152
+ completed = len(self.completion_times)
153
+ failed_count = len(self.failed)
154
+ self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
155
+
156
+ def get_update_status(self) -> Dict[str, Any]:
157
+ """Returns a summary status including completion times."""
158
+ completed_set = set(self.completion_times.keys())
159
+ failed_set = set(self.failed)
160
+ pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
161
+
162
+ return {
163
+ 'total': len(self.original_classes),
164
+ 'completed': list(completed_set),
165
+ 'failed': list(failed_set),
166
+ 'pending': list(pending_set),
167
+ 'completion_times': self.completion_times,
168
+ }
169
+
170
+ @staticmethod
171
+ def format_status_table(status: Dict[str, Any]) -> str:
172
+ """Formats the status dictionary into a readable table."""
173
+ lines = [
174
+ f"Total: {status['total']}",
175
+ f"Completed: {len(status['completed'])}",
176
+ f"Failed: {len(status['failed'])}",
177
+ f"Pending: {len(status['pending'])}",
178
+ "\nPer-artifact completion times (seconds):"
179
+ ]
180
+ sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
181
+ for name, duration in sorted_times:
182
+ lines.append(f" - {name:<30}: {duration:.2f}s")
183
+ if status['failed']:
184
+ lines.append("\nFailed artifacts:")
185
+ for name in status['failed']:
186
+ lines.append(f" - {name}")
187
+ return "\n".join(lines)
188
+
189
+
190
+ import asyncio
191
+ import datetime
192
+ import random
193
+ from typing import Any, Callable, Dict, List, Optional, Type
194
+
195
+ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
196
+ """
197
+ Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
198
+
199
+ Features:
200
+ - Caps concurrency at max_workers via semaphore
201
+ - Optionally prioritises tasks via a priority function or static method on artifact classes
202
+ - Tracks per-artifact completion times
203
+ - Configurable retry/backoff strategy
204
+ - Optional metrics integration
205
+ - Thread-safe within a single asyncio loop
206
+
207
+ Usage:
208
+ wrapper = ArtifactUpdaterMultiWrapper(
209
+ wrapped_classes={
210
+ 'mydata': [DataArtifactA, DataArtifactB],
211
+ },
212
+ max_workers=4,
213
+ retry_attempts=3,
214
+ update_timeout_seconds=600,
215
+ backoff_base=2,
216
+ backoff_max=60,
217
+ backoff_jitter=0.1,
218
+ priority_fn=None, # or custom
219
+ metrics_client=None,
220
+ debug=True,
221
+ logger=None,
222
+ artifact_class_kwargs={
223
+ 'fs': my_fs,
224
+ 'parquet_storage_path': 's3://bucket/data',
225
+ 'logger': my_logger,
226
+ 'debug': True,
227
+ }
228
+ )
229
+ await wrapper.update_data('mydata', period='ytd', overwrite=True)
230
+ """
231
+ def __init__(
232
+ self,
233
+ wrapped_classes: Dict[str, List[Type]],
234
+ *,
235
+ max_workers: int = 3,
236
+ retry_attempts: int = 3,
237
+ update_timeout_seconds: int = 600,
238
+ backoff_base: int = 2,
239
+ backoff_max: Optional[int] = 60,
240
+ backoff_jitter: float = 0.1,
241
+ priority_fn: Optional[Callable[[Type], int]] = None,
242
+ metrics_client: Any = None,
243
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
244
+ **kwargs: Dict[str, Any]
245
+ ) -> None:
246
+ super().__init__(**kwargs)
247
+ self.wrapped_classes = wrapped_classes
248
+ self.max_workers = max_workers
249
+ self.retry_attempts = retry_attempts
250
+ self.update_timeout_seconds = update_timeout_seconds
251
+ self.backoff_base = backoff_base
252
+ self.backoff_max = backoff_max
253
+ self.backoff_jitter = backoff_jitter
254
+ self.priority_fn = priority_fn
255
+ self.metrics_client = metrics_client
256
+
257
+ # Default artifact init kwargs
258
+ today = datetime.datetime.today() + datetime.timedelta(days=1)
259
+ default_kwargs = {
260
+ 'parquet_start_date': today.strftime('%Y-%m-%d'),
261
+ 'parquet_end_date': today.strftime('%Y-%m-%d'),
262
+ 'logger': self.logger,
263
+ 'debug': self.debug,
264
+ 'fs': self.fs,
265
+ 'verbose': self.verbose,
266
+ }
267
+ self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
268
+
269
+ # State
270
+ self.completion_times: Dict[str, float] = {}
271
+ self.failed: List[str] = []
272
+ self.original_classes: List[Type] = []
273
+
274
+ def get_artifact_classes(self, data_type: str) -> List[Type]:
275
+ """
276
+ Retrieve artifact classes by data type.
277
+ """
278
+ self.logger.info(f"Fetching artifact classes for '{data_type}'")
279
+ if data_type not in self.wrapped_classes:
280
+ raise ValueError(f"Unsupported data type: {data_type}")
281
+ classes = self.wrapped_classes[data_type]
282
+ self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
283
+ return classes
284
+
285
+ def estimate_priority(self, artifact_cls: Type) -> int:
286
+ """
287
+ Determine task priority for ordering. Lower values run first.
288
+ """
289
+ name = artifact_cls.__name__
290
+ if self.priority_fn:
291
+ try:
292
+ pr = self.priority_fn(artifact_cls)
293
+ self.logger.debug(f"priority_fn for {name}: {pr}")
294
+ return pr
295
+ except Exception as e:
296
+ self.logger.warning(f"priority_fn error for {name}: {e}")
297
+ try:
298
+ fs = self.artifact_class_kwargs.get('fs')
299
+ path = self.artifact_class_kwargs.get('parquet_storage_path')
300
+ pr=1
301
+ if hasattr(artifact_cls, 'get_size_estimate'):
302
+ pr = artifact_cls.get_size_estimate(fs, path)
303
+ self.logger.debug(f"Estimated priority for {name}: {pr}")
304
+ return pr
305
+ except Exception:
306
+ return 1
307
+
308
+ async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
309
+ """
310
+ Wrap update_artifact in a semaphore slot to limit concurrency.
311
+ """
312
+ async with sem:
313
+ name = artifact_cls.__name__
314
+ start = asyncio.get_event_loop().time()
315
+ self.logger.info(f"Starting update for {name}")
316
+ try:
317
+ for attempt in range(1, self.retry_attempts + 1):
318
+ try:
319
+ artifact = await asyncio.to_thread(
320
+ artifact_cls, **self.artifact_class_kwargs
321
+ )
322
+ await asyncio.wait_for(
323
+ asyncio.to_thread(
324
+ artifact.update_parquet, **update_kwargs
325
+ ),
326
+ timeout=self.update_timeout_seconds
327
+ )
328
+ duration = asyncio.get_event_loop().time() - start
329
+ self.completion_times[name] = duration
330
+ self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
331
+ if self.metrics_client:
332
+ self.metrics_client.increment('task_succeeded')
333
+ return
334
+ except asyncio.TimeoutError:
335
+ self.logger.warning(f"Timeout on {name}, attempt {attempt}")
336
+ except Exception as e:
337
+ self.logger.error(f"Error on {name} attempt {attempt}: {e}")
338
+
339
+ delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
340
+ delay *= 1 + random.uniform(0, self.backoff_jitter)
341
+ self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
342
+ await asyncio.sleep(delay)
343
+
344
+ except asyncio.CancelledError:
345
+ self.logger.warning(f"{name} update cancelled")
346
+ raise
347
+
348
+ # permanent failure
349
+ self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
350
+ if self.metrics_client:
351
+ self.metrics_client.increment('task_failed')
352
+ self.failed.append(name)
353
+
354
+ async def update_data(self, data_type: str, **kwargs: Any) -> None:
355
+ """
356
+ Entry point to update all artifacts of a given type concurrently.
357
+ """
358
+ self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
359
+
360
+ # RESET STATE
361
+ self.completion_times.clear()
362
+ self.failed.clear()
363
+ self.original_classes = self.get_artifact_classes(data_type)
364
+
365
+ # NON-DESTRUCTIVE SORTING
366
+ ordered = sorted(self.original_classes, key=self.estimate_priority)
367
+
368
+ sem = asyncio.Semaphore(self.max_workers)
369
+ tasks = [
370
+ asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
371
+ for cls in ordered
372
+ ]
373
+
374
+ try:
375
+ for coro in asyncio.as_completed(tasks):
376
+ await coro
377
+ except asyncio.CancelledError:
378
+ self.logger.warning("update_data was cancelled—aborting remaining retries")
379
+ for t in tasks:
380
+ t.cancel()
381
+ raise
382
+ finally:
383
+ total = len(self.original_classes)
384
+ completed = len(self.completion_times)
385
+ failed = len(self.failed)
386
+ self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
387
+
388
+ def get_update_status(self) -> Dict[str, Any]:
389
+ """
390
+ Returns summary status including completion times.
391
+ """
392
+ total = len(self.original_classes)
393
+ completed = set(self.completion_times.keys())
394
+ failed = set(self.failed)
395
+ pending = {cls.__name__ for cls in self.original_classes} - completed - failed
396
+
397
+ return {
398
+ 'total': total,
399
+ 'completed': list(completed),
400
+ 'failed': list(failed),
401
+ 'pending': list(pending),
402
+ 'completion_times': self.completion_times,
403
+ }
404
+
405
+ @staticmethod
406
+ def format_status_table(status: Dict[str, Any]) -> str:
407
+ """
408
+ Formats the status dict into a readable table.
409
+ """
410
+ lines = [
411
+ f"Total: {status['total']}",
412
+ f"Completed: {len(status['completed'])} {status['completed']}",
413
+ f"Failed: {len(status['failed'])} {status['failed']}",
414
+ f"Pending: {len(status['pending'])} {status['pending']}",
415
+ "",
416
+ "Per-artifact timings:"
417
+ ]
418
+ for name, dur in status['completion_times'].items():
419
+ lines.append(f" {name}: {dur:.2f}s")
420
+ return "\n".join(lines)
@@ -1,4 +1,3 @@
1
- import logging
2
1
  from typing import Optional, ClassVar, Dict
3
2
 
4
3
  import dask.dataframe as dd
@@ -47,15 +46,13 @@ class ParquetReader(DfHelper):
47
46
  'backend': 'parquet'
48
47
  }
49
48
 
50
- def __init__(self, filesystem_type="file", filesystem_options=None, **kwargs):
49
+ def __init__(self, **kwargs):
51
50
  self.config = {
52
51
  **self.DEFAULT_CONFIG,
53
52
  **kwargs,
54
53
  }
55
- self.df: Optional[dd.DataFrame] = None
56
- #self.debug = self.config.setdefault('debug', False)
57
- #self.logger = self.config.setdefault('logger', Logger.default_logger(logger_name=self.__class__.__name__))
58
- #self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
54
+ super().__init__(**self.config)
55
+
59
56
  self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
60
57
  if self.parquet_storage_path is None:
61
58
  raise ValueError('parquet_storage_path must be set')
@@ -67,19 +64,9 @@ class ParquetReader(DfHelper):
67
64
  if self.parquet_end_date is None:
68
65
  raise ValueError('parquet_end_date must be set')
69
66
 
70
- # Filesystem setup
71
- #self.filesystem_type = filesystem_type
72
- #self.filesystem_options = filesystem_options or {}
73
- #self.fs = self.config.setdefault('fs', None)
74
- #if self.fs is None:
75
- # self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
76
- #self.config.setdefault('fs', self.fs)
77
-
78
67
  if not self.directory_exists():
79
68
  raise ValueError(f"{self.parquet_storage_path} does not exist")
80
69
 
81
- super().__init__(**self.config)
82
-
83
70
  def load(self, **kwargs):
84
71
  self.df = super().load(**kwargs)
85
72
  return self.df
@@ -89,5 +76,4 @@ class ParquetReader(DfHelper):
89
76
  info = self.fs.info(self.parquet_storage_path)
90
77
  return info['type'] == 'directory'
91
78
  except FileNotFoundError:
92
- return False
93
-
79
+ return False
@@ -1,16 +1,13 @@
1
1
  import datetime
2
- import logging
3
2
  import threading
4
3
  import time
5
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
5
  from typing import Type, Any, Dict, Optional, Union, List, ClassVar
7
6
 
8
- import fsspec
9
7
  import pandas as pd
10
8
  from tqdm import tqdm
11
9
 
12
10
  from . import ManagedResource
13
- from .log_utils import Logger
14
11
  from .parquet_saver import ParquetSaver
15
12
 
16
13
 
@@ -60,8 +57,7 @@ class DataWrapper(ManagedResource):
60
57
  self.processed_dates: List[datetime.date] = []
61
58
  self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
62
59
  self.mmanifest = kwargs.get("mmanifest", None)
63
- self.update_planner=kwargs.get("update_planner", None)
64
-
60
+ self.update_planner = kwargs.get("update_planner", None)
65
61
 
66
62
  def __exit__(self, exc_type, exc_val, exc_tb):
67
63
  """Context manager exit"""
@@ -104,7 +100,6 @@ class DataWrapper(ManagedResource):
104
100
  if self.update_planner.show_progress:
105
101
  self.show_benchmark_summary()
106
102
 
107
-
108
103
  def _execute_task_batch(self, priority: int, dates: List[datetime.date], max_retries: int):
109
104
  """Executes a single batch of tasks (dates) using a thread pool."""
110
105
  desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
@@ -135,7 +130,7 @@ class DataWrapper(ManagedResource):
135
130
  time.sleep(2 ** attempt) # Exponential backoff
136
131
  else:
137
132
  self.logger.error(f"Failed processing {date} after {max_retries} attempts.")
138
- #raise
133
+ # raise
139
134
 
140
135
  def _process_single_date(self, date: datetime.date):
141
136
  """Core date processing logic with load/save timing and thread reporting"""
@@ -146,8 +141,8 @@ class DataWrapper(ManagedResource):
146
141
  return
147
142
  full_path = f"{path}{self.parquet_filename}"
148
143
 
149
- #thread_name = threading.current_thread().name
150
- #self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
144
+ # thread_name = threading.current_thread().name
145
+ # self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
151
146
 
152
147
  overall_start = time.perf_counter()
153
148
  try:
@@ -159,35 +154,43 @@ class DataWrapper(ManagedResource):
159
154
  local_load_params = self.load_params.copy()
160
155
  local_load_params.update(date_filter)
161
156
  local_class_instance = self.dataclass(**self.class_params)
162
- df=local_class_instance.load(**local_load_params)
157
+ df = local_class_instance.load(**local_load_params)
163
158
  load_time = time.perf_counter() - load_start
164
159
 
165
160
  if hasattr(local_class_instance, "total_records"):
166
- self.logger.debug(f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
161
+ self.logger.debug(
162
+ f"Total records loaded by {local_class_instance.__class__.__name__}: {local_class_instance.total_records}")
167
163
  if int(local_class_instance.total_records) == 0: # If no records were loaded but not due to an error
168
164
  if self.mmanifest:
169
165
  self.mmanifest.record(
170
166
  full_path=path
171
167
  )
172
- self.logger.info(f"No data found for {date}. Logged to missing manifest.")
173
- return
174
- save_start = time.perf_counter()
175
- with ParquetSaver(
176
- df_result=df,
177
- parquet_storage_path=path,
178
- fs=self.fs,
179
- logger=self.logger
180
- ) as ps:
181
- ps.save_to_parquet(self.parquet_filename, overwrite=True)
182
- save_time = time.perf_counter() - save_start
183
-
184
- total_time = time.perf_counter() - overall_start
185
- self.benchmarks[date] = {
186
- "load_duration": load_time,
187
- "save_duration": save_time,
188
- "total_duration": total_time
189
- }
190
- self._log_success(date, total_time, full_path)
168
+ self.logger.info(f"No data found for {full_path}. Logged to missing manifest.")
169
+ elif int(local_class_instance.total_records) < 0:
170
+ self.logger.warning(
171
+ f"Negative record count ({local_class_instance.total_records}) for {full_path}. "
172
+ "This may indicate an error in the data loading process."
173
+ )
174
+ else:
175
+ save_start = time.perf_counter()
176
+ parquet_params ={
177
+ "df_result": df,
178
+ "parquet_storage_path": path,
179
+ "fs": self.fs,
180
+ "logger": self.logger,
181
+ "debug": self.debug,
182
+ }
183
+ with ParquetSaver(**parquet_params) as ps:
184
+ ps.save_to_parquet(self.parquet_filename, overwrite=True)
185
+ save_time = time.perf_counter() - save_start
186
+
187
+ total_time = time.perf_counter() - overall_start
188
+ self.benchmarks[date] = {
189
+ "load_duration": load_time,
190
+ "save_duration": save_time,
191
+ "total_duration": total_time
192
+ }
193
+ self._log_success(date, total_time, full_path)
191
194
  except Exception as e:
192
195
  self._log_failure(date, e)
193
196
  raise
@@ -26,10 +26,6 @@ class ParquetSaver(ManagedResource):
26
26
  super().__init__(**kwargs)
27
27
  self.df_result = df_result
28
28
  self.parquet_storage_path = parquet_storage_path.rstrip("/")
29
- #self.debug = debug
30
- #self.logger = logger or Logger.default_logger(self.__class__.__name__)
31
- #self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
32
- #self.fs = fs
33
29
  # Determine protocol for special handling (e.g., 's3')
34
30
  if not self.fs:
35
31
  raise ValueError("File system (fs) must be provided to ParquetSaver.")