sibi-dst 2025.1.3__tar.gz → 2025.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/PKG-INFO +3 -1
  2. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/pyproject.toml +9 -1
  3. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/__init__.py +4 -1
  4. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/__init__.py +2 -2
  5. sibi_dst-2025.1.5/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +431 -0
  6. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/_df_helper.py +47 -30
  7. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/_parquet_artifact.py +57 -47
  8. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/_parquet_reader.py +9 -13
  9. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
  10. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
  11. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
  12. sibi_dst-2025.1.5/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +104 -0
  13. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/__init__.py +3 -2
  14. sibi_dst-2025.1.5/sibi_dst/utils/base.py +97 -0
  15. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/clickhouse_writer.py +5 -4
  16. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/data_wrapper.py +69 -84
  17. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/date_utils.py +2 -1
  18. sibi_dst-2025.1.5/sibi_dst/utils/log_utils.py +372 -0
  19. sibi_dst-2025.1.5/sibi_dst/utils/manifest_manager.py +207 -0
  20. sibi_dst-2025.1.5/sibi_dst/utils/parquet_saver.py +121 -0
  21. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/storage_config.py +6 -0
  22. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/storage_manager.py +2 -1
  23. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/update_planner.py +72 -22
  24. sibi_dst-2025.1.3/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -239
  25. sibi_dst-2025.1.3/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -206
  26. sibi_dst-2025.1.3/sibi_dst/utils/log_utils.py +0 -140
  27. sibi_dst-2025.1.3/sibi_dst/utils/manifest_manager.py +0 -486
  28. sibi_dst-2025.1.3/sibi_dst/utils/parquet_saver.py +0 -196
  29. sibi_dst-2025.1.3/sibi_dst/v3/__init__.py +0 -0
  30. sibi_dst-2025.1.3/sibi_dst/v3/backends/__init__.py +0 -0
  31. sibi_dst-2025.1.3/sibi_dst/v3/df_helper/__init__.py +0 -0
  32. sibi_dst-2025.1.3/sibi_dst/v3/df_helper/_df_helper.py +0 -91
  33. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/README.md +0 -0
  34. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/__init__.py +0 -0
  35. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  36. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  37. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  38. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  39. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  40. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  41. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/core/__init__.py +0 -0
  42. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/core/_defaults.py +0 -0
  43. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  44. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/core/_params_config.py +0 -0
  45. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/core/_query_config.py +0 -0
  46. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/df_helper/data_cleaner.py +0 -0
  47. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/geopy_helper/__init__.py +0 -0
  48. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  49. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/geopy_helper/utils.py +0 -0
  50. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/__init__.py +0 -0
  51. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  52. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  53. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  54. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  55. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/osmnx_helper/utils.py +0 -0
  56. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/tests/__init__.py +0 -0
  57. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  58. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/credentials.py +0 -0
  59. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/data_from_http_source.py +0 -0
  60. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/data_utils.py +0 -0
  61. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/df_utils.py +0 -0
  62. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/file_utils.py +0 -0
  63. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/filepath_generator.py +0 -0
  64. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/phone_formatter.py +0 -0
  65. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/utils/webdav_client.py +0 -0
  66. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/__init__.py +0 -0
  67. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/__init__.py +0 -0
  68. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  69. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  70. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  71. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  72. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  73. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  74. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  75. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  76. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  77. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  78. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  79. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  80. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  81. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  82. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  83. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  84. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/utils/__init__.py +0 -0
  85. {sibi_dst-2025.1.3 → sibi_dst-2025.1.5}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.1.3
3
+ Version: 2025.1.5
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -15,8 +15,10 @@ Requires-Dist: dask[complete] (>=2025.5.1,<2026.0.0)
15
15
  Requires-Dist: mysqlclient (>=2.2.7,<3.0.0)
16
16
  Requires-Dist: pandas (>=2.3.1,<3.0.0)
17
17
  Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
18
+ Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
18
19
  Requires-Dist: pydantic (>=2.11.7,<3.0.0)
19
20
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
21
+ Requires-Dist: rich (>=14.0.0,<15.0.0)
20
22
  Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
21
23
  Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
22
24
  Requires-Dist: tqdm (>=4.67.1,<5.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.1.3"
3
+ version = "2025.1.5"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -20,6 +20,8 @@ s3fs = "^2025.5.1"
20
20
  pydantic = "^2.11.7"
21
21
  sqlalchemy = "^2.0.41"
22
22
  pymysql = "^1.1.1"
23
+ pyarrow = "^20.0.0"
24
+ rich = "^14.0.0"
23
25
 
24
26
  [tool.poetry.group.dev]
25
27
  optional = true
@@ -45,6 +47,12 @@ geopy = "^2.4.1"
45
47
  folium = "^0.20.0"
46
48
  geopandas = "^1.1.1"
47
49
 
50
+
51
+ [tool.poetry.group.loggers.dependencies]
52
+ opentelemetry-sdk = "^1.34.1"
53
+ opentelemetry-api = "^1.34.1"
54
+ opentelemetry-exporter-otlp = "^1.34.1"
55
+
48
56
  [build-system]
49
57
  requires = ["poetry-core"]
50
58
  build-backend = "poetry.core.masonry.api"
@@ -1,4 +1,3 @@
1
-
2
1
  try:
3
2
  import importlib.metadata as version_reader
4
3
  except ImportError:
@@ -8,3 +7,7 @@ try:
8
7
  __version__ = version_reader.version("sibi-dst")
9
8
  except version_reader.PackageNotFoundError:
10
9
  __version__ = "unknown"
10
+
11
+ __all__ = [
12
+ "__version__",
13
+ ]
@@ -3,11 +3,11 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapper
6
+ from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded
7
7
 
8
8
  __all__ = [
9
9
  'DfHelper',
10
10
  'ParquetArtifact',
11
11
  'ParquetReader',
12
- 'ArtifactUpdaterMultiWrapper',
12
+ 'ArtifactUpdaterMultiWrapperThreaded',
13
13
  ]
@@ -0,0 +1,431 @@
1
+ import datetime
2
+ import time
3
+ import random
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from typing import Any, Callable, Dict, List, Optional, Type
6
+
7
+ from sibi_dst.utils import ManagedResource
8
+
9
+
10
+ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
11
+ """
12
+ Updates artifacts concurrently using a ThreadPoolExecutor.
13
+
14
+ This version is refactored for a pure multi-threaded environment, aligning
15
+ the orchestration model with the underlying threaded workers (DataWrapper).
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ wrapped_classes: Dict[str, List[Type]],
21
+ *,
22
+ max_workers: int = 4,
23
+ retry_attempts: int = 3,
24
+ backoff_base: int = 2,
25
+ backoff_max: int = 60,
26
+ backoff_jitter: float = 0.1,
27
+ priority_fn: Optional[Callable[[Type], int]] = None,
28
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
29
+ **kwargs: Any,
30
+ ) -> None:
31
+ super().__init__(**kwargs)
32
+ self.wrapped_classes = wrapped_classes
33
+ self.max_workers = max_workers
34
+ self.retry_attempts = retry_attempts
35
+ self.backoff_base = backoff_base
36
+ self.backoff_max = backoff_max
37
+ self.backoff_jitter = backoff_jitter
38
+ self.priority_fn = priority_fn
39
+ # Default artifact init kwargs
40
+ today = datetime.datetime.today() + datetime.timedelta(days=1)
41
+ default_kwargs = {
42
+ 'parquet_start_date': today.strftime('%Y-%m-%d'),
43
+ 'parquet_end_date': today.strftime('%Y-%m-%d'),
44
+ 'logger': self.logger,
45
+ 'debug': self.debug,
46
+ 'fs': self.fs,
47
+ 'verbose': self.verbose,
48
+ }
49
+ self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
50
+
51
+ # State tracking
52
+ self.completion_times: Dict[str, float] = {}
53
+ self.failed: List[str] = []
54
+ self.original_classes: List[Type] = []
55
+
56
+ def get_artifact_classes(self, data_type: str) -> List[Type]:
57
+ """Retrieve artifact classes by data type."""
58
+ self.logger.info(f"Fetching artifact classes for '{data_type}'")
59
+ classes = self.wrapped_classes.get(data_type)
60
+ if not classes:
61
+ raise ValueError(f"Unsupported data type: {data_type}")
62
+ self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
63
+ return classes
64
+
65
+ def estimate_priority(self, artifact_cls: Type) -> int:
66
+ """
67
+ Determines task priority. Lower values run first.
68
+ Note: This is a blocking call and will run sequentially before updates start.
69
+ """
70
+ name = artifact_cls.__name__
71
+ # Custom priority function takes precedence
72
+ if self.priority_fn:
73
+ try:
74
+ return self.priority_fn(artifact_cls)
75
+ except Exception as e:
76
+ self.logger.warning(f"priority_fn error for {name}: {e}")
77
+
78
+ # Fallback to size estimate if available
79
+ if hasattr(artifact_cls, 'get_size_estimate'):
80
+ try:
81
+ # This performs blocking I/O
82
+ return artifact_cls(**self.artifact_class_kwargs).get_size_estimate()
83
+
84
+ except Exception as e:
85
+ self.logger.warning(f"get_size_estimate failed for {name}: {e}")
86
+
87
+ # Default priority
88
+ return 999
89
+
90
+ def _update_artifact_with_retry(self, artifact_cls: Type, update_kwargs: Dict[str, Any]) -> str:
91
+ """
92
+ A blocking worker function that handles instantiation, update, and retries for a single artifact.
93
+ This function is designed to be run in a ThreadPoolExecutor.
94
+ """
95
+ name = artifact_cls.__name__
96
+ self.logger.debug(f"Worker thread starting update for {name}")
97
+
98
+ for attempt in range(1, self.retry_attempts + 1):
99
+ try:
100
+ # Instantiate and update directly within the worker thread
101
+ artifact_instance = artifact_cls(**self.artifact_class_kwargs)
102
+ artifact_instance.update_parquet(**update_kwargs)
103
+
104
+ self.logger.info(f"✅ {name} updated successfully on attempt {attempt}")
105
+ return name # Return the name on success
106
+
107
+ except Exception as e:
108
+ self.logger.error(f"Error on {name} attempt {attempt}/{self.retry_attempts}: {e}", exc_info=self.debug)
109
+ if attempt < self.retry_attempts:
110
+ delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
111
+ delay *= 1 + random.uniform(0, self.backoff_jitter)
112
+ self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
113
+ time.sleep(delay)
114
+
115
+ # If all retries fail, raise an exception to be caught by the main loop
116
+ raise RuntimeError(f"{name} failed after {self.retry_attempts} attempts.")
117
+
118
+ def update_data(self, data_type: str, **kwargs: Any) -> None:
119
+ """
120
+ Entry point to update all artifacts of a given type using a ThreadPoolExecutor.
121
+ """
122
+ self.logger.debug(f"Starting multi-threaded update for '{data_type}' with kwargs={kwargs}")
123
+
124
+ # Reset state for this run
125
+ self.completion_times.clear()
126
+ self.failed.clear()
127
+ self.original_classes = self.get_artifact_classes(data_type)
128
+
129
+ # Sequentially estimate priorities and sort classes before execution
130
+ self.logger.debug("Estimating priorities to order tasks...")
131
+ ordered_classes = sorted(self.original_classes, key=self.estimate_priority)
132
+ self.logger.debug("Priority estimation complete. Submitting tasks to thread pool.")
133
+
134
+ start_time = time.monotonic()
135
+
136
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
137
+ future_to_class_name = {
138
+ executor.submit(self._update_artifact_with_retry, cls, kwargs): cls.__name__
139
+ for cls in ordered_classes
140
+ }
141
+
142
+ for future in as_completed(future_to_class_name):
143
+ name = future_to_class_name[future]
144
+ try:
145
+ # result() will re-raise the exception from the worker if one occurred
146
+ future.result()
147
+ # If no exception, the task succeeded
148
+ self.completion_times[name] = time.monotonic() - start_time
149
+ except Exception as e:
150
+ self.logger.error(f"✖️ {name} permanently failed. See error log above.")
151
+ self.failed.append(name)
152
+
153
+ # Log final status
154
+ total = len(self.original_classes)
155
+ completed = len(self.completion_times)
156
+ failed_count = len(self.failed)
157
+ self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed_count}")
158
+
159
+ def get_update_status(self) -> Dict[str, Any]:
160
+ """Returns a summary status including completion times."""
161
+ completed_set = set(self.completion_times.keys())
162
+ failed_set = set(self.failed)
163
+ pending_set = {cls.__name__ for cls in self.original_classes} - completed_set - failed_set
164
+
165
+ return {
166
+ 'total': len(self.original_classes),
167
+ 'completed': list(completed_set),
168
+ 'failed': list(failed_set),
169
+ 'pending': list(pending_set),
170
+ 'completion_times': self.completion_times,
171
+ }
172
+
173
+ @staticmethod
174
+ def format_status_table(status: Dict[str, Any]) -> str:
175
+ """Formats the status dictionary into a readable table."""
176
+ lines = [
177
+ f"Total: {status['total']}",
178
+ f"Completed: {len(status['completed'])}",
179
+ f"Failed: {len(status['failed'])}",
180
+ f"Pending: {len(status['pending'])}",
181
+ "\nPer-artifact completion times (seconds):"
182
+ ]
183
+ sorted_times = sorted(status['completion_times'].items(), key=lambda item: item[1], reverse=True)
184
+ for name, duration in sorted_times:
185
+ lines.append(f" - {name:<30}: {duration:.2f}s")
186
+ if status['failed']:
187
+ lines.append("\nFailed artifacts:")
188
+ for name in status['failed']:
189
+ lines.append(f" - {name}")
190
+ return "\n".join(lines)
191
+
192
+
193
+ # import asyncio
194
+ # import logging
195
+ # import datetime
196
+ # import random
197
+ # from typing import Any, Callable, Dict, List, Optional, Type
198
+ #
199
+ # from sibi_dst.utils import Logger
200
+ #
201
+ #
202
+ # class ArtifactUpdaterMultiWrapperAsync:
203
+ # """
204
+ # Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
205
+ #
206
+ # Features:
207
+ # - Caps concurrency at max_workers via semaphore
208
+ # - Optionally prioritises tasks via a priority function or static method on artifact classes
209
+ # - Tracks per-artifact completion times
210
+ # - Configurable retry/backoff strategy
211
+ # - Optional metrics integration
212
+ # - Thread-safe within a single asyncio loop
213
+ #
214
+ # Usage:
215
+ # wrapper = ArtifactUpdaterMultiWrapper(
216
+ # wrapped_classes={
217
+ # 'mydata': [DataArtifactA, DataArtifactB],
218
+ # },
219
+ # max_workers=4,
220
+ # retry_attempts=3,
221
+ # update_timeout_seconds=600,
222
+ # backoff_base=2,
223
+ # backoff_max=60,
224
+ # backoff_jitter=0.1,
225
+ # priority_fn=None, # or custom
226
+ # metrics_client=None,
227
+ # debug=True,
228
+ # logger=None,
229
+ # artifact_class_kwargs={
230
+ # 'fs': my_fs,
231
+ # 'parquet_storage_path': 's3://bucket/data',
232
+ # 'logger': my_logger,
233
+ # 'debug': True,
234
+ # }
235
+ # )
236
+ # await wrapper.update_data('mydata', period='ytd', overwrite=True)
237
+ # """
238
+ # def __init__(
239
+ # self,
240
+ # wrapped_classes: Dict[str, List[Type]],
241
+ # *,
242
+ # max_workers: int = 3,
243
+ # retry_attempts: int = 3,
244
+ # update_timeout_seconds: int = 600,
245
+ # backoff_base: int = 2,
246
+ # backoff_max: Optional[int] = 60,
247
+ # backoff_jitter: float = 0.1,
248
+ # priority_fn: Optional[Callable[[Type], int]] = None,
249
+ # metrics_client: Any = None,
250
+ # debug: bool = False,
251
+ # logger: Optional[logging.Logger] = None,
252
+ # artifact_class_kwargs: Optional[Dict[str, Any]] = None,
253
+ # ) -> None:
254
+ # self.wrapped_classes = wrapped_classes
255
+ # self.max_workers = max_workers
256
+ # self.retry_attempts = retry_attempts
257
+ # self.update_timeout_seconds = update_timeout_seconds
258
+ # self.backoff_base = backoff_base
259
+ # self.backoff_max = backoff_max
260
+ # self.backoff_jitter = backoff_jitter
261
+ # self.priority_fn = priority_fn
262
+ # self.metrics_client = metrics_client
263
+ #
264
+ # self.debug = debug
265
+ # self.logger = logger or Logger.default_logger(
266
+ # logger_name=self.__class__.__name__,
267
+ # log_level=Logger.DEBUG if debug else Logger.INFO
268
+ # )
269
+ #
270
+ # # Default artifact init kwargs
271
+ # today = datetime.datetime.today() + datetime.timedelta(days=1)
272
+ # default_kwargs = {
273
+ # 'parquet_start_date': today.strftime('%Y-%m-%d'),
274
+ # 'parquet_end_date': today.strftime('%Y-%m-%d'),
275
+ # 'logger': self.logger,
276
+ # 'debug': self.debug,
277
+ # }
278
+ # self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
279
+ #
280
+ # # State
281
+ # self.completion_times: Dict[str, float] = {}
282
+ # self.failed: List[str] = []
283
+ # self.original_classes: List[Type] = []
284
+ #
285
+ # def get_artifact_classes(self, data_type: str) -> List[Type]:
286
+ # """
287
+ # Retrieve artifact classes by data type.
288
+ # """
289
+ # self.logger.info(f"Fetching artifact classes for '{data_type}'")
290
+ # if data_type not in self.wrapped_classes:
291
+ # raise ValueError(f"Unsupported data type: {data_type}")
292
+ # classes = self.wrapped_classes[data_type]
293
+ # self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
294
+ # return classes
295
+ #
296
+ # def estimate_priority(self, artifact_cls: Type) -> int:
297
+ # """
298
+ # Determine task priority for ordering. Lower values run first.
299
+ # """
300
+ # name = artifact_cls.__name__
301
+ # if self.priority_fn:
302
+ # try:
303
+ # pr = self.priority_fn(artifact_cls)
304
+ # self.logger.debug(f"priority_fn for {name}: {pr}")
305
+ # return pr
306
+ # except Exception as e:
307
+ # self.logger.warning(f"priority_fn error for {name}: {e}")
308
+ # try:
309
+ # fs = self.artifact_class_kwargs.get('fs')
310
+ # path = self.artifact_class_kwargs.get('parquet_storage_path')
311
+ # pr=1
312
+ # if hasattr(artifact_cls, 'get_size_estimate'):
313
+ # pr = artifact_cls.get_size_estimate(fs, path)
314
+ # self.logger.debug(f"Estimated priority for {name}: {pr}")
315
+ # return pr
316
+ # except Exception:
317
+ # return 1
318
+ #
319
+ # async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
320
+ # """
321
+ # Wrap update_artifact in a semaphore slot to limit concurrency.
322
+ # """
323
+ # async with sem:
324
+ # name = artifact_cls.__name__
325
+ # start = asyncio.get_event_loop().time()
326
+ # self.logger.info(f"Starting update for {name}")
327
+ # try:
328
+ # for attempt in range(1, self.retry_attempts + 1):
329
+ # try:
330
+ # artifact = await asyncio.to_thread(
331
+ # artifact_cls, **self.artifact_class_kwargs
332
+ # )
333
+ # await asyncio.wait_for(
334
+ # asyncio.to_thread(
335
+ # artifact.update_parquet, **update_kwargs
336
+ # ),
337
+ # timeout=self.update_timeout_seconds
338
+ # )
339
+ # duration = asyncio.get_event_loop().time() - start
340
+ # self.completion_times[name] = duration
341
+ # self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
342
+ # if self.metrics_client:
343
+ # self.metrics_client.increment('task_succeeded')
344
+ # return
345
+ # except asyncio.TimeoutError:
346
+ # self.logger.warning(f"Timeout on {name}, attempt {attempt}")
347
+ # except Exception as e:
348
+ # self.logger.error(f"Error on {name} attempt {attempt}: {e}")
349
+ #
350
+ # delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
351
+ # delay *= 1 + random.uniform(0, self.backoff_jitter)
352
+ # self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
353
+ # await asyncio.sleep(delay)
354
+ #
355
+ # except asyncio.CancelledError:
356
+ # self.logger.warning(f"{name} update cancelled")
357
+ # raise
358
+ #
359
+ # # permanent failure
360
+ # self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
361
+ # if self.metrics_client:
362
+ # self.metrics_client.increment('task_failed')
363
+ # self.failed.append(name)
364
+ #
365
+ # async def update_data(self, data_type: str, **kwargs: Any) -> None:
366
+ # """
367
+ # Entry point to update all artifacts of a given type concurrently.
368
+ # """
369
+ # self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
370
+ #
371
+ # # RESET STATE
372
+ # self.completion_times.clear()
373
+ # self.failed.clear()
374
+ # self.original_classes = self.get_artifact_classes(data_type)
375
+ #
376
+ # # NON-DESTRUCTIVE SORTING
377
+ # ordered = sorted(self.original_classes, key=self.estimate_priority)
378
+ #
379
+ # sem = asyncio.Semaphore(self.max_workers)
380
+ # tasks = [
381
+ # asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
382
+ # for cls in ordered
383
+ # ]
384
+ #
385
+ # try:
386
+ # for coro in asyncio.as_completed(tasks):
387
+ # await coro
388
+ # except asyncio.CancelledError:
389
+ # self.logger.warning("update_data was cancelled—aborting remaining retries")
390
+ # for t in tasks:
391
+ # t.cancel()
392
+ # raise
393
+ # finally:
394
+ # total = len(self.original_classes)
395
+ # completed = len(self.completion_times)
396
+ # failed = len(self.failed)
397
+ # self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
398
+ #
399
+ # def get_update_status(self) -> Dict[str, Any]:
400
+ # """
401
+ # Returns summary status including completion times.
402
+ # """
403
+ # total = len(self.original_classes)
404
+ # completed = set(self.completion_times.keys())
405
+ # failed = set(self.failed)
406
+ # pending = {cls.__name__ for cls in self.original_classes} - completed - failed
407
+ #
408
+ # return {
409
+ # 'total': total,
410
+ # 'completed': list(completed),
411
+ # 'failed': list(failed),
412
+ # 'pending': list(pending),
413
+ # 'completion_times': self.completion_times,
414
+ # }
415
+ #
416
+ # @staticmethod
417
+ # def format_status_table(status: Dict[str, Any]) -> str:
418
+ # """
419
+ # Formats the status dict into a readable table.
420
+ # """
421
+ # lines = [
422
+ # f"Total: {status['total']}",
423
+ # f"Completed: {len(status['completed'])} {status['completed']}",
424
+ # f"Failed: {len(status['failed'])} {status['failed']}",
425
+ # f"Pending: {len(status['pending'])} {status['pending']}",
426
+ # "",
427
+ # "Per-artifact timings:"
428
+ # ]
429
+ # for name, dur in status['completion_times'].items():
430
+ # lines.append(f" {name}: {dur:.2f}s")
431
+ # return "\n".join(lines)