sibi-dst 2025.8.1__tar.gz → 2025.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/PKG-INFO +1 -1
  2. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/pyproject.toml +1 -1
  3. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/__init__.py +3 -2
  4. sibi_dst-2025.8.3/sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  5. sibi_dst-2025.8.3/sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  6. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_df_helper.py +26 -7
  7. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_artifact.py +24 -4
  8. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_reader.py +9 -10
  9. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_filter_handler.py +116 -37
  10. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/__init__.py +2 -0
  11. sibi_dst-2025.8.3/sibi_dst/utils/base.py +252 -0
  12. sibi_dst-2025.8.3/sibi_dst/utils/business_days.py +248 -0
  13. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_wrapper.py +166 -106
  14. sibi_dst-2025.8.3/sibi_dst/utils/date_utils.py +778 -0
  15. sibi_dst-2025.8.3/sibi_dst/utils/file_age_checker.py +301 -0
  16. sibi_dst-2025.8.3/sibi_dst/utils/periods.py +42 -0
  17. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/update_planner.py +2 -2
  18. sibi_dst-2025.8.1/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -315
  19. sibi_dst-2025.8.1/sibi_dst/utils/base.py +0 -323
  20. sibi_dst-2025.8.1/sibi_dst/utils/date_utils.py +0 -461
  21. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/README.md +0 -0
  22. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/__init__.py +0 -0
  23. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/__init__.py +0 -0
  24. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  25. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  26. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  27. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  28. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  29. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  30. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  31. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  32. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  33. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  34. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  35. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  36. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/__init__.py +0 -0
  37. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_defaults.py +0 -0
  38. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_params_config.py +0 -0
  39. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_query_config.py +0 -0
  40. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/data_cleaner.py +0 -0
  41. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/__init__.py +0 -0
  42. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  43. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/utils.py +0 -0
  44. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/__init__.py +0 -0
  45. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  46. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  47. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  48. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  49. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  50. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  51. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/utils.py +0 -0
  52. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/tests/__init__.py +0 -0
  53. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  54. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/clickhouse_writer.py +0 -0
  55. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/credentials.py +0 -0
  56. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_from_http_source.py +0 -0
  57. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_utils.py +0 -0
  58. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/df_utils.py +0 -0
  59. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/file_utils.py +0 -0
  60. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/filepath_generator.py +0 -0
  61. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/log_utils.py +0 -0
  62. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/manifest_manager.py +0 -0
  63. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/parquet_saver.py +0 -0
  64. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/phone_formatter.py +0 -0
  65. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_config.py +0 -0
  66. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_manager.py +0 -0
  67. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/webdav_client.py +0 -0
  68. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/__init__.py +0 -0
  69. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/__init__.py +0 -0
  70. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  71. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  72. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  73. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  74. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  75. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  76. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  77. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  78. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  79. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  80. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  81. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  82. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  83. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  84. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  85. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  86. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/__init__.py +0 -0
  87. {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.1
3
+ Version: 2025.8.3
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.1"
3
+ version = "2025.8.3"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -3,8 +3,9 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
- from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
-
6
+ #from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
7
+ from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
8
+ from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
8
9
  __all__ = [
9
10
  'DfHelper',
10
11
  'ParquetArtifact',
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import datetime
5
+ import random
6
+ import time
7
+ from contextlib import ExitStack
8
+ from dataclasses import dataclass
9
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type
10
+
11
+ from sibi_dst.utils import ManagedResource
12
+
13
+
14
+ @dataclass(slots=True)
15
+ class _RetryCfg:
16
+ attempts: int = 3
17
+ backoff_base: float = 2.0
18
+ backoff_max: float = 60.0
19
+ jitter: float = 0.15
20
+
21
+
22
+ _ORCHESTRATOR_KEYS = {
23
+ "retry_attempts",
24
+ "backoff_base",
25
+ "backoff_max",
26
+ "backoff_jitter",
27
+ "update_timeout_seconds",
28
+ "max_workers",
29
+ "priority_fn",
30
+ "artifact_class_kwargs",
31
+ }
32
+
33
+
34
+ def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
35
+ return {
36
+ "logger": resource.logger,
37
+ "debug": resource.debug,
38
+ "fs": resource.fs,
39
+ "verbose": resource.verbose,
40
+ }
41
+
42
+
43
+ class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
44
+ """
45
+ Backward-compatible async orchestrator with shutdown-aware scheduling.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ wrapped_classes: Dict[str, Sequence[Type]],
51
+ *,
52
+ max_workers: int = 3,
53
+ retry_attempts: int = 3,
54
+ update_timeout_seconds: int = 600,
55
+ backoff_base: float = 2.0,
56
+ backoff_max: float = 60.0,
57
+ backoff_jitter: float = 0.15,
58
+ priority_fn: Optional[Callable[[Type], int]] = None,
59
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
60
+ **kwargs: Any,
61
+ ) -> None:
62
+ super().__init__(**kwargs)
63
+ self.wrapped_classes = wrapped_classes
64
+ self.max_workers = int(max_workers)
65
+ self.update_timeout_seconds = int(update_timeout_seconds)
66
+ self.priority_fn = priority_fn
67
+
68
+ self._retry = _RetryCfg(
69
+ attempts=int(retry_attempts),
70
+ backoff_base=float(backoff_base),
71
+ backoff_max=float(backoff_max),
72
+ jitter=float(backoff_jitter),
73
+ )
74
+
75
+ self.artifact_class_kwargs = {
76
+ **_default_artifact_kwargs(self),
77
+ **(artifact_class_kwargs or {}),
78
+ }
79
+
80
+ self.completion_secs: Dict[str, float] = {}
81
+ self.failed: List[str] = []
82
+
83
+ # NEW: async stop gate — tripped on cleanup/cancel
84
+ self._stop = asyncio.Event()
85
+
86
+ # Trip stop gate on close paths
87
+ def _cleanup(self) -> None:
88
+ try:
89
+ loop = asyncio.get_running_loop()
90
+ loop.call_soon_threadsafe(self._stop.set)
91
+ except RuntimeError:
92
+ self._stop.set()
93
+
94
+ async def _acleanup(self) -> None:
95
+ self._stop.set()
96
+
97
+ # ---- internals -----------------------------------------------------------
98
+
99
+ def _classes_for(self, period: str) -> List[Type]:
100
+ try:
101
+ classes = list(self.wrapped_classes[period])
102
+ except KeyError:
103
+ raise ValueError(f"Unsupported period '{period}'.")
104
+ if not classes:
105
+ raise ValueError(f"No artifact classes configured for period '{period}'.")
106
+ if self.priority_fn:
107
+ try:
108
+ classes.sort(key=self.priority_fn)
109
+ except Exception as e:
110
+ self.logger.warning(f"priority_fn failed; using listed order: {e}")
111
+ return classes
112
+
113
+ @staticmethod
114
+ def _split_kwargs(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
115
+ orch: Dict[str, Any] = {}
116
+ art: Dict[str, Any] = {}
117
+ for k, v in raw.items():
118
+ if k in _ORCHESTRATOR_KEYS:
119
+ orch[k] = v
120
+ else:
121
+ art[k] = v
122
+ return orch, art
123
+
124
+ async def _run_one(self, cls: Type, period: str, sem: asyncio.Semaphore, artifact_kwargs: Dict[str, Any]) -> None:
125
+ name = cls.__name__
126
+ if self._stop.is_set() or self.closed:
127
+ raise asyncio.CancelledError()
128
+
129
+ self.logger.info(f"Running {name} with period '{period}'", extra={"artifact": name, "period": period})
130
+ async with sem:
131
+ loop = asyncio.get_running_loop()
132
+ start = loop.time()
133
+ for attempt in range(1, self._retry.attempts + 1):
134
+ if self._stop.is_set() or self.closed:
135
+ raise asyncio.CancelledError()
136
+ try:
137
+ def _sync_block() -> None:
138
+ with ExitStack() as stack:
139
+ inst = cls(**self.artifact_class_kwargs)
140
+ inst = stack.enter_context(inst)
141
+ inst.update_parquet(period=period, **artifact_kwargs)
142
+
143
+ await asyncio.wait_for(asyncio.to_thread(_sync_block), timeout=self.update_timeout_seconds)
144
+ dt_secs = loop.time() - start
145
+ self.completion_secs[name] = dt_secs
146
+ self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
147
+ return
148
+
149
+ except asyncio.TimeoutError:
150
+ self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
151
+ except asyncio.CancelledError:
152
+ raise
153
+ except Exception as e:
154
+ self.logger.error(
155
+ f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
156
+ exc_info=self.debug,
157
+ )
158
+
159
+ if attempt < self._retry.attempts and not self._stop.is_set():
160
+ delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
161
+ delay *= 1 + random.uniform(0, self._retry.jitter)
162
+ try:
163
+ await asyncio.sleep(delay)
164
+ except asyncio.CancelledError:
165
+ raise
166
+
167
+ self.failed.append(name)
168
+ self.logger.error(f"✖️ {name} permanently failed")
169
+
170
+ # ---- public API ----------------------------------------------------------
171
+
172
+ async def update_data(self, period: str, **kwargs: Any) -> None:
173
+ """
174
+ Backward-compatible:
175
+ - Accepts orchestrator knobs in kwargs (we consume them).
176
+ - Forwards only artifact-friendly kwargs to update_parquet.
177
+ """
178
+ _, artifact_kwargs = self._split_kwargs(kwargs)
179
+
180
+ self.completion_secs.clear()
181
+ self.failed.clear()
182
+
183
+ classes = self._classes_for(period)
184
+ self.logger.info(
185
+ f"Starting update of {len(classes)} artifacts for period '{period}'",
186
+ extra={
187
+ "action_module_name": self.__class__.__name__,
188
+ "date_of_update": time.strftime('%Y-%m-%d'),
189
+ "start_time": time.strftime('%H:%M:%S'),
190
+ "period": period,
191
+ },
192
+ )
193
+
194
+ sem = asyncio.Semaphore(self.max_workers)
195
+ tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
196
+
197
+ try:
198
+ for t in asyncio.as_completed(tasks):
199
+ if self._stop.is_set():
200
+ break
201
+ await t
202
+ except (asyncio.CancelledError, KeyboardInterrupt):
203
+ self._stop.set()
204
+ for t in tasks:
205
+ t.cancel()
206
+ raise
207
+ finally:
208
+ # Drain/cancel everything deterministically
209
+ for t in tasks:
210
+ if not t.done():
211
+ t.cancel()
212
+ await asyncio.gather(*tasks, return_exceptions=True)
213
+
214
+ self.logger.info(
215
+ f"Update completed for period: {period}",
216
+ extra={
217
+ "action_module_name": self.__class__.__name__,
218
+ "date_of_update": datetime.date.today().strftime('%Y-%m-%d'),
219
+ "end_time": datetime.datetime.now().strftime('%H:%M:%S'),
220
+ "period": period,
221
+ },
222
+ )
223
+ self.logger.info(
224
+ f"Artifacts processed: total={len(classes)}, "
225
+ f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
226
+ )
227
+
228
+ def get_update_status(self) -> Dict[str, Any]:
229
+ done = set(self.completion_secs)
230
+ fail = set(self.failed)
231
+ all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
232
+ return {
233
+ "total": len(all_names),
234
+ "completed": sorted(done),
235
+ "failed": sorted(fail),
236
+ "pending": sorted(all_names - done - fail),
237
+ "completion_times": dict(self.completion_secs),
238
+ }
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import time
5
+ import random
6
+ import threading
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from contextlib import ExitStack
9
+ from dataclasses import dataclass
10
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Tuple
11
+
12
+ from sibi_dst.utils import ManagedResource
13
+
14
+
15
+ @dataclass(slots=True)
16
+ class _RetryCfg:
17
+ attempts: int = 3
18
+ backoff_base: float = 2.0
19
+ backoff_max: float = 60.0
20
+ jitter: float = 0.15
21
+
22
+
23
+ _ORCHESTRATOR_KEYS = {
24
+ "retry_attempts",
25
+ "backoff_base",
26
+ "backoff_max",
27
+ "backoff_jitter",
28
+ "update_timeout_seconds", # accepted but unused in pure-threads version
29
+ "max_workers",
30
+ "priority_fn",
31
+ "artifact_class_kwargs",
32
+ }
33
+
34
+
35
+ def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
36
+ return {
37
+ "logger": resource.logger,
38
+ "debug": resource.debug,
39
+ "fs": resource.fs,
40
+ "verbose": resource.verbose,
41
+ }
42
+
43
+
44
+ class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
45
+ """
46
+ Backward-compatible threaded orchestrator with shutdown-aware scheduling.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ wrapped_classes: Dict[str, Sequence[Type]],
52
+ *,
53
+ max_workers: int = 4,
54
+ retry_attempts: int = 3,
55
+ backoff_base: float = 2.0,
56
+ backoff_max: float = 60.0,
57
+ backoff_jitter: float = 0.15,
58
+ priority_fn: Optional[Callable[[Type], int]] = None,
59
+ artifact_class_kwargs: Optional[Dict[str, Any]] = None,
60
+ **kwargs: Any,
61
+ ) -> None:
62
+ super().__init__(**kwargs)
63
+ self.wrapped_classes = wrapped_classes
64
+ self.max_workers = int(max_workers)
65
+ self.priority_fn = priority_fn
66
+ self._retry = _RetryCfg(
67
+ attempts=int(retry_attempts),
68
+ backoff_base=float(backoff_base),
69
+ backoff_max=float(backoff_max),
70
+ jitter=float(backoff_jitter),
71
+ )
72
+ self.artifact_class_kwargs = {
73
+ **_default_artifact_kwargs(self),
74
+ **(artifact_class_kwargs or {}),
75
+ }
76
+ self.completion_secs: Dict[str, float] = {}
77
+ self.failed: List[str] = []
78
+
79
+ # NEW: stop gate — tripped on cleanup / Ctrl-C to stop scheduling & retries
80
+ self._stop_event = threading.Event()
81
+
82
+ # Trip the stop gate when this wrapper is closed
83
+ def _cleanup(self) -> None:
84
+ self._stop_event.set()
85
+
86
+ def _classes_for(self, period: str) -> List[Type]:
87
+ try:
88
+ classes = list(self.wrapped_classes[period])
89
+ except KeyError:
90
+ raise ValueError(f"Unsupported period '{period}'.")
91
+ if not classes:
92
+ raise ValueError(f"No artifact classes configured for period '{period}'.")
93
+ if self.priority_fn:
94
+ try:
95
+ classes.sort(key=self.priority_fn)
96
+ except Exception as e:
97
+ self.logger.warning(f"priority_fn failed; using listed order: {e}")
98
+ return classes
99
+
100
+ @staticmethod
101
+ def _split_kwargs(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
102
+ orch: Dict[str, Any] = {}
103
+ art: Dict[str, Any] = {}
104
+ for k, v in raw.items():
105
+ if k in _ORCHESTRATOR_KEYS:
106
+ orch[k] = v
107
+ else:
108
+ art[k] = v
109
+ return orch, art
110
+
111
+ def _run_one(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]) -> str:
112
+ name = cls.__name__
113
+ start = time.monotonic()
114
+ for attempt in range(1, self._retry.attempts + 1):
115
+ if self._stop_event.is_set() or self.closed:
116
+ raise RuntimeError("shutting_down")
117
+ try:
118
+ with ExitStack() as stack:
119
+ inst = cls(**self.artifact_class_kwargs)
120
+ inst = stack.enter_context(inst)
121
+ inst.update_parquet(period=period, **artifact_kwargs)
122
+ self.completion_secs[name] = time.monotonic() - start
123
+ return name
124
+ except Exception as e:
125
+ if attempt < self._retry.attempts and not self._stop_event.is_set():
126
+ # interruptible backoff sleep
127
+ delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
128
+ delay *= 1 + random.uniform(0, self._retry.jitter)
129
+ end = time.monotonic() + delay
130
+ while not self._stop_event.is_set() and time.monotonic() < end:
131
+ time.sleep(min(0.1, end - time.monotonic()))
132
+ continue
133
+ raise RuntimeError(f"{name} failed after {self._retry.attempts} attempts: {e}") from e
134
+
135
+ def update_data(self, period: str, **kwargs: Any) -> None:
136
+ # Split kwargs to preserve backward compatibility
137
+ _, artifact_kwargs = self._split_kwargs(kwargs)
138
+
139
+ self.completion_secs.clear()
140
+ self.failed.clear()
141
+
142
+ classes = self._classes_for(period)
143
+
144
+ executor = ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="artifact-updater")
145
+ try:
146
+ fut2name: Dict[Any, str] = {}
147
+ for cls in classes:
148
+ if self._stop_event.is_set() or self.closed:
149
+ break
150
+ try:
151
+ fut = executor.submit(self._run_one, cls, period, dict(artifact_kwargs))
152
+ except RuntimeError as e:
153
+ if "cannot schedule new futures after shutdown" in str(e).lower():
154
+ self.logger.warning("Executor shutting down; halting new submissions.")
155
+ break
156
+ raise
157
+ fut2name[fut] = cls.__name__
158
+
159
+ for fut in as_completed(fut2name):
160
+ name = fut2name[fut]
161
+ try:
162
+ fut.result()
163
+ self.logger.info(f"✅ {name} ({period}) in {self.completion_secs[name]:.2f}s")
164
+ except Exception as e:
165
+ self.failed.append(name)
166
+ self.logger.error(f"✖️ {name} permanently failed: {e}")
167
+ except KeyboardInterrupt:
168
+ self.logger.warning("KeyboardInterrupt — stopping scheduling and shutting down.")
169
+ self._stop_event.set()
170
+ raise
171
+ finally:
172
+ # Ensure queued-but-not-started tasks are canceled
173
+ executor.shutdown(wait=True, cancel_futures=True)
174
+
175
+ self.logger.info(
176
+ f"Artifacts processed: total={len(classes)}, "
177
+ f"completed={len(self.completion_secs)}, failed={len(self.failed)}",
178
+ extra={
179
+ "date_of_update": time.strftime('%Y-%m-%d'),
180
+ "start_time": time.strftime('%H:%M:%S'),
181
+ "period": period,
182
+ },
183
+ )
184
+
185
+ def get_update_status(self) -> Dict[str, Any]:
186
+ done = set(self.completion_secs)
187
+ fail = set(self.failed)
188
+ all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
189
+ return {
190
+ "total": len(all_names),
191
+ "completed": sorted(done),
192
+ "failed": sorted(fail),
193
+ "pending": sorted(all_names - done - fail),
194
+ "completion_times": dict(self.completion_secs),
195
+ }
@@ -55,23 +55,42 @@ class ParquetBackend(BaseBackend):
55
55
  def load(self, **options):
56
56
  try:
57
57
  df = self.helper.backend_parquet.load_files()
58
- if len(df.head(1)) == 0:
59
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
58
+ if self._is_empty(df):
59
+ return -1, self._empty_like(df)
60
60
 
61
61
  if options and df is not None:
62
62
  df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
63
- if len(df.head(1)) == 0:
63
+ nrows = self._row_count(df)
64
+ if nrows == 0:
64
65
  self.logger.debug("No records after filters; returning empty DataFrame.")
65
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
66
+ return 0, self._empty_like(df)
66
67
 
67
68
  df = df.persist()
68
- self.total_records = len(df) or -1
69
+ self.total_records = self._row_count(df) or -1
69
70
  return self.total_records, df
71
+
70
72
  except Exception as e:
71
- self.total_records = -1
73
+ self.total_records = -1 # Reset total_records on failure
72
74
  self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
73
75
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
74
76
 
77
+ def _is_empty(self, ddf) -> bool:
78
+ """True if no rows across all partitions."""
79
+ try:
80
+ # head with npartitions=-1 walks partitions until it gets n rows
81
+ return ddf.head(1, npartitions=-1).shape[0] == 0
82
+ except Exception:
83
+ return True
84
+
85
+ def _row_count(self, ddf) -> int:
86
+ """Reliable row count for Dask DataFrame."""
87
+ return int(ddf.map_partitions(len).sum().compute())
88
+
89
+ def _empty_like(self, ddf):
90
+ """Return an empty Dask DF with the SAME columns/dtypes."""
91
+ empty_pdf = ddf._meta.iloc[0:0]
92
+ return dd.from_pandas(empty_pdf, npartitions=1)
93
+
75
94
 
76
95
  class HttpBackend(BaseBackend):
77
96
  def load(self, **options):
@@ -250,7 +269,7 @@ class DfHelper(ManagedResource):
250
269
  if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
251
270
  self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
252
271
  return
253
- with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
272
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
254
273
  writer.save_to_clickhouse(df)
255
274
  self.logger.debug("Save to ClickHouse completed.")
256
275
 
@@ -242,13 +242,33 @@ class ParquetArtifact(ManagedResource):
242
242
  }
243
243
 
244
244
  def custom_config():
245
- if "start_on" not in final_kwargs or "end_on" not in final_kwargs:
245
+ """
246
+ Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
247
+ are provided (with backward compatibility for `start_date`/`end_date` aliases).
248
+ """
249
+ # Backward compatibility: normalize aliases
250
+ alias_map = {
251
+ "start_on": ("start_date", "start"),
252
+ "end_on": ("end_date", "end"),
253
+ }
254
+ normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
255
+ for target, aliases in alias_map.items():
256
+ if target not in normalized_kwargs:
257
+ for alias in aliases:
258
+ if alias in normalized_kwargs:
259
+ normalized_kwargs[target] = normalized_kwargs[alias]
260
+ break
261
+
262
+ # Validation
263
+ missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
264
+ if missing:
246
265
  raise ValueError(
247
- "For period 'custom', provide 'start_on' and 'end_on'."
266
+ f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
248
267
  )
268
+
249
269
  return {
250
- "parquet_start_date": final_kwargs["start_on"],
251
- "parquet_end_date": final_kwargs["end_on"],
270
+ "parquet_start_date": normalized_kwargs["start_on"],
271
+ "parquet_end_date": normalized_kwargs["end_on"],
252
272
  }
253
273
 
254
274
  if period == "itd":
@@ -1,7 +1,8 @@
1
- from typing import Optional, ClassVar, Dict
1
+ from typing import Optional, ClassVar, Dict, Any, Union
2
2
 
3
3
  import dask.dataframe as dd
4
4
  import fsspec
5
+ import pandas as pd
5
6
 
6
7
  from sibi_dst.df_helper import DfHelper
7
8
 
@@ -23,7 +24,7 @@ class ParquetReader(DfHelper):
23
24
  :type config: dict
24
25
  :ivar df: Stores the loaded Dask DataFrame after the `load()` method is
25
26
  invoked. Initially set to None.
26
- :type df: Optional[dd.DataFrame]
27
+ :type df: Optional[dd.DataFrame | pd.DataFrame]
27
28
  :ivar parquet_storage_path: The path to the Parquet storage directory.
28
29
  :type parquet_storage_path: str
29
30
  :ivar parquet_start_date: Start date for Parquet data selection. Must
@@ -32,19 +33,15 @@ class ParquetReader(DfHelper):
32
33
  :ivar parquet_end_date: End date for Parquet data selection. Must be
33
34
  set in the configuration.
34
35
  :type parquet_end_date: str
35
- :ivar filesystem_type: The type of filesystem the Parquet files are
36
- stored on (e.g., "file", "s3").
37
- :type filesystem_type: str
38
- :ivar filesystem_options: Any additional options required for the
39
- specified filesystem type.
40
- :type filesystem_options: dict
36
+
41
37
  :ivar fs: Instance of `fsspec` filesystem used to interact with the
42
38
  Parquet storage.
43
39
  :type fs: fsspec.AbstractFileSystem
44
40
  """
45
- DEFAULT_CONFIG: ClassVar[Dict[str, int]] = {
41
+ DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
46
42
  'backend': 'parquet'
47
43
  }
44
+ df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
48
45
 
49
46
  def __init__(self, **kwargs):
50
47
  self.config = {
@@ -63,11 +60,13 @@ class ParquetReader(DfHelper):
63
60
  self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
64
61
  if self.parquet_end_date is None:
65
62
  raise ValueError('parquet_end_date must be set')
63
+ if self.fs is None:
64
+ raise ValueError('Parquet Reader mush be supplied a fs instance')
66
65
 
67
66
  if not self.directory_exists():
68
67
  raise ValueError(f"{self.parquet_storage_path} does not exist")
69
68
 
70
- def load(self, **kwargs):
69
+ def load(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
71
70
  self.df = super().load(**kwargs)
72
71
  return self.df
73
72