sibi-dst 2025.8.1__tar.gz → 2025.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/PKG-INFO +1 -1
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/pyproject.toml +1 -1
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst-2025.8.3/sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst-2025.8.3/sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_df_helper.py +26 -7
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_artifact.py +24 -4
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_reader.py +9 -10
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_filter_handler.py +116 -37
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/__init__.py +2 -0
- sibi_dst-2025.8.3/sibi_dst/utils/base.py +252 -0
- sibi_dst-2025.8.3/sibi_dst/utils/business_days.py +248 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_wrapper.py +166 -106
- sibi_dst-2025.8.3/sibi_dst/utils/date_utils.py +778 -0
- sibi_dst-2025.8.3/sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst-2025.8.3/sibi_dst/utils/periods.py +42 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/update_planner.py +2 -2
- sibi_dst-2025.8.1/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -315
- sibi_dst-2025.8.1/sibi_dst/utils/base.py +0 -323
- sibi_dst-2025.8.1/sibi_dst/utils/date_utils.py +0 -461
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/README.md +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.8.1 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -3,8 +3,9 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
-
from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
|
-
|
6
|
+
#from ._artifact_updater_multi_wrapper import ArtifactUpdaterMultiWrapperThreaded, ArtifactUpdaterMultiWrapperAsync
|
7
|
+
from ._artifact_updater_async import ArtifactUpdaterMultiWrapperAsync
|
8
|
+
from ._artifact_updater_threaded import ArtifactUpdaterMultiWrapperThreaded
|
8
9
|
__all__ = [
|
9
10
|
'DfHelper',
|
10
11
|
'ParquetArtifact',
|
@@ -0,0 +1,238 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import datetime
|
5
|
+
import random
|
6
|
+
import time
|
7
|
+
from contextlib import ExitStack
|
8
|
+
from dataclasses import dataclass
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type
|
10
|
+
|
11
|
+
from sibi_dst.utils import ManagedResource
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass(slots=True)
|
15
|
+
class _RetryCfg:
|
16
|
+
attempts: int = 3
|
17
|
+
backoff_base: float = 2.0
|
18
|
+
backoff_max: float = 60.0
|
19
|
+
jitter: float = 0.15
|
20
|
+
|
21
|
+
|
22
|
+
_ORCHESTRATOR_KEYS = {
|
23
|
+
"retry_attempts",
|
24
|
+
"backoff_base",
|
25
|
+
"backoff_max",
|
26
|
+
"backoff_jitter",
|
27
|
+
"update_timeout_seconds",
|
28
|
+
"max_workers",
|
29
|
+
"priority_fn",
|
30
|
+
"artifact_class_kwargs",
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
|
35
|
+
return {
|
36
|
+
"logger": resource.logger,
|
37
|
+
"debug": resource.debug,
|
38
|
+
"fs": resource.fs,
|
39
|
+
"verbose": resource.verbose,
|
40
|
+
}
|
41
|
+
|
42
|
+
|
43
|
+
class ArtifactUpdaterMultiWrapperAsync(ManagedResource):
|
44
|
+
"""
|
45
|
+
Backward-compatible async orchestrator with shutdown-aware scheduling.
|
46
|
+
"""
|
47
|
+
|
48
|
+
def __init__(
|
49
|
+
self,
|
50
|
+
wrapped_classes: Dict[str, Sequence[Type]],
|
51
|
+
*,
|
52
|
+
max_workers: int = 3,
|
53
|
+
retry_attempts: int = 3,
|
54
|
+
update_timeout_seconds: int = 600,
|
55
|
+
backoff_base: float = 2.0,
|
56
|
+
backoff_max: float = 60.0,
|
57
|
+
backoff_jitter: float = 0.15,
|
58
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
59
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
60
|
+
**kwargs: Any,
|
61
|
+
) -> None:
|
62
|
+
super().__init__(**kwargs)
|
63
|
+
self.wrapped_classes = wrapped_classes
|
64
|
+
self.max_workers = int(max_workers)
|
65
|
+
self.update_timeout_seconds = int(update_timeout_seconds)
|
66
|
+
self.priority_fn = priority_fn
|
67
|
+
|
68
|
+
self._retry = _RetryCfg(
|
69
|
+
attempts=int(retry_attempts),
|
70
|
+
backoff_base=float(backoff_base),
|
71
|
+
backoff_max=float(backoff_max),
|
72
|
+
jitter=float(backoff_jitter),
|
73
|
+
)
|
74
|
+
|
75
|
+
self.artifact_class_kwargs = {
|
76
|
+
**_default_artifact_kwargs(self),
|
77
|
+
**(artifact_class_kwargs or {}),
|
78
|
+
}
|
79
|
+
|
80
|
+
self.completion_secs: Dict[str, float] = {}
|
81
|
+
self.failed: List[str] = []
|
82
|
+
|
83
|
+
# NEW: async stop gate — tripped on cleanup/cancel
|
84
|
+
self._stop = asyncio.Event()
|
85
|
+
|
86
|
+
# Trip stop gate on close paths
|
87
|
+
def _cleanup(self) -> None:
|
88
|
+
try:
|
89
|
+
loop = asyncio.get_running_loop()
|
90
|
+
loop.call_soon_threadsafe(self._stop.set)
|
91
|
+
except RuntimeError:
|
92
|
+
self._stop.set()
|
93
|
+
|
94
|
+
async def _acleanup(self) -> None:
|
95
|
+
self._stop.set()
|
96
|
+
|
97
|
+
# ---- internals -----------------------------------------------------------
|
98
|
+
|
99
|
+
def _classes_for(self, period: str) -> List[Type]:
|
100
|
+
try:
|
101
|
+
classes = list(self.wrapped_classes[period])
|
102
|
+
except KeyError:
|
103
|
+
raise ValueError(f"Unsupported period '{period}'.")
|
104
|
+
if not classes:
|
105
|
+
raise ValueError(f"No artifact classes configured for period '{period}'.")
|
106
|
+
if self.priority_fn:
|
107
|
+
try:
|
108
|
+
classes.sort(key=self.priority_fn)
|
109
|
+
except Exception as e:
|
110
|
+
self.logger.warning(f"priority_fn failed; using listed order: {e}")
|
111
|
+
return classes
|
112
|
+
|
113
|
+
@staticmethod
|
114
|
+
def _split_kwargs(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
115
|
+
orch: Dict[str, Any] = {}
|
116
|
+
art: Dict[str, Any] = {}
|
117
|
+
for k, v in raw.items():
|
118
|
+
if k in _ORCHESTRATOR_KEYS:
|
119
|
+
orch[k] = v
|
120
|
+
else:
|
121
|
+
art[k] = v
|
122
|
+
return orch, art
|
123
|
+
|
124
|
+
async def _run_one(self, cls: Type, period: str, sem: asyncio.Semaphore, artifact_kwargs: Dict[str, Any]) -> None:
|
125
|
+
name = cls.__name__
|
126
|
+
if self._stop.is_set() or self.closed:
|
127
|
+
raise asyncio.CancelledError()
|
128
|
+
|
129
|
+
self.logger.info(f"Running {name} with period '{period}'", extra={"artifact": name, "period": period})
|
130
|
+
async with sem:
|
131
|
+
loop = asyncio.get_running_loop()
|
132
|
+
start = loop.time()
|
133
|
+
for attempt in range(1, self._retry.attempts + 1):
|
134
|
+
if self._stop.is_set() or self.closed:
|
135
|
+
raise asyncio.CancelledError()
|
136
|
+
try:
|
137
|
+
def _sync_block() -> None:
|
138
|
+
with ExitStack() as stack:
|
139
|
+
inst = cls(**self.artifact_class_kwargs)
|
140
|
+
inst = stack.enter_context(inst)
|
141
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
142
|
+
|
143
|
+
await asyncio.wait_for(asyncio.to_thread(_sync_block), timeout=self.update_timeout_seconds)
|
144
|
+
dt_secs = loop.time() - start
|
145
|
+
self.completion_secs[name] = dt_secs
|
146
|
+
self.logger.info(f"✅ {name} ({period}) in {dt_secs:.2f}s")
|
147
|
+
return
|
148
|
+
|
149
|
+
except asyncio.TimeoutError:
|
150
|
+
self.logger.warning(f"Timeout in {name} attempt {attempt}/{self._retry.attempts}")
|
151
|
+
except asyncio.CancelledError:
|
152
|
+
raise
|
153
|
+
except Exception as e:
|
154
|
+
self.logger.error(
|
155
|
+
f"{name} attempt {attempt}/{self._retry.attempts} failed: {e}",
|
156
|
+
exc_info=self.debug,
|
157
|
+
)
|
158
|
+
|
159
|
+
if attempt < self._retry.attempts and not self._stop.is_set():
|
160
|
+
delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
|
161
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
162
|
+
try:
|
163
|
+
await asyncio.sleep(delay)
|
164
|
+
except asyncio.CancelledError:
|
165
|
+
raise
|
166
|
+
|
167
|
+
self.failed.append(name)
|
168
|
+
self.logger.error(f"✖️ {name} permanently failed")
|
169
|
+
|
170
|
+
# ---- public API ----------------------------------------------------------
|
171
|
+
|
172
|
+
async def update_data(self, period: str, **kwargs: Any) -> None:
|
173
|
+
"""
|
174
|
+
Backward-compatible:
|
175
|
+
- Accepts orchestrator knobs in kwargs (we consume them).
|
176
|
+
- Forwards only artifact-friendly kwargs to update_parquet.
|
177
|
+
"""
|
178
|
+
_, artifact_kwargs = self._split_kwargs(kwargs)
|
179
|
+
|
180
|
+
self.completion_secs.clear()
|
181
|
+
self.failed.clear()
|
182
|
+
|
183
|
+
classes = self._classes_for(period)
|
184
|
+
self.logger.info(
|
185
|
+
f"Starting update of {len(classes)} artifacts for period '{period}'",
|
186
|
+
extra={
|
187
|
+
"action_module_name": self.__class__.__name__,
|
188
|
+
"date_of_update": time.strftime('%Y-%m-%d'),
|
189
|
+
"start_time": time.strftime('%H:%M:%S'),
|
190
|
+
"period": period,
|
191
|
+
},
|
192
|
+
)
|
193
|
+
|
194
|
+
sem = asyncio.Semaphore(self.max_workers)
|
195
|
+
tasks = [asyncio.create_task(self._run_one(cls, period, sem, dict(artifact_kwargs))) for cls in classes]
|
196
|
+
|
197
|
+
try:
|
198
|
+
for t in asyncio.as_completed(tasks):
|
199
|
+
if self._stop.is_set():
|
200
|
+
break
|
201
|
+
await t
|
202
|
+
except (asyncio.CancelledError, KeyboardInterrupt):
|
203
|
+
self._stop.set()
|
204
|
+
for t in tasks:
|
205
|
+
t.cancel()
|
206
|
+
raise
|
207
|
+
finally:
|
208
|
+
# Drain/cancel everything deterministically
|
209
|
+
for t in tasks:
|
210
|
+
if not t.done():
|
211
|
+
t.cancel()
|
212
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
213
|
+
|
214
|
+
self.logger.info(
|
215
|
+
f"Update completed for period: {period}",
|
216
|
+
extra={
|
217
|
+
"action_module_name": self.__class__.__name__,
|
218
|
+
"date_of_update": datetime.date.today().strftime('%Y-%m-%d'),
|
219
|
+
"end_time": datetime.datetime.now().strftime('%H:%M:%S'),
|
220
|
+
"period": period,
|
221
|
+
},
|
222
|
+
)
|
223
|
+
self.logger.info(
|
224
|
+
f"Artifacts processed: total={len(classes)}, "
|
225
|
+
f"completed={len(self.completion_secs)}, failed={len(self.failed)}"
|
226
|
+
)
|
227
|
+
|
228
|
+
def get_update_status(self) -> Dict[str, Any]:
|
229
|
+
done = set(self.completion_secs)
|
230
|
+
fail = set(self.failed)
|
231
|
+
all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
|
232
|
+
return {
|
233
|
+
"total": len(all_names),
|
234
|
+
"completed": sorted(done),
|
235
|
+
"failed": sorted(fail),
|
236
|
+
"pending": sorted(all_names - done - fail),
|
237
|
+
"completion_times": dict(self.completion_secs),
|
238
|
+
}
|
@@ -0,0 +1,195 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import datetime
|
4
|
+
import time
|
5
|
+
import random
|
6
|
+
import threading
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
8
|
+
from contextlib import ExitStack
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Tuple
|
11
|
+
|
12
|
+
from sibi_dst.utils import ManagedResource
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass(slots=True)
|
16
|
+
class _RetryCfg:
|
17
|
+
attempts: int = 3
|
18
|
+
backoff_base: float = 2.0
|
19
|
+
backoff_max: float = 60.0
|
20
|
+
jitter: float = 0.15
|
21
|
+
|
22
|
+
|
23
|
+
_ORCHESTRATOR_KEYS = {
|
24
|
+
"retry_attempts",
|
25
|
+
"backoff_base",
|
26
|
+
"backoff_max",
|
27
|
+
"backoff_jitter",
|
28
|
+
"update_timeout_seconds", # accepted but unused in pure-threads version
|
29
|
+
"max_workers",
|
30
|
+
"priority_fn",
|
31
|
+
"artifact_class_kwargs",
|
32
|
+
}
|
33
|
+
|
34
|
+
|
35
|
+
def _default_artifact_kwargs(resource: ManagedResource) -> Dict[str, Any]:
|
36
|
+
return {
|
37
|
+
"logger": resource.logger,
|
38
|
+
"debug": resource.debug,
|
39
|
+
"fs": resource.fs,
|
40
|
+
"verbose": resource.verbose,
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
class ArtifactUpdaterMultiWrapperThreaded(ManagedResource):
|
45
|
+
"""
|
46
|
+
Backward-compatible threaded orchestrator with shutdown-aware scheduling.
|
47
|
+
"""
|
48
|
+
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
wrapped_classes: Dict[str, Sequence[Type]],
|
52
|
+
*,
|
53
|
+
max_workers: int = 4,
|
54
|
+
retry_attempts: int = 3,
|
55
|
+
backoff_base: float = 2.0,
|
56
|
+
backoff_max: float = 60.0,
|
57
|
+
backoff_jitter: float = 0.15,
|
58
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
59
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
60
|
+
**kwargs: Any,
|
61
|
+
) -> None:
|
62
|
+
super().__init__(**kwargs)
|
63
|
+
self.wrapped_classes = wrapped_classes
|
64
|
+
self.max_workers = int(max_workers)
|
65
|
+
self.priority_fn = priority_fn
|
66
|
+
self._retry = _RetryCfg(
|
67
|
+
attempts=int(retry_attempts),
|
68
|
+
backoff_base=float(backoff_base),
|
69
|
+
backoff_max=float(backoff_max),
|
70
|
+
jitter=float(backoff_jitter),
|
71
|
+
)
|
72
|
+
self.artifact_class_kwargs = {
|
73
|
+
**_default_artifact_kwargs(self),
|
74
|
+
**(artifact_class_kwargs or {}),
|
75
|
+
}
|
76
|
+
self.completion_secs: Dict[str, float] = {}
|
77
|
+
self.failed: List[str] = []
|
78
|
+
|
79
|
+
# NEW: stop gate — tripped on cleanup / Ctrl-C to stop scheduling & retries
|
80
|
+
self._stop_event = threading.Event()
|
81
|
+
|
82
|
+
# Trip the stop gate when this wrapper is closed
|
83
|
+
def _cleanup(self) -> None:
|
84
|
+
self._stop_event.set()
|
85
|
+
|
86
|
+
def _classes_for(self, period: str) -> List[Type]:
|
87
|
+
try:
|
88
|
+
classes = list(self.wrapped_classes[period])
|
89
|
+
except KeyError:
|
90
|
+
raise ValueError(f"Unsupported period '{period}'.")
|
91
|
+
if not classes:
|
92
|
+
raise ValueError(f"No artifact classes configured for period '{period}'.")
|
93
|
+
if self.priority_fn:
|
94
|
+
try:
|
95
|
+
classes.sort(key=self.priority_fn)
|
96
|
+
except Exception as e:
|
97
|
+
self.logger.warning(f"priority_fn failed; using listed order: {e}")
|
98
|
+
return classes
|
99
|
+
|
100
|
+
@staticmethod
|
101
|
+
def _split_kwargs(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
102
|
+
orch: Dict[str, Any] = {}
|
103
|
+
art: Dict[str, Any] = {}
|
104
|
+
for k, v in raw.items():
|
105
|
+
if k in _ORCHESTRATOR_KEYS:
|
106
|
+
orch[k] = v
|
107
|
+
else:
|
108
|
+
art[k] = v
|
109
|
+
return orch, art
|
110
|
+
|
111
|
+
def _run_one(self, cls: Type, period: str, artifact_kwargs: Dict[str, Any]) -> str:
|
112
|
+
name = cls.__name__
|
113
|
+
start = time.monotonic()
|
114
|
+
for attempt in range(1, self._retry.attempts + 1):
|
115
|
+
if self._stop_event.is_set() or self.closed:
|
116
|
+
raise RuntimeError("shutting_down")
|
117
|
+
try:
|
118
|
+
with ExitStack() as stack:
|
119
|
+
inst = cls(**self.artifact_class_kwargs)
|
120
|
+
inst = stack.enter_context(inst)
|
121
|
+
inst.update_parquet(period=period, **artifact_kwargs)
|
122
|
+
self.completion_secs[name] = time.monotonic() - start
|
123
|
+
return name
|
124
|
+
except Exception as e:
|
125
|
+
if attempt < self._retry.attempts and not self._stop_event.is_set():
|
126
|
+
# interruptible backoff sleep
|
127
|
+
delay = min(self._retry.backoff_base ** (attempt - 1), self._retry.backoff_max)
|
128
|
+
delay *= 1 + random.uniform(0, self._retry.jitter)
|
129
|
+
end = time.monotonic() + delay
|
130
|
+
while not self._stop_event.is_set() and time.monotonic() < end:
|
131
|
+
time.sleep(min(0.1, end - time.monotonic()))
|
132
|
+
continue
|
133
|
+
raise RuntimeError(f"{name} failed after {self._retry.attempts} attempts: {e}") from e
|
134
|
+
|
135
|
+
def update_data(self, period: str, **kwargs: Any) -> None:
|
136
|
+
# Split kwargs to preserve backward compatibility
|
137
|
+
_, artifact_kwargs = self._split_kwargs(kwargs)
|
138
|
+
|
139
|
+
self.completion_secs.clear()
|
140
|
+
self.failed.clear()
|
141
|
+
|
142
|
+
classes = self._classes_for(period)
|
143
|
+
|
144
|
+
executor = ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="artifact-updater")
|
145
|
+
try:
|
146
|
+
fut2name: Dict[Any, str] = {}
|
147
|
+
for cls in classes:
|
148
|
+
if self._stop_event.is_set() or self.closed:
|
149
|
+
break
|
150
|
+
try:
|
151
|
+
fut = executor.submit(self._run_one, cls, period, dict(artifact_kwargs))
|
152
|
+
except RuntimeError as e:
|
153
|
+
if "cannot schedule new futures after shutdown" in str(e).lower():
|
154
|
+
self.logger.warning("Executor shutting down; halting new submissions.")
|
155
|
+
break
|
156
|
+
raise
|
157
|
+
fut2name[fut] = cls.__name__
|
158
|
+
|
159
|
+
for fut in as_completed(fut2name):
|
160
|
+
name = fut2name[fut]
|
161
|
+
try:
|
162
|
+
fut.result()
|
163
|
+
self.logger.info(f"✅ {name} ({period}) in {self.completion_secs[name]:.2f}s")
|
164
|
+
except Exception as e:
|
165
|
+
self.failed.append(name)
|
166
|
+
self.logger.error(f"✖️ {name} permanently failed: {e}")
|
167
|
+
except KeyboardInterrupt:
|
168
|
+
self.logger.warning("KeyboardInterrupt — stopping scheduling and shutting down.")
|
169
|
+
self._stop_event.set()
|
170
|
+
raise
|
171
|
+
finally:
|
172
|
+
# Ensure queued-but-not-started tasks are canceled
|
173
|
+
executor.shutdown(wait=True, cancel_futures=True)
|
174
|
+
|
175
|
+
self.logger.info(
|
176
|
+
f"Artifacts processed: total={len(classes)}, "
|
177
|
+
f"completed={len(self.completion_secs)}, failed={len(self.failed)}",
|
178
|
+
extra={
|
179
|
+
"date_of_update": time.strftime('%Y-%m-%d'),
|
180
|
+
"start_time": time.strftime('%H:%M:%S'),
|
181
|
+
"period": period,
|
182
|
+
},
|
183
|
+
)
|
184
|
+
|
185
|
+
def get_update_status(self) -> Dict[str, Any]:
|
186
|
+
done = set(self.completion_secs)
|
187
|
+
fail = set(self.failed)
|
188
|
+
all_names = {c.__name__ for v in self.wrapped_classes.values() for c in v}
|
189
|
+
return {
|
190
|
+
"total": len(all_names),
|
191
|
+
"completed": sorted(done),
|
192
|
+
"failed": sorted(fail),
|
193
|
+
"pending": sorted(all_names - done - fail),
|
194
|
+
"completion_times": dict(self.completion_secs),
|
195
|
+
}
|
@@ -55,23 +55,42 @@ class ParquetBackend(BaseBackend):
|
|
55
55
|
def load(self, **options):
|
56
56
|
try:
|
57
57
|
df = self.helper.backend_parquet.load_files()
|
58
|
-
if
|
59
|
-
return -1,
|
58
|
+
if self._is_empty(df):
|
59
|
+
return -1, self._empty_like(df)
|
60
60
|
|
61
61
|
if options and df is not None:
|
62
62
|
df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
|
63
|
-
|
63
|
+
nrows = self._row_count(df)
|
64
|
+
if nrows == 0:
|
64
65
|
self.logger.debug("No records after filters; returning empty DataFrame.")
|
65
|
-
return
|
66
|
+
return 0, self._empty_like(df)
|
66
67
|
|
67
68
|
df = df.persist()
|
68
|
-
self.total_records =
|
69
|
+
self.total_records = self._row_count(df) or -1
|
69
70
|
return self.total_records, df
|
71
|
+
|
70
72
|
except Exception as e:
|
71
|
-
self.total_records = -1
|
73
|
+
self.total_records = -1 # Reset total_records on failure
|
72
74
|
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
73
75
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
74
76
|
|
77
|
+
def _is_empty(self, ddf) -> bool:
|
78
|
+
"""True if no rows across all partitions."""
|
79
|
+
try:
|
80
|
+
# head with npartitions=-1 walks partitions until it gets n rows
|
81
|
+
return ddf.head(1, npartitions=-1).shape[0] == 0
|
82
|
+
except Exception:
|
83
|
+
return True
|
84
|
+
|
85
|
+
def _row_count(self, ddf) -> int:
|
86
|
+
"""Reliable row count for Dask DataFrame."""
|
87
|
+
return int(ddf.map_partitions(len).sum().compute())
|
88
|
+
|
89
|
+
def _empty_like(self, ddf):
|
90
|
+
"""Return an empty Dask DF with the SAME columns/dtypes."""
|
91
|
+
empty_pdf = ddf._meta.iloc[0:0]
|
92
|
+
return dd.from_pandas(empty_pdf, npartitions=1)
|
93
|
+
|
75
94
|
|
76
95
|
class HttpBackend(BaseBackend):
|
77
96
|
def load(self, **options):
|
@@ -250,7 +269,7 @@ class DfHelper(ManagedResource):
|
|
250
269
|
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
251
270
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
252
271
|
return
|
253
|
-
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
272
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
|
254
273
|
writer.save_to_clickhouse(df)
|
255
274
|
self.logger.debug("Save to ClickHouse completed.")
|
256
275
|
|
@@ -242,13 +242,33 @@ class ParquetArtifact(ManagedResource):
|
|
242
242
|
}
|
243
243
|
|
244
244
|
def custom_config():
|
245
|
-
|
245
|
+
"""
|
246
|
+
Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
|
247
|
+
are provided (with backward compatibility for `start_date`/`end_date` aliases).
|
248
|
+
"""
|
249
|
+
# Backward compatibility: normalize aliases
|
250
|
+
alias_map = {
|
251
|
+
"start_on": ("start_date", "start"),
|
252
|
+
"end_on": ("end_date", "end"),
|
253
|
+
}
|
254
|
+
normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
|
255
|
+
for target, aliases in alias_map.items():
|
256
|
+
if target not in normalized_kwargs:
|
257
|
+
for alias in aliases:
|
258
|
+
if alias in normalized_kwargs:
|
259
|
+
normalized_kwargs[target] = normalized_kwargs[alias]
|
260
|
+
break
|
261
|
+
|
262
|
+
# Validation
|
263
|
+
missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
|
264
|
+
if missing:
|
246
265
|
raise ValueError(
|
247
|
-
"For period 'custom',
|
266
|
+
f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
|
248
267
|
)
|
268
|
+
|
249
269
|
return {
|
250
|
-
"parquet_start_date":
|
251
|
-
"parquet_end_date":
|
270
|
+
"parquet_start_date": normalized_kwargs["start_on"],
|
271
|
+
"parquet_end_date": normalized_kwargs["end_on"],
|
252
272
|
}
|
253
273
|
|
254
274
|
if period == "itd":
|
@@ -1,7 +1,8 @@
|
|
1
|
-
from typing import Optional, ClassVar, Dict
|
1
|
+
from typing import Optional, ClassVar, Dict, Any, Union
|
2
2
|
|
3
3
|
import dask.dataframe as dd
|
4
4
|
import fsspec
|
5
|
+
import pandas as pd
|
5
6
|
|
6
7
|
from sibi_dst.df_helper import DfHelper
|
7
8
|
|
@@ -23,7 +24,7 @@ class ParquetReader(DfHelper):
|
|
23
24
|
:type config: dict
|
24
25
|
:ivar df: Stores the loaded Dask DataFrame after the `load()` method is
|
25
26
|
invoked. Initially set to None.
|
26
|
-
:type df: Optional[dd.DataFrame]
|
27
|
+
:type df: Optional[dd.DataFrame | pd.DataFrame]
|
27
28
|
:ivar parquet_storage_path: The path to the Parquet storage directory.
|
28
29
|
:type parquet_storage_path: str
|
29
30
|
:ivar parquet_start_date: Start date for Parquet data selection. Must
|
@@ -32,19 +33,15 @@ class ParquetReader(DfHelper):
|
|
32
33
|
:ivar parquet_end_date: End date for Parquet data selection. Must be
|
33
34
|
set in the configuration.
|
34
35
|
:type parquet_end_date: str
|
35
|
-
|
36
|
-
stored on (e.g., "file", "s3").
|
37
|
-
:type filesystem_type: str
|
38
|
-
:ivar filesystem_options: Any additional options required for the
|
39
|
-
specified filesystem type.
|
40
|
-
:type filesystem_options: dict
|
36
|
+
|
41
37
|
:ivar fs: Instance of `fsspec` filesystem used to interact with the
|
42
38
|
Parquet storage.
|
43
39
|
:type fs: fsspec.AbstractFileSystem
|
44
40
|
"""
|
45
|
-
DEFAULT_CONFIG: ClassVar[Dict[str,
|
41
|
+
DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
|
46
42
|
'backend': 'parquet'
|
47
43
|
}
|
44
|
+
df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
|
48
45
|
|
49
46
|
def __init__(self, **kwargs):
|
50
47
|
self.config = {
|
@@ -63,11 +60,13 @@ class ParquetReader(DfHelper):
|
|
63
60
|
self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
|
64
61
|
if self.parquet_end_date is None:
|
65
62
|
raise ValueError('parquet_end_date must be set')
|
63
|
+
if self.fs is None:
|
64
|
+
raise ValueError('Parquet Reader mush be supplied a fs instance')
|
66
65
|
|
67
66
|
if not self.directory_exists():
|
68
67
|
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
69
68
|
|
70
|
-
def load(self, **kwargs):
|
69
|
+
def load(self, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
71
70
|
self.df = super().load(**kwargs)
|
72
71
|
return self.df
|
73
72
|
|