sibi-dst 0.3.58__tar.gz → 0.3.59__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/PKG-INFO +1 -1
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/pyproject.toml +1 -1
- sibi_dst-0.3.59/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +239 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_df_helper.py +7 -3
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_parquet_artifact.py +143 -52
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +3 -3
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/__init__.py +3 -2
- sibi_dst-0.3.59/sibi_dst/utils/data_wrapper.py +258 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/date_utils.py +8 -8
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/log_utils.py +1 -1
- sibi_dst-0.3.59/sibi_dst/utils/manifest_manager.py +154 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/update_planner.py +96 -85
- sibi_dst-0.3.58/sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -261
- sibi_dst-0.3.58/sibi_dst/utils/data_wrapper.py +0 -249
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/README.md +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/airflow_manager.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -0,0 +1,239 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import datetime
|
4
|
+
import random
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Type
|
6
|
+
|
7
|
+
from sibi_dst.utils import Logger
|
8
|
+
|
9
|
+
|
10
|
+
class ArtifactUpdaterMultiWrapper:
|
11
|
+
"""
|
12
|
+
Simplified wrapper that updates artifacts concurrently using an asyncio.Semaphore.
|
13
|
+
|
14
|
+
Features:
|
15
|
+
- Caps concurrency at max_workers via semaphore
|
16
|
+
- Optionally prioritises tasks via a priority function or static method on artifact classes
|
17
|
+
- Tracks per-artifact completion times
|
18
|
+
- Configurable retry/backoff strategy
|
19
|
+
- Optional metrics integration
|
20
|
+
- Thread-safe within a single asyncio loop
|
21
|
+
|
22
|
+
Usage:
|
23
|
+
wrapper = ArtifactUpdaterMultiWrapper(
|
24
|
+
wrapped_classes={
|
25
|
+
'mydata': [DataArtifactA, DataArtifactB],
|
26
|
+
},
|
27
|
+
max_workers=4,
|
28
|
+
retry_attempts=3,
|
29
|
+
update_timeout_seconds=600,
|
30
|
+
backoff_base=2,
|
31
|
+
backoff_max=60,
|
32
|
+
backoff_jitter=0.1,
|
33
|
+
priority_fn=None, # or custom
|
34
|
+
metrics_client=None,
|
35
|
+
debug=True,
|
36
|
+
logger=None,
|
37
|
+
artifact_class_kwargs={
|
38
|
+
'fs': my_fs,
|
39
|
+
'parquet_storage_path': 's3://bucket/data',
|
40
|
+
'logger': my_logger,
|
41
|
+
'debug': True,
|
42
|
+
}
|
43
|
+
)
|
44
|
+
await wrapper.update_data('mydata', period='ytd', overwrite=True)
|
45
|
+
"""
|
46
|
+
def __init__(
|
47
|
+
self,
|
48
|
+
wrapped_classes: Dict[str, List[Type]],
|
49
|
+
*,
|
50
|
+
max_workers: int = 3,
|
51
|
+
retry_attempts: int = 3,
|
52
|
+
update_timeout_seconds: int = 600,
|
53
|
+
backoff_base: int = 2,
|
54
|
+
backoff_max: Optional[int] = 60,
|
55
|
+
backoff_jitter: float = 0.1,
|
56
|
+
priority_fn: Optional[Callable[[Type], int]] = None,
|
57
|
+
metrics_client: Any = None,
|
58
|
+
debug: bool = False,
|
59
|
+
logger: Optional[logging.Logger] = None,
|
60
|
+
artifact_class_kwargs: Optional[Dict[str, Any]] = None,
|
61
|
+
) -> None:
|
62
|
+
self.wrapped_classes = wrapped_classes
|
63
|
+
self.max_workers = max_workers
|
64
|
+
self.retry_attempts = retry_attempts
|
65
|
+
self.update_timeout_seconds = update_timeout_seconds
|
66
|
+
self.backoff_base = backoff_base
|
67
|
+
self.backoff_max = backoff_max
|
68
|
+
self.backoff_jitter = backoff_jitter
|
69
|
+
self.priority_fn = priority_fn
|
70
|
+
self.metrics_client = metrics_client
|
71
|
+
|
72
|
+
self.debug = debug
|
73
|
+
self.logger = logger or Logger.default_logger(
|
74
|
+
logger_name=self.__class__.__name__,
|
75
|
+
log_level=Logger.DEBUG if debug else Logger.INFO
|
76
|
+
)
|
77
|
+
|
78
|
+
# Default artifact init kwargs
|
79
|
+
today = datetime.datetime.today() + datetime.timedelta(days=1)
|
80
|
+
default_kwargs = {
|
81
|
+
'parquet_start_date': today.strftime('%Y-%m-%d'),
|
82
|
+
'parquet_end_date': today.strftime('%Y-%m-%d'),
|
83
|
+
'logger': self.logger,
|
84
|
+
'debug': self.debug,
|
85
|
+
}
|
86
|
+
self.artifact_class_kwargs = artifact_class_kwargs or default_kwargs.copy()
|
87
|
+
|
88
|
+
# State
|
89
|
+
self.completion_times: Dict[str, float] = {}
|
90
|
+
self.failed: List[str] = []
|
91
|
+
self.original_classes: List[Type] = []
|
92
|
+
|
93
|
+
def get_artifact_classes(self, data_type: str) -> List[Type]:
|
94
|
+
"""
|
95
|
+
Retrieve artifact classes by data type.
|
96
|
+
"""
|
97
|
+
self.logger.info(f"Fetching artifact classes for '{data_type}'")
|
98
|
+
if data_type not in self.wrapped_classes:
|
99
|
+
raise ValueError(f"Unsupported data type: {data_type}")
|
100
|
+
classes = self.wrapped_classes[data_type]
|
101
|
+
self.logger.info(f"Found {len(classes)} artifact classes for '{data_type}'")
|
102
|
+
return classes
|
103
|
+
|
104
|
+
def estimate_priority(self, artifact_cls: Type) -> int:
|
105
|
+
"""
|
106
|
+
Determine task priority for ordering. Lower values run first.
|
107
|
+
"""
|
108
|
+
name = artifact_cls.__name__
|
109
|
+
if self.priority_fn:
|
110
|
+
try:
|
111
|
+
pr = self.priority_fn(artifact_cls)
|
112
|
+
self.logger.debug(f"priority_fn for {name}: {pr}")
|
113
|
+
return pr
|
114
|
+
except Exception as e:
|
115
|
+
self.logger.warning(f"priority_fn error for {name}: {e}")
|
116
|
+
try:
|
117
|
+
fs = self.artifact_class_kwargs.get('fs')
|
118
|
+
path = self.artifact_class_kwargs.get('parquet_storage_path')
|
119
|
+
pr=1
|
120
|
+
if hasattr(artifact_cls, 'get_size_estimate'):
|
121
|
+
pr = artifact_cls.get_size_estimate(fs, path)
|
122
|
+
self.logger.debug(f"Estimated priority for {name}: {pr}")
|
123
|
+
return pr
|
124
|
+
except Exception:
|
125
|
+
return 1
|
126
|
+
|
127
|
+
async def _bounded_update(self, artifact_cls: Type, sem: asyncio.Semaphore, **update_kwargs) -> None:
|
128
|
+
"""
|
129
|
+
Wrap update_artifact in a semaphore slot to limit concurrency.
|
130
|
+
"""
|
131
|
+
async with sem:
|
132
|
+
name = artifact_cls.__name__
|
133
|
+
start = asyncio.get_event_loop().time()
|
134
|
+
self.logger.info(f"Starting update for {name}")
|
135
|
+
try:
|
136
|
+
for attempt in range(1, self.retry_attempts + 1):
|
137
|
+
try:
|
138
|
+
artifact = await asyncio.to_thread(
|
139
|
+
artifact_cls, **self.artifact_class_kwargs
|
140
|
+
)
|
141
|
+
await asyncio.wait_for(
|
142
|
+
asyncio.to_thread(
|
143
|
+
artifact.update_parquet, **update_kwargs
|
144
|
+
),
|
145
|
+
timeout=self.update_timeout_seconds
|
146
|
+
)
|
147
|
+
duration = asyncio.get_event_loop().time() - start
|
148
|
+
self.completion_times[name] = duration
|
149
|
+
self.logger.info(f"✅ {name} updated in {duration:.2f}s (attempt {attempt})")
|
150
|
+
if self.metrics_client:
|
151
|
+
self.metrics_client.increment('task_succeeded')
|
152
|
+
return
|
153
|
+
except asyncio.TimeoutError:
|
154
|
+
self.logger.warning(f"Timeout on {name}, attempt {attempt}")
|
155
|
+
except Exception as e:
|
156
|
+
self.logger.error(f"Error on {name} attempt {attempt}: {e}")
|
157
|
+
|
158
|
+
delay = min(self.backoff_base ** (attempt - 1), self.backoff_max)
|
159
|
+
delay *= 1 + random.uniform(0, self.backoff_jitter)
|
160
|
+
self.logger.info(f"Sleeping {delay:.1f}s before retrying {name}")
|
161
|
+
await asyncio.sleep(delay)
|
162
|
+
|
163
|
+
except asyncio.CancelledError:
|
164
|
+
self.logger.warning(f"{name} update cancelled")
|
165
|
+
raise
|
166
|
+
|
167
|
+
# permanent failure
|
168
|
+
self.logger.error(f"✖️ {name} permanently failed after {self.retry_attempts} attempts")
|
169
|
+
if self.metrics_client:
|
170
|
+
self.metrics_client.increment('task_failed')
|
171
|
+
self.failed.append(name)
|
172
|
+
|
173
|
+
async def update_data(self, data_type: str, **kwargs: Any) -> None:
|
174
|
+
"""
|
175
|
+
Entry point to update all artifacts of a given type concurrently.
|
176
|
+
"""
|
177
|
+
self.logger.info(f"Starting update_data for '{data_type}' with kwargs={kwargs}")
|
178
|
+
|
179
|
+
# RESET STATE
|
180
|
+
self.completion_times.clear()
|
181
|
+
self.failed.clear()
|
182
|
+
self.original_classes = self.get_artifact_classes(data_type)
|
183
|
+
|
184
|
+
# NON-DESTRUCTIVE SORTING
|
185
|
+
ordered = sorted(self.original_classes, key=self.estimate_priority)
|
186
|
+
|
187
|
+
sem = asyncio.Semaphore(self.max_workers)
|
188
|
+
tasks = [
|
189
|
+
asyncio.create_task(self._bounded_update(cls, sem, **kwargs))
|
190
|
+
for cls in ordered
|
191
|
+
]
|
192
|
+
|
193
|
+
try:
|
194
|
+
for coro in asyncio.as_completed(tasks):
|
195
|
+
await coro
|
196
|
+
except asyncio.CancelledError:
|
197
|
+
self.logger.warning("update_data was cancelled—aborting remaining retries")
|
198
|
+
for t in tasks:
|
199
|
+
t.cancel()
|
200
|
+
raise
|
201
|
+
finally:
|
202
|
+
total = len(self.original_classes)
|
203
|
+
completed = len(self.completion_times)
|
204
|
+
failed = len(self.failed)
|
205
|
+
self.logger.info(f"All artifacts processed: total={total}, completed={completed}, failed={failed}")
|
206
|
+
|
207
|
+
def get_update_status(self) -> Dict[str, Any]:
|
208
|
+
"""
|
209
|
+
Returns summary status including completion times.
|
210
|
+
"""
|
211
|
+
total = len(self.original_classes)
|
212
|
+
completed = set(self.completion_times.keys())
|
213
|
+
failed = set(self.failed)
|
214
|
+
pending = {cls.__name__ for cls in self.original_classes} - completed - failed
|
215
|
+
|
216
|
+
return {
|
217
|
+
'total': total,
|
218
|
+
'completed': list(completed),
|
219
|
+
'failed': list(failed),
|
220
|
+
'pending': list(pending),
|
221
|
+
'completion_times': self.completion_times,
|
222
|
+
}
|
223
|
+
|
224
|
+
@staticmethod
|
225
|
+
def format_status_table(status: Dict[str, Any]) -> str:
|
226
|
+
"""
|
227
|
+
Formats the status dict into a readable table.
|
228
|
+
"""
|
229
|
+
lines = [
|
230
|
+
f"Total: {status['total']}",
|
231
|
+
f"Completed: {len(status['completed'])} {status['completed']}",
|
232
|
+
f"Failed: {len(status['failed'])} {status['failed']}",
|
233
|
+
f"Pending: {len(status['pending'])} {status['pending']}",
|
234
|
+
"",
|
235
|
+
"Per-artifact timings:"
|
236
|
+
]
|
237
|
+
for name, dur in status['completion_times'].items():
|
238
|
+
lines.append(f" {name}: {dur:.2f}s")
|
239
|
+
return "\n".join(lines)
|
@@ -46,8 +46,7 @@ class DfHelper:
|
|
46
46
|
|
47
47
|
:ivar df: The DataFrame currently being processed or loaded.
|
48
48
|
:type df: Union[dd.DataFrame, pd.DataFrame]
|
49
|
-
:
|
50
|
-
:type backend_connection: Optional[DjangoConnectionConfig]
|
49
|
+
:type backend_connection: Optional[DjangoConnectionConfig | SqlAlchemyConnectionConfig]
|
51
50
|
:ivar _backend_query: Internal configuration for query handling.
|
52
51
|
:type _backend_query: Optional[QueryConfig]
|
53
52
|
:ivar _backend_params: Internal parameters configuration for DataFrame handling.
|
@@ -81,9 +80,10 @@ class DfHelper:
|
|
81
80
|
self.debug = kwargs.setdefault("debug", False)
|
82
81
|
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=self.__class__.__name__))
|
83
82
|
# Configure logger level
|
84
|
-
self.logger.set_level(
|
83
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
85
84
|
self.logger.debug("Logger initialized in DEBUG mode.")
|
86
85
|
self.parquet_storage_path = kwargs.setdefault("parquet_storage_path", None)
|
86
|
+
self.parquet_filename = kwargs.setdefault("parquet_filename", None)
|
87
87
|
self.dt_field = kwargs.setdefault("dt_field", None)
|
88
88
|
self.as_pandas = kwargs.setdefault("as_pandas", False)
|
89
89
|
self.filesystem = kwargs.pop('filesystem', 'file')
|
@@ -429,7 +429,11 @@ class DfHelper:
|
|
429
429
|
the instance's attribute for storage path.
|
430
430
|
:return: None
|
431
431
|
"""
|
432
|
+
if self.df.map_partitions(len).compute().sum() == 0:
|
433
|
+
self.logger.debug("Cannot save to parquet since DataFrame is empty")
|
434
|
+
return
|
432
435
|
fs = kwargs.pop('fs', self.fs)
|
436
|
+
parquet_filename = parquet_filename or self.parquet_filename
|
433
437
|
parquet_storage_path = kwargs.pop('parquet_storage_path', self.parquet_storage_path)
|
434
438
|
ps = ParquetSaver(df_result=self.df, parquet_storage_path=parquet_storage_path, logger=self.logger, fs=fs)
|
435
439
|
ps.save_to_parquet(parquet_filename)
|
@@ -7,7 +7,8 @@ import dask.dataframe as dd
|
|
7
7
|
import fsspec
|
8
8
|
|
9
9
|
from sibi_dst.df_helper import DfHelper
|
10
|
-
from sibi_dst.utils import DataWrapper, DateUtils, Logger
|
10
|
+
from sibi_dst.utils import DataWrapper, DateUtils, Logger, ParquetSaver, UpdatePlanner
|
11
|
+
from sibi_dst.utils import MissingManifestManager
|
11
12
|
|
12
13
|
|
13
14
|
class ParquetArtifact(DfHelper):
|
@@ -57,7 +58,16 @@ class ParquetArtifact(DfHelper):
|
|
57
58
|
'backend': 'parquet'
|
58
59
|
}
|
59
60
|
|
60
|
-
|
61
|
+
# DEFAULT_UPDATE_PLANNER_CONFIG = {
|
62
|
+
# 'reverse_order': True,
|
63
|
+
# 'overwrite': False,
|
64
|
+
# 'ignore_missing': True,
|
65
|
+
# 'history_days_threshold': 30,
|
66
|
+
# 'max_age_minutes': 10,
|
67
|
+
# 'show_progress': False
|
68
|
+
# }
|
69
|
+
|
70
|
+
def __init__(self, data_wrapper_class, **kwargs):
|
61
71
|
"""
|
62
72
|
Initializes an instance of the class with given configuration and validates
|
63
73
|
required parameters. Sets up the filesystem to handle storage, ensuring
|
@@ -79,46 +89,89 @@ class ParquetArtifact(DfHelper):
|
|
79
89
|
`parquet_filename`, `parquet_start_date`,
|
80
90
|
or `parquet_end_date`) are missing or not set properly.
|
81
91
|
"""
|
92
|
+
|
93
|
+
"""Initialize with config, validate required fields, and setup filesystem."""
|
82
94
|
self._lock = threading.Lock()
|
83
95
|
self.config = {
|
84
96
|
**self.DEFAULT_CONFIG,
|
85
97
|
**kwargs,
|
86
98
|
}
|
87
99
|
self.df: Optional[dd.DataFrame] = None
|
88
|
-
self.
|
89
|
-
self.logger = self.config.setdefault('logger',Logger.default_logger(logger_name=f'parquet_artifact_{__class__.__name__}'))
|
90
|
-
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
100
|
+
self._setup_logging()
|
91
101
|
self.data_wrapper_class = data_wrapper_class
|
92
|
-
|
93
|
-
self.
|
94
|
-
self.
|
95
|
-
|
96
|
-
|
97
|
-
self.
|
98
|
-
if self.parquet_storage_path is None:
|
99
|
-
raise ValueError('parquet_storage_path must be set')
|
100
|
-
|
101
|
-
self.parquet_filename = self.config.setdefault('parquet_filename', None)
|
102
|
-
if self.parquet_filename is None:
|
103
|
-
raise ValueError('parquet_filename must be set')
|
104
|
-
self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
|
105
|
-
if self.parquet_start_date is None:
|
106
|
-
raise ValueError('parquet_start_date must be set')
|
107
|
-
|
108
|
-
self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
|
109
|
-
if self.parquet_end_date is None:
|
110
|
-
raise ValueError('parquet_end_date must be set')
|
102
|
+
|
103
|
+
self.date_field = self._validate_required('date_field')
|
104
|
+
self.parquet_storage_path = self._validate_required('parquet_storage_path')
|
105
|
+
self.parquet_filename = self._validate_required('parquet_filename')
|
106
|
+
self.parquet_start_date = self._validate_required('parquet_start_date')
|
107
|
+
self.parquet_end_date = self._validate_required('parquet_end_date')
|
111
108
|
|
112
109
|
# Filesystem setup
|
113
110
|
self.filesystem_type = self.config.setdefault('filesystem_type', 'file')
|
114
111
|
self.filesystem_options = self.config.setdefault('filesystem_options', {})
|
115
112
|
self.fs = self.config.setdefault('fs', None)
|
113
|
+
self._own_fs = self.fs is None
|
116
114
|
if self.fs is None:
|
117
115
|
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
116
|
+
self._own_fs = True
|
118
117
|
self.config.setdefault('fs', self.fs)
|
118
|
+
## Populate to parameters to pass to data_wrapper_class
|
119
|
+
self.class_params = self.config.pop('class_params', {
|
120
|
+
'debug': self.debug,
|
121
|
+
'logger': self.logger,
|
122
|
+
'fs': self.fs,
|
123
|
+
})
|
124
|
+
# Populate parameters to pass to load method of DataWrapper class
|
125
|
+
self.load_params = self.config.setdefault('load_params', {})
|
119
126
|
# Ensure the directory exists
|
120
127
|
self.ensure_directory_exists(self.parquet_storage_path)
|
121
128
|
super().__init__(**self.config)
|
129
|
+
self.update_planner_params = {}
|
130
|
+
self.datawrapper_params = {}
|
131
|
+
|
132
|
+
def _setup_logging(self):
|
133
|
+
"""Initialize logger and debug settings."""
|
134
|
+
self.debug = self.config.get('debug', False)
|
135
|
+
self.logger = self.config.get('logger',
|
136
|
+
Logger.default_logger(
|
137
|
+
logger_name=f'Parquet_Artifact_InstanceOf_{self.__class__.__name__}'))
|
138
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
139
|
+
|
140
|
+
def _validate_required(self, key: str) -> Any:
|
141
|
+
"""Validate required configuration fields."""
|
142
|
+
value = self.config.setdefault(key, None)
|
143
|
+
if value is None:
|
144
|
+
raise ValueError(f'{key} must be set')
|
145
|
+
return value
|
146
|
+
|
147
|
+
def _setup_manifest(self, overwrite: bool = False, ignore_missing: bool = False):
|
148
|
+
self.skipped = []
|
149
|
+
self.missing_manifest_path = f"{self.parquet_storage_path}_manifests/missing.parquet"
|
150
|
+
self.mmanifest = MissingManifestManager(
|
151
|
+
fs=self.fs,
|
152
|
+
manifest_path=self.missing_manifest_path,
|
153
|
+
clear_existing=overwrite
|
154
|
+
)
|
155
|
+
|
156
|
+
# Initialize skipped files
|
157
|
+
manifest_exists = self.mmanifest._safe_exists(self.missing_manifest_path)
|
158
|
+
if not manifest_exists:
|
159
|
+
self.logger.info(f"Creating new manifest at {self.missing_manifest_path}")
|
160
|
+
self.mmanifest.save()
|
161
|
+
else:
|
162
|
+
self.logger.info(f"Manifest already exists at {self.missing_manifest_path}")
|
163
|
+
|
164
|
+
# Load skipped files if manifest exists and ignore_missing is True
|
165
|
+
self.skipped = self.mmanifest.load_existing() # if ignore_missing and manifest_exists else []
|
166
|
+
self.logger.info(f"Skipped: {self.skipped}")
|
167
|
+
if overwrite:
|
168
|
+
self.skipped = []
|
169
|
+
self.ignore_missing = False
|
170
|
+
|
171
|
+
def _setup_update_planner(self, **kwargs) -> None:
|
172
|
+
self._prepare_update_params(**kwargs)
|
173
|
+
self.update_planner = UpdatePlanner(**self.update_planner_params)
|
174
|
+
self.update_planner.generate_plan(self.start_date, self.end_date)
|
122
175
|
|
123
176
|
def load(self, **kwargs):
|
124
177
|
with self._lock:
|
@@ -130,20 +183,30 @@ class ParquetArtifact(DfHelper):
|
|
130
183
|
Generate a Parquet file using the configured DataWrapper class.
|
131
184
|
"""
|
132
185
|
with self._lock:
|
133
|
-
|
134
|
-
|
135
|
-
|
186
|
+
overwrite = kwargs.get('overwrite', False)
|
187
|
+
ignore_missing = kwargs.get('ignore_missing', False)
|
188
|
+
self._setup_manifest(overwrite, ignore_missing)
|
189
|
+
self._setup_update_planner(**kwargs)
|
190
|
+
params = self.datawrapper_params.copy()
|
191
|
+
params.update({
|
192
|
+
'mmanifest': self.mmanifest,
|
193
|
+
'update_planner': self.update_planner
|
194
|
+
})
|
195
|
+
|
196
|
+
with DataWrapper(self.data_wrapper_class, **params) as dw:
|
197
|
+
dw.process()
|
136
198
|
|
137
199
|
def __enter__(self):
|
138
200
|
if getattr(self, "_entered", False):
|
139
201
|
return self
|
140
202
|
self._entered = True
|
141
|
-
self.ensure_directory_exists(self.parquet_storage_path)
|
142
203
|
return self
|
143
204
|
|
144
205
|
def __exit__(self, exc_type, exc_value, traceback):
|
145
206
|
try:
|
146
|
-
if
|
207
|
+
if self.mmanifest and self.mmanifest._new_records:
|
208
|
+
self.mmanifest.save()
|
209
|
+
if getattr(self, "_entered", False) and self.fs and self._own_fs:
|
147
210
|
self.fs.close()
|
148
211
|
except Exception as e:
|
149
212
|
self.logger.warning(f"Error closing filesystem: {e}")
|
@@ -152,6 +215,18 @@ class ParquetArtifact(DfHelper):
|
|
152
215
|
# return False so exceptions aren’t suppressed
|
153
216
|
return False
|
154
217
|
|
218
|
+
@classmethod
|
219
|
+
def get_size_estimate(cls, parquet_path: str, **kwargs) -> int:
|
220
|
+
"""
|
221
|
+
Estimate complexity as total bytes of all .parquet files under parquet_path.
|
222
|
+
Returns size in megabytes (so you can cap or scale priority sensibly).
|
223
|
+
"""
|
224
|
+
fs, _, paths = cls.fs.get_fs_token_paths(parquet_path)
|
225
|
+
files = fs.glob(f"{parquet_path}/*.parquet")
|
226
|
+
total_bytes = sum(fs.size(f) for f in files)
|
227
|
+
# convert to “units” (e.g. MB) so priorities stay in a reasonable range
|
228
|
+
return max(1, int(total_bytes / (1024 ** 2)))
|
229
|
+
|
155
230
|
def update_parquet(self, period: str = 'today', **kwargs) -> None:
|
156
231
|
"""Update the Parquet file with data from a specific period."""
|
157
232
|
|
@@ -206,36 +281,52 @@ class ParquetArtifact(DfHelper):
|
|
206
281
|
'end_date': kwargs.get('parquet_end_date', self.parquet_end_date),
|
207
282
|
}
|
208
283
|
|
209
|
-
def
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
284
|
+
def _prepare_update_params(self, **kwargs) -> Dict[str, Any]:
|
285
|
+
self.reverse_order = kwargs.pop('reverse_order', True)
|
286
|
+
self.overwrite = kwargs.pop('overwrite', False)
|
287
|
+
self.ignore_missing = kwargs.pop('ignore_missing', False)
|
288
|
+
self.history_days_threshold = kwargs.pop('history_days_threshold', 30)
|
289
|
+
self.max_age_minutes = kwargs.pop('max_age_minutes', 10)
|
290
|
+
self.show_progress = kwargs.pop('show_progress', False)
|
291
|
+
self.start_date = kwargs.pop('parquet_start_date', self.parquet_start_date)
|
292
|
+
self.end_date = kwargs.pop('parquet_end_date', self.parquet_end_date)
|
293
|
+
self.parquet_filename = kwargs.pop('parquet_filename', self.parquet_filename)
|
294
|
+
self.verbose = kwargs.pop('verbose', False)
|
295
|
+
|
296
|
+
self.update_planner_params.update({
|
297
|
+
'filename': self.parquet_filename,
|
215
298
|
'data_path': self.parquet_storage_path,
|
216
|
-
'
|
217
|
-
'
|
218
|
-
'end_date': kwargs.pop('parquet_end_date', self.parquet_end_date),
|
219
|
-
'verbose': kwargs.pop('verbose', False),
|
220
|
-
'load_params': kwargs.pop('load_params', None),
|
221
|
-
'reverse_order': kwargs.pop('reverse_order', True),
|
222
|
-
'overwrite': kwargs.pop('overwrite', False),
|
223
|
-
'ignore_missing': kwargs.pop('ignore_missing', False),
|
299
|
+
'fs': self.fs,
|
300
|
+
'debug': self.debug,
|
224
301
|
'logger': self.logger,
|
225
|
-
'
|
226
|
-
'
|
227
|
-
'
|
302
|
+
'reverse_order': self.reverse_order,
|
303
|
+
'overwrite': self.overwrite,
|
304
|
+
'ignore_missing': self.ignore_missing,
|
305
|
+
'history_days_threshold': self.history_days_threshold,
|
306
|
+
'max_age_minutes': self.max_age_minutes,
|
307
|
+
'show_progress': self.show_progress,
|
308
|
+
'description': f"{self.data_wrapper_class.__name__}",
|
309
|
+
'skipped': self.skipped,
|
310
|
+
'verbose': self.verbose,
|
311
|
+
})
|
312
|
+
|
313
|
+
self.datawrapper_params = {
|
314
|
+
'parquet_filename': self.parquet_filename,
|
315
|
+
'data_path': self.parquet_storage_path,
|
228
316
|
'fs': self.fs,
|
229
|
-
'
|
230
|
-
'
|
317
|
+
'class_params': self.class_params,
|
318
|
+
'date_field': self.date_field,
|
319
|
+
'load_params': self.load_params,
|
320
|
+
'verbose': self.verbose
|
231
321
|
}
|
232
322
|
|
233
|
-
|
234
|
-
def parse_parquet_period(**kwargs):
|
323
|
+
def parse_parquet_period(self, **kwargs):
|
235
324
|
start_date, end_date = DateUtils.parse_period(**kwargs)
|
325
|
+
self.parquet_start_date = start_date.strftime('%Y-%m-%d')
|
326
|
+
self.parquet_end_date = end_date.strftime('%Y-%m-%d')
|
236
327
|
return {
|
237
|
-
'parquet_start_date':
|
238
|
-
'parquet_end_date':
|
328
|
+
'parquet_start_date': self.parquet_start_date,
|
329
|
+
'parquet_end_date': self.parquet_end_date,
|
239
330
|
}
|
240
331
|
|
241
332
|
def ensure_directory_exists(self, path: str) -> None:
|
{sibi_dst-0.3.58 → sibi_dst-0.3.59}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
@@ -155,7 +155,7 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
155
155
|
key = self._engine_key()
|
156
156
|
with self._registry_lock:
|
157
157
|
if self._engine_registry.get(key) is not self.engine:
|
158
|
-
self.logger.
|
158
|
+
self.logger.debug("Engine changed")
|
159
159
|
return 0
|
160
160
|
pool = self.engine.pool
|
161
161
|
if isinstance(pool, QueuePool):
|
@@ -191,10 +191,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
|
|
191
191
|
with self._registry_lock:
|
192
192
|
key = self._engine_key()
|
193
193
|
if not self._owns_engine:
|
194
|
-
self.logger.
|
194
|
+
self.logger.debug("Not owner, skipping close")
|
195
195
|
return
|
196
196
|
if self._engine_registry.get(key) != self.engine:
|
197
|
-
self.logger.
|
197
|
+
self.logger.debug("Engine not in registry")
|
198
198
|
return
|
199
199
|
self.engine.dispose()
|
200
200
|
del self._engine_registry[key]
|
@@ -17,7 +17,7 @@ from .data_wrapper import DataWrapper
|
|
17
17
|
from .storage_config import StorageConfig
|
18
18
|
from .data_from_http_source import DataFromHttpSource
|
19
19
|
from .webdav_client import WebDAVClient
|
20
|
-
|
20
|
+
from .manifest_manager import MissingManifestManager
|
21
21
|
|
22
22
|
__all__ = [
|
23
23
|
"Logger",
|
@@ -38,5 +38,6 @@ __all__ = [
|
|
38
38
|
"AirflowDAGManager",
|
39
39
|
"StorageConfig",
|
40
40
|
"DataFromHttpSource",
|
41
|
-
"WebDAVClient"
|
41
|
+
"WebDAVClient",
|
42
|
+
"MissingManifestManager",
|
42
43
|
]
|