sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,61 +1,21 @@
|
|
1
|
-
import datetime
|
1
|
+
import datetime as dt
|
2
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
3
|
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
4
|
+
|
4
5
|
import pandas as pd
|
5
|
-
from .date_utils import FileAgeChecker
|
6
|
-
from pydantic import BaseModel, Field
|
7
|
-
from rich.console import Console
|
8
|
-
from rich.table import Table
|
9
6
|
|
10
7
|
from sibi_dst.utils import ManagedResource
|
8
|
+
from .date_utils import FileAgeChecker
|
11
9
|
|
12
10
|
|
13
|
-
class UpdateConfig(BaseModel):
|
14
|
-
"""
|
15
|
-
A unified Pydantic model for the data update process configuration.
|
16
|
-
Acts as a single source of truth for all settings.
|
17
|
-
"""
|
18
|
-
overwrite: bool = False
|
19
|
-
reverse_order: bool = True
|
20
|
-
ignore_missing: bool = False
|
21
|
-
history_days_threshold: int = 30
|
22
|
-
max_age_minutes: int = 1440 # 24 hours
|
23
|
-
show_progress: bool = False
|
24
|
-
verbose: bool = False
|
25
|
-
debug: bool = False
|
26
|
-
start_date: datetime.date
|
27
|
-
end_date: datetime.date
|
28
|
-
custom_priority_map: Optional[Dict[str, int]] = None
|
29
|
-
max_threads: int = 3
|
30
|
-
timeout: float = 30.0
|
31
|
-
|
32
|
-
class Config:
|
33
|
-
arbitrary_types_allowed = True
|
34
|
-
|
35
11
|
class UpdatePlanner(ManagedResource):
|
36
12
|
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Attributes:
|
41
|
-
data_path: Base path (always ends with '/').
|
42
|
-
filename: Filename inside each date folder.
|
43
|
-
fs: fsspec filesystem instance.
|
44
|
-
age_checker: FileAgeChecker for computing file ages.
|
45
|
-
reference_date: The "today" date used for history windows (date or ISO string).
|
46
|
-
history_days_threshold: Number of days considered "in history".
|
47
|
-
max_age_minutes: File staleness threshold in minutes.
|
48
|
-
overwrite: If True, forces updates for all dates.
|
49
|
-
ignore_missing: If True, skips missing files outside history.
|
50
|
-
reverse_order: If True, sorts dates descending in output.
|
51
|
-
show_progress: If True, displays a tqdm progress bar.
|
52
|
-
logger: Logger for informational messages.
|
53
|
-
|
54
|
-
Note:
|
55
|
-
generate_plan() will overwrite self.plan and self.df_req, and returns a DataFrame of required updates.
|
13
|
+
Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
|
14
|
+
Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
|
15
|
+
concerns do not apply here.
|
56
16
|
"""
|
57
17
|
|
58
|
-
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]]={
|
18
|
+
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
59
19
|
"file_is_recent": 0,
|
60
20
|
"missing_ignored": 0,
|
61
21
|
"overwrite_forced": 1,
|
@@ -68,183 +28,221 @@ class UpdatePlanner(ManagedResource):
|
|
68
28
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
69
29
|
|
70
30
|
def __init__(
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
31
|
+
self,
|
32
|
+
parquet_storage_path: str,
|
33
|
+
parquet_filename: str,
|
34
|
+
description: str = "Update Planner",
|
35
|
+
reference_date: Union[str, dt.date, None] = None,
|
36
|
+
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
37
|
+
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
38
|
+
overwrite: bool = False,
|
39
|
+
ignore_missing: bool = False,
|
40
|
+
custom_priority_map: Optional[Dict[str, int]] = None,
|
41
|
+
reverse_order: bool = False,
|
42
|
+
show_progress: bool = False,
|
43
|
+
skipped: Optional[List[str]] = None,
|
44
|
+
**kwargs,
|
85
45
|
):
|
86
|
-
# Initialize state
|
87
46
|
super().__init__(**kwargs)
|
88
|
-
|
89
|
-
|
47
|
+
|
48
|
+
# Public-ish attributes
|
90
49
|
self.description = description
|
91
|
-
self.data_path = self._ensure_trailing_slash(
|
92
|
-
self.filename =
|
50
|
+
self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
51
|
+
self.filename = parquet_filename
|
93
52
|
self.reverse_order = reverse_order
|
94
53
|
self.show_progress = show_progress
|
95
|
-
self.
|
54
|
+
self.overwrite = overwrite
|
55
|
+
self.ignore_missing = ignore_missing
|
56
|
+
self.history_days_threshold = history_days_threshold
|
57
|
+
self.max_age_minutes = max_age_minutes
|
58
|
+
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
59
|
+
self.skipped = set(skipped or [])
|
96
60
|
|
97
|
-
#
|
61
|
+
# Execution knobs from kwargs (fed by upstream config)
|
62
|
+
self.max_threads: int = int(kwargs.get("max_threads", 3))
|
63
|
+
self.timeout: float = float(kwargs.get("timeout", 30.0))
|
64
|
+
|
65
|
+
# Date window
|
66
|
+
self.start_date = kwargs.get("parquet_start_date")
|
67
|
+
self.end_date = kwargs.get("parquet_end_date")
|
68
|
+
|
69
|
+
# Reference "today"
|
98
70
|
if reference_date is None:
|
99
|
-
self.reference_date =
|
71
|
+
self.reference_date = dt.date.today()
|
100
72
|
else:
|
101
73
|
self.reference_date = pd.to_datetime(reference_date).date()
|
102
74
|
|
103
|
-
#
|
104
|
-
self.
|
105
|
-
self.
|
106
|
-
self.
|
107
|
-
self.ignore_missing = ignore_missing
|
108
|
-
self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
109
|
-
self.skipped = skipped or []
|
110
|
-
|
111
|
-
@staticmethod
|
112
|
-
def _ensure_trailing_slash(path: str) -> str:
|
113
|
-
"""Ensure that the provided path ends with a single '/'."""
|
114
|
-
return path.rstrip('/') + '/'
|
115
|
-
|
116
|
-
def _generate_plan(
|
117
|
-
self,
|
118
|
-
start: datetime.date,
|
119
|
-
end: datetime.date,
|
120
|
-
freq: str = "D"
|
121
|
-
) -> None:
|
122
|
-
"""
|
123
|
-
Internal: populates self.plan with all dates, and self.df_req with only those needing update.
|
124
|
-
"""
|
125
|
-
dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
126
|
-
history_start = self.reference_date - datetime.timedelta(days=self.history_days_threshold)
|
127
|
-
rows: List[Dict] = []
|
75
|
+
# Helpers & state
|
76
|
+
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
77
|
+
self.plan: pd.DataFrame = pd.DataFrame()
|
78
|
+
self.df_req: pd.DataFrame = pd.DataFrame()
|
128
79
|
|
129
|
-
#
|
130
|
-
|
131
|
-
futures = {executor.submit(self._get_file_status, d): d for d in dates}
|
132
|
-
iterator = as_completed(futures)
|
133
|
-
if self.show_progress:
|
134
|
-
from tqdm import tqdm
|
135
|
-
iterator = tqdm(
|
136
|
-
iterator,
|
137
|
-
total=len(futures),
|
138
|
-
desc=f"Scanning dates for {self.description}",
|
139
|
-
unit="date",
|
140
|
-
leave=False
|
141
|
-
)
|
142
|
-
for future in iterator:
|
143
|
-
d = futures[future]
|
144
|
-
try:
|
145
|
-
exists, age = future.result()
|
146
|
-
rows.append(self._make_row(d, history_start, exists, age))
|
147
|
-
except Exception as exc:
|
148
|
-
self.logger.error(f"Error processing date {d}: {exc}")
|
149
|
-
rows.append(self._make_row(d, history_start, False, None))
|
80
|
+
# internal run flag to print once per run if caller reuses instance
|
81
|
+
self._printed_this_run: bool = False
|
150
82
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
).reset_index(drop=True)
|
83
|
+
# --------------------- public helpers ---------------------
|
84
|
+
def has_plan(self) -> bool:
|
85
|
+
"""Safe truthiness for plan existence."""
|
86
|
+
return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
156
87
|
|
157
|
-
|
158
|
-
self.df_req
|
88
|
+
def required_count(self) -> int:
|
89
|
+
return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
|
159
90
|
|
91
|
+
# --------------------- core API ---------------------
|
160
92
|
def generate_plan(
|
161
|
-
|
162
|
-
|
163
|
-
|
93
|
+
self,
|
94
|
+
start: Union[str, dt.date, None] = None,
|
95
|
+
end: Union[str, dt.date, None] = None,
|
96
|
+
freq: str = "D",
|
164
97
|
) -> pd.DataFrame:
|
165
98
|
"""
|
166
|
-
|
167
|
-
sorted by update_priority and date (descending if reverse_order=True).
|
99
|
+
Build a plan for [start, end]. Returns rows that require update (df_req).
|
168
100
|
"""
|
101
|
+
start = start or self.start_date
|
102
|
+
end = end or self.end_date
|
169
103
|
sd = pd.to_datetime(start).date()
|
170
104
|
ed = pd.to_datetime(end).date()
|
171
105
|
if sd > ed:
|
172
106
|
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
173
107
|
|
174
108
|
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
|
175
|
-
self._generate_plan(sd, ed)
|
109
|
+
self._generate_plan(sd, ed, freq=freq)
|
176
110
|
self.logger.info(
|
177
111
|
f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
178
112
|
f"{len(self.df_req)} require update"
|
179
113
|
)
|
180
|
-
|
181
114
|
return self.df_req
|
182
115
|
|
183
116
|
def show_update_plan(self) -> None:
|
184
|
-
"""
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
117
|
+
"""Pretty-print the current plan once per run."""
|
118
|
+
if not self.has_plan():
|
119
|
+
self.logger.info("No update plan to show.")
|
120
|
+
return
|
121
|
+
if self._printed_this_run:
|
189
122
|
return
|
190
123
|
|
191
|
-
|
124
|
+
try:
|
125
|
+
from rich.console import Console
|
126
|
+
from rich.table import Table
|
127
|
+
except Exception:
|
128
|
+
# Fallback: plain text
|
129
|
+
self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}")
|
130
|
+
self._printed_this_run = True
|
131
|
+
return
|
192
132
|
|
193
|
-
table = Table(
|
133
|
+
table = Table(
|
134
|
+
title=f"Update Plan for {self.data_path}",
|
135
|
+
show_header=True,
|
136
|
+
header_style="bold magenta",
|
137
|
+
)
|
194
138
|
for column in self.plan.columns:
|
195
139
|
table.add_column(column, justify="left")
|
140
|
+
|
196
141
|
for _, row in self.plan.iterrows():
|
197
|
-
table.add_row(*(str(
|
142
|
+
table.add_row(*(str(row[col]) for col in self.plan.columns))
|
198
143
|
|
199
144
|
console = Console()
|
200
145
|
with console.capture() as capture:
|
201
146
|
console.print(table)
|
202
|
-
|
203
|
-
|
204
|
-
self.logger.info(f"Full Update Plan:\n{plan_string.strip()}")
|
147
|
+
self.logger.info(f"Full Update Plan:\n{capture.get().strip()}")
|
148
|
+
self._printed_this_run = True
|
205
149
|
|
206
|
-
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[
|
207
|
-
"""
|
208
|
-
|
150
|
+
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
151
|
+
"""
|
152
|
+
Yield (priority, [dates...]) batches, smallest priority first.
|
153
|
+
"""
|
154
|
+
if not self.has_plan():
|
209
155
|
return
|
210
|
-
|
211
|
-
|
212
|
-
if required_updates.empty:
|
156
|
+
req = self.plan[self.plan["update_required"]]
|
157
|
+
if req.empty:
|
213
158
|
return
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
dates = sorted_dates["date"].tolist()
|
159
|
+
for priority in sorted(req["update_priority"].unique()):
|
160
|
+
dates_df = req[req["update_priority"] == priority]
|
161
|
+
# sort within group
|
162
|
+
dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
163
|
+
dates = dates_df["date"].tolist()
|
220
164
|
if dates:
|
221
|
-
yield priority, dates
|
165
|
+
yield int(priority), dates
|
166
|
+
|
167
|
+
# --------------------- internals ---------------------
|
168
|
+
@staticmethod
|
169
|
+
def _ensure_trailing_slash(path: str) -> str:
|
170
|
+
return path.rstrip("/") + "/"
|
222
171
|
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
172
|
+
def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
173
|
+
"""
|
174
|
+
Populate self.plan with all dates and self.df_req with the subset to update.
|
175
|
+
"""
|
176
|
+
dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
177
|
+
history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
178
|
+
rows: List[Dict] = []
|
179
|
+
|
180
|
+
# bound threads
|
181
|
+
max_workers = max(1, int(self.max_threads))
|
182
|
+
|
183
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
184
|
+
futures = {executor.submit(self._get_file_status, d): d for d in dates}
|
185
|
+
iterator = as_completed(futures)
|
186
|
+
if self.show_progress:
|
187
|
+
try:
|
188
|
+
from tqdm import tqdm
|
189
|
+
iterator = tqdm(
|
190
|
+
iterator, total=len(futures),
|
191
|
+
desc=f"Scanning dates for {self.description}",
|
192
|
+
unit="date", leave=False
|
193
|
+
)
|
194
|
+
except Exception:
|
195
|
+
pass # no tqdm → proceed without progress bar
|
196
|
+
|
197
|
+
for future in iterator:
|
198
|
+
d = futures[future]
|
199
|
+
try:
|
200
|
+
exists, age = future.result(timeout=self.timeout)
|
201
|
+
rows.append(self._make_row(d, history_start, exists, age))
|
202
|
+
except Exception as exc:
|
203
|
+
self.logger.error(f"Error processing date {d}: {exc}")
|
204
|
+
rows.append(self._make_row(d, history_start, False, None))
|
205
|
+
|
206
|
+
df = pd.DataFrame(rows)
|
207
|
+
# consistent types
|
208
|
+
if not df.empty:
|
209
|
+
df["date"] = pd.to_datetime(df["date"]).dt.date
|
210
|
+
df["update_priority"] = df["update_priority"].astype(int)
|
211
|
+
|
212
|
+
df = df.sort_values(
|
213
|
+
by=["update_priority", "date"],
|
214
|
+
ascending=[True, not self.reverse_order],
|
215
|
+
kind="mergesort", # stable
|
216
|
+
).reset_index(drop=True)
|
217
|
+
|
218
|
+
self.plan = df
|
219
|
+
self.df_req = df[df["update_required"]].copy()
|
220
|
+
self._printed_this_run = False
|
221
|
+
|
222
|
+
def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
|
227
223
|
"""
|
228
224
|
Check file existence and age for the given date.
|
229
225
|
"""
|
230
226
|
just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
231
227
|
if just_path in self.skipped:
|
232
|
-
self.logger.debug(f"
|
228
|
+
self.logger.debug(f"Skipping {date}: path in skipped list.")
|
233
229
|
return False, None
|
230
|
+
|
234
231
|
path = f"{just_path}{self.filename}"
|
235
232
|
try:
|
236
233
|
exists = self.fs.exists(path)
|
237
234
|
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
238
|
-
return exists, age
|
239
|
-
except Exception:
|
235
|
+
return bool(exists), age
|
236
|
+
except Exception as e:
|
237
|
+
self.logger.warning(f"exists/age check failed for {path}: {e}")
|
240
238
|
return False, None
|
241
239
|
|
242
240
|
def _make_row(
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
241
|
+
self,
|
242
|
+
date: dt.date,
|
243
|
+
history_start: dt.date,
|
244
|
+
file_exists: bool,
|
245
|
+
file_age: Optional[float],
|
248
246
|
) -> Dict:
|
249
247
|
"""
|
250
248
|
Build a single plan row based on flags and thresholds.
|
@@ -252,11 +250,11 @@ class UpdatePlanner(ManagedResource):
|
|
252
250
|
within_history = history_start <= date <= self.reference_date
|
253
251
|
update_required = False
|
254
252
|
|
255
|
-
# 1
|
253
|
+
# 1) Overwrite forces update
|
256
254
|
if self.overwrite:
|
257
255
|
category = "overwrite_forced"
|
258
256
|
update_required = True
|
259
|
-
# 2
|
257
|
+
# 2) Inside history window
|
260
258
|
elif within_history:
|
261
259
|
if not file_exists:
|
262
260
|
category = "missing_in_history"
|
@@ -266,33 +264,37 @@ class UpdatePlanner(ManagedResource):
|
|
266
264
|
update_required = True
|
267
265
|
else:
|
268
266
|
category = "file_is_recent"
|
269
|
-
# 3
|
267
|
+
# 3) Outside history, missing file (and not ignoring)
|
270
268
|
elif not file_exists and not self.ignore_missing:
|
271
269
|
category = "create_missing"
|
272
270
|
update_required = True
|
273
|
-
# 4
|
271
|
+
# 4) Everything else
|
274
272
|
else:
|
275
273
|
category = "missing_ignored" if not file_exists else "file_is_recent"
|
276
274
|
|
277
275
|
return {
|
278
276
|
"date": date,
|
279
|
-
"file_exists": file_exists,
|
277
|
+
"file_exists": bool(file_exists),
|
280
278
|
"file_age_minutes": file_age,
|
281
279
|
"update_category": category,
|
282
280
|
"update_priority": self.priority_map.get(category, 99),
|
283
|
-
"update_required": update_required,
|
281
|
+
"update_required": bool(update_required),
|
284
282
|
"description": self.description,
|
285
283
|
}
|
286
284
|
|
287
|
-
def exclude_dates(self, dates: Set[
|
285
|
+
def exclude_dates(self, dates: Set[dt.date]) -> None:
|
288
286
|
"""
|
289
287
|
Exclude specific dates from the update plan.
|
290
288
|
"""
|
291
289
|
if not isinstance(dates, set):
|
292
|
-
raise ValueError("dates must be a set
|
293
|
-
if self.
|
294
|
-
self.logger.
|
290
|
+
raise ValueError("dates must be a set[date].")
|
291
|
+
if not self.has_plan():
|
292
|
+
self.logger.info("No update plan to modify. Call generate_plan() first.")
|
295
293
|
return
|
296
|
-
|
297
|
-
|
298
|
-
self.
|
294
|
+
|
295
|
+
before = len(self.plan)
|
296
|
+
self.plan = self.plan[~self.plan["date"].isin(dates)]
|
297
|
+
self.df_req = self.plan[self.plan["update_required"]].copy()
|
298
|
+
self.logger.info(
|
299
|
+
f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows)."
|
300
|
+
)
|