sibi-dst 2025.9.8__py3-none-any.whl → 2025.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_async.py +191 -137
- sibi_dst/df_helper/_parquet_artifact.py +6 -326
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +26 -2
- sibi_dst/utils/boilerplate/__init__.py +7 -3
- sibi_dst/utils/boilerplate/base_attacher.py +57 -12
- sibi_dst/utils/boilerplate/base_pipeline.py +14 -29
- sibi_dst/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_wrapper.py +46 -312
- sibi_dst/utils/parquet_saver.py +29 -16
- sibi_dst/utils/progress/sse_runner.py +39 -11
- sibi_dst/utils/update_planner.py +161 -805
- {sibi_dst-2025.9.8.dist-info → sibi_dst-2025.9.10.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.8.dist-info → sibi_dst-2025.9.10.dist-info}/RECORD +16 -15
- {sibi_dst-2025.9.8.dist-info → sibi_dst-2025.9.10.dist-info}/WHEEL +0 -0
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,54 +1,52 @@
|
|
1
|
-
# update_planner.py
|
2
1
|
from __future__ import annotations
|
3
2
|
|
4
3
|
import datetime as dt
|
4
|
+
import re
|
5
5
|
from concurrent.futures import ThreadPoolExecutor, wait
|
6
6
|
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar, Any, Callable
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
10
|
from sibi_dst.utils import ManagedResource
|
11
|
-
from . import FileAgeChecker
|
11
|
+
from . import FileAgeChecker
|
12
12
|
|
13
13
|
|
14
14
|
class UpdatePlanner(ManagedResource):
|
15
15
|
"""
|
16
|
-
|
17
|
-
Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
|
16
|
+
Update planner for datasets organized either as:
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
- Real timeouts using concurrent.futures.wait(...).
|
24
|
-
- Future dates marked as 'future' (not actionable).
|
18
|
+
- Legacy layout: /YYYY/MM/DD/file.parquet
|
19
|
+
- Hive layout: /partition_date=YYYY-MM-DD/[other=val]/file.parquet
|
20
|
+
|
21
|
+
Public API is unchanged (`generate_plan`, `show_update_plan`, etc.).
|
25
22
|
"""
|
26
23
|
|
27
|
-
# -------- Defaults (extended, but original keys retained) --------
|
28
24
|
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
29
25
|
"file_is_recent": 0,
|
30
26
|
"missing_ignored": 0,
|
31
27
|
"overwrite_forced": 1,
|
32
|
-
"incomplete": 1,
|
28
|
+
"incomplete": 1,
|
33
29
|
"create_missing": 2,
|
34
30
|
"missing_in_history": 3,
|
35
31
|
"stale_in_history": 4,
|
36
|
-
"future": 99,
|
32
|
+
"future": 99,
|
37
33
|
}
|
38
34
|
|
39
35
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
40
36
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
41
37
|
|
42
|
-
# Data/Control file heuristics (can be overridden)
|
43
38
|
DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
|
44
39
|
CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
|
45
40
|
|
41
|
+
HIVE_PARTITION_RE: ClassVar[re.Pattern] = re.compile(r"([^/=]+)=([^/]+)")
|
42
|
+
|
46
43
|
logger_extra = {"sibi_dst_component": __name__}
|
47
44
|
|
48
45
|
def __init__(
|
49
46
|
self,
|
50
47
|
parquet_storage_path: str,
|
51
|
-
|
48
|
+
*,
|
49
|
+
partition_on: Optional[List[str]] = None,
|
52
50
|
description: str = "Update Planner",
|
53
51
|
reference_date: Union[str, dt.date, None] = None,
|
54
52
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
@@ -58,57 +56,61 @@ class UpdatePlanner(ManagedResource):
|
|
58
56
|
custom_priority_map: Optional[Dict[str, int]] = None,
|
59
57
|
reverse_order: bool = False,
|
60
58
|
show_progress: bool = False,
|
59
|
+
hive_style: bool = False,
|
61
60
|
skipped: Optional[List[Union[str, dt.date]]] = None,
|
62
61
|
**kwargs,
|
63
62
|
):
|
64
63
|
super().__init__(**kwargs)
|
65
64
|
|
66
|
-
# ----
|
67
|
-
self.description: str = description
|
65
|
+
# ---- core config ----
|
68
66
|
self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
|
69
|
-
self.
|
67
|
+
self.description: str = description
|
70
68
|
self.reverse_order: bool = reverse_order
|
71
69
|
self.show_progress: bool = show_progress
|
72
70
|
self.overwrite: bool = overwrite
|
73
71
|
self.ignore_missing: bool = ignore_missing
|
74
72
|
self.history_days_threshold: int = history_days_threshold
|
75
73
|
self.max_age_minutes: int = max_age_minutes
|
76
|
-
# Copy to avoid shared mutation
|
77
74
|
self.priority_map: Dict[str, int] = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
78
75
|
|
79
|
-
# ----
|
76
|
+
# ---- NEW: Hive partition support ----
|
77
|
+
self.hive_style: bool = hive_style
|
78
|
+
self.partition_on: List[str] = list(partition_on or ["partition_date"] if self.hive_style else ["year", "month", "day"])
|
79
|
+
|
80
|
+
# ---- execution knobs ----
|
80
81
|
self.max_threads: int = int(kwargs.get("max_threads", 3))
|
81
|
-
self.timeout: float = float(kwargs.get("timeout", 30.0))
|
82
|
+
self.timeout: float = float(kwargs.get("timeout", 30.0))
|
83
|
+
self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
|
84
|
+
self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
|
82
85
|
|
83
|
-
# ----
|
86
|
+
# ---- date window ----
|
84
87
|
self.start_date = kwargs.get("parquet_start_date")
|
85
88
|
self.end_date = kwargs.get("parquet_end_date")
|
86
89
|
|
87
|
-
# ----
|
90
|
+
# ---- reference date ----
|
88
91
|
if reference_date is not None:
|
89
92
|
self.reference_date: dt.date = pd.to_datetime(reference_date).date()
|
90
93
|
else:
|
91
94
|
self.reference_date: dt.date = dt.date.today()
|
92
95
|
|
93
|
-
# ----
|
94
|
-
self.check_completeness: bool = bool(kwargs.get("check_completeness",
|
95
|
-
self.require_success_marker: bool = bool(kwargs.get("require_success_marker",
|
96
|
+
# ---- completeness/heuristics ----
|
97
|
+
self.check_completeness: bool = bool(kwargs.get("check_completeness", False))
|
98
|
+
self.require_success_marker: bool = bool(kwargs.get("require_success_marker", False))
|
96
99
|
self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
97
100
|
self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
|
98
|
-
|
99
|
-
|
100
|
-
# Dependency-injected clock (UTC) for tests
|
101
|
+
|
102
|
+
# ---- clock for tests ----
|
101
103
|
self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
102
104
|
|
103
|
-
# ----
|
104
|
-
# Keep legacy attribute and derive new internal canonical sets.
|
105
|
+
# ---- skipped (back-compat) ----
|
105
106
|
self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
106
107
|
self.skipped_paths: Set[str] = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
107
108
|
self.skipped_dates: Set[dt.date] = {p for p in self.skipped if isinstance(p, dt.date)}
|
108
109
|
|
109
|
-
# ---- Helpers & State ----
|
110
110
|
if not getattr(self, "fs", None):
|
111
111
|
raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
112
|
+
|
113
|
+
# ---- state ----
|
112
114
|
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
113
115
|
self.plan: pd.DataFrame = pd.DataFrame()
|
114
116
|
self.df_req: pd.DataFrame = pd.DataFrame()
|
@@ -117,41 +119,20 @@ class UpdatePlanner(ManagedResource):
|
|
117
119
|
# --------------------- Back-compat property bridge ---------------------
|
118
120
|
@property
|
119
121
|
def skipped(self) -> List[Union[str, dt.date]]:
|
120
|
-
|
121
|
-
Backward-compatible view of skip configuration.
|
122
|
-
Returns a merged list of path-strings and dates.
|
123
|
-
"""
|
124
|
-
paths = sorted(self.skipped_paths)
|
125
|
-
dates = sorted(self.skipped_dates)
|
126
|
-
return [*paths, *dates]
|
122
|
+
return [*sorted(self.skipped_paths), *sorted(self.skipped_dates)]
|
127
123
|
|
128
124
|
@skipped.setter
|
129
125
|
def skipped(self, value: List[Union[str, dt.date]]) -> None:
|
130
|
-
"""
|
131
|
-
Accepts legacy assignments like:
|
132
|
-
planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
|
133
|
-
and keeps new internals in sync.
|
134
|
-
"""
|
135
|
-
value = list(value or [])
|
136
126
|
self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
137
127
|
self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
138
128
|
|
139
129
|
# --------------------- Public API ---------------------
|
140
|
-
def has_plan(self) -> bool:
|
141
|
-
"""Check if a plan DataFrame exists and is not empty."""
|
142
|
-
return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
143
|
-
|
144
|
-
def required_count(self) -> int:
|
145
|
-
"""Get the number of dates that require an update."""
|
146
|
-
return len(self.df_req) if isinstance(self.df_req, pd.DataFrame) else 0
|
147
|
-
|
148
130
|
def generate_plan(
|
149
131
|
self,
|
150
132
|
start: Union[str, dt.date, None] = None,
|
151
133
|
end: Union[str, dt.date, None] = None,
|
152
134
|
freq: str = "D",
|
153
135
|
) -> pd.DataFrame:
|
154
|
-
"""Build a plan for [start, end]. Returns rows that require update (df_req)."""
|
155
136
|
start = start or self.start_date
|
156
137
|
end = end or self.end_date
|
157
138
|
if start is None or end is None:
|
@@ -162,330 +143,196 @@ class UpdatePlanner(ManagedResource):
|
|
162
143
|
if sd > ed:
|
163
144
|
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
164
145
|
|
165
|
-
|
166
|
-
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=log_extra)
|
146
|
+
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=self._log_extra())
|
167
147
|
self._generate_plan(sd, ed, freq=freq)
|
168
|
-
self.logger.info(
|
169
|
-
f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
170
|
-
f"{len(self.df_req)} require update",
|
171
|
-
extra=log_extra,
|
172
|
-
)
|
173
148
|
return self.df_req
|
174
149
|
|
175
150
|
def show_update_plan(self) -> None:
|
176
|
-
|
177
|
-
if not self.has_plan():
|
178
|
-
self.logger.info("No update plan to show.", extra=self._log_extra())
|
151
|
+
if not self.has_plan() or self._printed_this_run:
|
179
152
|
return
|
180
|
-
if self._printed_this_run:
|
181
|
-
return
|
182
|
-
|
183
153
|
try:
|
184
154
|
from rich.console import Console
|
185
155
|
from rich.table import Table
|
186
|
-
|
187
156
|
console = Console()
|
188
|
-
terminal_width = console.size.width
|
189
|
-
|
190
157
|
table = Table(
|
191
|
-
title=f"Update Plan for {self.data_path}",
|
192
|
-
show_header=True,
|
193
|
-
header_style="bold magenta",
|
194
|
-
expand=True,
|
195
|
-
pad_edge=False,
|
158
|
+
title=f"Update Plan for {self.data_path} [{'Hive' if 'partition_date' in self.partition_on else 'Legacy'}]",
|
159
|
+
show_header=True, header_style="bold magenta", expand=True, pad_edge=False,
|
196
160
|
)
|
197
|
-
max_w = max(terminal_width - 50, 640)
|
198
161
|
for col in self.plan.columns:
|
199
|
-
|
200
|
-
table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
|
201
|
-
elif col == "description":
|
202
|
-
table.add_column(col, justify="left", overflow="fold", max_width=max_w)
|
203
|
-
else:
|
204
|
-
table.add_column(col, justify="left", overflow="fold")
|
205
|
-
|
162
|
+
table.add_column(col, justify="left", overflow="fold")
|
206
163
|
for _, row in self.plan.iterrows():
|
207
164
|
table.add_row(*(str(row[c]) for c in self.plan.columns))
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
|
212
|
-
|
213
|
-
except Exception as e:
|
214
|
-
self.logger.debug(f"Falling back to plain text plan display due to: {e}", extra=self._log_extra())
|
215
|
-
preview = self.plan.head(200).to_string(index=False)
|
216
|
-
self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
|
217
|
-
|
165
|
+
console.print(table)
|
166
|
+
except Exception:
|
167
|
+
self.logger.info(f"Update Plan:\n{self.plan.head(50)}", extra=self._log_extra())
|
218
168
|
self._printed_this_run = True
|
219
169
|
|
220
170
|
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
221
|
-
"""Yield (priority, [dates...]) batches, smallest priority first."""
|
222
171
|
if not self.has_plan():
|
223
172
|
return
|
224
173
|
req = self.plan[self.plan["update_required"]]
|
225
|
-
if req.empty:
|
226
|
-
return
|
227
174
|
for priority in sorted(req["update_priority"].unique()):
|
228
|
-
|
229
|
-
|
230
|
-
|
175
|
+
dates = req[req["update_priority"] == priority].sort_values(
|
176
|
+
by="date", ascending=not self.reverse_order
|
177
|
+
)["date"].tolist()
|
231
178
|
if dates:
|
232
179
|
yield int(priority), dates
|
233
180
|
|
234
|
-
|
181
|
+
def has_plan(self) -> bool:
|
182
|
+
return not self.plan.empty
|
183
|
+
|
184
|
+
def required_count(self) -> int:
|
185
|
+
return len(self.df_req)
|
186
|
+
|
187
|
+
# --------------------- Internals ---------------------
|
235
188
|
def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
236
|
-
"""
|
237
|
-
Populate self.plan with all dates and self.df_req with the subset to update.
|
238
|
-
- Pre-lists months or days (configurable) with timeouts that actually apply
|
239
|
-
- Computes staleness from newest *data* file
|
240
|
-
- Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
|
241
|
-
- Marks future dates as 'future' (not actionable)
|
242
|
-
"""
|
243
189
|
dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
244
190
|
history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
245
191
|
rows: List[Dict[str, Any]] = []
|
246
192
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
units: List[Tuple[str, dt.date]] = []
|
252
|
-
if self.list_granularity == "day":
|
253
|
-
units = [("day", d) for d in dates]
|
254
|
-
else: # Default to month
|
193
|
+
if "partition_date" in self.partition_on:
|
194
|
+
caches: Dict[dt.date, Dict[str, Any]] = self._list_prefix(self.data_path)
|
195
|
+
else:
|
196
|
+
caches: Dict[dt.date, Dict[str, Any]] = {}
|
255
197
|
months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
max_workers = max(1, self.max_threads) # Ensure at least 1 worker
|
266
|
-
|
267
|
-
with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="update_planner") as executor:
|
268
|
-
future_to_unit: Dict[Any, Tuple[str, dt.date]] = {}
|
269
|
-
for kind, val in units:
|
270
|
-
prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
|
271
|
-
future = executor.submit(self._list_prefix, prefix)
|
272
|
-
future_to_unit[future] = (kind, val)
|
273
|
-
|
274
|
-
# Wait for all futures with a total timeout
|
275
|
-
done_futures, not_done_futures = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
|
276
|
-
|
277
|
-
# Process completed futures
|
278
|
-
for future in done_futures:
|
279
|
-
kind, val = future_to_unit[future]
|
280
|
-
try:
|
281
|
-
# Get the result with a per-listing timeout
|
282
|
-
cache = future.result(timeout=self.list_timeout or None)
|
283
|
-
except Exception as e:
|
284
|
-
self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
|
285
|
-
cache = {}
|
286
|
-
|
287
|
-
if kind == "month":
|
288
|
-
caches[val] = cache
|
289
|
-
else: # day
|
290
|
-
# Store day listing results in its month's bucket for summarization
|
291
|
-
month_key = val.replace(day=1)
|
292
|
-
caches.setdefault(month_key, {}).update(cache)
|
293
|
-
|
294
|
-
# Handle timed-out futures
|
295
|
-
for future in not_done_futures:
|
296
|
-
kind, val = future_to_unit[future]
|
297
|
-
self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
|
298
|
-
if kind == "month":
|
299
|
-
caches[val] = {}
|
300
|
-
else: # day
|
301
|
-
month_key = val.replace(day=1)
|
302
|
-
caches.setdefault(month_key, {})
|
198
|
+
with ThreadPoolExecutor(max_workers=max(1, self.max_threads)) as ex:
|
199
|
+
future_to_unit = {ex.submit(self._list_prefix, self._month_prefix(m)): m for m in months}
|
200
|
+
done, _ = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
|
201
|
+
for fut in done:
|
202
|
+
m = future_to_unit[fut]
|
203
|
+
try:
|
204
|
+
caches[m] = fut.result(timeout=self.list_timeout or None)
|
205
|
+
except Exception:
|
206
|
+
caches[m] = {}
|
303
207
|
|
304
|
-
# --- Summarize Each Date and Build Plan ---
|
305
208
|
for d in dates:
|
306
|
-
if
|
307
|
-
rows.append(
|
308
|
-
"date": d, "file_exists": False, "file_age_minutes": None,
|
309
|
-
"update_category": "future", "update_priority": self.priority_map.get("future", 99),
|
310
|
-
"update_required": False, "description": self.description,
|
311
|
-
})
|
209
|
+
if d > self.reference_date:
|
210
|
+
rows.append(self._row_future(d))
|
312
211
|
continue
|
313
|
-
|
314
212
|
if self._is_skipped(d):
|
315
|
-
self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
|
316
|
-
# Append a row even for skipped dates, using default policy logic
|
317
213
|
rows.append(self._make_row(d, history_start, False, None))
|
318
214
|
continue
|
319
215
|
|
320
|
-
|
321
|
-
month_key = d.replace(day=1)
|
322
|
-
cache = caches.get(month_key, {})
|
216
|
+
cache = caches if "partition_date" in self.partition_on else caches.get(d.replace(day=1), {})
|
323
217
|
exists, age_min, incomplete = self._summarize_partition(d, cache)
|
324
|
-
|
325
|
-
# Incomplete partitions get their own category (unless overwrite forces update)
|
326
218
|
if incomplete and not self.overwrite:
|
327
|
-
rows.append(
|
328
|
-
|
329
|
-
|
330
|
-
"update_required": True, "description": self.description,
|
331
|
-
})
|
332
|
-
continue
|
333
|
-
|
334
|
-
# Fall back to the standard policy logic (overwrite / history / staleness / missing)
|
335
|
-
rows.append(self._make_row(d, history_start, exists, age_min))
|
219
|
+
rows.append(self._row_incomplete(d, age_min))
|
220
|
+
else:
|
221
|
+
rows.append(self._make_row(d, history_start, exists, age_min))
|
336
222
|
|
337
|
-
# --- Finalize DataFrame ---
|
338
223
|
df = pd.DataFrame.from_records(rows)
|
339
224
|
if not df.empty:
|
340
225
|
df["date"] = pd.to_datetime(df["date"]).dt.date
|
341
226
|
df["update_priority"] = df["update_priority"].astype(int)
|
342
|
-
|
343
|
-
df = df.sort_values(
|
227
|
+
self.plan = df.sort_values(
|
344
228
|
by=["update_priority", "date"],
|
345
229
|
ascending=[True, not self.reverse_order],
|
346
|
-
kind="mergesort",
|
230
|
+
kind="mergesort",
|
347
231
|
).reset_index(drop=True)
|
232
|
+
self.df_req = self.plan[self.plan["update_required"]].copy()
|
348
233
|
|
349
|
-
self.plan = df
|
350
|
-
self.df_req = df[df["update_required"]].copy()
|
351
|
-
self._printed_this_run = False
|
352
|
-
|
353
|
-
# --------------------- File System Interaction ---------------------
|
354
234
|
def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
|
355
|
-
"""
|
356
|
-
Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
357
|
-
Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime).
|
358
|
-
"""
|
359
235
|
try:
|
360
|
-
# Returns {path: info_dict} when detail=True
|
361
236
|
items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
|
362
|
-
except Exception
|
363
|
-
self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
237
|
+
except Exception:
|
364
238
|
return {}
|
365
239
|
|
366
240
|
out: Dict[dt.date, Dict[str, Any]] = {}
|
367
241
|
for path, info in items.items():
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
242
|
+
d: Optional[dt.date] = None
|
243
|
+
if "partition_date" in self.partition_on:
|
244
|
+
parts = self._extract_partitions(path)
|
245
|
+
if "partition_date" in parts:
|
246
|
+
try:
|
247
|
+
d = dt.date.fromisoformat(parts["partition_date"])
|
248
|
+
except Exception:
|
249
|
+
continue
|
250
|
+
else:
|
251
|
+
segs = path.strip("/").split("/")
|
252
|
+
if len(segs) >= 3:
|
253
|
+
try:
|
254
|
+
y, m, dd = int(segs[-3]), int(segs[-2]), int(segs[-1])
|
255
|
+
d = dt.date(y, m, dd)
|
256
|
+
except Exception:
|
257
|
+
continue
|
258
|
+
if d is None:
|
377
259
|
continue
|
378
260
|
|
379
|
-
# Initialize or get the record for this date
|
380
261
|
rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
381
|
-
|
382
|
-
|
383
|
-
# Check for _SUCCESS marker
|
384
|
-
if base_name == "_SUCCESS":
|
262
|
+
base = path.rsplit("/", 1)[-1]
|
263
|
+
if base == "_SUCCESS":
|
385
264
|
rec["has_success"] = True
|
386
|
-
|
387
|
-
# Check if it's a relevant data file
|
388
265
|
if self._is_data_file(path):
|
389
266
|
rec["files"].append(path)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
if isinstance(mtime, (int, float)):
|
394
|
-
ts = dt.datetime.utcfromtimestamp(mtime)
|
395
|
-
elif isinstance(mtime, str):
|
396
|
-
try:
|
397
|
-
ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
398
|
-
except Exception:
|
399
|
-
ts = None
|
400
|
-
elif isinstance(mtime, dt.datetime):
|
401
|
-
# Ensure timezone awareness for comparison
|
402
|
-
ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
403
|
-
|
404
|
-
# Update the newest timestamp for this partition
|
405
|
-
if ts:
|
406
|
-
current_newest = rec["newest_ts"]
|
407
|
-
# Naive comparison after ensuring tz awareness
|
408
|
-
ts_naive = ts.replace(tzinfo=None) if ts.tzinfo else ts
|
409
|
-
current_naive = current_newest.replace(tzinfo=None) if current_newest and current_newest.tzinfo else current_newest
|
410
|
-
if current_naive is None or ts_naive > current_naive:
|
411
|
-
rec["newest_ts"] = ts
|
412
|
-
|
267
|
+
ts = self._extract_mtime(info)
|
268
|
+
if ts and (rec["newest_ts"] is None or ts > rec["newest_ts"]):
|
269
|
+
rec["newest_ts"] = ts
|
413
270
|
return out
|
414
271
|
|
415
|
-
def
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
272
|
+
def _extract_partitions(self, path: str) -> Dict[str, str]:
|
273
|
+
out: Dict[str, str] = {}
|
274
|
+
for seg in path.strip("/").split("/"):
|
275
|
+
m = self.HIVE_PARTITION_RE.match(seg)
|
276
|
+
if m:
|
277
|
+
out[m.group(1)] = m.group(2)
|
278
|
+
return out
|
420
279
|
|
421
|
-
|
422
|
-
Tuple[bool, Optional[float], bool]: (exists, age_minutes, incomplete)
|
423
|
-
- exists: True iff at least one *data* file is present for day `d`
|
424
|
-
- age_minutes: minutes since the NEWEST data file (UTC 'now'), or None if not determinable
|
425
|
-
- incomplete: True if files exist but required _SUCCESS is missing (and checks are enabled)
|
426
|
-
"""
|
280
|
+
def _summarize_partition(self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]) -> Tuple[bool, Optional[float], bool]:
|
427
281
|
rec = cache.get(d, {})
|
428
282
|
files = rec.get("files", [])
|
429
|
-
|
430
|
-
exists = len(files) > 0
|
431
|
-
|
283
|
+
exists = bool(files)
|
432
284
|
if not exists:
|
433
285
|
return False, None, False
|
434
|
-
|
286
|
+
has_success = rec.get("has_success", False)
|
435
287
|
newest_ts = rec.get("newest_ts")
|
436
|
-
age_min
|
288
|
+
age_min = None
|
437
289
|
if newest_ts:
|
438
|
-
|
439
|
-
|
440
|
-
age_min = max(0.0, (
|
441
|
-
|
290
|
+
now = self._utcnow().replace(tzinfo=None)
|
291
|
+
ts = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
292
|
+
age_min = max(0.0, (now - ts).total_seconds() / 60.0)
|
442
293
|
incomplete = self.check_completeness and self.require_success_marker and not has_success
|
443
294
|
return exists, age_min, incomplete
|
444
295
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
date: dt.date,
|
449
|
-
history_start: dt.date,
|
450
|
-
file_exists: bool,
|
451
|
-
file_age: Optional[float],
|
452
|
-
) -> Dict[str, Any]:
|
453
|
-
"""
|
454
|
-
Build a single plan row based on flags and thresholds.
|
455
|
-
(Categories 'future'/'incomplete' are injected earlier by _generate_plan.)
|
456
|
-
"""
|
457
|
-
within_history = history_start <= date <= self.reference_date
|
458
|
-
update_required = False
|
459
|
-
category = "unknown"
|
460
|
-
|
296
|
+
def _make_row(self, d: dt.date, history_start: dt.date, exists: bool, age_min: Optional[float]) -> Dict[str, Any]:
|
297
|
+
within_history = history_start <= d <= self.reference_date
|
298
|
+
category, update_required = "unknown", False
|
461
299
|
if self.overwrite:
|
462
|
-
category = "overwrite_forced"
|
463
|
-
update_required = True
|
300
|
+
category, update_required = "overwrite_forced", True
|
464
301
|
elif within_history:
|
465
|
-
if not
|
466
|
-
category = "missing_in_history"
|
467
|
-
|
468
|
-
|
469
|
-
category = "stale_in_history"
|
470
|
-
update_required = True
|
302
|
+
if not exists:
|
303
|
+
category, update_required = "missing_in_history", True
|
304
|
+
elif age_min is not None and age_min > self.max_age_minutes:
|
305
|
+
category, update_required = "stale_in_history", True
|
471
306
|
else:
|
472
307
|
category = "file_is_recent"
|
473
|
-
elif not
|
474
|
-
category = "create_missing"
|
475
|
-
update_required = True
|
308
|
+
elif not exists and not self.ignore_missing:
|
309
|
+
category, update_required = "create_missing", True
|
476
310
|
else:
|
477
|
-
category = "missing_ignored" if not
|
478
|
-
|
311
|
+
category = "missing_ignored" if not exists else "file_is_recent"
|
479
312
|
return {
|
480
|
-
"date":
|
481
|
-
"file_exists":
|
482
|
-
"file_age_minutes":
|
313
|
+
"date": d,
|
314
|
+
"file_exists": exists,
|
315
|
+
"file_age_minutes": age_min,
|
483
316
|
"update_category": category,
|
484
317
|
"update_priority": self.priority_map.get(category, 99),
|
485
318
|
"update_required": update_required,
|
486
319
|
"description": self.description,
|
487
320
|
}
|
488
321
|
|
322
|
+
def _row_future(self, d: dt.date) -> Dict[str, Any]:
|
323
|
+
return {
|
324
|
+
"date": d, "file_exists": False, "file_age_minutes": None,
|
325
|
+
"update_category": "future", "update_priority": self.priority_map.get("future", 99),
|
326
|
+
"update_required": False, "description": self.description,
|
327
|
+
}
|
328
|
+
|
329
|
+
def _row_incomplete(self, d: dt.date, age_min: Optional[float]) -> Dict[str, Any]:
|
330
|
+
return {
|
331
|
+
"date": d, "file_exists": True, "file_age_minutes": age_min,
|
332
|
+
"update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
|
333
|
+
"update_required": True, "description": self.description,
|
334
|
+
}
|
335
|
+
|
489
336
|
# --------------------- Utilities ---------------------
|
490
337
|
@staticmethod
|
491
338
|
def _ensure_trailing_slash(path: str) -> str:
|
@@ -501,29 +348,36 @@ class UpdatePlanner(ManagedResource):
|
|
501
348
|
while cur <= end:
|
502
349
|
yield cur
|
503
350
|
y, m = cur.year, cur.month
|
504
|
-
|
505
|
-
if m == 12:
|
506
|
-
cur = dt.date(y + 1, 1, 1)
|
507
|
-
else:
|
508
|
-
cur = dt.date(y, m + 1, 1)
|
351
|
+
cur = dt.date(y + 1, 1, 1) if m == 12 else dt.date(y, m + 1, 1)
|
509
352
|
|
510
353
|
def _month_prefix(self, month_start: dt.date) -> str:
|
511
354
|
return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
512
355
|
|
513
|
-
def _day_prefix(self, d: dt.date) -> str:
|
514
|
-
return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
515
|
-
|
516
356
|
def _is_data_file(self, path: str) -> bool:
|
517
357
|
base = path.rsplit("/", 1)[-1]
|
518
|
-
# Skip hidden files, directories, and control files
|
519
358
|
if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
520
359
|
return False
|
521
|
-
|
522
|
-
|
360
|
+
return any(base.lower().endswith(suf) for suf in self.data_file_suffixes)
|
361
|
+
|
362
|
+
@staticmethod
|
363
|
+
def _extract_mtime(info: Dict[str, Any]) -> Optional[dt.datetime]:
|
364
|
+
mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
365
|
+
if isinstance(mtime, (int, float)):
|
366
|
+
return dt.datetime.utcfromtimestamp(mtime)
|
367
|
+
if isinstance(mtime, str):
|
368
|
+
try:
|
369
|
+
return pd.to_datetime(mtime, utc=True).to_pydatetime()
|
370
|
+
except Exception:
|
371
|
+
return None
|
372
|
+
if isinstance(mtime, dt.datetime):
|
373
|
+
return mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
374
|
+
return None
|
523
375
|
|
524
376
|
def _is_skipped(self, d: dt.date) -> bool:
|
525
|
-
""
|
526
|
-
|
377
|
+
if "partition_date" in self.partition_on:
|
378
|
+
canonical_path = f"{self.data_path}partition_date={d.isoformat()}/"
|
379
|
+
else:
|
380
|
+
canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
527
381
|
return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
|
528
382
|
|
529
383
|
def _log_extra(self, **overrides) -> Dict[str, Any]:
|
@@ -534,502 +388,4 @@ class UpdatePlanner(ManagedResource):
|
|
534
388
|
"action_module_name": "update_plan",
|
535
389
|
}
|
536
390
|
base.update(overrides)
|
537
|
-
return base
|
538
|
-
|
539
|
-
|
540
|
-
# import datetime as dt
|
541
|
-
# from concurrent.futures import ThreadPoolExecutor, wait
|
542
|
-
# from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
543
|
-
#
|
544
|
-
# import pandas as pd
|
545
|
-
#
|
546
|
-
# from sibi_dst.utils import ManagedResource
|
547
|
-
# from . import FileAgeChecker
|
548
|
-
#
|
549
|
-
#
|
550
|
-
# class UpdatePlanner(ManagedResource):
|
551
|
-
# """
|
552
|
-
# Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
|
553
|
-
# Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
|
554
|
-
#
|
555
|
-
# Enhancements:
|
556
|
-
# - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
|
557
|
-
# - Age computed from the NEWEST data file (ignoring control files).
|
558
|
-
# - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
|
559
|
-
# - Real timeouts using concurrent.futures.wait(...).
|
560
|
-
# - Future dates marked as 'future' (not actionable).
|
561
|
-
# """
|
562
|
-
#
|
563
|
-
# # -------- Defaults (extended, but original keys retained) --------
|
564
|
-
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
565
|
-
# "file_is_recent": 0,
|
566
|
-
# "missing_ignored": 0,
|
567
|
-
# "overwrite_forced": 1,
|
568
|
-
# "incomplete": 1, # new: prioritize just under overwrite
|
569
|
-
# "create_missing": 2,
|
570
|
-
# "missing_in_history": 3,
|
571
|
-
# "stale_in_history": 4,
|
572
|
-
# "future": 99, # new: not actionable
|
573
|
-
# }
|
574
|
-
#
|
575
|
-
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
576
|
-
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
577
|
-
#
|
578
|
-
# # Data/Control file heuristics (can be overridden)
|
579
|
-
# DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
|
580
|
-
# CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
|
581
|
-
#
|
582
|
-
# logger_extra = {"sibi_dst_component": __name__}
|
583
|
-
#
|
584
|
-
# def __init__(
|
585
|
-
# self,
|
586
|
-
# parquet_storage_path: str,
|
587
|
-
# parquet_filename: str,
|
588
|
-
# description: str = "Update Planner",
|
589
|
-
# reference_date: Union[str, dt.date, None] = None,
|
590
|
-
# history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
591
|
-
# max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
592
|
-
# overwrite: bool = False,
|
593
|
-
# ignore_missing: bool = False,
|
594
|
-
# custom_priority_map: Optional[Dict[str, int]] = None,
|
595
|
-
# reverse_order: bool = False,
|
596
|
-
# show_progress: bool = False,
|
597
|
-
# skipped: Optional[List[Union[str, dt.date]]] = None,
|
598
|
-
# **kwargs,
|
599
|
-
# ):
|
600
|
-
# super().__init__(**kwargs)
|
601
|
-
#
|
602
|
-
# # ---- Existing public-ish attributes (unchanged) ----
|
603
|
-
# self.description = description
|
604
|
-
# self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
605
|
-
# self.filename = parquet_filename
|
606
|
-
# self.reverse_order = reverse_order
|
607
|
-
# self.show_progress = show_progress
|
608
|
-
# self.overwrite = overwrite
|
609
|
-
# self.ignore_missing = ignore_missing
|
610
|
-
# self.history_days_threshold = history_days_threshold
|
611
|
-
# self.max_age_minutes = max_age_minutes
|
612
|
-
# # copy to avoid shared mutation
|
613
|
-
# self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
614
|
-
#
|
615
|
-
# # Execution knobs from kwargs (kept)
|
616
|
-
# self.max_threads: int = int(kwargs.get("max_threads", 3))
|
617
|
-
# self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
|
618
|
-
#
|
619
|
-
# # Date window (kept)
|
620
|
-
# self.start_date = kwargs.get("parquet_start_date")
|
621
|
-
# self.end_date = kwargs.get("parquet_end_date")
|
622
|
-
#
|
623
|
-
# # Reference date (kept; tolerant)
|
624
|
-
# self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
|
625
|
-
#
|
626
|
-
# # Helpers & state (kept)
|
627
|
-
# self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
628
|
-
# self.plan: pd.DataFrame = pd.DataFrame()
|
629
|
-
# self.df_req: pd.DataFrame = pd.DataFrame()
|
630
|
-
# self._printed_this_run: bool = False
|
631
|
-
#
|
632
|
-
# # ---- New feature flags / knobs (all default to safe choices) ----
|
633
|
-
# # Completeness check via _SUCCESS
|
634
|
-
# self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
|
635
|
-
# self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
|
636
|
-
# # Listing granularity: 'month' (default) or 'day'
|
637
|
-
# self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
638
|
-
# # Data file suffixes to consider for age (default common formats)
|
639
|
-
# self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
|
640
|
-
# # Timeouts
|
641
|
-
# self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
|
642
|
-
# self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
|
643
|
-
# # Dependency-injected clock (UTC) for tests
|
644
|
-
# self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
645
|
-
#
|
646
|
-
# # ------------ Backward-compatible skip handling ------------
|
647
|
-
# # Keep legacy attribute and derive new internal canonical sets.
|
648
|
-
# self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
649
|
-
# self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
650
|
-
# self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
|
651
|
-
#
|
652
|
-
# # Validate fs presence (you rely on it)
|
653
|
-
# if not getattr(self, "fs", None):
|
654
|
-
# raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
655
|
-
#
|
656
|
-
# # --------------------- Back-compat property bridge ---------------------
|
657
|
-
# @property
|
658
|
-
# def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
|
659
|
-
# """
|
660
|
-
# Backward-compatible view of skip configuration.
|
661
|
-
# Returns a merged list of path-strings and dates.
|
662
|
-
# """
|
663
|
-
# paths = sorted(self.skipped_paths)
|
664
|
-
# dates = sorted(self.skipped_dates)
|
665
|
-
# return [*paths, *dates]
|
666
|
-
#
|
667
|
-
# @skipped.setter
|
668
|
-
# def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
|
669
|
-
# """
|
670
|
-
# Accepts legacy assignments like:
|
671
|
-
# planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
|
672
|
-
# and keeps new internals in sync.
|
673
|
-
# """
|
674
|
-
# value = list(value or [])
|
675
|
-
# self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
676
|
-
# self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
677
|
-
#
|
678
|
-
# # --------------------- public helpers (kept) ---------------------
|
679
|
-
# def has_plan(self) -> bool:
|
680
|
-
# return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
681
|
-
#
|
682
|
-
# def required_count(self) -> int:
|
683
|
-
# return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
|
684
|
-
#
|
685
|
-
# # --------------------- core API (kept) ---------------------
|
686
|
-
# def generate_plan(
|
687
|
-
# self,
|
688
|
-
# start: Union[str, dt.date, None] = None,
|
689
|
-
# end: Union[str, dt.date, None] = None,
|
690
|
-
# freq: str = "D",
|
691
|
-
# ) -> pd.DataFrame:
|
692
|
-
# """Build a plan for [start, end]. Returns rows that require update (df_req)."""
|
693
|
-
# start = start or self.start_date
|
694
|
-
# end = end or self.end_date
|
695
|
-
# if start is None or end is None:
|
696
|
-
# raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
|
697
|
-
#
|
698
|
-
# sd = pd.to_datetime(start).date()
|
699
|
-
# ed = pd.to_datetime(end).date()
|
700
|
-
# if sd > ed:
|
701
|
-
# raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
702
|
-
#
|
703
|
-
# self.logger.info(
|
704
|
-
# f"Generating update plan for {self.description} from {sd} to {ed}",
|
705
|
-
# extra=self._log_extra(),
|
706
|
-
# )
|
707
|
-
# self._generate_plan(sd, ed, freq=freq)
|
708
|
-
# self.logger.info(
|
709
|
-
# f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
710
|
-
# f"{len(self.df_req)} require update",
|
711
|
-
# extra=self._log_extra(),
|
712
|
-
# )
|
713
|
-
# return self.df_req
|
714
|
-
#
|
715
|
-
# def show_update_plan(self) -> None:
|
716
|
-
# """Pretty-print the current plan once per run, now respecting terminal width fully."""
|
717
|
-
# if not self.has_plan():
|
718
|
-
# self.logger.info("No update plan to show.", extra=self._log_extra())
|
719
|
-
# return
|
720
|
-
# if self._printed_this_run:
|
721
|
-
# return
|
722
|
-
#
|
723
|
-
# try:
|
724
|
-
# from rich.console import Console
|
725
|
-
# from rich.table import Table
|
726
|
-
#
|
727
|
-
# console = Console() # auto-detect terminal size
|
728
|
-
# terminal_width = console.size.width
|
729
|
-
#
|
730
|
-
# table = Table(
|
731
|
-
# title=f"Update Plan for {self.data_path}",
|
732
|
-
# show_header=True,
|
733
|
-
# header_style="bold magenta",
|
734
|
-
# expand=True, # fill available width
|
735
|
-
# pad_edge=False,
|
736
|
-
# )
|
737
|
-
# max_w = max(terminal_width - 50, 640)
|
738
|
-
# for col in self.plan.columns:
|
739
|
-
# if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
|
740
|
-
# table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
|
741
|
-
# elif col == "description":
|
742
|
-
# # Let description wrap, but set a max width to avoid huge columns
|
743
|
-
# table.add_column(col, justify="left", overflow="fold", max_width=max_w)
|
744
|
-
# else:
|
745
|
-
# table.add_column(col, justify="left", overflow="fold")
|
746
|
-
#
|
747
|
-
# for _, row in self.plan.iterrows():
|
748
|
-
# table.add_row(*(str(row[c]) for c in self.plan.columns))
|
749
|
-
#
|
750
|
-
# # Capture with the same console so width stays consistent
|
751
|
-
# with console.capture() as cap:
|
752
|
-
# console.print(table)
|
753
|
-
# self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
|
754
|
-
#
|
755
|
-
# except Exception:
|
756
|
-
# preview = self.plan.head(200).to_string(index=False)
|
757
|
-
# self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
|
758
|
-
#
|
759
|
-
# self._printed_this_run = True
|
760
|
-
#
|
761
|
-
# def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
762
|
-
# """Yield (priority, [dates...]) batches, smallest priority first."""
|
763
|
-
# if not self.has_plan():
|
764
|
-
# return
|
765
|
-
# req = self.plan[self.plan["update_required"]]
|
766
|
-
# if req.empty:
|
767
|
-
# return
|
768
|
-
# for priority in sorted(req["update_priority"].unique()):
|
769
|
-
# dates_df = req[req["update_priority"] == priority]
|
770
|
-
# dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
771
|
-
# dates = dates_df["date"].tolist()
|
772
|
-
# if dates:
|
773
|
-
# yield int(priority), dates
|
774
|
-
#
|
775
|
-
# # --------------------- internals ---------------------
|
776
|
-
# @staticmethod
|
777
|
-
# def _ensure_trailing_slash(path: str) -> str:
|
778
|
-
# return path.rstrip("/") + "/"
|
779
|
-
#
|
780
|
-
# @staticmethod
|
781
|
-
# def _month_floor(d: dt.date) -> dt.date:
|
782
|
-
# return d.replace(day=1)
|
783
|
-
#
|
784
|
-
# @staticmethod
|
785
|
-
# def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
786
|
-
# cur = start.replace(day=1)
|
787
|
-
# while cur <= end:
|
788
|
-
# yield cur
|
789
|
-
# y, m = cur.year, cur.month
|
790
|
-
# cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
|
791
|
-
#
|
792
|
-
# def _month_prefix(self, month_start: dt.date) -> str:
|
793
|
-
# return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
794
|
-
#
|
795
|
-
# def _day_prefix(self, d: dt.date) -> str:
|
796
|
-
# return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
797
|
-
#
|
798
|
-
# def _log_extra(self, **overrides) -> dict:
|
799
|
-
# base = {
|
800
|
-
# "sibi_dst_component": __name__,
|
801
|
-
# "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
802
|
-
# "dataclass": self.description,
|
803
|
-
# "action_module_name": "update_plan",
|
804
|
-
# }
|
805
|
-
# base.update(overrides)
|
806
|
-
# return base
|
807
|
-
#
|
808
|
-
# def _is_data_file(self, path: str) -> bool:
|
809
|
-
# base = path.rsplit("/", 1)[-1]
|
810
|
-
# if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
811
|
-
# return False
|
812
|
-
# lower = base.lower()
|
813
|
-
# return any(lower.endswith(suf) for suf in self.data_file_suffixes)
|
814
|
-
#
|
815
|
-
# def _is_skipped(self, d: dt.date) -> bool:
|
816
|
-
# """True if the date or its canonical path is in the skip config."""
|
817
|
-
# just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
818
|
-
# return (d in self.skipped_dates) or (just_path in self.skipped_paths)
|
819
|
-
#
|
820
|
-
# def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
|
821
|
-
# """
|
822
|
-
# Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
823
|
-
# Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
|
824
|
-
# """
|
825
|
-
# try:
|
826
|
-
# items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
|
827
|
-
# except Exception as e:
|
828
|
-
# self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
829
|
-
# return {}
|
830
|
-
#
|
831
|
-
# out: Dict[dt.date, Dict[str, object]] = {}
|
832
|
-
# for path, info in items.items():
|
833
|
-
# parts = path.strip("/").split("/")
|
834
|
-
# if len(parts) < 4:
|
835
|
-
# continue
|
836
|
-
# try:
|
837
|
-
# y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
|
838
|
-
# d = dt.date(y, m, dd)
|
839
|
-
# except Exception:
|
840
|
-
# continue
|
841
|
-
#
|
842
|
-
# rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
843
|
-
# base = path.rsplit("/", 1)[-1]
|
844
|
-
# if base == "_SUCCESS":
|
845
|
-
# rec["has_success"] = True
|
846
|
-
#
|
847
|
-
# if self._is_data_file(path):
|
848
|
-
# rec["files"].append(path)
|
849
|
-
# mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
850
|
-
# ts = None
|
851
|
-
# if isinstance(mtime, (int, float)):
|
852
|
-
# ts = dt.datetime.utcfromtimestamp(mtime)
|
853
|
-
# elif isinstance(mtime, str):
|
854
|
-
# try:
|
855
|
-
# ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
856
|
-
# except Exception:
|
857
|
-
# ts = None
|
858
|
-
# elif isinstance(mtime, dt.datetime):
|
859
|
-
# ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
860
|
-
# if ts:
|
861
|
-
# cur = rec["newest_ts"]
|
862
|
-
# rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
|
863
|
-
# return out
|
864
|
-
#
|
865
|
-
# def _summarize_partition(
|
866
|
-
# self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
|
867
|
-
# ) -> Tuple[bool, Optional[float], bool]:
|
868
|
-
# """
|
869
|
-
# (exists, age_minutes, incomplete)
|
870
|
-
# - exists: True iff at least one *data* file is present for day `d`
|
871
|
-
# - age_minutes: minutes since the NEWEST data file (UTC 'now')
|
872
|
-
# - incomplete: True if files exist but required _SUCCESS is missing
|
873
|
-
# """
|
874
|
-
# rec = cache.get(d, {})
|
875
|
-
# files = rec.get("files", [])
|
876
|
-
# has_success = bool(rec.get("has_success", False))
|
877
|
-
# exists = len(files) > 0
|
878
|
-
# if not exists:
|
879
|
-
# return False, None, False
|
880
|
-
# newest_ts = rec.get("newest_ts")
|
881
|
-
# if newest_ts:
|
882
|
-
# now_utc = self._utcnow().replace(tzinfo=None)
|
883
|
-
# ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
884
|
-
# age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
|
885
|
-
# else:
|
886
|
-
# age_min = None
|
887
|
-
# incomplete = self.check_completeness and self.require_success_marker and not has_success
|
888
|
-
# return True, age_min, incomplete
|
889
|
-
#
|
890
|
-
# def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
891
|
-
# """
|
892
|
-
# Populate self.plan with all dates and self.df_req with the subset to update.
|
893
|
-
# - Pre-lists months or days (configurable) with timeouts that actually apply
|
894
|
-
# - Computes staleness from newest *data* file
|
895
|
-
# - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
|
896
|
-
# - Marks future dates as 'future' (not actionable)
|
897
|
-
# """
|
898
|
-
# dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
899
|
-
# history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
900
|
-
# rows: List[Dict] = []
|
901
|
-
#
|
902
|
-
# def is_future(d: dt.date) -> bool:
|
903
|
-
# return d > self.reference_date
|
904
|
-
#
|
905
|
-
# # Choose listing units
|
906
|
-
# if self.list_granularity == "day":
|
907
|
-
# units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
|
908
|
-
# else:
|
909
|
-
# months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
|
910
|
-
# units = [("month", m) for m in months]
|
911
|
-
#
|
912
|
-
# self.logger.info(
|
913
|
-
# f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
|
914
|
-
# extra=self._log_extra(),
|
915
|
-
# )
|
916
|
-
#
|
917
|
-
# # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
|
918
|
-
# caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
|
919
|
-
# max_workers = max(1, int(self.max_threads))
|
920
|
-
# with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
921
|
-
# futs = {}
|
922
|
-
# for kind, val in units:
|
923
|
-
# prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
|
924
|
-
# futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
|
925
|
-
# done, not_done = wait(futs, timeout=self.total_timeout or None)
|
926
|
-
# for f in done:
|
927
|
-
# kind, val = futs[f]
|
928
|
-
# try:
|
929
|
-
# cache = f.result(timeout=self.list_timeout or None)
|
930
|
-
# except Exception as e:
|
931
|
-
# self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
|
932
|
-
# cache = {}
|
933
|
-
# if kind == "month":
|
934
|
-
# caches[val] = cache
|
935
|
-
# else:
|
936
|
-
# # day → store into its month bucket for summarization reuse
|
937
|
-
# mk = val.replace(day=1)
|
938
|
-
# caches.setdefault(mk, {}).update(cache)
|
939
|
-
# for f in not_done:
|
940
|
-
# kind, val = futs[f]
|
941
|
-
# self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
|
942
|
-
# if kind == "month":
|
943
|
-
# caches[val] = {}
|
944
|
-
# else:
|
945
|
-
# caches.setdefault(val.replace(day=1), {})
|
946
|
-
#
|
947
|
-
# # Summarize each date
|
948
|
-
# for d in dates:
|
949
|
-
# if is_future(d):
|
950
|
-
# rows.append({
|
951
|
-
# "date": d, "file_exists": False, "file_age_minutes": None,
|
952
|
-
# "update_category": "future", "update_priority": self.priority_map.get("future", 99),
|
953
|
-
# "update_required": False, "description": self.description,
|
954
|
-
# })
|
955
|
-
# continue
|
956
|
-
#
|
957
|
-
# if self._is_skipped(d):
|
958
|
-
# self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
|
959
|
-
# rows.append(self._make_row(d, history_start, False, None))
|
960
|
-
# continue
|
961
|
-
#
|
962
|
-
# month_key = d.replace(day=1)
|
963
|
-
# cache = caches.get(month_key, {})
|
964
|
-
# exists, age_min, incomplete = self._summarize_partition(d, cache)
|
965
|
-
#
|
966
|
-
# # Incomplete partitions get their own category (unless overwrite)
|
967
|
-
# if incomplete and not self.overwrite:
|
968
|
-
# rows.append({
|
969
|
-
# "date": d, "file_exists": True, "file_age_minutes": age_min,
|
970
|
-
# "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
|
971
|
-
# "update_required": True, "description": self.description,
|
972
|
-
# })
|
973
|
-
# continue
|
974
|
-
#
|
975
|
-
# # Fall back to your existing policy (overwrite / history / staleness / missing)
|
976
|
-
# rows.append(self._make_row(d, history_start, exists, age_min))
|
977
|
-
#
|
978
|
-
# df = pd.DataFrame.from_records(rows)
|
979
|
-
# if not df.empty:
|
980
|
-
# df["date"] = pd.to_datetime(df["date"]).dt.date
|
981
|
-
# df["update_priority"] = df["update_priority"].astype(int)
|
982
|
-
#
|
983
|
-
# df = df.sort_values(
|
984
|
-
# by=["update_priority", "date"],
|
985
|
-
# ascending=[True, not self.reverse_order],
|
986
|
-
# kind="mergesort",
|
987
|
-
# ).reset_index(drop=True)
|
988
|
-
#
|
989
|
-
# self.plan = df
|
990
|
-
# self.df_req = df[df["update_required"]].copy()
|
991
|
-
# self._printed_this_run = False
|
992
|
-
#
|
993
|
-
# # --------------------- original policy (kept) ---------------------
|
994
|
-
# def _make_row(
|
995
|
-
# self,
|
996
|
-
# date: dt.date,
|
997
|
-
# history_start: dt.date,
|
998
|
-
# file_exists: bool,
|
999
|
-
# file_age: Optional[float],
|
1000
|
-
# ) -> Dict:
|
1001
|
-
# """
|
1002
|
-
# Build a single plan row based on flags and thresholds.
|
1003
|
-
# (Categories 'future'/'incomplete' are injected earlier.)
|
1004
|
-
# """
|
1005
|
-
# within_history = history_start <= date <= self.reference_date
|
1006
|
-
# update_required = False
|
1007
|
-
#
|
1008
|
-
# if self.overwrite:
|
1009
|
-
# category = "overwrite_forced"
|
1010
|
-
# update_required = True
|
1011
|
-
# elif within_history:
|
1012
|
-
# if not file_exists:
|
1013
|
-
# category = "missing_in_history"
|
1014
|
-
# update_required = True
|
1015
|
-
# elif file_age is not None and file_age > self.max_age_minutes:
|
1016
|
-
# category = "stale_in_history"
|
1017
|
-
# update_required = True
|
1018
|
-
# else:
|
1019
|
-
# category = "file_is_recent"
|
1020
|
-
# elif not file_exists and not self.ignore_missing:
|
1021
|
-
# category = "create_missing"
|
1022
|
-
# update_required = True
|
1023
|
-
# else:
|
1024
|
-
# category = "missing_ignored" if not file_exists else "file_is_recent"
|
1025
|
-
#
|
1026
|
-
# return {
|
1027
|
-
# "date": date,
|
1028
|
-
# "file_exists": bool(file_exists),
|
1029
|
-
# "file_age_minutes": file_age,
|
1030
|
-
# "update_category": category,
|
1031
|
-
# "update_priority": self.priority_map.get(category, 99),
|
1032
|
-
# "update_required": bool(update_required),
|
1033
|
-
# "description": self.description,
|
1034
|
-
# }
|
1035
|
-
#
|
391
|
+
return base
|