sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/RECORD +13 -12
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.4.dist-info}/WHEEL +0 -0
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
+
# update_planner.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
1
4
|
import datetime as dt
|
2
5
|
from concurrent.futures import ThreadPoolExecutor, wait
|
3
|
-
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
6
|
+
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar, Any, Callable
|
4
7
|
|
5
8
|
import pandas as pd
|
6
9
|
|
7
10
|
from sibi_dst.utils import ManagedResource
|
8
|
-
from . import FileAgeChecker
|
11
|
+
from . import FileAgeChecker # Assuming FileAgeChecker is in the same package
|
9
12
|
|
10
13
|
|
11
14
|
class UpdatePlanner(ManagedResource):
|
@@ -26,11 +29,11 @@ class UpdatePlanner(ManagedResource):
|
|
26
29
|
"file_is_recent": 0,
|
27
30
|
"missing_ignored": 0,
|
28
31
|
"overwrite_forced": 1,
|
29
|
-
"incomplete": 1,
|
32
|
+
"incomplete": 1, # new: prioritize just under overwrite
|
30
33
|
"create_missing": 2,
|
31
34
|
"missing_in_history": 3,
|
32
35
|
"stale_in_history": 4,
|
33
|
-
"future": 99,
|
36
|
+
"future": 99, # new: not actionable
|
34
37
|
}
|
35
38
|
|
36
39
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
@@ -60,63 +63,60 @@ class UpdatePlanner(ManagedResource):
|
|
60
63
|
):
|
61
64
|
super().__init__(**kwargs)
|
62
65
|
|
63
|
-
# ----
|
64
|
-
self.description = description
|
65
|
-
self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
66
|
-
self.filename = parquet_filename
|
67
|
-
self.reverse_order = reverse_order
|
68
|
-
self.show_progress = show_progress
|
69
|
-
self.overwrite = overwrite
|
70
|
-
self.ignore_missing = ignore_missing
|
71
|
-
self.history_days_threshold = history_days_threshold
|
72
|
-
self.max_age_minutes = max_age_minutes
|
73
|
-
#
|
74
|
-
self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
75
|
-
|
76
|
-
# Execution
|
66
|
+
# ---- Core Configuration ----
|
67
|
+
self.description: str = description
|
68
|
+
self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
|
69
|
+
self.filename: str = parquet_filename
|
70
|
+
self.reverse_order: bool = reverse_order
|
71
|
+
self.show_progress: bool = show_progress
|
72
|
+
self.overwrite: bool = overwrite
|
73
|
+
self.ignore_missing: bool = ignore_missing
|
74
|
+
self.history_days_threshold: int = history_days_threshold
|
75
|
+
self.max_age_minutes: int = max_age_minutes
|
76
|
+
# Copy to avoid shared mutation
|
77
|
+
self.priority_map: Dict[str, int] = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
78
|
+
|
79
|
+
# ---- Execution Parameters ----
|
77
80
|
self.max_threads: int = int(kwargs.get("max_threads", 3))
|
78
81
|
self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
|
79
82
|
|
80
|
-
# Date
|
83
|
+
# ---- Date Window ----
|
81
84
|
self.start_date = kwargs.get("parquet_start_date")
|
82
85
|
self.end_date = kwargs.get("parquet_end_date")
|
83
86
|
|
84
|
-
# Reference
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
self.plan: pd.DataFrame = pd.DataFrame()
|
90
|
-
self.df_req: pd.DataFrame = pd.DataFrame()
|
91
|
-
self._printed_this_run: bool = False
|
87
|
+
# ---- Reference Date ----
|
88
|
+
if reference_date is not None:
|
89
|
+
self.reference_date: dt.date = pd.to_datetime(reference_date).date()
|
90
|
+
else:
|
91
|
+
self.reference_date: dt.date = dt.date.today()
|
92
92
|
|
93
|
-
# ----
|
94
|
-
# Completeness check via _SUCCESS
|
93
|
+
# ---- Feature Flags / Advanced Knobs ----
|
95
94
|
self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
|
96
95
|
self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
|
97
|
-
# Listing granularity: 'month' (default) or 'day'
|
98
96
|
self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
99
|
-
# Data file suffixes to consider for age (default common formats)
|
100
97
|
self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
|
101
|
-
|
102
|
-
self.
|
103
|
-
self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
|
98
|
+
self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
|
99
|
+
self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
|
104
100
|
# Dependency-injected clock (UTC) for tests
|
105
|
-
self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
101
|
+
self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
106
102
|
|
107
|
-
#
|
103
|
+
# ---- Backward-Compatible Skip Handling ----
|
108
104
|
# Keep legacy attribute and derive new internal canonical sets.
|
109
105
|
self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
110
|
-
self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
111
|
-
self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
|
106
|
+
self.skipped_paths: Set[str] = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
107
|
+
self.skipped_dates: Set[dt.date] = {p for p in self.skipped if isinstance(p, dt.date)}
|
112
108
|
|
113
|
-
#
|
109
|
+
# ---- Helpers & State ----
|
114
110
|
if not getattr(self, "fs", None):
|
115
111
|
raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
112
|
+
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
113
|
+
self.plan: pd.DataFrame = pd.DataFrame()
|
114
|
+
self.df_req: pd.DataFrame = pd.DataFrame()
|
115
|
+
self._printed_this_run: bool = False
|
116
116
|
|
117
117
|
# --------------------- Back-compat property bridge ---------------------
|
118
118
|
@property
|
119
|
-
def skipped(self) -> List[Union[str, dt.date]]:
|
119
|
+
def skipped(self) -> List[Union[str, dt.date]]:
|
120
120
|
"""
|
121
121
|
Backward-compatible view of skip configuration.
|
122
122
|
Returns a merged list of path-strings and dates.
|
@@ -126,7 +126,7 @@ class UpdatePlanner(ManagedResource):
|
|
126
126
|
return [*paths, *dates]
|
127
127
|
|
128
128
|
@skipped.setter
|
129
|
-
def skipped(self, value: List[Union[str, dt.date]]) -> None:
|
129
|
+
def skipped(self, value: List[Union[str, dt.date]]) -> None:
|
130
130
|
"""
|
131
131
|
Accepts legacy assignments like:
|
132
132
|
planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
|
@@ -136,14 +136,15 @@ class UpdatePlanner(ManagedResource):
|
|
136
136
|
self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
137
137
|
self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
138
138
|
|
139
|
-
# ---------------------
|
139
|
+
# --------------------- Public API ---------------------
|
140
140
|
def has_plan(self) -> bool:
|
141
|
+
"""Check if a plan DataFrame exists and is not empty."""
|
141
142
|
return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
142
143
|
|
143
144
|
def required_count(self) -> int:
|
144
|
-
|
145
|
+
"""Get the number of dates that require an update."""
|
146
|
+
return len(self.df_req) if isinstance(self.df_req, pd.DataFrame) else 0
|
145
147
|
|
146
|
-
# --------------------- core API (kept) ---------------------
|
147
148
|
def generate_plan(
|
148
149
|
self,
|
149
150
|
start: Union[str, dt.date, None] = None,
|
@@ -161,20 +162,18 @@ class UpdatePlanner(ManagedResource):
|
|
161
162
|
if sd > ed:
|
162
163
|
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
163
164
|
|
164
|
-
self.
|
165
|
-
|
166
|
-
extra=self._log_extra(),
|
167
|
-
)
|
165
|
+
log_extra = self._log_extra()
|
166
|
+
self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=log_extra)
|
168
167
|
self._generate_plan(sd, ed, freq=freq)
|
169
168
|
self.logger.info(
|
170
169
|
f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
171
170
|
f"{len(self.df_req)} require update",
|
172
|
-
extra=
|
171
|
+
extra=log_extra,
|
173
172
|
)
|
174
173
|
return self.df_req
|
175
174
|
|
176
175
|
def show_update_plan(self) -> None:
|
177
|
-
"""Pretty-print the current plan once per run
|
176
|
+
"""Pretty-print the current plan once per run."""
|
178
177
|
if not self.has_plan():
|
179
178
|
self.logger.info("No update plan to show.", extra=self._log_extra())
|
180
179
|
return
|
@@ -185,14 +184,14 @@ class UpdatePlanner(ManagedResource):
|
|
185
184
|
from rich.console import Console
|
186
185
|
from rich.table import Table
|
187
186
|
|
188
|
-
console = Console()
|
187
|
+
console = Console()
|
189
188
|
terminal_width = console.size.width
|
190
189
|
|
191
190
|
table = Table(
|
192
191
|
title=f"Update Plan for {self.data_path}",
|
193
192
|
show_header=True,
|
194
193
|
header_style="bold magenta",
|
195
|
-
expand=True,
|
194
|
+
expand=True,
|
196
195
|
pad_edge=False,
|
197
196
|
)
|
198
197
|
max_w = max(terminal_width - 50, 640)
|
@@ -200,7 +199,6 @@ class UpdatePlanner(ManagedResource):
|
|
200
199
|
if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
|
201
200
|
table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
|
202
201
|
elif col == "description":
|
203
|
-
# Let description wrap, but set a max width to avoid huge columns
|
204
202
|
table.add_column(col, justify="left", overflow="fold", max_width=max_w)
|
205
203
|
else:
|
206
204
|
table.add_column(col, justify="left", overflow="fold")
|
@@ -208,12 +206,12 @@ class UpdatePlanner(ManagedResource):
|
|
208
206
|
for _, row in self.plan.iterrows():
|
209
207
|
table.add_row(*(str(row[c]) for c in self.plan.columns))
|
210
208
|
|
211
|
-
# Capture with the same console so width stays consistent
|
212
209
|
with console.capture() as cap:
|
213
210
|
console.print(table)
|
214
211
|
self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
|
215
212
|
|
216
|
-
except Exception:
|
213
|
+
except Exception as e:
|
214
|
+
self.logger.debug(f"Falling back to plain text plan display due to: {e}", extra=self._log_extra())
|
217
215
|
preview = self.plan.head(200).to_string(index=False)
|
218
216
|
self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
|
219
217
|
|
@@ -233,121 +231,7 @@ class UpdatePlanner(ManagedResource):
|
|
233
231
|
if dates:
|
234
232
|
yield int(priority), dates
|
235
233
|
|
236
|
-
# ---------------------
|
237
|
-
@staticmethod
|
238
|
-
def _ensure_trailing_slash(path: str) -> str:
|
239
|
-
return path.rstrip("/") + "/"
|
240
|
-
|
241
|
-
@staticmethod
|
242
|
-
def _month_floor(d: dt.date) -> dt.date:
|
243
|
-
return d.replace(day=1)
|
244
|
-
|
245
|
-
@staticmethod
|
246
|
-
def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
247
|
-
cur = start.replace(day=1)
|
248
|
-
while cur <= end:
|
249
|
-
yield cur
|
250
|
-
y, m = cur.year, cur.month
|
251
|
-
cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
|
252
|
-
|
253
|
-
def _month_prefix(self, month_start: dt.date) -> str:
|
254
|
-
return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
255
|
-
|
256
|
-
def _day_prefix(self, d: dt.date) -> str:
|
257
|
-
return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
258
|
-
|
259
|
-
def _log_extra(self, **overrides) -> dict:
|
260
|
-
base = {
|
261
|
-
"sibi_dst_component": __name__,
|
262
|
-
"date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
263
|
-
"dataclass": self.description,
|
264
|
-
"action_module_name": "update_plan",
|
265
|
-
}
|
266
|
-
base.update(overrides)
|
267
|
-
return base
|
268
|
-
|
269
|
-
def _is_data_file(self, path: str) -> bool:
|
270
|
-
base = path.rsplit("/", 1)[-1]
|
271
|
-
if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
272
|
-
return False
|
273
|
-
lower = base.lower()
|
274
|
-
return any(lower.endswith(suf) for suf in self.data_file_suffixes)
|
275
|
-
|
276
|
-
def _is_skipped(self, d: dt.date) -> bool:
|
277
|
-
"""True if the date or its canonical path is in the skip config."""
|
278
|
-
just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
279
|
-
return (d in self.skipped_dates) or (just_path in self.skipped_paths)
|
280
|
-
|
281
|
-
def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
|
282
|
-
"""
|
283
|
-
Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
284
|
-
Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
|
285
|
-
"""
|
286
|
-
try:
|
287
|
-
items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
|
288
|
-
except Exception as e:
|
289
|
-
self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
290
|
-
return {}
|
291
|
-
|
292
|
-
out: Dict[dt.date, Dict[str, object]] = {}
|
293
|
-
for path, info in items.items():
|
294
|
-
parts = path.strip("/").split("/")
|
295
|
-
if len(parts) < 4:
|
296
|
-
continue
|
297
|
-
try:
|
298
|
-
y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
|
299
|
-
d = dt.date(y, m, dd)
|
300
|
-
except Exception:
|
301
|
-
continue
|
302
|
-
|
303
|
-
rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
304
|
-
base = path.rsplit("/", 1)[-1]
|
305
|
-
if base == "_SUCCESS":
|
306
|
-
rec["has_success"] = True
|
307
|
-
|
308
|
-
if self._is_data_file(path):
|
309
|
-
rec["files"].append(path)
|
310
|
-
mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
311
|
-
ts = None
|
312
|
-
if isinstance(mtime, (int, float)):
|
313
|
-
ts = dt.datetime.utcfromtimestamp(mtime)
|
314
|
-
elif isinstance(mtime, str):
|
315
|
-
try:
|
316
|
-
ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
317
|
-
except Exception:
|
318
|
-
ts = None
|
319
|
-
elif isinstance(mtime, dt.datetime):
|
320
|
-
ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
321
|
-
if ts:
|
322
|
-
cur = rec["newest_ts"]
|
323
|
-
rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
|
324
|
-
return out
|
325
|
-
|
326
|
-
def _summarize_partition(
|
327
|
-
self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
|
328
|
-
) -> Tuple[bool, Optional[float], bool]:
|
329
|
-
"""
|
330
|
-
(exists, age_minutes, incomplete)
|
331
|
-
- exists: True iff at least one *data* file is present for day `d`
|
332
|
-
- age_minutes: minutes since the NEWEST data file (UTC 'now')
|
333
|
-
- incomplete: True if files exist but required _SUCCESS is missing
|
334
|
-
"""
|
335
|
-
rec = cache.get(d, {})
|
336
|
-
files = rec.get("files", [])
|
337
|
-
has_success = bool(rec.get("has_success", False))
|
338
|
-
exists = len(files) > 0
|
339
|
-
if not exists:
|
340
|
-
return False, None, False
|
341
|
-
newest_ts = rec.get("newest_ts")
|
342
|
-
if newest_ts:
|
343
|
-
now_utc = self._utcnow().replace(tzinfo=None)
|
344
|
-
ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
345
|
-
age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
|
346
|
-
else:
|
347
|
-
age_min = None
|
348
|
-
incomplete = self.check_completeness and self.require_success_marker and not has_success
|
349
|
-
return True, age_min, incomplete
|
350
|
-
|
234
|
+
# --------------------- Plan Generation Internals ---------------------
|
351
235
|
def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
352
236
|
"""
|
353
237
|
Populate self.plan with all dates and self.df_req with the subset to update.
|
@@ -358,15 +242,16 @@ class UpdatePlanner(ManagedResource):
|
|
358
242
|
"""
|
359
243
|
dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
360
244
|
history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
361
|
-
rows: List[Dict] = []
|
245
|
+
rows: List[Dict[str, Any]] = []
|
362
246
|
|
363
247
|
def is_future(d: dt.date) -> bool:
|
364
248
|
return d > self.reference_date
|
365
249
|
|
366
250
|
# Choose listing units
|
251
|
+
units: List[Tuple[str, dt.date]] = []
|
367
252
|
if self.list_granularity == "day":
|
368
|
-
units
|
369
|
-
else:
|
253
|
+
units = [("day", d) for d in dates]
|
254
|
+
else: # Default to month
|
370
255
|
months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
|
371
256
|
units = [("month", m) for m in months]
|
372
257
|
|
@@ -375,37 +260,48 @@ class UpdatePlanner(ManagedResource):
|
|
375
260
|
extra=self._log_extra(),
|
376
261
|
)
|
377
262
|
|
378
|
-
# Parallel
|
379
|
-
caches: Dict[dt.date, Dict[dt.date, Dict[str,
|
380
|
-
max_workers = max(1,
|
381
|
-
|
382
|
-
|
263
|
+
# --- Parallel File Listing with Realistic Timeouts ---
|
264
|
+
caches: Dict[dt.date, Dict[dt.date, Dict[str, Any]]] = {}
|
265
|
+
max_workers = max(1, self.max_threads) # Ensure at least 1 worker
|
266
|
+
|
267
|
+
with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="update_planner") as executor:
|
268
|
+
future_to_unit: Dict[Any, Tuple[str, dt.date]] = {}
|
383
269
|
for kind, val in units:
|
384
270
|
prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
271
|
+
future = executor.submit(self._list_prefix, prefix)
|
272
|
+
future_to_unit[future] = (kind, val)
|
273
|
+
|
274
|
+
# Wait for all futures with a total timeout
|
275
|
+
done_futures, not_done_futures = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
|
276
|
+
|
277
|
+
# Process completed futures
|
278
|
+
for future in done_futures:
|
279
|
+
kind, val = future_to_unit[future]
|
389
280
|
try:
|
390
|
-
|
281
|
+
# Get the result with a per-listing timeout
|
282
|
+
cache = future.result(timeout=self.list_timeout or None)
|
391
283
|
except Exception as e:
|
392
284
|
self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
|
393
285
|
cache = {}
|
286
|
+
|
394
287
|
if kind == "month":
|
395
288
|
caches[val] = cache
|
396
|
-
else:
|
397
|
-
# day
|
398
|
-
|
399
|
-
caches.setdefault(
|
400
|
-
|
401
|
-
|
289
|
+
else: # day
|
290
|
+
# Store day listing results in its month's bucket for summarization
|
291
|
+
month_key = val.replace(day=1)
|
292
|
+
caches.setdefault(month_key, {}).update(cache)
|
293
|
+
|
294
|
+
# Handle timed-out futures
|
295
|
+
for future in not_done_futures:
|
296
|
+
kind, val = future_to_unit[future]
|
402
297
|
self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
|
403
298
|
if kind == "month":
|
404
299
|
caches[val] = {}
|
405
|
-
else:
|
406
|
-
|
300
|
+
else: # day
|
301
|
+
month_key = val.replace(day=1)
|
302
|
+
caches.setdefault(month_key, {})
|
407
303
|
|
408
|
-
# Summarize
|
304
|
+
# --- Summarize Each Date and Build Plan ---
|
409
305
|
for d in dates:
|
410
306
|
if is_future(d):
|
411
307
|
rows.append({
|
@@ -417,14 +313,16 @@ class UpdatePlanner(ManagedResource):
|
|
417
313
|
|
418
314
|
if self._is_skipped(d):
|
419
315
|
self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
|
316
|
+
# Append a row even for skipped dates, using default policy logic
|
420
317
|
rows.append(self._make_row(d, history_start, False, None))
|
421
318
|
continue
|
422
319
|
|
320
|
+
# Get the cache for the month containing this date
|
423
321
|
month_key = d.replace(day=1)
|
424
322
|
cache = caches.get(month_key, {})
|
425
323
|
exists, age_min, incomplete = self._summarize_partition(d, cache)
|
426
324
|
|
427
|
-
# Incomplete partitions get their own category (unless overwrite)
|
325
|
+
# Incomplete partitions get their own category (unless overwrite forces update)
|
428
326
|
if incomplete and not self.overwrite:
|
429
327
|
rows.append({
|
430
328
|
"date": d, "file_exists": True, "file_age_minutes": age_min,
|
@@ -433,9 +331,10 @@ class UpdatePlanner(ManagedResource):
|
|
433
331
|
})
|
434
332
|
continue
|
435
333
|
|
436
|
-
# Fall back to
|
334
|
+
# Fall back to the standard policy logic (overwrite / history / staleness / missing)
|
437
335
|
rows.append(self._make_row(d, history_start, exists, age_min))
|
438
336
|
|
337
|
+
# --- Finalize DataFrame ---
|
439
338
|
df = pd.DataFrame.from_records(rows)
|
440
339
|
if not df.empty:
|
441
340
|
df["date"] = pd.to_datetime(df["date"]).dt.date
|
@@ -444,27 +343,120 @@ class UpdatePlanner(ManagedResource):
|
|
444
343
|
df = df.sort_values(
|
445
344
|
by=["update_priority", "date"],
|
446
345
|
ascending=[True, not self.reverse_order],
|
447
|
-
kind="mergesort",
|
346
|
+
kind="mergesort", # Stable sort
|
448
347
|
).reset_index(drop=True)
|
449
348
|
|
450
349
|
self.plan = df
|
451
350
|
self.df_req = df[df["update_required"]].copy()
|
452
351
|
self._printed_this_run = False
|
453
352
|
|
454
|
-
# ---------------------
|
353
|
+
# --------------------- File System Interaction ---------------------
|
354
|
+
def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
|
355
|
+
"""
|
356
|
+
Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
357
|
+
Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime).
|
358
|
+
"""
|
359
|
+
try:
|
360
|
+
# Returns {path: info_dict} when detail=True
|
361
|
+
items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
|
362
|
+
except Exception as e:
|
363
|
+
self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
364
|
+
return {}
|
365
|
+
|
366
|
+
out: Dict[dt.date, Dict[str, Any]] = {}
|
367
|
+
for path, info in items.items():
|
368
|
+
# Extract date from path structure (e.g., .../YYYY/MM/DD/file)
|
369
|
+
parts = path.strip("/").split("/")
|
370
|
+
if len(parts) < 3: # Need at least year, month, day
|
371
|
+
continue
|
372
|
+
try:
|
373
|
+
y, m, dd = int(parts[-3]), int(parts[-2]), int(parts[-1])
|
374
|
+
d = dt.date(y, m, dd)
|
375
|
+
except (ValueError, IndexError):
|
376
|
+
# Not a date-partitioned path, skip
|
377
|
+
continue
|
378
|
+
|
379
|
+
# Initialize or get the record for this date
|
380
|
+
rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
381
|
+
base_name = path.rsplit("/", 1)[-1]
|
382
|
+
|
383
|
+
# Check for _SUCCESS marker
|
384
|
+
if base_name == "_SUCCESS":
|
385
|
+
rec["has_success"] = True
|
386
|
+
|
387
|
+
# Check if it's a relevant data file
|
388
|
+
if self._is_data_file(path):
|
389
|
+
rec["files"].append(path)
|
390
|
+
# Determine the modification time
|
391
|
+
mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
392
|
+
ts = None
|
393
|
+
if isinstance(mtime, (int, float)):
|
394
|
+
ts = dt.datetime.utcfromtimestamp(mtime)
|
395
|
+
elif isinstance(mtime, str):
|
396
|
+
try:
|
397
|
+
ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
398
|
+
except Exception:
|
399
|
+
ts = None
|
400
|
+
elif isinstance(mtime, dt.datetime):
|
401
|
+
# Ensure timezone awareness for comparison
|
402
|
+
ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
403
|
+
|
404
|
+
# Update the newest timestamp for this partition
|
405
|
+
if ts:
|
406
|
+
current_newest = rec["newest_ts"]
|
407
|
+
# Naive comparison after ensuring tz awareness
|
408
|
+
ts_naive = ts.replace(tzinfo=None) if ts.tzinfo else ts
|
409
|
+
current_naive = current_newest.replace(tzinfo=None) if current_newest and current_newest.tzinfo else current_newest
|
410
|
+
if current_naive is None or ts_naive > current_naive:
|
411
|
+
rec["newest_ts"] = ts
|
412
|
+
|
413
|
+
return out
|
414
|
+
|
415
|
+
def _summarize_partition(
|
416
|
+
self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]
|
417
|
+
) -> Tuple[bool, Optional[float], bool]:
|
418
|
+
"""
|
419
|
+
Summarize the state of a partition for a given date.
|
420
|
+
|
421
|
+
Returns:
|
422
|
+
Tuple[bool, Optional[float], bool]: (exists, age_minutes, incomplete)
|
423
|
+
- exists: True iff at least one *data* file is present for day `d`
|
424
|
+
- age_minutes: minutes since the NEWEST data file (UTC 'now'), or None if not determinable
|
425
|
+
- incomplete: True if files exist but required _SUCCESS is missing (and checks are enabled)
|
426
|
+
"""
|
427
|
+
rec = cache.get(d, {})
|
428
|
+
files = rec.get("files", [])
|
429
|
+
has_success = bool(rec.get("has_success", False))
|
430
|
+
exists = len(files) > 0
|
431
|
+
|
432
|
+
if not exists:
|
433
|
+
return False, None, False
|
434
|
+
|
435
|
+
newest_ts = rec.get("newest_ts")
|
436
|
+
age_min: Optional[float] = None
|
437
|
+
if newest_ts:
|
438
|
+
now_utc = self._utcnow().replace(tzinfo=None) # Get current UTC time (naive)
|
439
|
+
ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts # Make mtime naive
|
440
|
+
age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
|
441
|
+
|
442
|
+
incomplete = self.check_completeness and self.require_success_marker and not has_success
|
443
|
+
return exists, age_min, incomplete
|
444
|
+
|
445
|
+
# --------------------- Policy Logic ---------------------
|
455
446
|
def _make_row(
|
456
447
|
self,
|
457
448
|
date: dt.date,
|
458
449
|
history_start: dt.date,
|
459
450
|
file_exists: bool,
|
460
451
|
file_age: Optional[float],
|
461
|
-
) -> Dict:
|
452
|
+
) -> Dict[str, Any]:
|
462
453
|
"""
|
463
454
|
Build a single plan row based on flags and thresholds.
|
464
|
-
(Categories 'future'/'incomplete' are injected earlier.)
|
455
|
+
(Categories 'future'/'incomplete' are injected earlier by _generate_plan.)
|
465
456
|
"""
|
466
457
|
within_history = history_start <= date <= self.reference_date
|
467
458
|
update_required = False
|
459
|
+
category = "unknown"
|
468
460
|
|
469
461
|
if self.overwrite:
|
470
462
|
category = "overwrite_forced"
|
@@ -486,16 +478,67 @@ class UpdatePlanner(ManagedResource):
|
|
486
478
|
|
487
479
|
return {
|
488
480
|
"date": date,
|
489
|
-
"file_exists":
|
481
|
+
"file_exists": file_exists,
|
490
482
|
"file_age_minutes": file_age,
|
491
483
|
"update_category": category,
|
492
484
|
"update_priority": self.priority_map.get(category, 99),
|
493
|
-
"update_required":
|
485
|
+
"update_required": update_required,
|
494
486
|
"description": self.description,
|
495
487
|
}
|
496
488
|
|
489
|
+
# --------------------- Utilities ---------------------
|
490
|
+
@staticmethod
|
491
|
+
def _ensure_trailing_slash(path: str) -> str:
|
492
|
+
return path.rstrip("/") + "/"
|
493
|
+
|
494
|
+
@staticmethod
|
495
|
+
def _month_floor(d: dt.date) -> dt.date:
|
496
|
+
return d.replace(day=1)
|
497
|
+
|
498
|
+
@staticmethod
|
499
|
+
def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
500
|
+
cur = start.replace(day=1)
|
501
|
+
while cur <= end:
|
502
|
+
yield cur
|
503
|
+
y, m = cur.year, cur.month
|
504
|
+
# Move to the first day of the next month
|
505
|
+
if m == 12:
|
506
|
+
cur = dt.date(y + 1, 1, 1)
|
507
|
+
else:
|
508
|
+
cur = dt.date(y, m + 1, 1)
|
509
|
+
|
510
|
+
def _month_prefix(self, month_start: dt.date) -> str:
|
511
|
+
return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
512
|
+
|
513
|
+
def _day_prefix(self, d: dt.date) -> str:
|
514
|
+
return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
515
|
+
|
516
|
+
def _is_data_file(self, path: str) -> bool:
|
517
|
+
base = path.rsplit("/", 1)[-1]
|
518
|
+
# Skip hidden files, directories, and control files
|
519
|
+
if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
520
|
+
return False
|
521
|
+
lower_base = base.lower()
|
522
|
+
return any(lower_base.endswith(suf) for suf in self.data_file_suffixes)
|
523
|
+
|
524
|
+
def _is_skipped(self, d: dt.date) -> bool:
|
525
|
+
"""True if the date or its canonical path is in the skip config."""
|
526
|
+
canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
527
|
+
return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
|
528
|
+
|
529
|
+
def _log_extra(self, **overrides) -> Dict[str, Any]:
|
530
|
+
base = {
|
531
|
+
"sibi_dst_component": self.logger_extra.get("sibi_dst_component", "warehouse.update_planner"),
|
532
|
+
"date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
533
|
+
"dataclass": self.description,
|
534
|
+
"action_module_name": "update_plan",
|
535
|
+
}
|
536
|
+
base.update(overrides)
|
537
|
+
return base
|
538
|
+
|
539
|
+
|
497
540
|
# import datetime as dt
|
498
|
-
# from concurrent.futures import ThreadPoolExecutor,
|
541
|
+
# from concurrent.futures import ThreadPoolExecutor, wait
|
499
542
|
# from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
500
543
|
#
|
501
544
|
# import pandas as pd
|
@@ -507,21 +550,35 @@ class UpdatePlanner(ManagedResource):
|
|
507
550
|
# class UpdatePlanner(ManagedResource):
|
508
551
|
# """
|
509
552
|
# Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
|
510
|
-
#
|
511
|
-
#
|
553
|
+
# Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
|
554
|
+
#
|
555
|
+
# Enhancements:
|
556
|
+
# - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
|
557
|
+
# - Age computed from the NEWEST data file (ignoring control files).
|
558
|
+
# - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
|
559
|
+
# - Real timeouts using concurrent.futures.wait(...).
|
560
|
+
# - Future dates marked as 'future' (not actionable).
|
512
561
|
# """
|
513
562
|
#
|
563
|
+
# # -------- Defaults (extended, but original keys retained) --------
|
514
564
|
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
515
565
|
# "file_is_recent": 0,
|
516
566
|
# "missing_ignored": 0,
|
517
567
|
# "overwrite_forced": 1,
|
568
|
+
# "incomplete": 1, # new: prioritize just under overwrite
|
518
569
|
# "create_missing": 2,
|
519
570
|
# "missing_in_history": 3,
|
520
571
|
# "stale_in_history": 4,
|
572
|
+
# "future": 99, # new: not actionable
|
521
573
|
# }
|
522
574
|
#
|
523
575
|
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
524
576
|
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
577
|
+
#
|
578
|
+
# # Data/Control file heuristics (can be overridden)
|
579
|
+
# DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
|
580
|
+
# CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
|
581
|
+
#
|
525
582
|
# logger_extra = {"sibi_dst_component": __name__}
|
526
583
|
#
|
527
584
|
# def __init__(
|
@@ -537,12 +594,12 @@ class UpdatePlanner(ManagedResource):
|
|
537
594
|
# custom_priority_map: Optional[Dict[str, int]] = None,
|
538
595
|
# reverse_order: bool = False,
|
539
596
|
# show_progress: bool = False,
|
540
|
-
# skipped: Optional[List[str]] = None,
|
597
|
+
# skipped: Optional[List[Union[str, dt.date]]] = None,
|
541
598
|
# **kwargs,
|
542
599
|
# ):
|
543
600
|
# super().__init__(**kwargs)
|
544
601
|
#
|
545
|
-
# #
|
602
|
+
# # ---- Existing public-ish attributes (unchanged) ----
|
546
603
|
# self.description = description
|
547
604
|
# self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
548
605
|
# self.filename = parquet_filename
|
@@ -552,71 +609,113 @@ class UpdatePlanner(ManagedResource):
|
|
552
609
|
# self.ignore_missing = ignore_missing
|
553
610
|
# self.history_days_threshold = history_days_threshold
|
554
611
|
# self.max_age_minutes = max_age_minutes
|
555
|
-
#
|
556
|
-
# self.
|
612
|
+
# # copy to avoid shared mutation
|
613
|
+
# self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
557
614
|
#
|
558
|
-
# # Execution knobs from kwargs (
|
615
|
+
# # Execution knobs from kwargs (kept)
|
559
616
|
# self.max_threads: int = int(kwargs.get("max_threads", 3))
|
560
|
-
# self.timeout: float = float(kwargs.get("timeout", 30.0))
|
617
|
+
# self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
|
561
618
|
#
|
562
|
-
# # Date window
|
619
|
+
# # Date window (kept)
|
563
620
|
# self.start_date = kwargs.get("parquet_start_date")
|
564
621
|
# self.end_date = kwargs.get("parquet_end_date")
|
565
622
|
#
|
566
|
-
# # Reference
|
567
|
-
# if reference_date is None
|
568
|
-
# self.reference_date = dt.date.today()
|
569
|
-
# else:
|
570
|
-
# self.reference_date = pd.to_datetime(reference_date).date()
|
623
|
+
# # Reference date (kept; tolerant)
|
624
|
+
# self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
|
571
625
|
#
|
572
|
-
# # Helpers & state
|
626
|
+
# # Helpers & state (kept)
|
573
627
|
# self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
574
628
|
# self.plan: pd.DataFrame = pd.DataFrame()
|
575
629
|
# self.df_req: pd.DataFrame = pd.DataFrame()
|
576
|
-
#
|
577
|
-
# # internal run flag to print once per run if caller reuses instance
|
578
630
|
# self._printed_this_run: bool = False
|
579
631
|
#
|
580
|
-
#
|
632
|
+
# # ---- New feature flags / knobs (all default to safe choices) ----
|
633
|
+
# # Completeness check via _SUCCESS
|
634
|
+
# self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
|
635
|
+
# self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
|
636
|
+
# # Listing granularity: 'month' (default) or 'day'
|
637
|
+
# self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
638
|
+
# # Data file suffixes to consider for age (default common formats)
|
639
|
+
# self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
|
640
|
+
# # Timeouts
|
641
|
+
# self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
|
642
|
+
# self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
|
643
|
+
# # Dependency-injected clock (UTC) for tests
|
644
|
+
# self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
645
|
+
#
|
646
|
+
# # ------------ Backward-compatible skip handling ------------
|
647
|
+
# # Keep legacy attribute and derive new internal canonical sets.
|
648
|
+
# self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
649
|
+
# self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
650
|
+
# self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
|
651
|
+
#
|
652
|
+
# # Validate fs presence (you rely on it)
|
653
|
+
# if not getattr(self, "fs", None):
|
654
|
+
# raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
655
|
+
#
|
656
|
+
# # --------------------- Back-compat property bridge ---------------------
|
657
|
+
# @property
|
658
|
+
# def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
|
659
|
+
# """
|
660
|
+
# Backward-compatible view of skip configuration.
|
661
|
+
# Returns a merged list of path-strings and dates.
|
662
|
+
# """
|
663
|
+
# paths = sorted(self.skipped_paths)
|
664
|
+
# dates = sorted(self.skipped_dates)
|
665
|
+
# return [*paths, *dates]
|
666
|
+
#
|
667
|
+
# @skipped.setter
|
668
|
+
# def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
|
669
|
+
# """
|
670
|
+
# Accepts legacy assignments like:
|
671
|
+
# planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
|
672
|
+
# and keeps new internals in sync.
|
673
|
+
# """
|
674
|
+
# value = list(value or [])
|
675
|
+
# self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
676
|
+
# self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
677
|
+
#
|
678
|
+
# # --------------------- public helpers (kept) ---------------------
|
581
679
|
# def has_plan(self) -> bool:
|
582
|
-
# """Safe truthiness for plan existence."""
|
583
680
|
# return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
584
681
|
#
|
585
682
|
# def required_count(self) -> int:
|
586
683
|
# return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
|
587
684
|
#
|
588
|
-
# # --------------------- core API ---------------------
|
685
|
+
# # --------------------- core API (kept) ---------------------
|
589
686
|
# def generate_plan(
|
590
687
|
# self,
|
591
688
|
# start: Union[str, dt.date, None] = None,
|
592
689
|
# end: Union[str, dt.date, None] = None,
|
593
690
|
# freq: str = "D",
|
594
691
|
# ) -> pd.DataFrame:
|
595
|
-
# """
|
596
|
-
# Build a plan for [start, end]. Returns rows that require update (df_req).
|
597
|
-
# """
|
692
|
+
# """Build a plan for [start, end]. Returns rows that require update (df_req)."""
|
598
693
|
# start = start or self.start_date
|
599
694
|
# end = end or self.end_date
|
695
|
+
# if start is None or end is None:
|
696
|
+
# raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
|
697
|
+
#
|
600
698
|
# sd = pd.to_datetime(start).date()
|
601
699
|
# ed = pd.to_datetime(end).date()
|
602
700
|
# if sd > ed:
|
603
701
|
# raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
604
702
|
#
|
605
|
-
# self.logger.info(
|
703
|
+
# self.logger.info(
|
704
|
+
# f"Generating update plan for {self.description} from {sd} to {ed}",
|
705
|
+
# extra=self._log_extra(),
|
706
|
+
# )
|
606
707
|
# self._generate_plan(sd, ed, freq=freq)
|
607
708
|
# self.logger.info(
|
608
709
|
# f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
609
710
|
# f"{len(self.df_req)} require update",
|
610
|
-
# extra=self.
|
711
|
+
# extra=self._log_extra(),
|
611
712
|
# )
|
612
713
|
# return self.df_req
|
613
714
|
#
|
614
715
|
# def show_update_plan(self) -> None:
|
615
|
-
#
|
616
|
-
#
|
617
|
-
# """Pretty-print the current plan once per run."""
|
716
|
+
# """Pretty-print the current plan once per run, now respecting terminal width fully."""
|
618
717
|
# if not self.has_plan():
|
619
|
-
# self.logger.info("No update plan to show.")
|
718
|
+
# self.logger.info("No update plan to show.", extra=self._log_extra())
|
620
719
|
# return
|
621
720
|
# if self._printed_this_run:
|
622
721
|
# return
|
@@ -624,33 +723,43 @@ class UpdatePlanner(ManagedResource):
|
|
624
723
|
# try:
|
625
724
|
# from rich.console import Console
|
626
725
|
# from rich.table import Table
|
627
|
-
# except Exception:
|
628
|
-
# # Fallback: plain text
|
629
|
-
# self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}", extra=logger_extra)
|
630
|
-
# self._printed_this_run = True
|
631
|
-
# return
|
632
726
|
#
|
633
|
-
#
|
634
|
-
#
|
635
|
-
#
|
636
|
-
#
|
637
|
-
#
|
638
|
-
#
|
639
|
-
#
|
727
|
+
# console = Console() # auto-detect terminal size
|
728
|
+
# terminal_width = console.size.width
|
729
|
+
#
|
730
|
+
# table = Table(
|
731
|
+
# title=f"Update Plan for {self.data_path}",
|
732
|
+
# show_header=True,
|
733
|
+
# header_style="bold magenta",
|
734
|
+
# expand=True, # fill available width
|
735
|
+
# pad_edge=False,
|
736
|
+
# )
|
737
|
+
# max_w = max(terminal_width - 50, 640)
|
738
|
+
# for col in self.plan.columns:
|
739
|
+
# if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
|
740
|
+
# table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
|
741
|
+
# elif col == "description":
|
742
|
+
# # Let description wrap, but set a max width to avoid huge columns
|
743
|
+
# table.add_column(col, justify="left", overflow="fold", max_width=max_w)
|
744
|
+
# else:
|
745
|
+
# table.add_column(col, justify="left", overflow="fold")
|
746
|
+
#
|
747
|
+
# for _, row in self.plan.iterrows():
|
748
|
+
# table.add_row(*(str(row[c]) for c in self.plan.columns))
|
640
749
|
#
|
641
|
-
#
|
642
|
-
#
|
750
|
+
# # Capture with the same console so width stays consistent
|
751
|
+
# with console.capture() as cap:
|
752
|
+
# console.print(table)
|
753
|
+
# self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
|
754
|
+
#
|
755
|
+
# except Exception:
|
756
|
+
# preview = self.plan.head(200).to_string(index=False)
|
757
|
+
# self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
|
643
758
|
#
|
644
|
-
# console = Console()
|
645
|
-
# with console.capture() as capture:
|
646
|
-
# console.print(table)
|
647
|
-
# self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra=logger_extra)
|
648
759
|
# self._printed_this_run = True
|
649
760
|
#
|
650
761
|
# def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
651
|
-
# """
|
652
|
-
# Yield (priority, [dates...]) batches, smallest priority first.
|
653
|
-
# """
|
762
|
+
# """Yield (priority, [dates...]) batches, smallest priority first."""
|
654
763
|
# if not self.has_plan():
|
655
764
|
# return
|
656
765
|
# req = self.plan[self.plan["update_required"]]
|
@@ -658,7 +767,6 @@ class UpdatePlanner(ManagedResource):
|
|
658
767
|
# return
|
659
768
|
# for priority in sorted(req["update_priority"].unique()):
|
660
769
|
# dates_df = req[req["update_priority"] == priority]
|
661
|
-
# # sort within group
|
662
770
|
# dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
663
771
|
# dates = dates_df["date"].tolist()
|
664
772
|
# if dates:
|
@@ -669,42 +777,205 @@ class UpdatePlanner(ManagedResource):
|
|
669
777
|
# def _ensure_trailing_slash(path: str) -> str:
|
670
778
|
# return path.rstrip("/") + "/"
|
671
779
|
#
|
780
|
+
# @staticmethod
|
781
|
+
# def _month_floor(d: dt.date) -> dt.date:
|
782
|
+
# return d.replace(day=1)
|
783
|
+
#
|
784
|
+
# @staticmethod
|
785
|
+
# def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
786
|
+
# cur = start.replace(day=1)
|
787
|
+
# while cur <= end:
|
788
|
+
# yield cur
|
789
|
+
# y, m = cur.year, cur.month
|
790
|
+
# cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
|
791
|
+
#
|
792
|
+
# def _month_prefix(self, month_start: dt.date) -> str:
|
793
|
+
# return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
794
|
+
#
|
795
|
+
# def _day_prefix(self, d: dt.date) -> str:
|
796
|
+
# return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
797
|
+
#
|
798
|
+
# def _log_extra(self, **overrides) -> dict:
|
799
|
+
# base = {
|
800
|
+
# "sibi_dst_component": __name__,
|
801
|
+
# "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
802
|
+
# "dataclass": self.description,
|
803
|
+
# "action_module_name": "update_plan",
|
804
|
+
# }
|
805
|
+
# base.update(overrides)
|
806
|
+
# return base
|
807
|
+
#
|
808
|
+
# def _is_data_file(self, path: str) -> bool:
|
809
|
+
# base = path.rsplit("/", 1)[-1]
|
810
|
+
# if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
811
|
+
# return False
|
812
|
+
# lower = base.lower()
|
813
|
+
# return any(lower.endswith(suf) for suf in self.data_file_suffixes)
|
814
|
+
#
|
815
|
+
# def _is_skipped(self, d: dt.date) -> bool:
|
816
|
+
# """True if the date or its canonical path is in the skip config."""
|
817
|
+
# just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
818
|
+
# return (d in self.skipped_dates) or (just_path in self.skipped_paths)
|
819
|
+
#
|
820
|
+
# def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
|
821
|
+
# """
|
822
|
+
# Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
823
|
+
# Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
|
824
|
+
# """
|
825
|
+
# try:
|
826
|
+
# items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
|
827
|
+
# except Exception as e:
|
828
|
+
# self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
829
|
+
# return {}
|
830
|
+
#
|
831
|
+
# out: Dict[dt.date, Dict[str, object]] = {}
|
832
|
+
# for path, info in items.items():
|
833
|
+
# parts = path.strip("/").split("/")
|
834
|
+
# if len(parts) < 4:
|
835
|
+
# continue
|
836
|
+
# try:
|
837
|
+
# y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
|
838
|
+
# d = dt.date(y, m, dd)
|
839
|
+
# except Exception:
|
840
|
+
# continue
|
841
|
+
#
|
842
|
+
# rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
843
|
+
# base = path.rsplit("/", 1)[-1]
|
844
|
+
# if base == "_SUCCESS":
|
845
|
+
# rec["has_success"] = True
|
846
|
+
#
|
847
|
+
# if self._is_data_file(path):
|
848
|
+
# rec["files"].append(path)
|
849
|
+
# mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
850
|
+
# ts = None
|
851
|
+
# if isinstance(mtime, (int, float)):
|
852
|
+
# ts = dt.datetime.utcfromtimestamp(mtime)
|
853
|
+
# elif isinstance(mtime, str):
|
854
|
+
# try:
|
855
|
+
# ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
856
|
+
# except Exception:
|
857
|
+
# ts = None
|
858
|
+
# elif isinstance(mtime, dt.datetime):
|
859
|
+
# ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
860
|
+
# if ts:
|
861
|
+
# cur = rec["newest_ts"]
|
862
|
+
# rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
|
863
|
+
# return out
|
864
|
+
#
|
865
|
+
# def _summarize_partition(
|
866
|
+
# self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
|
867
|
+
# ) -> Tuple[bool, Optional[float], bool]:
|
868
|
+
# """
|
869
|
+
# (exists, age_minutes, incomplete)
|
870
|
+
# - exists: True iff at least one *data* file is present for day `d`
|
871
|
+
# - age_minutes: minutes since the NEWEST data file (UTC 'now')
|
872
|
+
# - incomplete: True if files exist but required _SUCCESS is missing
|
873
|
+
# """
|
874
|
+
# rec = cache.get(d, {})
|
875
|
+
# files = rec.get("files", [])
|
876
|
+
# has_success = bool(rec.get("has_success", False))
|
877
|
+
# exists = len(files) > 0
|
878
|
+
# if not exists:
|
879
|
+
# return False, None, False
|
880
|
+
# newest_ts = rec.get("newest_ts")
|
881
|
+
# if newest_ts:
|
882
|
+
# now_utc = self._utcnow().replace(tzinfo=None)
|
883
|
+
# ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
884
|
+
# age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
|
885
|
+
# else:
|
886
|
+
# age_min = None
|
887
|
+
# incomplete = self.check_completeness and self.require_success_marker and not has_success
|
888
|
+
# return True, age_min, incomplete
|
889
|
+
#
|
672
890
|
# def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
673
891
|
# """
|
674
892
|
# Populate self.plan with all dates and self.df_req with the subset to update.
|
893
|
+
# - Pre-lists months or days (configurable) with timeouts that actually apply
|
894
|
+
# - Computes staleness from newest *data* file
|
895
|
+
# - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
|
896
|
+
# - Marks future dates as 'future' (not actionable)
|
675
897
|
# """
|
676
|
-
# dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
898
|
+
# dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
677
899
|
# history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
678
900
|
# rows: List[Dict] = []
|
679
901
|
#
|
680
|
-
#
|
681
|
-
#
|
902
|
+
# def is_future(d: dt.date) -> bool:
|
903
|
+
# return d > self.reference_date
|
682
904
|
#
|
683
|
-
#
|
684
|
-
#
|
685
|
-
#
|
686
|
-
#
|
687
|
-
#
|
688
|
-
#
|
689
|
-
#
|
690
|
-
#
|
691
|
-
#
|
692
|
-
#
|
693
|
-
#
|
694
|
-
#
|
695
|
-
#
|
696
|
-
#
|
697
|
-
#
|
698
|
-
#
|
905
|
+
# # Choose listing units
|
906
|
+
# if self.list_granularity == "day":
|
907
|
+
# units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
|
908
|
+
# else:
|
909
|
+
# months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
|
910
|
+
# units = [("month", m) for m in months]
|
911
|
+
#
|
912
|
+
# self.logger.info(
|
913
|
+
# f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
|
914
|
+
# extra=self._log_extra(),
|
915
|
+
# )
|
916
|
+
#
|
917
|
+
# # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
|
918
|
+
# caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
|
919
|
+
# max_workers = max(1, int(self.max_threads))
|
920
|
+
# with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
921
|
+
# futs = {}
|
922
|
+
# for kind, val in units:
|
923
|
+
# prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
|
924
|
+
# futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
|
925
|
+
# done, not_done = wait(futs, timeout=self.total_timeout or None)
|
926
|
+
# for f in done:
|
927
|
+
# kind, val = futs[f]
|
699
928
|
# try:
|
700
|
-
#
|
701
|
-
#
|
702
|
-
#
|
703
|
-
#
|
704
|
-
#
|
705
|
-
#
|
706
|
-
#
|
707
|
-
#
|
929
|
+
# cache = f.result(timeout=self.list_timeout or None)
|
930
|
+
# except Exception as e:
|
931
|
+
# self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
|
932
|
+
# cache = {}
|
933
|
+
# if kind == "month":
|
934
|
+
# caches[val] = cache
|
935
|
+
# else:
|
936
|
+
# # day → store into its month bucket for summarization reuse
|
937
|
+
# mk = val.replace(day=1)
|
938
|
+
# caches.setdefault(mk, {}).update(cache)
|
939
|
+
# for f in not_done:
|
940
|
+
# kind, val = futs[f]
|
941
|
+
# self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
|
942
|
+
# if kind == "month":
|
943
|
+
# caches[val] = {}
|
944
|
+
# else:
|
945
|
+
# caches.setdefault(val.replace(day=1), {})
|
946
|
+
#
|
947
|
+
# # Summarize each date
|
948
|
+
# for d in dates:
|
949
|
+
# if is_future(d):
|
950
|
+
# rows.append({
|
951
|
+
# "date": d, "file_exists": False, "file_age_minutes": None,
|
952
|
+
# "update_category": "future", "update_priority": self.priority_map.get("future", 99),
|
953
|
+
# "update_required": False, "description": self.description,
|
954
|
+
# })
|
955
|
+
# continue
|
956
|
+
#
|
957
|
+
# if self._is_skipped(d):
|
958
|
+
# self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
|
959
|
+
# rows.append(self._make_row(d, history_start, False, None))
|
960
|
+
# continue
|
961
|
+
#
|
962
|
+
# month_key = d.replace(day=1)
|
963
|
+
# cache = caches.get(month_key, {})
|
964
|
+
# exists, age_min, incomplete = self._summarize_partition(d, cache)
|
965
|
+
#
|
966
|
+
# # Incomplete partitions get their own category (unless overwrite)
|
967
|
+
# if incomplete and not self.overwrite:
|
968
|
+
# rows.append({
|
969
|
+
# "date": d, "file_exists": True, "file_age_minutes": age_min,
|
970
|
+
# "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
|
971
|
+
# "update_required": True, "description": self.description,
|
972
|
+
# })
|
973
|
+
# continue
|
974
|
+
#
|
975
|
+
# # Fall back to your existing policy (overwrite / history / staleness / missing)
|
976
|
+
# rows.append(self._make_row(d, history_start, exists, age_min))
|
977
|
+
#
|
978
|
+
# df = pd.DataFrame.from_records(rows)
|
708
979
|
# if not df.empty:
|
709
980
|
# df["date"] = pd.to_datetime(df["date"]).dt.date
|
710
981
|
# df["update_priority"] = df["update_priority"].astype(int)
|
@@ -712,31 +983,14 @@ class UpdatePlanner(ManagedResource):
|
|
712
983
|
# df = df.sort_values(
|
713
984
|
# by=["update_priority", "date"],
|
714
985
|
# ascending=[True, not self.reverse_order],
|
715
|
-
# kind="mergesort",
|
986
|
+
# kind="mergesort",
|
716
987
|
# ).reset_index(drop=True)
|
717
988
|
#
|
718
989
|
# self.plan = df
|
719
990
|
# self.df_req = df[df["update_required"]].copy()
|
720
991
|
# self._printed_this_run = False
|
721
992
|
#
|
722
|
-
#
|
723
|
-
# """
|
724
|
-
# Check file existence and age for the given date.
|
725
|
-
# """
|
726
|
-
# just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
727
|
-
# if just_path in self.skipped:
|
728
|
-
# self.logger.debug(f"Skipping {date}: path in skipped list.", extra=self.logger_extra)
|
729
|
-
# return False, None
|
730
|
-
#
|
731
|
-
# path = f"{just_path}{self.filename}"
|
732
|
-
# try:
|
733
|
-
# exists = self.fs.exists(path)
|
734
|
-
# age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
735
|
-
# return bool(exists), age
|
736
|
-
# except Exception as e:
|
737
|
-
# self.logger.warning(f"exists/age check failed for {path}: {e}", extra=self.logger_extra)
|
738
|
-
# return False, None
|
739
|
-
#
|
993
|
+
# # --------------------- original policy (kept) ---------------------
|
740
994
|
# def _make_row(
|
741
995
|
# self,
|
742
996
|
# date: dt.date,
|
@@ -746,15 +1000,14 @@ class UpdatePlanner(ManagedResource):
|
|
746
1000
|
# ) -> Dict:
|
747
1001
|
# """
|
748
1002
|
# Build a single plan row based on flags and thresholds.
|
1003
|
+
# (Categories 'future'/'incomplete' are injected earlier.)
|
749
1004
|
# """
|
750
1005
|
# within_history = history_start <= date <= self.reference_date
|
751
1006
|
# update_required = False
|
752
1007
|
#
|
753
|
-
# # 1) Overwrite forces update
|
754
1008
|
# if self.overwrite:
|
755
1009
|
# category = "overwrite_forced"
|
756
1010
|
# update_required = True
|
757
|
-
# # 2) Inside history window
|
758
1011
|
# elif within_history:
|
759
1012
|
# if not file_exists:
|
760
1013
|
# category = "missing_in_history"
|
@@ -764,11 +1017,9 @@ class UpdatePlanner(ManagedResource):
|
|
764
1017
|
# update_required = True
|
765
1018
|
# else:
|
766
1019
|
# category = "file_is_recent"
|
767
|
-
# # 3) Outside history, missing file (and not ignoring)
|
768
1020
|
# elif not file_exists and not self.ignore_missing:
|
769
1021
|
# category = "create_missing"
|
770
1022
|
# update_required = True
|
771
|
-
# # 4) Everything else
|
772
1023
|
# else:
|
773
1024
|
# category = "missing_ignored" if not file_exists else "file_is_recent"
|
774
1025
|
#
|
@@ -782,20 +1033,3 @@ class UpdatePlanner(ManagedResource):
|
|
782
1033
|
# "description": self.description,
|
783
1034
|
# }
|
784
1035
|
#
|
785
|
-
# def exclude_dates(self, dates: Set[dt.date]) -> None:
|
786
|
-
# """
|
787
|
-
# Exclude specific dates from the update plan.
|
788
|
-
# """
|
789
|
-
# if not isinstance(dates, set):
|
790
|
-
# raise ValueError("dates must be a set[date].")
|
791
|
-
# if not self.has_plan():
|
792
|
-
# self.logger.info("No update plan to modify. Call generate_plan() first.", extra=self.logger_extra)
|
793
|
-
# return
|
794
|
-
#
|
795
|
-
# before = len(self.plan)
|
796
|
-
# self.plan = self.plan[~self.plan["date"].isin(dates)]
|
797
|
-
# self.df_req = self.plan[self.plan["update_required"]].copy()
|
798
|
-
# self.logger.info(
|
799
|
-
# f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows).",
|
800
|
-
# extra=self.logger_extra
|
801
|
-
# )
|