sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,14 @@
1
+ # update_planner.py
2
+ from __future__ import annotations
3
+
1
4
  import datetime as dt
2
5
  from concurrent.futures import ThreadPoolExecutor, wait
3
- from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
6
+ from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar, Any, Callable
4
7
 
5
8
  import pandas as pd
6
9
 
7
10
  from sibi_dst.utils import ManagedResource
8
- from . import FileAgeChecker
11
+ from . import FileAgeChecker # Assuming FileAgeChecker is in the same package
9
12
 
10
13
 
11
14
  class UpdatePlanner(ManagedResource):
@@ -26,11 +29,11 @@ class UpdatePlanner(ManagedResource):
26
29
  "file_is_recent": 0,
27
30
  "missing_ignored": 0,
28
31
  "overwrite_forced": 1,
29
- "incomplete": 1, # new: prioritize just under overwrite
32
+ "incomplete": 1, # new: prioritize just under overwrite
30
33
  "create_missing": 2,
31
34
  "missing_in_history": 3,
32
35
  "stale_in_history": 4,
33
- "future": 99, # new: not actionable
36
+ "future": 99, # new: not actionable
34
37
  }
35
38
 
36
39
  DEFAULT_MAX_AGE_MINUTES: int = 1440
@@ -60,63 +63,60 @@ class UpdatePlanner(ManagedResource):
60
63
  ):
61
64
  super().__init__(**kwargs)
62
65
 
63
- # ---- Existing public-ish attributes (unchanged) ----
64
- self.description = description
65
- self.data_path = self._ensure_trailing_slash(parquet_storage_path)
66
- self.filename = parquet_filename
67
- self.reverse_order = reverse_order
68
- self.show_progress = show_progress
69
- self.overwrite = overwrite
70
- self.ignore_missing = ignore_missing
71
- self.history_days_threshold = history_days_threshold
72
- self.max_age_minutes = max_age_minutes
73
- # copy to avoid shared mutation
74
- self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
75
-
76
- # Execution knobs from kwargs (kept)
66
+ # ---- Core Configuration ----
67
+ self.description: str = description
68
+ self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
69
+ self.filename: str = parquet_filename
70
+ self.reverse_order: bool = reverse_order
71
+ self.show_progress: bool = show_progress
72
+ self.overwrite: bool = overwrite
73
+ self.ignore_missing: bool = ignore_missing
74
+ self.history_days_threshold: int = history_days_threshold
75
+ self.max_age_minutes: int = max_age_minutes
76
+ # Copy to avoid shared mutation
77
+ self.priority_map: Dict[str, int] = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
78
+
79
+ # ---- Execution Parameters ----
77
80
  self.max_threads: int = int(kwargs.get("max_threads", 3))
78
81
  self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
79
82
 
80
- # Date window (kept)
83
+ # ---- Date Window ----
81
84
  self.start_date = kwargs.get("parquet_start_date")
82
85
  self.end_date = kwargs.get("parquet_end_date")
83
86
 
84
- # Reference date (kept; tolerant)
85
- self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
86
-
87
- # Helpers & state (kept)
88
- self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
89
- self.plan: pd.DataFrame = pd.DataFrame()
90
- self.df_req: pd.DataFrame = pd.DataFrame()
91
- self._printed_this_run: bool = False
87
+ # ---- Reference Date ----
88
+ if reference_date is not None:
89
+ self.reference_date: dt.date = pd.to_datetime(reference_date).date()
90
+ else:
91
+ self.reference_date: dt.date = dt.date.today()
92
92
 
93
- # ---- New feature flags / knobs (all default to safe choices) ----
94
- # Completeness check via _SUCCESS
93
+ # ---- Feature Flags / Advanced Knobs ----
95
94
  self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
96
95
  self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
97
- # Listing granularity: 'month' (default) or 'day'
98
96
  self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
99
- # Data file suffixes to consider for age (default common formats)
100
97
  self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
101
- # Timeouts
102
- self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
103
- self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
98
+ self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
99
+ self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
104
100
  # Dependency-injected clock (UTC) for tests
105
- self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
101
+ self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
106
102
 
107
- # ------------ Backward-compatible skip handling ------------
103
+ # ---- Backward-Compatible Skip Handling ----
108
104
  # Keep legacy attribute and derive new internal canonical sets.
109
105
  self.skipped = list(skipped or kwargs.get("skipped", []) or [])
110
- self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
111
- self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
106
+ self.skipped_paths: Set[str] = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
107
+ self.skipped_dates: Set[dt.date] = {p for p in self.skipped if isinstance(p, dt.date)}
112
108
 
113
- # Validate fs presence (you rely on it)
109
+ # ---- Helpers & State ----
114
110
  if not getattr(self, "fs", None):
115
111
  raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
112
+ self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
113
+ self.plan: pd.DataFrame = pd.DataFrame()
114
+ self.df_req: pd.DataFrame = pd.DataFrame()
115
+ self._printed_this_run: bool = False
116
116
 
117
117
  # --------------------- Back-compat property bridge ---------------------
118
118
  @property
119
- def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
119
+ def skipped(self) -> List[Union[str, dt.date]]:
120
120
  """
121
121
  Backward-compatible view of skip configuration.
122
122
  Returns a merged list of path-strings and dates.
@@ -126,7 +126,7 @@ class UpdatePlanner(ManagedResource):
126
126
  return [*paths, *dates]
127
127
 
128
128
  @skipped.setter
129
- def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
129
+ def skipped(self, value: List[Union[str, dt.date]]) -> None:
130
130
  """
131
131
  Accepts legacy assignments like:
132
132
  planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
@@ -136,14 +136,15 @@ class UpdatePlanner(ManagedResource):
136
136
  self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
137
137
  self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
138
138
 
139
- # --------------------- public helpers (kept) ---------------------
139
+ # --------------------- Public API ---------------------
140
140
  def has_plan(self) -> bool:
141
+ """Check if a plan DataFrame exists and is not empty."""
141
142
  return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
142
143
 
143
144
  def required_count(self) -> int:
144
- return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
145
+ """Get the number of dates that require an update."""
146
+ return len(self.df_req) if isinstance(self.df_req, pd.DataFrame) else 0
145
147
 
146
- # --------------------- core API (kept) ---------------------
147
148
  def generate_plan(
148
149
  self,
149
150
  start: Union[str, dt.date, None] = None,
@@ -161,20 +162,18 @@ class UpdatePlanner(ManagedResource):
161
162
  if sd > ed:
162
163
  raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
163
164
 
164
- self.logger.info(
165
- f"Generating update plan for {self.description} from {sd} to {ed}",
166
- extra=self._log_extra(),
167
- )
165
+ log_extra = self._log_extra()
166
+ self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=log_extra)
168
167
  self._generate_plan(sd, ed, freq=freq)
169
168
  self.logger.info(
170
169
  f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
171
170
  f"{len(self.df_req)} require update",
172
- extra=self._log_extra(),
171
+ extra=log_extra,
173
172
  )
174
173
  return self.df_req
175
174
 
176
175
  def show_update_plan(self) -> None:
177
- """Pretty-print the current plan once per run, now respecting terminal width fully."""
176
+ """Pretty-print the current plan once per run."""
178
177
  if not self.has_plan():
179
178
  self.logger.info("No update plan to show.", extra=self._log_extra())
180
179
  return
@@ -185,14 +184,14 @@ class UpdatePlanner(ManagedResource):
185
184
  from rich.console import Console
186
185
  from rich.table import Table
187
186
 
188
- console = Console() # auto-detect terminal size
187
+ console = Console()
189
188
  terminal_width = console.size.width
190
189
 
191
190
  table = Table(
192
191
  title=f"Update Plan for {self.data_path}",
193
192
  show_header=True,
194
193
  header_style="bold magenta",
195
- expand=True, # fill available width
194
+ expand=True,
196
195
  pad_edge=False,
197
196
  )
198
197
  max_w = max(terminal_width - 50, 640)
@@ -200,7 +199,6 @@ class UpdatePlanner(ManagedResource):
200
199
  if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
201
200
  table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
202
201
  elif col == "description":
203
- # Let description wrap, but set a max width to avoid huge columns
204
202
  table.add_column(col, justify="left", overflow="fold", max_width=max_w)
205
203
  else:
206
204
  table.add_column(col, justify="left", overflow="fold")
@@ -208,12 +206,12 @@ class UpdatePlanner(ManagedResource):
208
206
  for _, row in self.plan.iterrows():
209
207
  table.add_row(*(str(row[c]) for c in self.plan.columns))
210
208
 
211
- # Capture with the same console so width stays consistent
212
209
  with console.capture() as cap:
213
210
  console.print(table)
214
211
  self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
215
212
 
216
- except Exception:
213
+ except Exception as e:
214
+ self.logger.debug(f"Falling back to plain text plan display due to: {e}", extra=self._log_extra())
217
215
  preview = self.plan.head(200).to_string(index=False)
218
216
  self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
219
217
 
@@ -233,121 +231,7 @@ class UpdatePlanner(ManagedResource):
233
231
  if dates:
234
232
  yield int(priority), dates
235
233
 
236
- # --------------------- internals ---------------------
237
- @staticmethod
238
- def _ensure_trailing_slash(path: str) -> str:
239
- return path.rstrip("/") + "/"
240
-
241
- @staticmethod
242
- def _month_floor(d: dt.date) -> dt.date:
243
- return d.replace(day=1)
244
-
245
- @staticmethod
246
- def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
247
- cur = start.replace(day=1)
248
- while cur <= end:
249
- yield cur
250
- y, m = cur.year, cur.month
251
- cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
252
-
253
- def _month_prefix(self, month_start: dt.date) -> str:
254
- return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
255
-
256
- def _day_prefix(self, d: dt.date) -> str:
257
- return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
258
-
259
- def _log_extra(self, **overrides) -> dict:
260
- base = {
261
- "sibi_dst_component": __name__,
262
- "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
263
- "dataclass": self.description,
264
- "action_module_name": "update_plan",
265
- }
266
- base.update(overrides)
267
- return base
268
-
269
- def _is_data_file(self, path: str) -> bool:
270
- base = path.rsplit("/", 1)[-1]
271
- if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
272
- return False
273
- lower = base.lower()
274
- return any(lower.endswith(suf) for suf in self.data_file_suffixes)
275
-
276
- def _is_skipped(self, d: dt.date) -> bool:
277
- """True if the date or its canonical path is in the skip config."""
278
- just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
279
- return (d in self.skipped_dates) or (just_path in self.skipped_paths)
280
-
281
- def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
282
- """
283
- Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
284
- Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
285
- """
286
- try:
287
- items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
288
- except Exception as e:
289
- self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
290
- return {}
291
-
292
- out: Dict[dt.date, Dict[str, object]] = {}
293
- for path, info in items.items():
294
- parts = path.strip("/").split("/")
295
- if len(parts) < 4:
296
- continue
297
- try:
298
- y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
299
- d = dt.date(y, m, dd)
300
- except Exception:
301
- continue
302
-
303
- rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
304
- base = path.rsplit("/", 1)[-1]
305
- if base == "_SUCCESS":
306
- rec["has_success"] = True
307
-
308
- if self._is_data_file(path):
309
- rec["files"].append(path)
310
- mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
311
- ts = None
312
- if isinstance(mtime, (int, float)):
313
- ts = dt.datetime.utcfromtimestamp(mtime)
314
- elif isinstance(mtime, str):
315
- try:
316
- ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
317
- except Exception:
318
- ts = None
319
- elif isinstance(mtime, dt.datetime):
320
- ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
321
- if ts:
322
- cur = rec["newest_ts"]
323
- rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
324
- return out
325
-
326
- def _summarize_partition(
327
- self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
328
- ) -> Tuple[bool, Optional[float], bool]:
329
- """
330
- (exists, age_minutes, incomplete)
331
- - exists: True iff at least one *data* file is present for day `d`
332
- - age_minutes: minutes since the NEWEST data file (UTC 'now')
333
- - incomplete: True if files exist but required _SUCCESS is missing
334
- """
335
- rec = cache.get(d, {})
336
- files = rec.get("files", [])
337
- has_success = bool(rec.get("has_success", False))
338
- exists = len(files) > 0
339
- if not exists:
340
- return False, None, False
341
- newest_ts = rec.get("newest_ts")
342
- if newest_ts:
343
- now_utc = self._utcnow().replace(tzinfo=None)
344
- ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
345
- age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
346
- else:
347
- age_min = None
348
- incomplete = self.check_completeness and self.require_success_marker and not has_success
349
- return True, age_min, incomplete
350
-
234
+ # --------------------- Plan Generation Internals ---------------------
351
235
  def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
352
236
  """
353
237
  Populate self.plan with all dates and self.df_req with the subset to update.
@@ -358,15 +242,16 @@ class UpdatePlanner(ManagedResource):
358
242
  """
359
243
  dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
360
244
  history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
361
- rows: List[Dict] = []
245
+ rows: List[Dict[str, Any]] = []
362
246
 
363
247
  def is_future(d: dt.date) -> bool:
364
248
  return d > self.reference_date
365
249
 
366
250
  # Choose listing units
251
+ units: List[Tuple[str, dt.date]] = []
367
252
  if self.list_granularity == "day":
368
- units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
369
- else:
253
+ units = [("day", d) for d in dates]
254
+ else: # Default to month
370
255
  months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
371
256
  units = [("month", m) for m in months]
372
257
 
@@ -375,37 +260,48 @@ class UpdatePlanner(ManagedResource):
375
260
  extra=self._log_extra(),
376
261
  )
377
262
 
378
- # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
379
- caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
380
- max_workers = max(1, int(self.max_threads))
381
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
382
- futs = {}
263
+ # --- Parallel File Listing with Realistic Timeouts ---
264
+ caches: Dict[dt.date, Dict[dt.date, Dict[str, Any]]] = {}
265
+ max_workers = max(1, self.max_threads) # Ensure at least 1 worker
266
+
267
+ with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="update_planner") as executor:
268
+ future_to_unit: Dict[Any, Tuple[str, dt.date]] = {}
383
269
  for kind, val in units:
384
270
  prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
385
- futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
386
- done, not_done = wait(futs, timeout=self.total_timeout or None)
387
- for f in done:
388
- kind, val = futs[f]
271
+ future = executor.submit(self._list_prefix, prefix)
272
+ future_to_unit[future] = (kind, val)
273
+
274
+ # Wait for all futures with a total timeout
275
+ done_futures, not_done_futures = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
276
+
277
+ # Process completed futures
278
+ for future in done_futures:
279
+ kind, val = future_to_unit[future]
389
280
  try:
390
- cache = f.result(timeout=self.list_timeout or None)
281
+ # Get the result with a per-listing timeout
282
+ cache = future.result(timeout=self.list_timeout or None)
391
283
  except Exception as e:
392
284
  self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
393
285
  cache = {}
286
+
394
287
  if kind == "month":
395
288
  caches[val] = cache
396
- else:
397
- # day store into its month bucket for summarization reuse
398
- mk = val.replace(day=1)
399
- caches.setdefault(mk, {}).update(cache)
400
- for f in not_done:
401
- kind, val = futs[f]
289
+ else: # day
290
+ # Store day listing results in its month's bucket for summarization
291
+ month_key = val.replace(day=1)
292
+ caches.setdefault(month_key, {}).update(cache)
293
+
294
+ # Handle timed-out futures
295
+ for future in not_done_futures:
296
+ kind, val = future_to_unit[future]
402
297
  self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
403
298
  if kind == "month":
404
299
  caches[val] = {}
405
- else:
406
- caches.setdefault(val.replace(day=1), {})
300
+ else: # day
301
+ month_key = val.replace(day=1)
302
+ caches.setdefault(month_key, {})
407
303
 
408
- # Summarize each date
304
+ # --- Summarize Each Date and Build Plan ---
409
305
  for d in dates:
410
306
  if is_future(d):
411
307
  rows.append({
@@ -417,14 +313,16 @@ class UpdatePlanner(ManagedResource):
417
313
 
418
314
  if self._is_skipped(d):
419
315
  self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
316
+ # Append a row even for skipped dates, using default policy logic
420
317
  rows.append(self._make_row(d, history_start, False, None))
421
318
  continue
422
319
 
320
+ # Get the cache for the month containing this date
423
321
  month_key = d.replace(day=1)
424
322
  cache = caches.get(month_key, {})
425
323
  exists, age_min, incomplete = self._summarize_partition(d, cache)
426
324
 
427
- # Incomplete partitions get their own category (unless overwrite)
325
+ # Incomplete partitions get their own category (unless overwrite forces update)
428
326
  if incomplete and not self.overwrite:
429
327
  rows.append({
430
328
  "date": d, "file_exists": True, "file_age_minutes": age_min,
@@ -433,9 +331,10 @@ class UpdatePlanner(ManagedResource):
433
331
  })
434
332
  continue
435
333
 
436
- # Fall back to your existing policy (overwrite / history / staleness / missing)
334
+ # Fall back to the standard policy logic (overwrite / history / staleness / missing)
437
335
  rows.append(self._make_row(d, history_start, exists, age_min))
438
336
 
337
+ # --- Finalize DataFrame ---
439
338
  df = pd.DataFrame.from_records(rows)
440
339
  if not df.empty:
441
340
  df["date"] = pd.to_datetime(df["date"]).dt.date
@@ -444,27 +343,120 @@ class UpdatePlanner(ManagedResource):
444
343
  df = df.sort_values(
445
344
  by=["update_priority", "date"],
446
345
  ascending=[True, not self.reverse_order],
447
- kind="mergesort",
346
+ kind="mergesort", # Stable sort
448
347
  ).reset_index(drop=True)
449
348
 
450
349
  self.plan = df
451
350
  self.df_req = df[df["update_required"]].copy()
452
351
  self._printed_this_run = False
453
352
 
454
- # --------------------- original policy (kept) ---------------------
353
+ # --------------------- File System Interaction ---------------------
354
+ def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
355
+ """
356
+ Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
357
+ Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime).
358
+ """
359
+ try:
360
+ # Returns {path: info_dict} when detail=True
361
+ items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
362
+ except Exception as e:
363
+ self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
364
+ return {}
365
+
366
+ out: Dict[dt.date, Dict[str, Any]] = {}
367
+ for path, info in items.items():
368
+ # Extract date from path structure (e.g., .../YYYY/MM/DD/file)
369
+ parts = path.strip("/").split("/")
370
+ if len(parts) < 3: # Need at least year, month, day
371
+ continue
372
+ try:
373
+ y, m, dd = int(parts[-3]), int(parts[-2]), int(parts[-1])
374
+ d = dt.date(y, m, dd)
375
+ except (ValueError, IndexError):
376
+ # Not a date-partitioned path, skip
377
+ continue
378
+
379
+ # Initialize or get the record for this date
380
+ rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
381
+ base_name = path.rsplit("/", 1)[-1]
382
+
383
+ # Check for _SUCCESS marker
384
+ if base_name == "_SUCCESS":
385
+ rec["has_success"] = True
386
+
387
+ # Check if it's a relevant data file
388
+ if self._is_data_file(path):
389
+ rec["files"].append(path)
390
+ # Determine the modification time
391
+ mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
392
+ ts = None
393
+ if isinstance(mtime, (int, float)):
394
+ ts = dt.datetime.utcfromtimestamp(mtime)
395
+ elif isinstance(mtime, str):
396
+ try:
397
+ ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
398
+ except Exception:
399
+ ts = None
400
+ elif isinstance(mtime, dt.datetime):
401
+ # Ensure timezone awareness for comparison
402
+ ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
403
+
404
+ # Update the newest timestamp for this partition
405
+ if ts:
406
+ current_newest = rec["newest_ts"]
407
+ # Naive comparison after ensuring tz awareness
408
+ ts_naive = ts.replace(tzinfo=None) if ts.tzinfo else ts
409
+ current_naive = current_newest.replace(tzinfo=None) if current_newest and current_newest.tzinfo else current_newest
410
+ if current_naive is None or ts_naive > current_naive:
411
+ rec["newest_ts"] = ts
412
+
413
+ return out
414
+
415
+ def _summarize_partition(
416
+ self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]
417
+ ) -> Tuple[bool, Optional[float], bool]:
418
+ """
419
+ Summarize the state of a partition for a given date.
420
+
421
+ Returns:
422
+ Tuple[bool, Optional[float], bool]: (exists, age_minutes, incomplete)
423
+ - exists: True iff at least one *data* file is present for day `d`
424
+ - age_minutes: minutes since the NEWEST data file (UTC 'now'), or None if not determinable
425
+ - incomplete: True if files exist but required _SUCCESS is missing (and checks are enabled)
426
+ """
427
+ rec = cache.get(d, {})
428
+ files = rec.get("files", [])
429
+ has_success = bool(rec.get("has_success", False))
430
+ exists = len(files) > 0
431
+
432
+ if not exists:
433
+ return False, None, False
434
+
435
+ newest_ts = rec.get("newest_ts")
436
+ age_min: Optional[float] = None
437
+ if newest_ts:
438
+ now_utc = self._utcnow().replace(tzinfo=None) # Get current UTC time (naive)
439
+ ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts # Make mtime naive
440
+ age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
441
+
442
+ incomplete = self.check_completeness and self.require_success_marker and not has_success
443
+ return exists, age_min, incomplete
444
+
445
+ # --------------------- Policy Logic ---------------------
455
446
  def _make_row(
456
447
  self,
457
448
  date: dt.date,
458
449
  history_start: dt.date,
459
450
  file_exists: bool,
460
451
  file_age: Optional[float],
461
- ) -> Dict:
452
+ ) -> Dict[str, Any]:
462
453
  """
463
454
  Build a single plan row based on flags and thresholds.
464
- (Categories 'future'/'incomplete' are injected earlier.)
455
+ (Categories 'future'/'incomplete' are injected earlier by _generate_plan.)
465
456
  """
466
457
  within_history = history_start <= date <= self.reference_date
467
458
  update_required = False
459
+ category = "unknown"
468
460
 
469
461
  if self.overwrite:
470
462
  category = "overwrite_forced"
@@ -486,16 +478,67 @@ class UpdatePlanner(ManagedResource):
486
478
 
487
479
  return {
488
480
  "date": date,
489
- "file_exists": bool(file_exists),
481
+ "file_exists": file_exists,
490
482
  "file_age_minutes": file_age,
491
483
  "update_category": category,
492
484
  "update_priority": self.priority_map.get(category, 99),
493
- "update_required": bool(update_required),
485
+ "update_required": update_required,
494
486
  "description": self.description,
495
487
  }
496
488
 
489
+ # --------------------- Utilities ---------------------
490
+ @staticmethod
491
+ def _ensure_trailing_slash(path: str) -> str:
492
+ return path.rstrip("/") + "/"
493
+
494
+ @staticmethod
495
+ def _month_floor(d: dt.date) -> dt.date:
496
+ return d.replace(day=1)
497
+
498
+ @staticmethod
499
+ def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
500
+ cur = start.replace(day=1)
501
+ while cur <= end:
502
+ yield cur
503
+ y, m = cur.year, cur.month
504
+ # Move to the first day of the next month
505
+ if m == 12:
506
+ cur = dt.date(y + 1, 1, 1)
507
+ else:
508
+ cur = dt.date(y, m + 1, 1)
509
+
510
+ def _month_prefix(self, month_start: dt.date) -> str:
511
+ return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
512
+
513
+ def _day_prefix(self, d: dt.date) -> str:
514
+ return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
515
+
516
+ def _is_data_file(self, path: str) -> bool:
517
+ base = path.rsplit("/", 1)[-1]
518
+ # Skip hidden files, directories, and control files
519
+ if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
520
+ return False
521
+ lower_base = base.lower()
522
+ return any(lower_base.endswith(suf) for suf in self.data_file_suffixes)
523
+
524
+ def _is_skipped(self, d: dt.date) -> bool:
525
+ """True if the date or its canonical path is in the skip config."""
526
+ canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
527
+ return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
528
+
529
+ def _log_extra(self, **overrides) -> Dict[str, Any]:
530
+ base = {
531
+ "sibi_dst_component": self.logger_extra.get("sibi_dst_component", "warehouse.update_planner"),
532
+ "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
533
+ "dataclass": self.description,
534
+ "action_module_name": "update_plan",
535
+ }
536
+ base.update(overrides)
537
+ return base
538
+
539
+
497
540
  # import datetime as dt
498
- # from concurrent.futures import ThreadPoolExecutor, as_completed
541
+ # from concurrent.futures import ThreadPoolExecutor, wait
499
542
  # from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
500
543
  #
501
544
  # import pandas as pd
@@ -507,21 +550,35 @@ class UpdatePlanner(ManagedResource):
507
550
  # class UpdatePlanner(ManagedResource):
508
551
  # """
509
552
  # Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
510
- # Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
511
- # concerns do not apply here.
553
+ # Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
554
+ #
555
+ # Enhancements:
556
+ # - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
557
+ # - Age computed from the NEWEST data file (ignoring control files).
558
+ # - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
559
+ # - Real timeouts using concurrent.futures.wait(...).
560
+ # - Future dates marked as 'future' (not actionable).
512
561
  # """
513
562
  #
563
+ # # -------- Defaults (extended, but original keys retained) --------
514
564
  # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
515
565
  # "file_is_recent": 0,
516
566
  # "missing_ignored": 0,
517
567
  # "overwrite_forced": 1,
568
+ # "incomplete": 1, # new: prioritize just under overwrite
518
569
  # "create_missing": 2,
519
570
  # "missing_in_history": 3,
520
571
  # "stale_in_history": 4,
572
+ # "future": 99, # new: not actionable
521
573
  # }
522
574
  #
523
575
  # DEFAULT_MAX_AGE_MINUTES: int = 1440
524
576
  # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
577
+ #
578
+ # # Data/Control file heuristics (can be overridden)
579
+ # DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
580
+ # CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
581
+ #
525
582
  # logger_extra = {"sibi_dst_component": __name__}
526
583
  #
527
584
  # def __init__(
@@ -537,12 +594,12 @@ class UpdatePlanner(ManagedResource):
537
594
  # custom_priority_map: Optional[Dict[str, int]] = None,
538
595
  # reverse_order: bool = False,
539
596
  # show_progress: bool = False,
540
- # skipped: Optional[List[str]] = None,
597
+ # skipped: Optional[List[Union[str, dt.date]]] = None,
541
598
  # **kwargs,
542
599
  # ):
543
600
  # super().__init__(**kwargs)
544
601
  #
545
- # # Public-ish attributes
602
+ # # ---- Existing public-ish attributes (unchanged) ----
546
603
  # self.description = description
547
604
  # self.data_path = self._ensure_trailing_slash(parquet_storage_path)
548
605
  # self.filename = parquet_filename
@@ -552,71 +609,113 @@ class UpdatePlanner(ManagedResource):
552
609
  # self.ignore_missing = ignore_missing
553
610
  # self.history_days_threshold = history_days_threshold
554
611
  # self.max_age_minutes = max_age_minutes
555
- # self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
556
- # self.skipped = set(skipped or [])
612
+ # # copy to avoid shared mutation
613
+ # self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
557
614
  #
558
- # # Execution knobs from kwargs (fed by upstream config)
615
+ # # Execution knobs from kwargs (kept)
559
616
  # self.max_threads: int = int(kwargs.get("max_threads", 3))
560
- # self.timeout: float = float(kwargs.get("timeout", 30.0))
617
+ # self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
561
618
  #
562
- # # Date window
619
+ # # Date window (kept)
563
620
  # self.start_date = kwargs.get("parquet_start_date")
564
621
  # self.end_date = kwargs.get("parquet_end_date")
565
622
  #
566
- # # Reference "today"
567
- # if reference_date is None:
568
- # self.reference_date = dt.date.today()
569
- # else:
570
- # self.reference_date = pd.to_datetime(reference_date).date()
623
+ # # Reference date (kept; tolerant)
624
+ # self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
571
625
  #
572
- # # Helpers & state
626
+ # # Helpers & state (kept)
573
627
  # self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
574
628
  # self.plan: pd.DataFrame = pd.DataFrame()
575
629
  # self.df_req: pd.DataFrame = pd.DataFrame()
576
- #
577
- # # internal run flag to print once per run if caller reuses instance
578
630
  # self._printed_this_run: bool = False
579
631
  #
580
- # # --------------------- public helpers ---------------------
632
+ # # ---- New feature flags / knobs (all default to safe choices) ----
633
+ # # Completeness check via _SUCCESS
634
+ # self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
635
+ # self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
636
+ # # Listing granularity: 'month' (default) or 'day'
637
+ # self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
638
+ # # Data file suffixes to consider for age (default common formats)
639
+ # self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
640
+ # # Timeouts
641
+ # self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
642
+ # self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
643
+ # # Dependency-injected clock (UTC) for tests
644
+ # self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
645
+ #
646
+ # # ------------ Backward-compatible skip handling ------------
647
+ # # Keep legacy attribute and derive new internal canonical sets.
648
+ # self.skipped = list(skipped or kwargs.get("skipped", []) or [])
649
+ # self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
650
+ # self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
651
+ #
652
+ # # Validate fs presence (you rely on it)
653
+ # if not getattr(self, "fs", None):
654
+ # raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
655
+ #
656
+ # # --------------------- Back-compat property bridge ---------------------
657
+ # @property
658
+ # def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
659
+ # """
660
+ # Backward-compatible view of skip configuration.
661
+ # Returns a merged list of path-strings and dates.
662
+ # """
663
+ # paths = sorted(self.skipped_paths)
664
+ # dates = sorted(self.skipped_dates)
665
+ # return [*paths, *dates]
666
+ #
667
+ # @skipped.setter
668
+ # def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
669
+ # """
670
+ # Accepts legacy assignments like:
671
+ # planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
672
+ # and keeps new internals in sync.
673
+ # """
674
+ # value = list(value or [])
675
+ # self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
676
+ # self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
677
+ #
678
+ # # --------------------- public helpers (kept) ---------------------
581
679
  # def has_plan(self) -> bool:
582
- # """Safe truthiness for plan existence."""
583
680
  # return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
584
681
  #
585
682
  # def required_count(self) -> int:
586
683
  # return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
587
684
  #
588
- # # --------------------- core API ---------------------
685
+ # # --------------------- core API (kept) ---------------------
589
686
  # def generate_plan(
590
687
  # self,
591
688
  # start: Union[str, dt.date, None] = None,
592
689
  # end: Union[str, dt.date, None] = None,
593
690
  # freq: str = "D",
594
691
  # ) -> pd.DataFrame:
595
- # """
596
- # Build a plan for [start, end]. Returns rows that require update (df_req).
597
- # """
692
+ # """Build a plan for [start, end]. Returns rows that require update (df_req)."""
598
693
  # start = start or self.start_date
599
694
  # end = end or self.end_date
695
+ # if start is None or end is None:
696
+ # raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
697
+ #
600
698
  # sd = pd.to_datetime(start).date()
601
699
  # ed = pd.to_datetime(end).date()
602
700
  # if sd > ed:
603
701
  # raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
604
702
  #
605
- # self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=self.logger_extra)
703
+ # self.logger.info(
704
+ # f"Generating update plan for {self.description} from {sd} to {ed}",
705
+ # extra=self._log_extra(),
706
+ # )
606
707
  # self._generate_plan(sd, ed, freq=freq)
607
708
  # self.logger.info(
608
709
  # f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
609
710
  # f"{len(self.df_req)} require update",
610
- # extra=self.logger_extra
711
+ # extra=self._log_extra(),
611
712
  # )
612
713
  # return self.df_req
613
714
  #
614
715
  # def show_update_plan(self) -> None:
615
- # logger_extra = self.logger_extra.update({"date_of_update": self.reference_date.strftime('%Y-%m-%d'), "dataclass": self.description,"action_module_name": "update_plan"})
616
- #
617
- # """Pretty-print the current plan once per run."""
716
+ # """Pretty-print the current plan once per run, now respecting terminal width fully."""
618
717
  # if not self.has_plan():
619
- # self.logger.info("No update plan to show.")
718
+ # self.logger.info("No update plan to show.", extra=self._log_extra())
620
719
  # return
621
720
  # if self._printed_this_run:
622
721
  # return
@@ -624,33 +723,43 @@ class UpdatePlanner(ManagedResource):
624
723
  # try:
625
724
  # from rich.console import Console
626
725
  # from rich.table import Table
627
- # except Exception:
628
- # # Fallback: plain text
629
- # self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}", extra=logger_extra)
630
- # self._printed_this_run = True
631
- # return
632
726
  #
633
- # table = Table(
634
- # title=f"Update Plan for {self.data_path}",
635
- # show_header=True,
636
- # header_style="bold magenta",
637
- # )
638
- # for column in self.plan.columns:
639
- # table.add_column(column, justify="left")
727
+ # console = Console() # auto-detect terminal size
728
+ # terminal_width = console.size.width
729
+ #
730
+ # table = Table(
731
+ # title=f"Update Plan for {self.data_path}",
732
+ # show_header=True,
733
+ # header_style="bold magenta",
734
+ # expand=True, # fill available width
735
+ # pad_edge=False,
736
+ # )
737
+ # max_w = max(terminal_width - 50, 640)
738
+ # for col in self.plan.columns:
739
+ # if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
740
+ # table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
741
+ # elif col == "description":
742
+ # # Let description wrap, but set a max width to avoid huge columns
743
+ # table.add_column(col, justify="left", overflow="fold", max_width=max_w)
744
+ # else:
745
+ # table.add_column(col, justify="left", overflow="fold")
746
+ #
747
+ # for _, row in self.plan.iterrows():
748
+ # table.add_row(*(str(row[c]) for c in self.plan.columns))
640
749
  #
641
- # for _, row in self.plan.iterrows():
642
- # table.add_row(*(str(row[col]) for col in self.plan.columns))
750
+ # # Capture with the same console so width stays consistent
751
+ # with console.capture() as cap:
752
+ # console.print(table)
753
+ # self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
754
+ #
755
+ # except Exception:
756
+ # preview = self.plan.head(200).to_string(index=False)
757
+ # self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
643
758
  #
644
- # console = Console()
645
- # with console.capture() as capture:
646
- # console.print(table)
647
- # self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra=logger_extra)
648
759
  # self._printed_this_run = True
649
760
  #
650
761
  # def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
651
- # """
652
- # Yield (priority, [dates...]) batches, smallest priority first.
653
- # """
762
+ # """Yield (priority, [dates...]) batches, smallest priority first."""
654
763
  # if not self.has_plan():
655
764
  # return
656
765
  # req = self.plan[self.plan["update_required"]]
@@ -658,7 +767,6 @@ class UpdatePlanner(ManagedResource):
658
767
  # return
659
768
  # for priority in sorted(req["update_priority"].unique()):
660
769
  # dates_df = req[req["update_priority"] == priority]
661
- # # sort within group
662
770
  # dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
663
771
  # dates = dates_df["date"].tolist()
664
772
  # if dates:
@@ -669,42 +777,205 @@ class UpdatePlanner(ManagedResource):
669
777
  # def _ensure_trailing_slash(path: str) -> str:
670
778
  # return path.rstrip("/") + "/"
671
779
  #
780
+ # @staticmethod
781
+ # def _month_floor(d: dt.date) -> dt.date:
782
+ # return d.replace(day=1)
783
+ #
784
+ # @staticmethod
785
+ # def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
786
+ # cur = start.replace(day=1)
787
+ # while cur <= end:
788
+ # yield cur
789
+ # y, m = cur.year, cur.month
790
+ # cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
791
+ #
792
+ # def _month_prefix(self, month_start: dt.date) -> str:
793
+ # return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
794
+ #
795
+ # def _day_prefix(self, d: dt.date) -> str:
796
+ # return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
797
+ #
798
+ # def _log_extra(self, **overrides) -> dict:
799
+ # base = {
800
+ # "sibi_dst_component": __name__,
801
+ # "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
802
+ # "dataclass": self.description,
803
+ # "action_module_name": "update_plan",
804
+ # }
805
+ # base.update(overrides)
806
+ # return base
807
+ #
808
+ # def _is_data_file(self, path: str) -> bool:
809
+ # base = path.rsplit("/", 1)[-1]
810
+ # if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
811
+ # return False
812
+ # lower = base.lower()
813
+ # return any(lower.endswith(suf) for suf in self.data_file_suffixes)
814
+ #
815
+ # def _is_skipped(self, d: dt.date) -> bool:
816
+ # """True if the date or its canonical path is in the skip config."""
817
+ # just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
818
+ # return (d in self.skipped_dates) or (just_path in self.skipped_paths)
819
+ #
820
+ # def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
821
+ # """
822
+ # Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
823
+ # Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
824
+ # """
825
+ # try:
826
+ # items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
827
+ # except Exception as e:
828
+ # self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
829
+ # return {}
830
+ #
831
+ # out: Dict[dt.date, Dict[str, object]] = {}
832
+ # for path, info in items.items():
833
+ # parts = path.strip("/").split("/")
834
+ # if len(parts) < 4:
835
+ # continue
836
+ # try:
837
+ # y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
838
+ # d = dt.date(y, m, dd)
839
+ # except Exception:
840
+ # continue
841
+ #
842
+ # rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
843
+ # base = path.rsplit("/", 1)[-1]
844
+ # if base == "_SUCCESS":
845
+ # rec["has_success"] = True
846
+ #
847
+ # if self._is_data_file(path):
848
+ # rec["files"].append(path)
849
+ # mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
850
+ # ts = None
851
+ # if isinstance(mtime, (int, float)):
852
+ # ts = dt.datetime.utcfromtimestamp(mtime)
853
+ # elif isinstance(mtime, str):
854
+ # try:
855
+ # ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
856
+ # except Exception:
857
+ # ts = None
858
+ # elif isinstance(mtime, dt.datetime):
859
+ # ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
860
+ # if ts:
861
+ # cur = rec["newest_ts"]
862
+ # rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
863
+ # return out
864
+ #
865
+ # def _summarize_partition(
866
+ # self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
867
+ # ) -> Tuple[bool, Optional[float], bool]:
868
+ # """
869
+ # (exists, age_minutes, incomplete)
870
+ # - exists: True iff at least one *data* file is present for day `d`
871
+ # - age_minutes: minutes since the NEWEST data file (UTC 'now')
872
+ # - incomplete: True if files exist but required _SUCCESS is missing
873
+ # """
874
+ # rec = cache.get(d, {})
875
+ # files = rec.get("files", [])
876
+ # has_success = bool(rec.get("has_success", False))
877
+ # exists = len(files) > 0
878
+ # if not exists:
879
+ # return False, None, False
880
+ # newest_ts = rec.get("newest_ts")
881
+ # if newest_ts:
882
+ # now_utc = self._utcnow().replace(tzinfo=None)
883
+ # ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
884
+ # age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
885
+ # else:
886
+ # age_min = None
887
+ # incomplete = self.check_completeness and self.require_success_marker and not has_success
888
+ # return True, age_min, incomplete
889
+ #
672
890
  # def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
673
891
  # """
674
892
  # Populate self.plan with all dates and self.df_req with the subset to update.
893
+ # - Pre-lists months or days (configurable) with timeouts that actually apply
894
+ # - Computes staleness from newest *data* file
895
+ # - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
896
+ # - Marks future dates as 'future' (not actionable)
675
897
  # """
676
- # dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
898
+ # dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
677
899
  # history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
678
900
  # rows: List[Dict] = []
679
901
  #
680
- # # bound threads
681
- # max_workers = max(1, int(self.max_threads))
902
+ # def is_future(d: dt.date) -> bool:
903
+ # return d > self.reference_date
682
904
  #
683
- # with ThreadPoolExecutor(max_workers=max_workers) as executor:
684
- # futures = {executor.submit(self._get_file_status, d): d for d in dates}
685
- # iterator = as_completed(futures)
686
- # if self.show_progress:
687
- # try:
688
- # from tqdm import tqdm
689
- # iterator = tqdm(
690
- # iterator, total=len(futures),
691
- # desc=f"Scanning dates for {self.description}",
692
- # unit="date", leave=False
693
- # )
694
- # except Exception:
695
- # pass # no tqdm proceed without progress bar
696
- #
697
- # for future in iterator:
698
- # d = futures[future]
905
+ # # Choose listing units
906
+ # if self.list_granularity == "day":
907
+ # units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
908
+ # else:
909
+ # months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
910
+ # units = [("month", m) for m in months]
911
+ #
912
+ # self.logger.info(
913
+ # f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
914
+ # extra=self._log_extra(),
915
+ # )
916
+ #
917
+ # # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
918
+ # caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
919
+ # max_workers = max(1, int(self.max_threads))
920
+ # with ThreadPoolExecutor(max_workers=max_workers) as ex:
921
+ # futs = {}
922
+ # for kind, val in units:
923
+ # prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
924
+ # futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
925
+ # done, not_done = wait(futs, timeout=self.total_timeout or None)
926
+ # for f in done:
927
+ # kind, val = futs[f]
699
928
  # try:
700
- # exists, age = future.result(timeout=self.timeout)
701
- # rows.append(self._make_row(d, history_start, exists, age))
702
- # except Exception as exc:
703
- # self.logger.error(f"Error processing date {d}: {exc}", extra=self.logger_extra)
704
- # rows.append(self._make_row(d, history_start, False, None))
705
- #
706
- # df = pd.DataFrame(rows)
707
- # # consistent types
929
+ # cache = f.result(timeout=self.list_timeout or None)
930
+ # except Exception as e:
931
+ # self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
932
+ # cache = {}
933
+ # if kind == "month":
934
+ # caches[val] = cache
935
+ # else:
936
+ # # day → store into its month bucket for summarization reuse
937
+ # mk = val.replace(day=1)
938
+ # caches.setdefault(mk, {}).update(cache)
939
+ # for f in not_done:
940
+ # kind, val = futs[f]
941
+ # self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
942
+ # if kind == "month":
943
+ # caches[val] = {}
944
+ # else:
945
+ # caches.setdefault(val.replace(day=1), {})
946
+ #
947
+ # # Summarize each date
948
+ # for d in dates:
949
+ # if is_future(d):
950
+ # rows.append({
951
+ # "date": d, "file_exists": False, "file_age_minutes": None,
952
+ # "update_category": "future", "update_priority": self.priority_map.get("future", 99),
953
+ # "update_required": False, "description": self.description,
954
+ # })
955
+ # continue
956
+ #
957
+ # if self._is_skipped(d):
958
+ # self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
959
+ # rows.append(self._make_row(d, history_start, False, None))
960
+ # continue
961
+ #
962
+ # month_key = d.replace(day=1)
963
+ # cache = caches.get(month_key, {})
964
+ # exists, age_min, incomplete = self._summarize_partition(d, cache)
965
+ #
966
+ # # Incomplete partitions get their own category (unless overwrite)
967
+ # if incomplete and not self.overwrite:
968
+ # rows.append({
969
+ # "date": d, "file_exists": True, "file_age_minutes": age_min,
970
+ # "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
971
+ # "update_required": True, "description": self.description,
972
+ # })
973
+ # continue
974
+ #
975
+ # # Fall back to your existing policy (overwrite / history / staleness / missing)
976
+ # rows.append(self._make_row(d, history_start, exists, age_min))
977
+ #
978
+ # df = pd.DataFrame.from_records(rows)
708
979
  # if not df.empty:
709
980
  # df["date"] = pd.to_datetime(df["date"]).dt.date
710
981
  # df["update_priority"] = df["update_priority"].astype(int)
@@ -712,31 +983,14 @@ class UpdatePlanner(ManagedResource):
712
983
  # df = df.sort_values(
713
984
  # by=["update_priority", "date"],
714
985
  # ascending=[True, not self.reverse_order],
715
- # kind="mergesort", # stable
986
+ # kind="mergesort",
716
987
  # ).reset_index(drop=True)
717
988
  #
718
989
  # self.plan = df
719
990
  # self.df_req = df[df["update_required"]].copy()
720
991
  # self._printed_this_run = False
721
992
  #
722
- # def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
723
- # """
724
- # Check file existence and age for the given date.
725
- # """
726
- # just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
727
- # if just_path in self.skipped:
728
- # self.logger.debug(f"Skipping {date}: path in skipped list.", extra=self.logger_extra)
729
- # return False, None
730
- #
731
- # path = f"{just_path}{self.filename}"
732
- # try:
733
- # exists = self.fs.exists(path)
734
- # age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
735
- # return bool(exists), age
736
- # except Exception as e:
737
- # self.logger.warning(f"exists/age check failed for {path}: {e}", extra=self.logger_extra)
738
- # return False, None
739
- #
993
+ # # --------------------- original policy (kept) ---------------------
740
994
  # def _make_row(
741
995
  # self,
742
996
  # date: dt.date,
@@ -746,15 +1000,14 @@ class UpdatePlanner(ManagedResource):
746
1000
  # ) -> Dict:
747
1001
  # """
748
1002
  # Build a single plan row based on flags and thresholds.
1003
+ # (Categories 'future'/'incomplete' are injected earlier.)
749
1004
  # """
750
1005
  # within_history = history_start <= date <= self.reference_date
751
1006
  # update_required = False
752
1007
  #
753
- # # 1) Overwrite forces update
754
1008
  # if self.overwrite:
755
1009
  # category = "overwrite_forced"
756
1010
  # update_required = True
757
- # # 2) Inside history window
758
1011
  # elif within_history:
759
1012
  # if not file_exists:
760
1013
  # category = "missing_in_history"
@@ -764,11 +1017,9 @@ class UpdatePlanner(ManagedResource):
764
1017
  # update_required = True
765
1018
  # else:
766
1019
  # category = "file_is_recent"
767
- # # 3) Outside history, missing file (and not ignoring)
768
1020
  # elif not file_exists and not self.ignore_missing:
769
1021
  # category = "create_missing"
770
1022
  # update_required = True
771
- # # 4) Everything else
772
1023
  # else:
773
1024
  # category = "missing_ignored" if not file_exists else "file_is_recent"
774
1025
  #
@@ -782,20 +1033,3 @@ class UpdatePlanner(ManagedResource):
782
1033
  # "description": self.description,
783
1034
  # }
784
1035
  #
785
- # def exclude_dates(self, dates: Set[dt.date]) -> None:
786
- # """
787
- # Exclude specific dates from the update plan.
788
- # """
789
- # if not isinstance(dates, set):
790
- # raise ValueError("dates must be a set[date].")
791
- # if not self.has_plan():
792
- # self.logger.info("No update plan to modify. Call generate_plan() first.", extra=self.logger_extra)
793
- # return
794
- #
795
- # before = len(self.plan)
796
- # self.plan = self.plan[~self.plan["date"].isin(dates)]
797
- # self.df_req = self.plan[self.plan["update_required"]].copy()
798
- # self.logger.info(
799
- # f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows).",
800
- # extra=self.logger_extra
801
- # )