sibi-dst 2025.8.7__py3-none-any.whl → 2025.8.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import datetime as dt
2
- from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from concurrent.futures import ThreadPoolExecutor, wait
3
3
  from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
4
4
 
5
5
  import pandas as pd
@@ -11,22 +11,37 @@ from . import FileAgeChecker
11
11
  class UpdatePlanner(ManagedResource):
12
12
  """
13
13
  Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
14
- Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
15
- concerns do not apply here.
14
+ Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
15
+
16
+ Enhancements:
17
+ - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
18
+ - Age computed from the NEWEST data file (ignoring control files).
19
+ - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
20
+ - Real timeouts using concurrent.futures.wait(...).
21
+ - Future dates marked as 'future' (not actionable).
16
22
  """
17
23
 
24
+ # -------- Defaults (extended, but original keys retained) --------
18
25
  DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
19
26
  "file_is_recent": 0,
20
27
  "missing_ignored": 0,
21
28
  "overwrite_forced": 1,
29
+ "incomplete": 1, # new: prioritize just under overwrite
22
30
  "create_missing": 2,
23
31
  "missing_in_history": 3,
24
32
  "stale_in_history": 4,
33
+ "future": 99, # new: not actionable
25
34
  }
26
35
 
27
36
  DEFAULT_MAX_AGE_MINUTES: int = 1440
28
37
  DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
29
38
 
39
+ # Data/Control file heuristics (can be overridden)
40
+ DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
41
+ CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
42
+
43
+ logger_extra = {"sibi_dst_component": __name__}
44
+
30
45
  def __init__(
31
46
  self,
32
47
  parquet_storage_path: str,
@@ -40,12 +55,12 @@ class UpdatePlanner(ManagedResource):
40
55
  custom_priority_map: Optional[Dict[str, int]] = None,
41
56
  reverse_order: bool = False,
42
57
  show_progress: bool = False,
43
- skipped: Optional[List[str]] = None,
58
+ skipped: Optional[List[Union[str, dt.date]]] = None,
44
59
  **kwargs,
45
60
  ):
46
61
  super().__init__(**kwargs)
47
62
 
48
- # Public-ish attributes
63
+ # ---- Existing public-ish attributes (unchanged) ----
49
64
  self.description = description
50
65
  self.data_path = self._ensure_trailing_slash(parquet_storage_path)
51
66
  self.filename = parquet_filename
@@ -55,68 +70,113 @@ class UpdatePlanner(ManagedResource):
55
70
  self.ignore_missing = ignore_missing
56
71
  self.history_days_threshold = history_days_threshold
57
72
  self.max_age_minutes = max_age_minutes
58
- self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
59
- self.skipped = set(skipped or [])
73
+ # copy to avoid shared mutation
74
+ self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
60
75
 
61
- # Execution knobs from kwargs (fed by upstream config)
76
+ # Execution knobs from kwargs (kept)
62
77
  self.max_threads: int = int(kwargs.get("max_threads", 3))
63
- self.timeout: float = float(kwargs.get("timeout", 30.0))
78
+ self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
64
79
 
65
- # Date window
80
+ # Date window (kept)
66
81
  self.start_date = kwargs.get("parquet_start_date")
67
82
  self.end_date = kwargs.get("parquet_end_date")
68
83
 
69
- # Reference "today"
70
- if reference_date is None:
71
- self.reference_date = dt.date.today()
72
- else:
73
- self.reference_date = pd.to_datetime(reference_date).date()
84
+ # Reference date (kept; tolerant)
85
+ self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
74
86
 
75
- # Helpers & state
87
+ # Helpers & state (kept)
76
88
  self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
77
89
  self.plan: pd.DataFrame = pd.DataFrame()
78
90
  self.df_req: pd.DataFrame = pd.DataFrame()
79
-
80
- # internal run flag to print once per run if caller reuses instance
81
91
  self._printed_this_run: bool = False
82
92
 
83
- # --------------------- public helpers ---------------------
93
+ # ---- New feature flags / knobs (all default to safe choices) ----
94
+ # Completeness check via _SUCCESS
95
+ self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
96
+ self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
97
+ # Listing granularity: 'month' (default) or 'day'
98
+ self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
99
+ # Data file suffixes to consider for age (default common formats)
100
+ self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
101
+ # Timeouts
102
+ self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
103
+ self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
104
+ # Dependency-injected clock (UTC) for tests
105
+ self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
106
+
107
+ # ------------ Backward-compatible skip handling ------------
108
+ # Keep legacy attribute and derive new internal canonical sets.
109
+ self.skipped = list(skipped or kwargs.get("skipped", []) or [])
110
+ self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
111
+ self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
112
+
113
+ # Validate fs presence (you rely on it)
114
+ if not getattr(self, "fs", None):
115
+ raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
116
+
117
+ # --------------------- Back-compat property bridge ---------------------
118
+ @property
119
+ def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
120
+ """
121
+ Backward-compatible view of skip configuration.
122
+ Returns a merged list of path-strings and dates.
123
+ """
124
+ paths = sorted(self.skipped_paths)
125
+ dates = sorted(self.skipped_dates)
126
+ return [*paths, *dates]
127
+
128
+ @skipped.setter
129
+ def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
130
+ """
131
+ Accepts legacy assignments like:
132
+ planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
133
+ and keeps new internals in sync.
134
+ """
135
+ value = list(value or [])
136
+ self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
137
+ self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
138
+
139
+ # --------------------- public helpers (kept) ---------------------
84
140
  def has_plan(self) -> bool:
85
- """Safe truthiness for plan existence."""
86
141
  return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
87
142
 
88
143
  def required_count(self) -> int:
89
144
  return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
90
145
 
91
- # --------------------- core API ---------------------
146
+ # --------------------- core API (kept) ---------------------
92
147
  def generate_plan(
93
148
  self,
94
149
  start: Union[str, dt.date, None] = None,
95
150
  end: Union[str, dt.date, None] = None,
96
151
  freq: str = "D",
97
152
  ) -> pd.DataFrame:
98
- """
99
- Build a plan for [start, end]. Returns rows that require update (df_req).
100
- """
153
+ """Build a plan for [start, end]. Returns rows that require update (df_req)."""
101
154
  start = start or self.start_date
102
155
  end = end or self.end_date
156
+ if start is None or end is None:
157
+ raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
158
+
103
159
  sd = pd.to_datetime(start).date()
104
160
  ed = pd.to_datetime(end).date()
105
161
  if sd > ed:
106
162
  raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
107
163
 
108
- self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}")
164
+ self.logger.info(
165
+ f"Generating update plan for {self.description} from {sd} to {ed}",
166
+ extra=self._log_extra(),
167
+ )
109
168
  self._generate_plan(sd, ed, freq=freq)
110
169
  self.logger.info(
111
170
  f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
112
- f"{len(self.df_req)} require update"
171
+ f"{len(self.df_req)} require update",
172
+ extra=self._log_extra(),
113
173
  )
114
174
  return self.df_req
115
175
 
116
176
  def show_update_plan(self) -> None:
117
- """Pretty-print the current plan once per run."""
177
+ """Pretty-print the current plan once per run, now respecting terminal width fully."""
118
178
  if not self.has_plan():
119
- self.logger.info("No update plan to show.")
179
+ self.logger.info("No update plan to show.", extra=self._log_extra())
120
180
  return
121
181
  if self._printed_this_run:
122
182
  return
@@ -124,33 +184,43 @@ class UpdatePlanner(ManagedResource):
124
184
  try:
125
185
  from rich.console import Console
126
186
  from rich.table import Table
127
- except Exception:
128
- # Fallback: plain text
129
- self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}")
130
- self._printed_this_run = True
131
- return
132
187
 
133
- table = Table(
134
- title=f"Update Plan for {self.data_path}",
135
- show_header=True,
136
- header_style="bold magenta",
137
- )
138
- for column in self.plan.columns:
139
- table.add_column(column, justify="left")
188
+ console = Console() # auto-detect terminal size
189
+ terminal_width = console.size.width
190
+
191
+ table = Table(
192
+ title=f"Update Plan for {self.data_path}",
193
+ show_header=True,
194
+ header_style="bold magenta",
195
+ expand=True, # fill available width
196
+ pad_edge=False,
197
+ )
198
+ max_w = max(terminal_width - 50, 640)
199
+ for col in self.plan.columns:
200
+ if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
201
+ table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
202
+ elif col == "description":
203
+ # Let description wrap, but set a max width to avoid huge columns
204
+ table.add_column(col, justify="left", overflow="fold", max_width=max_w)
205
+ else:
206
+ table.add_column(col, justify="left", overflow="fold")
207
+
208
+ for _, row in self.plan.iterrows():
209
+ table.add_row(*(str(row[c]) for c in self.plan.columns))
210
+
211
+ # Capture with the same console so width stays consistent
212
+ with console.capture() as cap:
213
+ console.print(table)
214
+ self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
140
215
 
141
- for _, row in self.plan.iterrows():
142
- table.add_row(*(str(row[col]) for col in self.plan.columns))
216
+ except Exception:
217
+ preview = self.plan.head(200).to_string(index=False)
218
+ self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
143
219
 
144
- console = Console()
145
- with console.capture() as capture:
146
- console.print(table)
147
- self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra={"date_of_update": self.reference_date.strftime('%Y-%m-%d'), "dataclass": self.description,"action_module_name": "update_plan"})
148
220
  self._printed_this_run = True
149
221
 
150
222
  def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
151
- """
152
- Yield (priority, [dates...]) batches, smallest priority first.
153
- """
223
+ """Yield (priority, [dates...]) batches, smallest priority first."""
154
224
  if not self.has_plan():
155
225
  return
156
226
  req = self.plan[self.plan["update_required"]]
@@ -158,7 +228,6 @@ class UpdatePlanner(ManagedResource):
158
228
  return
159
229
  for priority in sorted(req["update_priority"].unique()):
160
230
  dates_df = req[req["update_priority"] == priority]
161
- # sort within group
162
231
  dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
163
232
  dates = dates_df["date"].tolist()
164
233
  if dates:
@@ -169,42 +238,205 @@ class UpdatePlanner(ManagedResource):
169
238
  def _ensure_trailing_slash(path: str) -> str:
170
239
  return path.rstrip("/") + "/"
171
240
 
241
+ @staticmethod
242
+ def _month_floor(d: dt.date) -> dt.date:
243
+ return d.replace(day=1)
244
+
245
+ @staticmethod
246
+ def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
247
+ cur = start.replace(day=1)
248
+ while cur <= end:
249
+ yield cur
250
+ y, m = cur.year, cur.month
251
+ cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
252
+
253
+ def _month_prefix(self, month_start: dt.date) -> str:
254
+ return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
255
+
256
+ def _day_prefix(self, d: dt.date) -> str:
257
+ return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
258
+
259
+ def _log_extra(self, **overrides) -> dict:
260
+ base = {
261
+ "sibi_dst_component": __name__,
262
+ "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
263
+ "dataclass": self.description,
264
+ "action_module_name": "update_plan",
265
+ }
266
+ base.update(overrides)
267
+ return base
268
+
269
+ def _is_data_file(self, path: str) -> bool:
270
+ base = path.rsplit("/", 1)[-1]
271
+ if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
272
+ return False
273
+ lower = base.lower()
274
+ return any(lower.endswith(suf) for suf in self.data_file_suffixes)
275
+
276
+ def _is_skipped(self, d: dt.date) -> bool:
277
+ """True if the date or its canonical path is in the skip config."""
278
+ just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
279
+ return (d in self.skipped_dates) or (just_path in self.skipped_paths)
280
+
281
+ def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
282
+ """
283
+ Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
284
+ Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
285
+ """
286
+ try:
287
+ items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
288
+ except Exception as e:
289
+ self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
290
+ return {}
291
+
292
+ out: Dict[dt.date, Dict[str, object]] = {}
293
+ for path, info in items.items():
294
+ parts = path.strip("/").split("/")
295
+ if len(parts) < 4:
296
+ continue
297
+ try:
298
+ y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
299
+ d = dt.date(y, m, dd)
300
+ except Exception:
301
+ continue
302
+
303
+ rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
304
+ base = path.rsplit("/", 1)[-1]
305
+ if base == "_SUCCESS":
306
+ rec["has_success"] = True
307
+
308
+ if self._is_data_file(path):
309
+ rec["files"].append(path)
310
+ mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
311
+ ts = None
312
+ if isinstance(mtime, (int, float)):
313
+ ts = dt.datetime.utcfromtimestamp(mtime)
314
+ elif isinstance(mtime, str):
315
+ try:
316
+ ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
317
+ except Exception:
318
+ ts = None
319
+ elif isinstance(mtime, dt.datetime):
320
+ ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
321
+ if ts:
322
+ cur = rec["newest_ts"]
323
+ rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
324
+ return out
325
+
326
+ def _summarize_partition(
327
+ self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
328
+ ) -> Tuple[bool, Optional[float], bool]:
329
+ """
330
+ (exists, age_minutes, incomplete)
331
+ - exists: True iff at least one *data* file is present for day `d`
332
+ - age_minutes: minutes since the NEWEST data file (UTC 'now')
333
+ - incomplete: True if files exist but required _SUCCESS is missing
334
+ """
335
+ rec = cache.get(d, {})
336
+ files = rec.get("files", [])
337
+ has_success = bool(rec.get("has_success", False))
338
+ exists = len(files) > 0
339
+ if not exists:
340
+ return False, None, False
341
+ newest_ts = rec.get("newest_ts")
342
+ if newest_ts:
343
+ now_utc = self._utcnow().replace(tzinfo=None)
344
+ ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
345
+ age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
346
+ else:
347
+ age_min = None
348
+ incomplete = self.check_completeness and self.require_success_marker and not has_success
349
+ return True, age_min, incomplete
350
+
172
351
  def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
173
352
  """
174
353
  Populate self.plan with all dates and self.df_req with the subset to update.
354
+ - Pre-lists months or days (configurable) with timeouts that actually apply
355
+ - Computes staleness from newest *data* file
356
+ - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
357
+ - Marks future dates as 'future' (not actionable)
175
358
  """
176
- dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
359
+ dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
177
360
  history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
178
361
  rows: List[Dict] = []
179
362
 
180
- # bound threads
181
- max_workers = max(1, int(self.max_threads))
363
+ def is_future(d: dt.date) -> bool:
364
+ return d > self.reference_date
182
365
 
183
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
184
- futures = {executor.submit(self._get_file_status, d): d for d in dates}
185
- iterator = as_completed(futures)
186
- if self.show_progress:
187
- try:
188
- from tqdm import tqdm
189
- iterator = tqdm(
190
- iterator, total=len(futures),
191
- desc=f"Scanning dates for {self.description}",
192
- unit="date", leave=False
193
- )
194
- except Exception:
195
- pass # no tqdm proceed without progress bar
196
-
197
- for future in iterator:
198
- d = futures[future]
366
+ # Choose listing units
367
+ if self.list_granularity == "day":
368
+ units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
369
+ else:
370
+ months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
371
+ units = [("month", m) for m in months]
372
+
373
+ self.logger.info(
374
+ f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
375
+ extra=self._log_extra(),
376
+ )
377
+
378
+ # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
379
+ caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
380
+ max_workers = max(1, int(self.max_threads))
381
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
382
+ futs = {}
383
+ for kind, val in units:
384
+ prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
385
+ futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
386
+ done, not_done = wait(futs, timeout=self.total_timeout or None)
387
+ for f in done:
388
+ kind, val = futs[f]
199
389
  try:
200
- exists, age = future.result(timeout=self.timeout)
201
- rows.append(self._make_row(d, history_start, exists, age))
202
- except Exception as exc:
203
- self.logger.error(f"Error processing date {d}: {exc}")
204
- rows.append(self._make_row(d, history_start, False, None))
205
-
206
- df = pd.DataFrame(rows)
207
- # consistent types
390
+ cache = f.result(timeout=self.list_timeout or None)
391
+ except Exception as e:
392
+ self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
393
+ cache = {}
394
+ if kind == "month":
395
+ caches[val] = cache
396
+ else:
397
+ # day → store into its month bucket for summarization reuse
398
+ mk = val.replace(day=1)
399
+ caches.setdefault(mk, {}).update(cache)
400
+ for f in not_done:
401
+ kind, val = futs[f]
402
+ self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
403
+ if kind == "month":
404
+ caches[val] = {}
405
+ else:
406
+ caches.setdefault(val.replace(day=1), {})
407
+
408
+ # Summarize each date
409
+ for d in dates:
410
+ if is_future(d):
411
+ rows.append({
412
+ "date": d, "file_exists": False, "file_age_minutes": None,
413
+ "update_category": "future", "update_priority": self.priority_map.get("future", 99),
414
+ "update_required": False, "description": self.description,
415
+ })
416
+ continue
417
+
418
+ if self._is_skipped(d):
419
+ self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
420
+ rows.append(self._make_row(d, history_start, False, None))
421
+ continue
422
+
423
+ month_key = d.replace(day=1)
424
+ cache = caches.get(month_key, {})
425
+ exists, age_min, incomplete = self._summarize_partition(d, cache)
426
+
427
+ # Incomplete partitions get their own category (unless overwrite)
428
+ if incomplete and not self.overwrite:
429
+ rows.append({
430
+ "date": d, "file_exists": True, "file_age_minutes": age_min,
431
+ "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
432
+ "update_required": True, "description": self.description,
433
+ })
434
+ continue
435
+
436
+ # Fall back to your existing policy (overwrite / history / staleness / missing)
437
+ rows.append(self._make_row(d, history_start, exists, age_min))
438
+
439
+ df = pd.DataFrame.from_records(rows)
208
440
  if not df.empty:
209
441
  df["date"] = pd.to_datetime(df["date"]).dt.date
210
442
  df["update_priority"] = df["update_priority"].astype(int)
@@ -212,31 +444,14 @@ class UpdatePlanner(ManagedResource):
212
444
  df = df.sort_values(
213
445
  by=["update_priority", "date"],
214
446
  ascending=[True, not self.reverse_order],
215
- kind="mergesort", # stable
447
+ kind="mergesort",
216
448
  ).reset_index(drop=True)
217
449
 
218
450
  self.plan = df
219
451
  self.df_req = df[df["update_required"]].copy()
220
452
  self._printed_this_run = False
221
453
 
222
- def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
223
- """
224
- Check file existence and age for the given date.
225
- """
226
- just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
227
- if just_path in self.skipped:
228
- self.logger.debug(f"Skipping {date}: path in skipped list.")
229
- return False, None
230
-
231
- path = f"{just_path}{self.filename}"
232
- try:
233
- exists = self.fs.exists(path)
234
- age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
235
- return bool(exists), age
236
- except Exception as e:
237
- self.logger.warning(f"exists/age check failed for {path}: {e}")
238
- return False, None
239
-
454
+ # --------------------- original policy (kept) ---------------------
240
455
  def _make_row(
241
456
  self,
242
457
  date: dt.date,
@@ -246,15 +461,14 @@ class UpdatePlanner(ManagedResource):
246
461
  ) -> Dict:
247
462
  """
248
463
  Build a single plan row based on flags and thresholds.
464
+ (Categories 'future'/'incomplete' are injected earlier.)
249
465
  """
250
466
  within_history = history_start <= date <= self.reference_date
251
467
  update_required = False
252
468
 
253
- # 1) Overwrite forces update
254
469
  if self.overwrite:
255
470
  category = "overwrite_forced"
256
471
  update_required = True
257
- # 2) Inside history window
258
472
  elif within_history:
259
473
  if not file_exists:
260
474
  category = "missing_in_history"
@@ -264,11 +478,9 @@ class UpdatePlanner(ManagedResource):
264
478
  update_required = True
265
479
  else:
266
480
  category = "file_is_recent"
267
- # 3) Outside history, missing file (and not ignoring)
268
481
  elif not file_exists and not self.ignore_missing:
269
482
  category = "create_missing"
270
483
  update_required = True
271
- # 4) Everything else
272
484
  else:
273
485
  category = "missing_ignored" if not file_exists else "file_is_recent"
274
486
 
@@ -282,19 +494,308 @@ class UpdatePlanner(ManagedResource):
282
494
  "description": self.description,
283
495
  }
284
496
 
285
- def exclude_dates(self, dates: Set[dt.date]) -> None:
286
- """
287
- Exclude specific dates from the update plan.
288
- """
289
- if not isinstance(dates, set):
290
- raise ValueError("dates must be a set[date].")
291
- if not self.has_plan():
292
- self.logger.info("No update plan to modify. Call generate_plan() first.")
293
- return
294
-
295
- before = len(self.plan)
296
- self.plan = self.plan[~self.plan["date"].isin(dates)]
297
- self.df_req = self.plan[self.plan["update_required"]].copy()
298
- self.logger.info(
299
- f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows)."
300
- )
497
+ # import datetime as dt
498
+ # from concurrent.futures import ThreadPoolExecutor, as_completed
499
+ # from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
500
+ #
501
+ # import pandas as pd
502
+ #
503
+ # from sibi_dst.utils import ManagedResource
504
+ # from . import FileAgeChecker
505
+ #
506
+ #
507
+ # class UpdatePlanner(ManagedResource):
508
+ # """
509
+ # Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
510
+ # Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
511
+ # concerns do not apply here.
512
+ # """
513
+ #
514
+ # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
515
+ # "file_is_recent": 0,
516
+ # "missing_ignored": 0,
517
+ # "overwrite_forced": 1,
518
+ # "create_missing": 2,
519
+ # "missing_in_history": 3,
520
+ # "stale_in_history": 4,
521
+ # }
522
+ #
523
+ # DEFAULT_MAX_AGE_MINUTES: int = 1440
524
+ # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
525
+ # logger_extra = {"sibi_dst_component": __name__}
526
+ #
527
+ # def __init__(
528
+ # self,
529
+ # parquet_storage_path: str,
530
+ # parquet_filename: str,
531
+ # description: str = "Update Planner",
532
+ # reference_date: Union[str, dt.date, None] = None,
533
+ # history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
534
+ # max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
535
+ # overwrite: bool = False,
536
+ # ignore_missing: bool = False,
537
+ # custom_priority_map: Optional[Dict[str, int]] = None,
538
+ # reverse_order: bool = False,
539
+ # show_progress: bool = False,
540
+ # skipped: Optional[List[str]] = None,
541
+ # **kwargs,
542
+ # ):
543
+ # super().__init__(**kwargs)
544
+ #
545
+ # # Public-ish attributes
546
+ # self.description = description
547
+ # self.data_path = self._ensure_trailing_slash(parquet_storage_path)
548
+ # self.filename = parquet_filename
549
+ # self.reverse_order = reverse_order
550
+ # self.show_progress = show_progress
551
+ # self.overwrite = overwrite
552
+ # self.ignore_missing = ignore_missing
553
+ # self.history_days_threshold = history_days_threshold
554
+ # self.max_age_minutes = max_age_minutes
555
+ # self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
556
+ # self.skipped = set(skipped or [])
557
+ #
558
+ # # Execution knobs from kwargs (fed by upstream config)
559
+ # self.max_threads: int = int(kwargs.get("max_threads", 3))
560
+ # self.timeout: float = float(kwargs.get("timeout", 30.0))
561
+ #
562
+ # # Date window
563
+ # self.start_date = kwargs.get("parquet_start_date")
564
+ # self.end_date = kwargs.get("parquet_end_date")
565
+ #
566
+ # # Reference "today"
567
+ # if reference_date is None:
568
+ # self.reference_date = dt.date.today()
569
+ # else:
570
+ # self.reference_date = pd.to_datetime(reference_date).date()
571
+ #
572
+ # # Helpers & state
573
+ # self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
574
+ # self.plan: pd.DataFrame = pd.DataFrame()
575
+ # self.df_req: pd.DataFrame = pd.DataFrame()
576
+ #
577
+ # # internal run flag to print once per run if caller reuses instance
578
+ # self._printed_this_run: bool = False
579
+ #
580
+ # # --------------------- public helpers ---------------------
581
+ # def has_plan(self) -> bool:
582
+ # """Safe truthiness for plan existence."""
583
+ # return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
584
+ #
585
+ # def required_count(self) -> int:
586
+ # return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
587
+ #
588
+ # # --------------------- core API ---------------------
589
+ # def generate_plan(
590
+ # self,
591
+ # start: Union[str, dt.date, None] = None,
592
+ # end: Union[str, dt.date, None] = None,
593
+ # freq: str = "D",
594
+ # ) -> pd.DataFrame:
595
+ # """
596
+ # Build a plan for [start, end]. Returns rows that require update (df_req).
597
+ # """
598
+ # start = start or self.start_date
599
+ # end = end or self.end_date
600
+ # sd = pd.to_datetime(start).date()
601
+ # ed = pd.to_datetime(end).date()
602
+ # if sd > ed:
603
+ # raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
604
+ #
605
+ # self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=self.logger_extra)
606
+ # self._generate_plan(sd, ed, freq=freq)
607
+ # self.logger.info(
608
+ # f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
609
+ # f"{len(self.df_req)} require update",
610
+ # extra=self.logger_extra
611
+ # )
612
+ # return self.df_req
613
+ #
614
+ # def show_update_plan(self) -> None:
615
+ # logger_extra = self.logger_extra.update({"date_of_update": self.reference_date.strftime('%Y-%m-%d'), "dataclass": self.description,"action_module_name": "update_plan"})
616
+ #
617
+ # """Pretty-print the current plan once per run."""
618
+ # if not self.has_plan():
619
+ # self.logger.info("No update plan to show.")
620
+ # return
621
+ # if self._printed_this_run:
622
+ # return
623
+ #
624
+ # try:
625
+ # from rich.console import Console
626
+ # from rich.table import Table
627
+ # except Exception:
628
+ # # Fallback: plain text
629
+ # self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}", extra=logger_extra)
630
+ # self._printed_this_run = True
631
+ # return
632
+ #
633
+ # table = Table(
634
+ # title=f"Update Plan for {self.data_path}",
635
+ # show_header=True,
636
+ # header_style="bold magenta",
637
+ # )
638
+ # for column in self.plan.columns:
639
+ # table.add_column(column, justify="left")
640
+ #
641
+ # for _, row in self.plan.iterrows():
642
+ # table.add_row(*(str(row[col]) for col in self.plan.columns))
643
+ #
644
+ # console = Console()
645
+ # with console.capture() as capture:
646
+ # console.print(table)
647
+ # self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra=logger_extra)
648
+ # self._printed_this_run = True
649
+ #
650
+ # def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
651
+ # """
652
+ # Yield (priority, [dates...]) batches, smallest priority first.
653
+ # """
654
+ # if not self.has_plan():
655
+ # return
656
+ # req = self.plan[self.plan["update_required"]]
657
+ # if req.empty:
658
+ # return
659
+ # for priority in sorted(req["update_priority"].unique()):
660
+ # dates_df = req[req["update_priority"] == priority]
661
+ # # sort within group
662
+ # dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
663
+ # dates = dates_df["date"].tolist()
664
+ # if dates:
665
+ # yield int(priority), dates
666
+ #
667
+ # # --------------------- internals ---------------------
668
+ # @staticmethod
669
+ # def _ensure_trailing_slash(path: str) -> str:
670
+ # return path.rstrip("/") + "/"
671
+ #
672
+ # def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
673
+ # """
674
+ # Populate self.plan with all dates and self.df_req with the subset to update.
675
+ # """
676
+ # dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
677
+ # history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
678
+ # rows: List[Dict] = []
679
+ #
680
+ # # bound threads
681
+ # max_workers = max(1, int(self.max_threads))
682
+ #
683
+ # with ThreadPoolExecutor(max_workers=max_workers) as executor:
684
+ # futures = {executor.submit(self._get_file_status, d): d for d in dates}
685
+ # iterator = as_completed(futures)
686
+ # if self.show_progress:
687
+ # try:
688
+ # from tqdm import tqdm
689
+ # iterator = tqdm(
690
+ # iterator, total=len(futures),
691
+ # desc=f"Scanning dates for {self.description}",
692
+ # unit="date", leave=False
693
+ # )
694
+ # except Exception:
695
+ # pass # no tqdm → proceed without progress bar
696
+ #
697
+ # for future in iterator:
698
+ # d = futures[future]
699
+ # try:
700
+ # exists, age = future.result(timeout=self.timeout)
701
+ # rows.append(self._make_row(d, history_start, exists, age))
702
+ # except Exception as exc:
703
+ # self.logger.error(f"Error processing date {d}: {exc}", extra=self.logger_extra)
704
+ # rows.append(self._make_row(d, history_start, False, None))
705
+ #
706
+ # df = pd.DataFrame(rows)
707
+ # # consistent types
708
+ # if not df.empty:
709
+ # df["date"] = pd.to_datetime(df["date"]).dt.date
710
+ # df["update_priority"] = df["update_priority"].astype(int)
711
+ #
712
+ # df = df.sort_values(
713
+ # by=["update_priority", "date"],
714
+ # ascending=[True, not self.reverse_order],
715
+ # kind="mergesort", # stable
716
+ # ).reset_index(drop=True)
717
+ #
718
+ # self.plan = df
719
+ # self.df_req = df[df["update_required"]].copy()
720
+ # self._printed_this_run = False
721
+ #
722
+ # def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
723
+ # """
724
+ # Check file existence and age for the given date.
725
+ # """
726
+ # just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
727
+ # if just_path in self.skipped:
728
+ # self.logger.debug(f"Skipping {date}: path in skipped list.", extra=self.logger_extra)
729
+ # return False, None
730
+ #
731
+ # path = f"{just_path}{self.filename}"
732
+ # try:
733
+ # exists = self.fs.exists(path)
734
+ # age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
735
+ # return bool(exists), age
736
+ # except Exception as e:
737
+ # self.logger.warning(f"exists/age check failed for {path}: {e}", extra=self.logger_extra)
738
+ # return False, None
739
+ #
740
+ # def _make_row(
741
+ # self,
742
+ # date: dt.date,
743
+ # history_start: dt.date,
744
+ # file_exists: bool,
745
+ # file_age: Optional[float],
746
+ # ) -> Dict:
747
+ # """
748
+ # Build a single plan row based on flags and thresholds.
749
+ # """
750
+ # within_history = history_start <= date <= self.reference_date
751
+ # update_required = False
752
+ #
753
+ # # 1) Overwrite forces update
754
+ # if self.overwrite:
755
+ # category = "overwrite_forced"
756
+ # update_required = True
757
+ # # 2) Inside history window
758
+ # elif within_history:
759
+ # if not file_exists:
760
+ # category = "missing_in_history"
761
+ # update_required = True
762
+ # elif file_age is not None and file_age > self.max_age_minutes:
763
+ # category = "stale_in_history"
764
+ # update_required = True
765
+ # else:
766
+ # category = "file_is_recent"
767
+ # # 3) Outside history, missing file (and not ignoring)
768
+ # elif not file_exists and not self.ignore_missing:
769
+ # category = "create_missing"
770
+ # update_required = True
771
+ # # 4) Everything else
772
+ # else:
773
+ # category = "missing_ignored" if not file_exists else "file_is_recent"
774
+ #
775
+ # return {
776
+ # "date": date,
777
+ # "file_exists": bool(file_exists),
778
+ # "file_age_minutes": file_age,
779
+ # "update_category": category,
780
+ # "update_priority": self.priority_map.get(category, 99),
781
+ # "update_required": bool(update_required),
782
+ # "description": self.description,
783
+ # }
784
+ #
785
+ # def exclude_dates(self, dates: Set[dt.date]) -> None:
786
+ # """
787
+ # Exclude specific dates from the update plan.
788
+ # """
789
+ # if not isinstance(dates, set):
790
+ # raise ValueError("dates must be a set[date].")
791
+ # if not self.has_plan():
792
+ # self.logger.info("No update plan to modify. Call generate_plan() first.", extra=self.logger_extra)
793
+ # return
794
+ #
795
+ # before = len(self.plan)
796
+ # self.plan = self.plan[~self.plan["date"].isin(dates)]
797
+ # self.df_req = self.plan[self.plan["update_required"]].copy()
798
+ # self.logger.info(
799
+ # f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows).",
800
+ # extra=self.logger_extra
801
+ # )