sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,54 +1,52 @@
1
- # update_planner.py
2
1
  from __future__ import annotations
3
2
 
4
3
  import datetime as dt
4
+ import re
5
5
  from concurrent.futures import ThreadPoolExecutor, wait
6
6
  from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar, Any, Callable
7
7
 
8
8
  import pandas as pd
9
9
 
10
10
  from sibi_dst.utils import ManagedResource
11
- from . import FileAgeChecker # Assuming FileAgeChecker is in the same package
11
+ from . import FileAgeChecker
12
12
 
13
13
 
14
14
  class UpdatePlanner(ManagedResource):
15
15
  """
16
- Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
17
- Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
16
+ Update planner for datasets organized either as:
18
17
 
19
- Enhancements:
20
- - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
21
- - Age computed from the NEWEST data file (ignoring control files).
22
- - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
23
- - Real timeouts using concurrent.futures.wait(...).
24
- - Future dates marked as 'future' (not actionable).
18
+ - Legacy layout: /YYYY/MM/DD/file.parquet
19
+ - Hive layout: /partition_date=YYYY-MM-DD/[other=val]/file.parquet
20
+
21
+ Public API is unchanged (`generate_plan`, `show_update_plan`, etc.).
25
22
  """
26
23
 
27
- # -------- Defaults (extended, but original keys retained) --------
28
24
  DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
29
25
  "file_is_recent": 0,
30
26
  "missing_ignored": 0,
31
27
  "overwrite_forced": 1,
32
- "incomplete": 1, # new: prioritize just under overwrite
28
+ "incomplete": 1,
33
29
  "create_missing": 2,
34
30
  "missing_in_history": 3,
35
31
  "stale_in_history": 4,
36
- "future": 99, # new: not actionable
32
+ "future": 99,
37
33
  }
38
34
 
39
35
  DEFAULT_MAX_AGE_MINUTES: int = 1440
40
36
  DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
41
37
 
42
- # Data/Control file heuristics (can be overridden)
43
38
  DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
44
39
  CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
45
40
 
41
+ HIVE_PARTITION_RE: ClassVar[re.Pattern] = re.compile(r"([^/=]+)=([^/]+)")
42
+
46
43
  logger_extra = {"sibi_dst_component": __name__}
47
44
 
48
45
  def __init__(
49
46
  self,
50
47
  parquet_storage_path: str,
51
- parquet_filename: str,
48
+ *,
49
+ partition_on: Optional[List[str]] = None,
52
50
  description: str = "Update Planner",
53
51
  reference_date: Union[str, dt.date, None] = None,
54
52
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
@@ -58,57 +56,61 @@ class UpdatePlanner(ManagedResource):
58
56
  custom_priority_map: Optional[Dict[str, int]] = None,
59
57
  reverse_order: bool = False,
60
58
  show_progress: bool = False,
59
+ hive_style: bool = False,
61
60
  skipped: Optional[List[Union[str, dt.date]]] = None,
62
61
  **kwargs,
63
62
  ):
64
63
  super().__init__(**kwargs)
65
64
 
66
- # ---- Core Configuration ----
67
- self.description: str = description
65
+ # ---- core config ----
68
66
  self.data_path: str = self._ensure_trailing_slash(parquet_storage_path)
69
- self.filename: str = parquet_filename
67
+ self.description: str = description
70
68
  self.reverse_order: bool = reverse_order
71
69
  self.show_progress: bool = show_progress
72
70
  self.overwrite: bool = overwrite
73
71
  self.ignore_missing: bool = ignore_missing
74
72
  self.history_days_threshold: int = history_days_threshold
75
73
  self.max_age_minutes: int = max_age_minutes
76
- # Copy to avoid shared mutation
77
74
  self.priority_map: Dict[str, int] = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
78
75
 
79
- # ---- Execution Parameters ----
76
+ # ---- NEW: Hive partition support ----
77
+ self.hive_style: bool = hive_style
78
+ self.partition_on: List[str] = list(partition_on or ["partition_date"] if self.hive_style else ["year", "month", "day"])
79
+
80
+ # ---- execution knobs ----
80
81
  self.max_threads: int = int(kwargs.get("max_threads", 3))
81
- self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
82
+ self.timeout: float = float(kwargs.get("timeout", 30.0))
83
+ self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
84
+ self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
82
85
 
83
- # ---- Date Window ----
86
+ # ---- date window ----
84
87
  self.start_date = kwargs.get("parquet_start_date")
85
88
  self.end_date = kwargs.get("parquet_end_date")
86
89
 
87
- # ---- Reference Date ----
90
+ # ---- reference date ----
88
91
  if reference_date is not None:
89
92
  self.reference_date: dt.date = pd.to_datetime(reference_date).date()
90
93
  else:
91
94
  self.reference_date: dt.date = dt.date.today()
92
95
 
93
- # ---- Feature Flags / Advanced Knobs ----
94
- self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
95
- self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
96
+ # ---- completeness/heuristics ----
97
+ self.check_completeness: bool = bool(kwargs.get("check_completeness", False))
98
+ self.require_success_marker: bool = bool(kwargs.get("require_success_marker", False))
96
99
  self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
97
100
  self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
98
- self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout))
99
- self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout))
100
- # Dependency-injected clock (UTC) for tests
101
+
102
+ # ---- clock for tests ----
101
103
  self._utcnow: Callable[[], dt.datetime] = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
102
104
 
103
- # ---- Backward-Compatible Skip Handling ----
104
- # Keep legacy attribute and derive new internal canonical sets.
105
+ # ---- skipped (back-compat) ----
105
106
  self.skipped = list(skipped or kwargs.get("skipped", []) or [])
106
107
  self.skipped_paths: Set[str] = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
107
108
  self.skipped_dates: Set[dt.date] = {p for p in self.skipped if isinstance(p, dt.date)}
108
109
 
109
- # ---- Helpers & State ----
110
110
  if not getattr(self, "fs", None):
111
111
  raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
112
+
113
+ # ---- state ----
112
114
  self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
113
115
  self.plan: pd.DataFrame = pd.DataFrame()
114
116
  self.df_req: pd.DataFrame = pd.DataFrame()
@@ -117,41 +119,20 @@ class UpdatePlanner(ManagedResource):
117
119
  # --------------------- Back-compat property bridge ---------------------
118
120
  @property
119
121
  def skipped(self) -> List[Union[str, dt.date]]:
120
- """
121
- Backward-compatible view of skip configuration.
122
- Returns a merged list of path-strings and dates.
123
- """
124
- paths = sorted(self.skipped_paths)
125
- dates = sorted(self.skipped_dates)
126
- return [*paths, *dates]
122
+ return [*sorted(self.skipped_paths), *sorted(self.skipped_dates)]
127
123
 
128
124
  @skipped.setter
129
125
  def skipped(self, value: List[Union[str, dt.date]]) -> None:
130
- """
131
- Accepts legacy assignments like:
132
- planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
133
- and keeps new internals in sync.
134
- """
135
- value = list(value or [])
136
126
  self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
137
127
  self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
138
128
 
139
129
  # --------------------- Public API ---------------------
140
- def has_plan(self) -> bool:
141
- """Check if a plan DataFrame exists and is not empty."""
142
- return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
143
-
144
- def required_count(self) -> int:
145
- """Get the number of dates that require an update."""
146
- return len(self.df_req) if isinstance(self.df_req, pd.DataFrame) else 0
147
-
148
130
  def generate_plan(
149
131
  self,
150
132
  start: Union[str, dt.date, None] = None,
151
133
  end: Union[str, dt.date, None] = None,
152
134
  freq: str = "D",
153
135
  ) -> pd.DataFrame:
154
- """Build a plan for [start, end]. Returns rows that require update (df_req)."""
155
136
  start = start or self.start_date
156
137
  end = end or self.end_date
157
138
  if start is None or end is None:
@@ -162,330 +143,196 @@ class UpdatePlanner(ManagedResource):
162
143
  if sd > ed:
163
144
  raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
164
145
 
165
- log_extra = self._log_extra()
166
- self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=log_extra)
146
+ self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=self._log_extra())
167
147
  self._generate_plan(sd, ed, freq=freq)
168
- self.logger.info(
169
- f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
170
- f"{len(self.df_req)} require update",
171
- extra=log_extra,
172
- )
173
148
  return self.df_req
174
149
 
175
150
  def show_update_plan(self) -> None:
176
- """Pretty-print the current plan once per run."""
177
- if not self.has_plan():
178
- self.logger.info("No update plan to show.", extra=self._log_extra())
151
+ if not self.has_plan() or self._printed_this_run:
179
152
  return
180
- if self._printed_this_run:
181
- return
182
-
183
153
  try:
184
154
  from rich.console import Console
185
155
  from rich.table import Table
186
-
187
156
  console = Console()
188
- terminal_width = console.size.width
189
-
190
157
  table = Table(
191
- title=f"Update Plan for {self.data_path}",
192
- show_header=True,
193
- header_style="bold magenta",
194
- expand=True,
195
- pad_edge=False,
158
+ title=f"Update Plan for {self.data_path} [{'Hive' if 'partition_date' in self.partition_on else 'Legacy'}]",
159
+ show_header=True, header_style="bold magenta", expand=True, pad_edge=False,
196
160
  )
197
- max_w = max(terminal_width - 50, 640)
198
161
  for col in self.plan.columns:
199
- if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
200
- table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
201
- elif col == "description":
202
- table.add_column(col, justify="left", overflow="fold", max_width=max_w)
203
- else:
204
- table.add_column(col, justify="left", overflow="fold")
205
-
162
+ table.add_column(col, justify="left", overflow="fold")
206
163
  for _, row in self.plan.iterrows():
207
164
  table.add_row(*(str(row[c]) for c in self.plan.columns))
208
-
209
- with console.capture() as cap:
210
- console.print(table)
211
- self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
212
-
213
- except Exception as e:
214
- self.logger.debug(f"Falling back to plain text plan display due to: {e}", extra=self._log_extra())
215
- preview = self.plan.head(200).to_string(index=False)
216
- self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
217
-
165
+ console.print(table)
166
+ except Exception:
167
+ self.logger.info(f"Update Plan:\n{self.plan.head(50)}", extra=self._log_extra())
218
168
  self._printed_this_run = True
219
169
 
220
170
  def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
221
- """Yield (priority, [dates...]) batches, smallest priority first."""
222
171
  if not self.has_plan():
223
172
  return
224
173
  req = self.plan[self.plan["update_required"]]
225
- if req.empty:
226
- return
227
174
  for priority in sorted(req["update_priority"].unique()):
228
- dates_df = req[req["update_priority"] == priority]
229
- dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
230
- dates = dates_df["date"].tolist()
175
+ dates = req[req["update_priority"] == priority].sort_values(
176
+ by="date", ascending=not self.reverse_order
177
+ )["date"].tolist()
231
178
  if dates:
232
179
  yield int(priority), dates
233
180
 
234
- # --------------------- Plan Generation Internals ---------------------
181
+ def has_plan(self) -> bool:
182
+ return not self.plan.empty
183
+
184
+ def required_count(self) -> int:
185
+ return len(self.df_req)
186
+
187
+ # --------------------- Internals ---------------------
235
188
  def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
236
- """
237
- Populate self.plan with all dates and self.df_req with the subset to update.
238
- - Pre-lists months or days (configurable) with timeouts that actually apply
239
- - Computes staleness from newest *data* file
240
- - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
241
- - Marks future dates as 'future' (not actionable)
242
- """
243
189
  dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
244
190
  history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
245
191
  rows: List[Dict[str, Any]] = []
246
192
 
247
- def is_future(d: dt.date) -> bool:
248
- return d > self.reference_date
249
-
250
- # Choose listing units
251
- units: List[Tuple[str, dt.date]] = []
252
- if self.list_granularity == "day":
253
- units = [("day", d) for d in dates]
254
- else: # Default to month
193
+ if "partition_date" in self.partition_on:
194
+ caches: Dict[dt.date, Dict[str, Any]] = self._list_prefix(self.data_path)
195
+ else:
196
+ caches: Dict[dt.date, Dict[str, Any]] = {}
255
197
  months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
256
- units = [("month", m) for m in months]
257
-
258
- self.logger.info(
259
- f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
260
- extra=self._log_extra(),
261
- )
262
-
263
- # --- Parallel File Listing with Realistic Timeouts ---
264
- caches: Dict[dt.date, Dict[dt.date, Dict[str, Any]]] = {}
265
- max_workers = max(1, self.max_threads) # Ensure at least 1 worker
266
-
267
- with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="update_planner") as executor:
268
- future_to_unit: Dict[Any, Tuple[str, dt.date]] = {}
269
- for kind, val in units:
270
- prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
271
- future = executor.submit(self._list_prefix, prefix)
272
- future_to_unit[future] = (kind, val)
273
-
274
- # Wait for all futures with a total timeout
275
- done_futures, not_done_futures = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
276
-
277
- # Process completed futures
278
- for future in done_futures:
279
- kind, val = future_to_unit[future]
280
- try:
281
- # Get the result with a per-listing timeout
282
- cache = future.result(timeout=self.list_timeout or None)
283
- except Exception as e:
284
- self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
285
- cache = {}
286
-
287
- if kind == "month":
288
- caches[val] = cache
289
- else: # day
290
- # Store day listing results in its month's bucket for summarization
291
- month_key = val.replace(day=1)
292
- caches.setdefault(month_key, {}).update(cache)
293
-
294
- # Handle timed-out futures
295
- for future in not_done_futures:
296
- kind, val = future_to_unit[future]
297
- self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
298
- if kind == "month":
299
- caches[val] = {}
300
- else: # day
301
- month_key = val.replace(day=1)
302
- caches.setdefault(month_key, {})
198
+ with ThreadPoolExecutor(max_workers=max(1, self.max_threads)) as ex:
199
+ future_to_unit = {ex.submit(self._list_prefix, self._month_prefix(m)): m for m in months}
200
+ done, _ = wait(future_to_unit.keys(), timeout=self.total_timeout or None)
201
+ for fut in done:
202
+ m = future_to_unit[fut]
203
+ try:
204
+ caches[m] = fut.result(timeout=self.list_timeout or None)
205
+ except Exception:
206
+ caches[m] = {}
303
207
 
304
- # --- Summarize Each Date and Build Plan ---
305
208
  for d in dates:
306
- if is_future(d):
307
- rows.append({
308
- "date": d, "file_exists": False, "file_age_minutes": None,
309
- "update_category": "future", "update_priority": self.priority_map.get("future", 99),
310
- "update_required": False, "description": self.description,
311
- })
209
+ if d > self.reference_date:
210
+ rows.append(self._row_future(d))
312
211
  continue
313
-
314
212
  if self._is_skipped(d):
315
- self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
316
- # Append a row even for skipped dates, using default policy logic
317
213
  rows.append(self._make_row(d, history_start, False, None))
318
214
  continue
319
215
 
320
- # Get the cache for the month containing this date
321
- month_key = d.replace(day=1)
322
- cache = caches.get(month_key, {})
216
+ cache = caches if "partition_date" in self.partition_on else caches.get(d.replace(day=1), {})
323
217
  exists, age_min, incomplete = self._summarize_partition(d, cache)
324
-
325
- # Incomplete partitions get their own category (unless overwrite forces update)
326
218
  if incomplete and not self.overwrite:
327
- rows.append({
328
- "date": d, "file_exists": True, "file_age_minutes": age_min,
329
- "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
330
- "update_required": True, "description": self.description,
331
- })
332
- continue
333
-
334
- # Fall back to the standard policy logic (overwrite / history / staleness / missing)
335
- rows.append(self._make_row(d, history_start, exists, age_min))
219
+ rows.append(self._row_incomplete(d, age_min))
220
+ else:
221
+ rows.append(self._make_row(d, history_start, exists, age_min))
336
222
 
337
- # --- Finalize DataFrame ---
338
223
  df = pd.DataFrame.from_records(rows)
339
224
  if not df.empty:
340
225
  df["date"] = pd.to_datetime(df["date"]).dt.date
341
226
  df["update_priority"] = df["update_priority"].astype(int)
342
-
343
- df = df.sort_values(
227
+ self.plan = df.sort_values(
344
228
  by=["update_priority", "date"],
345
229
  ascending=[True, not self.reverse_order],
346
- kind="mergesort", # Stable sort
230
+ kind="mergesort",
347
231
  ).reset_index(drop=True)
232
+ self.df_req = self.plan[self.plan["update_required"]].copy()
348
233
 
349
- self.plan = df
350
- self.df_req = df[df["update_required"]].copy()
351
- self._printed_this_run = False
352
-
353
- # --------------------- File System Interaction ---------------------
354
234
  def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, Any]]:
355
- """
356
- Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
357
- Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime).
358
- """
359
235
  try:
360
- # Returns {path: info_dict} when detail=True
361
236
  items: Dict[str, Any] = self.fs.find(prefix, withdirs=False, detail=True)
362
- except Exception as e:
363
- self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
237
+ except Exception:
364
238
  return {}
365
239
 
366
240
  out: Dict[dt.date, Dict[str, Any]] = {}
367
241
  for path, info in items.items():
368
- # Extract date from path structure (e.g., .../YYYY/MM/DD/file)
369
- parts = path.strip("/").split("/")
370
- if len(parts) < 3: # Need at least year, month, day
371
- continue
372
- try:
373
- y, m, dd = int(parts[-3]), int(parts[-2]), int(parts[-1])
374
- d = dt.date(y, m, dd)
375
- except (ValueError, IndexError):
376
- # Not a date-partitioned path, skip
242
+ d: Optional[dt.date] = None
243
+ if "partition_date" in self.partition_on:
244
+ parts = self._extract_partitions(path)
245
+ if "partition_date" in parts:
246
+ try:
247
+ d = dt.date.fromisoformat(parts["partition_date"])
248
+ except Exception:
249
+ continue
250
+ else:
251
+ segs = path.strip("/").split("/")
252
+ if len(segs) >= 3:
253
+ try:
254
+ y, m, dd = int(segs[-3]), int(segs[-2]), int(segs[-1])
255
+ d = dt.date(y, m, dd)
256
+ except Exception:
257
+ continue
258
+ if d is None:
377
259
  continue
378
260
 
379
- # Initialize or get the record for this date
380
261
  rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
381
- base_name = path.rsplit("/", 1)[-1]
382
-
383
- # Check for _SUCCESS marker
384
- if base_name == "_SUCCESS":
262
+ base = path.rsplit("/", 1)[-1]
263
+ if base == "_SUCCESS":
385
264
  rec["has_success"] = True
386
-
387
- # Check if it's a relevant data file
388
265
  if self._is_data_file(path):
389
266
  rec["files"].append(path)
390
- # Determine the modification time
391
- mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
392
- ts = None
393
- if isinstance(mtime, (int, float)):
394
- ts = dt.datetime.utcfromtimestamp(mtime)
395
- elif isinstance(mtime, str):
396
- try:
397
- ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
398
- except Exception:
399
- ts = None
400
- elif isinstance(mtime, dt.datetime):
401
- # Ensure timezone awareness for comparison
402
- ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
403
-
404
- # Update the newest timestamp for this partition
405
- if ts:
406
- current_newest = rec["newest_ts"]
407
- # Naive comparison after ensuring tz awareness
408
- ts_naive = ts.replace(tzinfo=None) if ts.tzinfo else ts
409
- current_naive = current_newest.replace(tzinfo=None) if current_newest and current_newest.tzinfo else current_newest
410
- if current_naive is None or ts_naive > current_naive:
411
- rec["newest_ts"] = ts
412
-
267
+ ts = self._extract_mtime(info)
268
+ if ts and (rec["newest_ts"] is None or ts > rec["newest_ts"]):
269
+ rec["newest_ts"] = ts
413
270
  return out
414
271
 
415
- def _summarize_partition(
416
- self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]
417
- ) -> Tuple[bool, Optional[float], bool]:
418
- """
419
- Summarize the state of a partition for a given date.
272
+ def _extract_partitions(self, path: str) -> Dict[str, str]:
273
+ out: Dict[str, str] = {}
274
+ for seg in path.strip("/").split("/"):
275
+ m = self.HIVE_PARTITION_RE.match(seg)
276
+ if m:
277
+ out[m.group(1)] = m.group(2)
278
+ return out
420
279
 
421
- Returns:
422
- Tuple[bool, Optional[float], bool]: (exists, age_minutes, incomplete)
423
- - exists: True iff at least one *data* file is present for day `d`
424
- - age_minutes: minutes since the NEWEST data file (UTC 'now'), or None if not determinable
425
- - incomplete: True if files exist but required _SUCCESS is missing (and checks are enabled)
426
- """
280
+ def _summarize_partition(self, d: dt.date, cache: Dict[dt.date, Dict[str, Any]]) -> Tuple[bool, Optional[float], bool]:
427
281
  rec = cache.get(d, {})
428
282
  files = rec.get("files", [])
429
- has_success = bool(rec.get("has_success", False))
430
- exists = len(files) > 0
431
-
283
+ exists = bool(files)
432
284
  if not exists:
433
285
  return False, None, False
434
-
286
+ has_success = rec.get("has_success", False)
435
287
  newest_ts = rec.get("newest_ts")
436
- age_min: Optional[float] = None
288
+ age_min = None
437
289
  if newest_ts:
438
- now_utc = self._utcnow().replace(tzinfo=None) # Get current UTC time (naive)
439
- ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts # Make mtime naive
440
- age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
441
-
290
+ now = self._utcnow().replace(tzinfo=None)
291
+ ts = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
292
+ age_min = max(0.0, (now - ts).total_seconds() / 60.0)
442
293
  incomplete = self.check_completeness and self.require_success_marker and not has_success
443
294
  return exists, age_min, incomplete
444
295
 
445
- # --------------------- Policy Logic ---------------------
446
- def _make_row(
447
- self,
448
- date: dt.date,
449
- history_start: dt.date,
450
- file_exists: bool,
451
- file_age: Optional[float],
452
- ) -> Dict[str, Any]:
453
- """
454
- Build a single plan row based on flags and thresholds.
455
- (Categories 'future'/'incomplete' are injected earlier by _generate_plan.)
456
- """
457
- within_history = history_start <= date <= self.reference_date
458
- update_required = False
459
- category = "unknown"
460
-
296
+ def _make_row(self, d: dt.date, history_start: dt.date, exists: bool, age_min: Optional[float]) -> Dict[str, Any]:
297
+ within_history = history_start <= d <= self.reference_date
298
+ category, update_required = "unknown", False
461
299
  if self.overwrite:
462
- category = "overwrite_forced"
463
- update_required = True
300
+ category, update_required = "overwrite_forced", True
464
301
  elif within_history:
465
- if not file_exists:
466
- category = "missing_in_history"
467
- update_required = True
468
- elif file_age is not None and file_age > self.max_age_minutes:
469
- category = "stale_in_history"
470
- update_required = True
302
+ if not exists:
303
+ category, update_required = "missing_in_history", True
304
+ elif age_min is not None and age_min > self.max_age_minutes:
305
+ category, update_required = "stale_in_history", True
471
306
  else:
472
307
  category = "file_is_recent"
473
- elif not file_exists and not self.ignore_missing:
474
- category = "create_missing"
475
- update_required = True
308
+ elif not exists and not self.ignore_missing:
309
+ category, update_required = "create_missing", True
476
310
  else:
477
- category = "missing_ignored" if not file_exists else "file_is_recent"
478
-
311
+ category = "missing_ignored" if not exists else "file_is_recent"
479
312
  return {
480
- "date": date,
481
- "file_exists": file_exists,
482
- "file_age_minutes": file_age,
313
+ "date": d,
314
+ "file_exists": exists,
315
+ "file_age_minutes": age_min,
483
316
  "update_category": category,
484
317
  "update_priority": self.priority_map.get(category, 99),
485
318
  "update_required": update_required,
486
319
  "description": self.description,
487
320
  }
488
321
 
322
+ def _row_future(self, d: dt.date) -> Dict[str, Any]:
323
+ return {
324
+ "date": d, "file_exists": False, "file_age_minutes": None,
325
+ "update_category": "future", "update_priority": self.priority_map.get("future", 99),
326
+ "update_required": False, "description": self.description,
327
+ }
328
+
329
+ def _row_incomplete(self, d: dt.date, age_min: Optional[float]) -> Dict[str, Any]:
330
+ return {
331
+ "date": d, "file_exists": True, "file_age_minutes": age_min,
332
+ "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
333
+ "update_required": True, "description": self.description,
334
+ }
335
+
489
336
  # --------------------- Utilities ---------------------
490
337
  @staticmethod
491
338
  def _ensure_trailing_slash(path: str) -> str:
@@ -501,29 +348,36 @@ class UpdatePlanner(ManagedResource):
501
348
  while cur <= end:
502
349
  yield cur
503
350
  y, m = cur.year, cur.month
504
- # Move to the first day of the next month
505
- if m == 12:
506
- cur = dt.date(y + 1, 1, 1)
507
- else:
508
- cur = dt.date(y, m + 1, 1)
351
+ cur = dt.date(y + 1, 1, 1) if m == 12 else dt.date(y, m + 1, 1)
509
352
 
510
353
  def _month_prefix(self, month_start: dt.date) -> str:
511
354
  return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
512
355
 
513
- def _day_prefix(self, d: dt.date) -> str:
514
- return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
515
-
516
356
  def _is_data_file(self, path: str) -> bool:
517
357
  base = path.rsplit("/", 1)[-1]
518
- # Skip hidden files, directories, and control files
519
358
  if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
520
359
  return False
521
- lower_base = base.lower()
522
- return any(lower_base.endswith(suf) for suf in self.data_file_suffixes)
360
+ return any(base.lower().endswith(suf) for suf in self.data_file_suffixes)
361
+
362
+ @staticmethod
363
+ def _extract_mtime(info: Dict[str, Any]) -> Optional[dt.datetime]:
364
+ mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
365
+ if isinstance(mtime, (int, float)):
366
+ return dt.datetime.utcfromtimestamp(mtime)
367
+ if isinstance(mtime, str):
368
+ try:
369
+ return pd.to_datetime(mtime, utc=True).to_pydatetime()
370
+ except Exception:
371
+ return None
372
+ if isinstance(mtime, dt.datetime):
373
+ return mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
374
+ return None
523
375
 
524
376
  def _is_skipped(self, d: dt.date) -> bool:
525
- """True if the date or its canonical path is in the skip config."""
526
- canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
377
+ if "partition_date" in self.partition_on:
378
+ canonical_path = f"{self.data_path}partition_date={d.isoformat()}/"
379
+ else:
380
+ canonical_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
527
381
  return (d in self.skipped_dates) or (canonical_path in self.skipped_paths)
528
382
 
529
383
  def _log_extra(self, **overrides) -> Dict[str, Any]:
@@ -534,502 +388,4 @@ class UpdatePlanner(ManagedResource):
534
388
  "action_module_name": "update_plan",
535
389
  }
536
390
  base.update(overrides)
537
- return base
538
-
539
-
540
- # import datetime as dt
541
- # from concurrent.futures import ThreadPoolExecutor, wait
542
- # from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
543
- #
544
- # import pandas as pd
545
- #
546
- # from sibi_dst.utils import ManagedResource
547
- # from . import FileAgeChecker
548
- #
549
- #
550
- # class UpdatePlanner(ManagedResource):
551
- # """
552
- # Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
553
- # Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
554
- #
555
- # Enhancements:
556
- # - Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
557
- # - Age computed from the NEWEST data file (ignoring control files).
558
- # - Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
559
- # - Real timeouts using concurrent.futures.wait(...).
560
- # - Future dates marked as 'future' (not actionable).
561
- # """
562
- #
563
- # # -------- Defaults (extended, but original keys retained) --------
564
- # DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
565
- # "file_is_recent": 0,
566
- # "missing_ignored": 0,
567
- # "overwrite_forced": 1,
568
- # "incomplete": 1, # new: prioritize just under overwrite
569
- # "create_missing": 2,
570
- # "missing_in_history": 3,
571
- # "stale_in_history": 4,
572
- # "future": 99, # new: not actionable
573
- # }
574
- #
575
- # DEFAULT_MAX_AGE_MINUTES: int = 1440
576
- # DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
577
- #
578
- # # Data/Control file heuristics (can be overridden)
579
- # DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
580
- # CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
581
- #
582
- # logger_extra = {"sibi_dst_component": __name__}
583
- #
584
- # def __init__(
585
- # self,
586
- # parquet_storage_path: str,
587
- # parquet_filename: str,
588
- # description: str = "Update Planner",
589
- # reference_date: Union[str, dt.date, None] = None,
590
- # history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
591
- # max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
592
- # overwrite: bool = False,
593
- # ignore_missing: bool = False,
594
- # custom_priority_map: Optional[Dict[str, int]] = None,
595
- # reverse_order: bool = False,
596
- # show_progress: bool = False,
597
- # skipped: Optional[List[Union[str, dt.date]]] = None,
598
- # **kwargs,
599
- # ):
600
- # super().__init__(**kwargs)
601
- #
602
- # # ---- Existing public-ish attributes (unchanged) ----
603
- # self.description = description
604
- # self.data_path = self._ensure_trailing_slash(parquet_storage_path)
605
- # self.filename = parquet_filename
606
- # self.reverse_order = reverse_order
607
- # self.show_progress = show_progress
608
- # self.overwrite = overwrite
609
- # self.ignore_missing = ignore_missing
610
- # self.history_days_threshold = history_days_threshold
611
- # self.max_age_minutes = max_age_minutes
612
- # # copy to avoid shared mutation
613
- # self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
614
- #
615
- # # Execution knobs from kwargs (kept)
616
- # self.max_threads: int = int(kwargs.get("max_threads", 3))
617
- # self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
618
- #
619
- # # Date window (kept)
620
- # self.start_date = kwargs.get("parquet_start_date")
621
- # self.end_date = kwargs.get("parquet_end_date")
622
- #
623
- # # Reference date (kept; tolerant)
624
- # self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
625
- #
626
- # # Helpers & state (kept)
627
- # self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
628
- # self.plan: pd.DataFrame = pd.DataFrame()
629
- # self.df_req: pd.DataFrame = pd.DataFrame()
630
- # self._printed_this_run: bool = False
631
- #
632
- # # ---- New feature flags / knobs (all default to safe choices) ----
633
- # # Completeness check via _SUCCESS
634
- # self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
635
- # self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
636
- # # Listing granularity: 'month' (default) or 'day'
637
- # self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
638
- # # Data file suffixes to consider for age (default common formats)
639
- # self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
640
- # # Timeouts
641
- # self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
642
- # self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
643
- # # Dependency-injected clock (UTC) for tests
644
- # self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
645
- #
646
- # # ------------ Backward-compatible skip handling ------------
647
- # # Keep legacy attribute and derive new internal canonical sets.
648
- # self.skipped = list(skipped or kwargs.get("skipped", []) or [])
649
- # self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
650
- # self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
651
- #
652
- # # Validate fs presence (you rely on it)
653
- # if not getattr(self, "fs", None):
654
- # raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
655
- #
656
- # # --------------------- Back-compat property bridge ---------------------
657
- # @property
658
- # def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
659
- # """
660
- # Backward-compatible view of skip configuration.
661
- # Returns a merged list of path-strings and dates.
662
- # """
663
- # paths = sorted(self.skipped_paths)
664
- # dates = sorted(self.skipped_dates)
665
- # return [*paths, *dates]
666
- #
667
- # @skipped.setter
668
- # def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
669
- # """
670
- # Accepts legacy assignments like:
671
- # planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
672
- # and keeps new internals in sync.
673
- # """
674
- # value = list(value or [])
675
- # self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
676
- # self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
677
- #
678
- # # --------------------- public helpers (kept) ---------------------
679
- # def has_plan(self) -> bool:
680
- # return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
681
- #
682
- # def required_count(self) -> int:
683
- # return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
684
- #
685
- # # --------------------- core API (kept) ---------------------
686
- # def generate_plan(
687
- # self,
688
- # start: Union[str, dt.date, None] = None,
689
- # end: Union[str, dt.date, None] = None,
690
- # freq: str = "D",
691
- # ) -> pd.DataFrame:
692
- # """Build a plan for [start, end]. Returns rows that require update (df_req)."""
693
- # start = start or self.start_date
694
- # end = end or self.end_date
695
- # if start is None or end is None:
696
- # raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
697
- #
698
- # sd = pd.to_datetime(start).date()
699
- # ed = pd.to_datetime(end).date()
700
- # if sd > ed:
701
- # raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
702
- #
703
- # self.logger.info(
704
- # f"Generating update plan for {self.description} from {sd} to {ed}",
705
- # extra=self._log_extra(),
706
- # )
707
- # self._generate_plan(sd, ed, freq=freq)
708
- # self.logger.info(
709
- # f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
710
- # f"{len(self.df_req)} require update",
711
- # extra=self._log_extra(),
712
- # )
713
- # return self.df_req
714
- #
715
- # def show_update_plan(self) -> None:
716
- # """Pretty-print the current plan once per run, now respecting terminal width fully."""
717
- # if not self.has_plan():
718
- # self.logger.info("No update plan to show.", extra=self._log_extra())
719
- # return
720
- # if self._printed_this_run:
721
- # return
722
- #
723
- # try:
724
- # from rich.console import Console
725
- # from rich.table import Table
726
- #
727
- # console = Console() # auto-detect terminal size
728
- # terminal_width = console.size.width
729
- #
730
- # table = Table(
731
- # title=f"Update Plan for {self.data_path}",
732
- # show_header=True,
733
- # header_style="bold magenta",
734
- # expand=True, # fill available width
735
- # pad_edge=False,
736
- # )
737
- # max_w = max(terminal_width - 50, 640)
738
- # for col in self.plan.columns:
739
- # if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
740
- # table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
741
- # elif col == "description":
742
- # # Let description wrap, but set a max width to avoid huge columns
743
- # table.add_column(col, justify="left", overflow="fold", max_width=max_w)
744
- # else:
745
- # table.add_column(col, justify="left", overflow="fold")
746
- #
747
- # for _, row in self.plan.iterrows():
748
- # table.add_row(*(str(row[c]) for c in self.plan.columns))
749
- #
750
- # # Capture with the same console so width stays consistent
751
- # with console.capture() as cap:
752
- # console.print(table)
753
- # self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
754
- #
755
- # except Exception:
756
- # preview = self.plan.head(200).to_string(index=False)
757
- # self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
758
- #
759
- # self._printed_this_run = True
760
- #
761
- # def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
762
- # """Yield (priority, [dates...]) batches, smallest priority first."""
763
- # if not self.has_plan():
764
- # return
765
- # req = self.plan[self.plan["update_required"]]
766
- # if req.empty:
767
- # return
768
- # for priority in sorted(req["update_priority"].unique()):
769
- # dates_df = req[req["update_priority"] == priority]
770
- # dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
771
- # dates = dates_df["date"].tolist()
772
- # if dates:
773
- # yield int(priority), dates
774
- #
775
- # # --------------------- internals ---------------------
776
- # @staticmethod
777
- # def _ensure_trailing_slash(path: str) -> str:
778
- # return path.rstrip("/") + "/"
779
- #
780
- # @staticmethod
781
- # def _month_floor(d: dt.date) -> dt.date:
782
- # return d.replace(day=1)
783
- #
784
- # @staticmethod
785
- # def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
786
- # cur = start.replace(day=1)
787
- # while cur <= end:
788
- # yield cur
789
- # y, m = cur.year, cur.month
790
- # cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
791
- #
792
- # def _month_prefix(self, month_start: dt.date) -> str:
793
- # return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
794
- #
795
- # def _day_prefix(self, d: dt.date) -> str:
796
- # return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
797
- #
798
- # def _log_extra(self, **overrides) -> dict:
799
- # base = {
800
- # "sibi_dst_component": __name__,
801
- # "date_of_update": self.reference_date.strftime("%Y-%m-%d"),
802
- # "dataclass": self.description,
803
- # "action_module_name": "update_plan",
804
- # }
805
- # base.update(overrides)
806
- # return base
807
- #
808
- # def _is_data_file(self, path: str) -> bool:
809
- # base = path.rsplit("/", 1)[-1]
810
- # if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
811
- # return False
812
- # lower = base.lower()
813
- # return any(lower.endswith(suf) for suf in self.data_file_suffixes)
814
- #
815
- # def _is_skipped(self, d: dt.date) -> bool:
816
- # """True if the date or its canonical path is in the skip config."""
817
- # just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
818
- # return (d in self.skipped_dates) or (just_path in self.skipped_paths)
819
- #
820
- # def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
821
- # """
822
- # Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
823
- # Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
824
- # """
825
- # try:
826
- # items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
827
- # except Exception as e:
828
- # self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
829
- # return {}
830
- #
831
- # out: Dict[dt.date, Dict[str, object]] = {}
832
- # for path, info in items.items():
833
- # parts = path.strip("/").split("/")
834
- # if len(parts) < 4:
835
- # continue
836
- # try:
837
- # y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
838
- # d = dt.date(y, m, dd)
839
- # except Exception:
840
- # continue
841
- #
842
- # rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
843
- # base = path.rsplit("/", 1)[-1]
844
- # if base == "_SUCCESS":
845
- # rec["has_success"] = True
846
- #
847
- # if self._is_data_file(path):
848
- # rec["files"].append(path)
849
- # mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
850
- # ts = None
851
- # if isinstance(mtime, (int, float)):
852
- # ts = dt.datetime.utcfromtimestamp(mtime)
853
- # elif isinstance(mtime, str):
854
- # try:
855
- # ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
856
- # except Exception:
857
- # ts = None
858
- # elif isinstance(mtime, dt.datetime):
859
- # ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
860
- # if ts:
861
- # cur = rec["newest_ts"]
862
- # rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
863
- # return out
864
- #
865
- # def _summarize_partition(
866
- # self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
867
- # ) -> Tuple[bool, Optional[float], bool]:
868
- # """
869
- # (exists, age_minutes, incomplete)
870
- # - exists: True iff at least one *data* file is present for day `d`
871
- # - age_minutes: minutes since the NEWEST data file (UTC 'now')
872
- # - incomplete: True if files exist but required _SUCCESS is missing
873
- # """
874
- # rec = cache.get(d, {})
875
- # files = rec.get("files", [])
876
- # has_success = bool(rec.get("has_success", False))
877
- # exists = len(files) > 0
878
- # if not exists:
879
- # return False, None, False
880
- # newest_ts = rec.get("newest_ts")
881
- # if newest_ts:
882
- # now_utc = self._utcnow().replace(tzinfo=None)
883
- # ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
884
- # age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
885
- # else:
886
- # age_min = None
887
- # incomplete = self.check_completeness and self.require_success_marker and not has_success
888
- # return True, age_min, incomplete
889
- #
890
- # def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
891
- # """
892
- # Populate self.plan with all dates and self.df_req with the subset to update.
893
- # - Pre-lists months or days (configurable) with timeouts that actually apply
894
- # - Computes staleness from newest *data* file
895
- # - Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
896
- # - Marks future dates as 'future' (not actionable)
897
- # """
898
- # dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
899
- # history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
900
- # rows: List[Dict] = []
901
- #
902
- # def is_future(d: dt.date) -> bool:
903
- # return d > self.reference_date
904
- #
905
- # # Choose listing units
906
- # if self.list_granularity == "day":
907
- # units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
908
- # else:
909
- # months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
910
- # units = [("month", m) for m in months]
911
- #
912
- # self.logger.info(
913
- # f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
914
- # extra=self._log_extra(),
915
- # )
916
- #
917
- # # Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
918
- # caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
919
- # max_workers = max(1, int(self.max_threads))
920
- # with ThreadPoolExecutor(max_workers=max_workers) as ex:
921
- # futs = {}
922
- # for kind, val in units:
923
- # prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
924
- # futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
925
- # done, not_done = wait(futs, timeout=self.total_timeout or None)
926
- # for f in done:
927
- # kind, val = futs[f]
928
- # try:
929
- # cache = f.result(timeout=self.list_timeout or None)
930
- # except Exception as e:
931
- # self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
932
- # cache = {}
933
- # if kind == "month":
934
- # caches[val] = cache
935
- # else:
936
- # # day → store into its month bucket for summarization reuse
937
- # mk = val.replace(day=1)
938
- # caches.setdefault(mk, {}).update(cache)
939
- # for f in not_done:
940
- # kind, val = futs[f]
941
- # self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
942
- # if kind == "month":
943
- # caches[val] = {}
944
- # else:
945
- # caches.setdefault(val.replace(day=1), {})
946
- #
947
- # # Summarize each date
948
- # for d in dates:
949
- # if is_future(d):
950
- # rows.append({
951
- # "date": d, "file_exists": False, "file_age_minutes": None,
952
- # "update_category": "future", "update_priority": self.priority_map.get("future", 99),
953
- # "update_required": False, "description": self.description,
954
- # })
955
- # continue
956
- #
957
- # if self._is_skipped(d):
958
- # self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
959
- # rows.append(self._make_row(d, history_start, False, None))
960
- # continue
961
- #
962
- # month_key = d.replace(day=1)
963
- # cache = caches.get(month_key, {})
964
- # exists, age_min, incomplete = self._summarize_partition(d, cache)
965
- #
966
- # # Incomplete partitions get their own category (unless overwrite)
967
- # if incomplete and not self.overwrite:
968
- # rows.append({
969
- # "date": d, "file_exists": True, "file_age_minutes": age_min,
970
- # "update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
971
- # "update_required": True, "description": self.description,
972
- # })
973
- # continue
974
- #
975
- # # Fall back to your existing policy (overwrite / history / staleness / missing)
976
- # rows.append(self._make_row(d, history_start, exists, age_min))
977
- #
978
- # df = pd.DataFrame.from_records(rows)
979
- # if not df.empty:
980
- # df["date"] = pd.to_datetime(df["date"]).dt.date
981
- # df["update_priority"] = df["update_priority"].astype(int)
982
- #
983
- # df = df.sort_values(
984
- # by=["update_priority", "date"],
985
- # ascending=[True, not self.reverse_order],
986
- # kind="mergesort",
987
- # ).reset_index(drop=True)
988
- #
989
- # self.plan = df
990
- # self.df_req = df[df["update_required"]].copy()
991
- # self._printed_this_run = False
992
- #
993
- # # --------------------- original policy (kept) ---------------------
994
- # def _make_row(
995
- # self,
996
- # date: dt.date,
997
- # history_start: dt.date,
998
- # file_exists: bool,
999
- # file_age: Optional[float],
1000
- # ) -> Dict:
1001
- # """
1002
- # Build a single plan row based on flags and thresholds.
1003
- # (Categories 'future'/'incomplete' are injected earlier.)
1004
- # """
1005
- # within_history = history_start <= date <= self.reference_date
1006
- # update_required = False
1007
- #
1008
- # if self.overwrite:
1009
- # category = "overwrite_forced"
1010
- # update_required = True
1011
- # elif within_history:
1012
- # if not file_exists:
1013
- # category = "missing_in_history"
1014
- # update_required = True
1015
- # elif file_age is not None and file_age > self.max_age_minutes:
1016
- # category = "stale_in_history"
1017
- # update_required = True
1018
- # else:
1019
- # category = "file_is_recent"
1020
- # elif not file_exists and not self.ignore_missing:
1021
- # category = "create_missing"
1022
- # update_required = True
1023
- # else:
1024
- # category = "missing_ignored" if not file_exists else "file_is_recent"
1025
- #
1026
- # return {
1027
- # "date": date,
1028
- # "file_exists": bool(file_exists),
1029
- # "file_age_minutes": file_age,
1030
- # "update_category": category,
1031
- # "update_priority": self.priority_map.get(category, 99),
1032
- # "update_required": bool(update_required),
1033
- # "description": self.description,
1034
- # }
1035
- #
391
+ return base