sibi-dst 2025.8.7__py3-none-any.whl → 2025.8.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +105 -89
- sibi_dst/df_helper/_parquet_artifact.py +11 -10
- sibi_dst/df_helper/_parquet_reader.py +4 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +504 -214
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +11 -10
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +9 -8
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +4 -76
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -104
- sibi_dst/utils/boilerplate/__init__.py +6 -0
- sibi_dst/utils/boilerplate/base_data_artifact.py +110 -0
- sibi_dst/utils/boilerplate/base_data_cube.py +79 -0
- sibi_dst/utils/data_wrapper.py +22 -263
- sibi_dst/utils/iceberg_saver.py +126 -0
- sibi_dst/utils/log_utils.py +108 -529
- sibi_dst/utils/parquet_saver.py +110 -9
- sibi_dst/utils/progress/__init__.py +5 -0
- sibi_dst/utils/progress/jobs.py +82 -0
- sibi_dst/utils/progress/sse_runner.py +82 -0
- sibi_dst/utils/storage_hive.py +38 -1
- sibi_dst/utils/update_planner.py +617 -116
- {sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/METADATA +3 -2
- {sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/RECORD +23 -16
- {sibi_dst-2025.8.7.dist-info → sibi_dst-2025.8.9.dist-info}/WHEEL +0 -0
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
import datetime as dt
|
2
|
-
from concurrent.futures import ThreadPoolExecutor,
|
2
|
+
from concurrent.futures import ThreadPoolExecutor, wait
|
3
3
|
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
4
4
|
|
5
5
|
import pandas as pd
|
@@ -11,22 +11,37 @@ from . import FileAgeChecker
|
|
11
11
|
class UpdatePlanner(ManagedResource):
|
12
12
|
"""
|
13
13
|
Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
|
14
|
-
|
15
|
-
|
14
|
+
Backward compatible: public API and legacy attributes preserved; enhancements are opt-in via kwargs.
|
15
|
+
|
16
|
+
Enhancements:
|
17
|
+
- Batch listings via fsspec.find(..., detail=True) to avoid N×exists() roundtrips.
|
18
|
+
- Age computed from the NEWEST data file (ignoring control files).
|
19
|
+
- Optional completeness check: partitions with files but no _SUCCESS => 'incomplete'.
|
20
|
+
- Real timeouts using concurrent.futures.wait(...).
|
21
|
+
- Future dates marked as 'future' (not actionable).
|
16
22
|
"""
|
17
23
|
|
24
|
+
# -------- Defaults (extended, but original keys retained) --------
|
18
25
|
DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
19
26
|
"file_is_recent": 0,
|
20
27
|
"missing_ignored": 0,
|
21
28
|
"overwrite_forced": 1,
|
29
|
+
"incomplete": 1, # new: prioritize just under overwrite
|
22
30
|
"create_missing": 2,
|
23
31
|
"missing_in_history": 3,
|
24
32
|
"stale_in_history": 4,
|
33
|
+
"future": 99, # new: not actionable
|
25
34
|
}
|
26
35
|
|
27
36
|
DEFAULT_MAX_AGE_MINUTES: int = 1440
|
28
37
|
DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
29
38
|
|
39
|
+
# Data/Control file heuristics (can be overridden)
|
40
|
+
DATA_FILE_PATTERNS: ClassVar[Tuple[str, ...]] = (".parquet", ".orc", ".csv", ".json")
|
41
|
+
CONTROL_BASENAMES: ClassVar[Set[str]] = {"_SUCCESS", "_metadata", "_common_metadata"}
|
42
|
+
|
43
|
+
logger_extra = {"sibi_dst_component": __name__}
|
44
|
+
|
30
45
|
def __init__(
|
31
46
|
self,
|
32
47
|
parquet_storage_path: str,
|
@@ -40,12 +55,12 @@ class UpdatePlanner(ManagedResource):
|
|
40
55
|
custom_priority_map: Optional[Dict[str, int]] = None,
|
41
56
|
reverse_order: bool = False,
|
42
57
|
show_progress: bool = False,
|
43
|
-
skipped: Optional[List[str]] = None,
|
58
|
+
skipped: Optional[List[Union[str, dt.date]]] = None,
|
44
59
|
**kwargs,
|
45
60
|
):
|
46
61
|
super().__init__(**kwargs)
|
47
62
|
|
48
|
-
#
|
63
|
+
# ---- Existing public-ish attributes (unchanged) ----
|
49
64
|
self.description = description
|
50
65
|
self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
51
66
|
self.filename = parquet_filename
|
@@ -55,68 +70,113 @@ class UpdatePlanner(ManagedResource):
|
|
55
70
|
self.ignore_missing = ignore_missing
|
56
71
|
self.history_days_threshold = history_days_threshold
|
57
72
|
self.max_age_minutes = max_age_minutes
|
58
|
-
|
59
|
-
self.
|
73
|
+
# copy to avoid shared mutation
|
74
|
+
self.priority_map = dict(custom_priority_map) if custom_priority_map else dict(self.DEFAULT_PRIORITY_MAP)
|
60
75
|
|
61
|
-
# Execution knobs from kwargs (
|
76
|
+
# Execution knobs from kwargs (kept)
|
62
77
|
self.max_threads: int = int(kwargs.get("max_threads", 3))
|
63
|
-
self.timeout: float = float(kwargs.get("timeout", 30.0))
|
78
|
+
self.timeout: float = float(kwargs.get("timeout", 30.0)) # legacy overall timeout
|
64
79
|
|
65
|
-
# Date window
|
80
|
+
# Date window (kept)
|
66
81
|
self.start_date = kwargs.get("parquet_start_date")
|
67
82
|
self.end_date = kwargs.get("parquet_end_date")
|
68
83
|
|
69
|
-
# Reference
|
70
|
-
if reference_date is None
|
71
|
-
self.reference_date = dt.date.today()
|
72
|
-
else:
|
73
|
-
self.reference_date = pd.to_datetime(reference_date).date()
|
84
|
+
# Reference date (kept; tolerant)
|
85
|
+
self.reference_date = pd.to_datetime(reference_date).date() if reference_date is not None else dt.date.today()
|
74
86
|
|
75
|
-
# Helpers & state
|
87
|
+
# Helpers & state (kept)
|
76
88
|
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
77
89
|
self.plan: pd.DataFrame = pd.DataFrame()
|
78
90
|
self.df_req: pd.DataFrame = pd.DataFrame()
|
79
|
-
|
80
|
-
# internal run flag to print once per run if caller reuses instance
|
81
91
|
self._printed_this_run: bool = False
|
82
92
|
|
83
|
-
|
93
|
+
# ---- New feature flags / knobs (all default to safe choices) ----
|
94
|
+
# Completeness check via _SUCCESS
|
95
|
+
self.check_completeness: bool = bool(kwargs.get("check_completeness", True))
|
96
|
+
self.require_success_marker: bool = bool(kwargs.get("require_success_marker", True))
|
97
|
+
# Listing granularity: 'month' (default) or 'day'
|
98
|
+
self.list_granularity: str = str(kwargs.get("list_granularity", "month"))
|
99
|
+
# Data file suffixes to consider for age (default common formats)
|
100
|
+
self.data_file_suffixes: Tuple[str, ...] = tuple(kwargs.get("data_file_suffixes", self.DATA_FILE_PATTERNS))
|
101
|
+
# Timeouts
|
102
|
+
self.list_timeout: float = float(kwargs.get("list_timeout", self.timeout)) # per-future
|
103
|
+
self.total_timeout: float = float(kwargs.get("total_timeout", self.timeout)) # across all listings
|
104
|
+
# Dependency-injected clock (UTC) for tests
|
105
|
+
self._utcnow = kwargs.get("utcnow_func", None) or (lambda: dt.datetime.utcnow())
|
106
|
+
|
107
|
+
# ------------ Backward-compatible skip handling ------------
|
108
|
+
# Keep legacy attribute and derive new internal canonical sets.
|
109
|
+
self.skipped = list(skipped or kwargs.get("skipped", []) or [])
|
110
|
+
self.skipped_paths = {p.rstrip("/") + "/" for p in self.skipped if isinstance(p, str)}
|
111
|
+
self.skipped_dates = {p for p in self.skipped if isinstance(p, dt.date)}
|
112
|
+
|
113
|
+
# Validate fs presence (you rely on it)
|
114
|
+
if not getattr(self, "fs", None):
|
115
|
+
raise ValueError("UpdatePlanner requires a valid fsspec filesystem (fs).")
|
116
|
+
|
117
|
+
# --------------------- Back-compat property bridge ---------------------
|
118
|
+
@property
|
119
|
+
def skipped(self) -> List[Union[str, dt.date]]: # type: ignore[override]
|
120
|
+
"""
|
121
|
+
Backward-compatible view of skip configuration.
|
122
|
+
Returns a merged list of path-strings and dates.
|
123
|
+
"""
|
124
|
+
paths = sorted(self.skipped_paths)
|
125
|
+
dates = sorted(self.skipped_dates)
|
126
|
+
return [*paths, *dates]
|
127
|
+
|
128
|
+
@skipped.setter
|
129
|
+
def skipped(self, value: List[Union[str, dt.date]]) -> None: # type: ignore[override]
|
130
|
+
"""
|
131
|
+
Accepts legacy assignments like:
|
132
|
+
planner.skipped = ["s3://.../2025/01/03/", date(2025,1,4)]
|
133
|
+
and keeps new internals in sync.
|
134
|
+
"""
|
135
|
+
value = list(value or [])
|
136
|
+
self.skipped_paths = {p.rstrip("/") + "/" for p in value if isinstance(p, str)}
|
137
|
+
self.skipped_dates = {p for p in value if isinstance(p, dt.date)}
|
138
|
+
|
139
|
+
# --------------------- public helpers (kept) ---------------------
|
84
140
|
def has_plan(self) -> bool:
|
85
|
-
"""Safe truthiness for plan existence."""
|
86
141
|
return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
87
142
|
|
88
143
|
def required_count(self) -> int:
|
89
144
|
return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
|
90
145
|
|
91
|
-
# --------------------- core API ---------------------
|
146
|
+
# --------------------- core API (kept) ---------------------
|
92
147
|
def generate_plan(
|
93
148
|
self,
|
94
149
|
start: Union[str, dt.date, None] = None,
|
95
150
|
end: Union[str, dt.date, None] = None,
|
96
151
|
freq: str = "D",
|
97
152
|
) -> pd.DataFrame:
|
98
|
-
"""
|
99
|
-
Build a plan for [start, end]. Returns rows that require update (df_req).
|
100
|
-
"""
|
153
|
+
"""Build a plan for [start, end]. Returns rows that require update (df_req)."""
|
101
154
|
start = start or self.start_date
|
102
155
|
end = end or self.end_date
|
156
|
+
if start is None or end is None:
|
157
|
+
raise ValueError("start and end must be provided (or set via parquet_* kwargs).")
|
158
|
+
|
103
159
|
sd = pd.to_datetime(start).date()
|
104
160
|
ed = pd.to_datetime(end).date()
|
105
161
|
if sd > ed:
|
106
162
|
raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
107
163
|
|
108
|
-
self.logger.info(
|
164
|
+
self.logger.info(
|
165
|
+
f"Generating update plan for {self.description} from {sd} to {ed}",
|
166
|
+
extra=self._log_extra(),
|
167
|
+
)
|
109
168
|
self._generate_plan(sd, ed, freq=freq)
|
110
169
|
self.logger.info(
|
111
170
|
f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
112
|
-
f"{len(self.df_req)} require update"
|
171
|
+
f"{len(self.df_req)} require update",
|
172
|
+
extra=self._log_extra(),
|
113
173
|
)
|
114
174
|
return self.df_req
|
115
175
|
|
116
176
|
def show_update_plan(self) -> None:
|
117
|
-
"""Pretty-print the current plan once per run."""
|
177
|
+
"""Pretty-print the current plan once per run, now respecting terminal width fully."""
|
118
178
|
if not self.has_plan():
|
119
|
-
self.logger.info("No update plan to show.")
|
179
|
+
self.logger.info("No update plan to show.", extra=self._log_extra())
|
120
180
|
return
|
121
181
|
if self._printed_this_run:
|
122
182
|
return
|
@@ -124,33 +184,43 @@ class UpdatePlanner(ManagedResource):
|
|
124
184
|
try:
|
125
185
|
from rich.console import Console
|
126
186
|
from rich.table import Table
|
127
|
-
except Exception:
|
128
|
-
# Fallback: plain text
|
129
|
-
self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}")
|
130
|
-
self._printed_this_run = True
|
131
|
-
return
|
132
187
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
188
|
+
console = Console() # auto-detect terminal size
|
189
|
+
terminal_width = console.size.width
|
190
|
+
|
191
|
+
table = Table(
|
192
|
+
title=f"Update Plan for {self.data_path}",
|
193
|
+
show_header=True,
|
194
|
+
header_style="bold magenta",
|
195
|
+
expand=True, # fill available width
|
196
|
+
pad_edge=False,
|
197
|
+
)
|
198
|
+
max_w = max(terminal_width - 50, 640)
|
199
|
+
for col in self.plan.columns:
|
200
|
+
if col in {"date", "update_category", "update_priority", "update_required", "file_exists"}:
|
201
|
+
table.add_column(col, justify="left", no_wrap=True, overflow="fold", max_width=max_w)
|
202
|
+
elif col == "description":
|
203
|
+
# Let description wrap, but set a max width to avoid huge columns
|
204
|
+
table.add_column(col, justify="left", overflow="fold", max_width=max_w)
|
205
|
+
else:
|
206
|
+
table.add_column(col, justify="left", overflow="fold")
|
207
|
+
|
208
|
+
for _, row in self.plan.iterrows():
|
209
|
+
table.add_row(*(str(row[c]) for c in self.plan.columns))
|
210
|
+
|
211
|
+
# Capture with the same console so width stays consistent
|
212
|
+
with console.capture() as cap:
|
213
|
+
console.print(table)
|
214
|
+
self.logger.info(f"Full Update Plan:\n{cap.get().strip()}", extra=self._log_extra())
|
140
215
|
|
141
|
-
|
142
|
-
|
216
|
+
except Exception:
|
217
|
+
preview = self.plan.head(200).to_string(index=False)
|
218
|
+
self.logger.info(f"Update Plan (first 200 rows):\n{preview}", extra=self._log_extra())
|
143
219
|
|
144
|
-
console = Console()
|
145
|
-
with console.capture() as capture:
|
146
|
-
console.print(table)
|
147
|
-
self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra={"date_of_update": self.reference_date.strftime('%Y-%m-%d'), "dataclass": self.description,"action_module_name": "update_plan"})
|
148
220
|
self._printed_this_run = True
|
149
221
|
|
150
222
|
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
151
|
-
"""
|
152
|
-
Yield (priority, [dates...]) batches, smallest priority first.
|
153
|
-
"""
|
223
|
+
"""Yield (priority, [dates...]) batches, smallest priority first."""
|
154
224
|
if not self.has_plan():
|
155
225
|
return
|
156
226
|
req = self.plan[self.plan["update_required"]]
|
@@ -158,7 +228,6 @@ class UpdatePlanner(ManagedResource):
|
|
158
228
|
return
|
159
229
|
for priority in sorted(req["update_priority"].unique()):
|
160
230
|
dates_df = req[req["update_priority"] == priority]
|
161
|
-
# sort within group
|
162
231
|
dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
163
232
|
dates = dates_df["date"].tolist()
|
164
233
|
if dates:
|
@@ -169,42 +238,205 @@ class UpdatePlanner(ManagedResource):
|
|
169
238
|
def _ensure_trailing_slash(path: str) -> str:
|
170
239
|
return path.rstrip("/") + "/"
|
171
240
|
|
241
|
+
@staticmethod
|
242
|
+
def _month_floor(d: dt.date) -> dt.date:
|
243
|
+
return d.replace(day=1)
|
244
|
+
|
245
|
+
@staticmethod
|
246
|
+
def _iter_month_starts(start: dt.date, end: dt.date) -> Iterator[dt.date]:
|
247
|
+
cur = start.replace(day=1)
|
248
|
+
while cur <= end:
|
249
|
+
yield cur
|
250
|
+
y, m = cur.year, cur.month
|
251
|
+
cur = dt.date(y + (m == 12), 1 if m == 12 else m + 1, 1)
|
252
|
+
|
253
|
+
def _month_prefix(self, month_start: dt.date) -> str:
|
254
|
+
return f"{self.data_path}{month_start.year}/{month_start.month:02d}/"
|
255
|
+
|
256
|
+
def _day_prefix(self, d: dt.date) -> str:
|
257
|
+
return f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
258
|
+
|
259
|
+
def _log_extra(self, **overrides) -> dict:
|
260
|
+
base = {
|
261
|
+
"sibi_dst_component": __name__,
|
262
|
+
"date_of_update": self.reference_date.strftime("%Y-%m-%d"),
|
263
|
+
"dataclass": self.description,
|
264
|
+
"action_module_name": "update_plan",
|
265
|
+
}
|
266
|
+
base.update(overrides)
|
267
|
+
return base
|
268
|
+
|
269
|
+
def _is_data_file(self, path: str) -> bool:
|
270
|
+
base = path.rsplit("/", 1)[-1]
|
271
|
+
if not base or base.startswith(".") or base in self.CONTROL_BASENAMES:
|
272
|
+
return False
|
273
|
+
lower = base.lower()
|
274
|
+
return any(lower.endswith(suf) for suf in self.data_file_suffixes)
|
275
|
+
|
276
|
+
def _is_skipped(self, d: dt.date) -> bool:
|
277
|
+
"""True if the date or its canonical path is in the skip config."""
|
278
|
+
just_path = f"{self.data_path}{d.year}/{d.month:02d}/{d.day:02d}/"
|
279
|
+
return (d in self.skipped_dates) or (just_path in self.skipped_paths)
|
280
|
+
|
281
|
+
def _list_prefix(self, prefix: str) -> Dict[dt.date, Dict[str, object]]:
|
282
|
+
"""
|
283
|
+
Return {date: {'files': [paths], 'has_success': bool, 'newest_ts': datetime|None}} under prefix.
|
284
|
+
Uses fsspec.find(detail=True) for one-shot listing with metadata (mtime). [oai_citation:0‡fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?utm_source=chatgpt.com) [oai_citation:1‡GitHub](https://github.com/fsspec/filesystem_spec/blob/master/fsspec%2Fspec.py?utm_source=chatgpt.com)
|
285
|
+
"""
|
286
|
+
try:
|
287
|
+
items = self.fs.find(prefix, withdirs=False, detail=True) # returns {path: info} when detail=True
|
288
|
+
except Exception as e:
|
289
|
+
self.logger.warning(f"Listing failed for {prefix}: {e}", extra=self._log_extra())
|
290
|
+
return {}
|
291
|
+
|
292
|
+
out: Dict[dt.date, Dict[str, object]] = {}
|
293
|
+
for path, info in items.items():
|
294
|
+
parts = path.strip("/").split("/")
|
295
|
+
if len(parts) < 4:
|
296
|
+
continue
|
297
|
+
try:
|
298
|
+
y, m, dd = int(parts[-4]), int(parts[-3]), int(parts[-2])
|
299
|
+
d = dt.date(y, m, dd)
|
300
|
+
except Exception:
|
301
|
+
continue
|
302
|
+
|
303
|
+
rec = out.setdefault(d, {"files": [], "has_success": False, "newest_ts": None})
|
304
|
+
base = path.rsplit("/", 1)[-1]
|
305
|
+
if base == "_SUCCESS":
|
306
|
+
rec["has_success"] = True
|
307
|
+
|
308
|
+
if self._is_data_file(path):
|
309
|
+
rec["files"].append(path)
|
310
|
+
mtime = info.get("mtime") or info.get("LastModified") or info.get("last_modified")
|
311
|
+
ts = None
|
312
|
+
if isinstance(mtime, (int, float)):
|
313
|
+
ts = dt.datetime.utcfromtimestamp(mtime)
|
314
|
+
elif isinstance(mtime, str):
|
315
|
+
try:
|
316
|
+
ts = pd.to_datetime(mtime, utc=True).to_pydatetime()
|
317
|
+
except Exception:
|
318
|
+
ts = None
|
319
|
+
elif isinstance(mtime, dt.datetime):
|
320
|
+
ts = mtime if mtime.tzinfo else mtime.replace(tzinfo=dt.timezone.utc)
|
321
|
+
if ts:
|
322
|
+
cur = rec["newest_ts"]
|
323
|
+
rec["newest_ts"] = ts if (cur is None or ts > cur) else cur
|
324
|
+
return out
|
325
|
+
|
326
|
+
def _summarize_partition(
|
327
|
+
self, d: dt.date, cache: Dict[dt.date, Dict[str, object]]
|
328
|
+
) -> Tuple[bool, Optional[float], bool]:
|
329
|
+
"""
|
330
|
+
(exists, age_minutes, incomplete)
|
331
|
+
- exists: True iff at least one *data* file is present for day `d`
|
332
|
+
- age_minutes: minutes since the NEWEST data file (UTC 'now')
|
333
|
+
- incomplete: True if files exist but required _SUCCESS is missing
|
334
|
+
"""
|
335
|
+
rec = cache.get(d, {})
|
336
|
+
files = rec.get("files", [])
|
337
|
+
has_success = bool(rec.get("has_success", False))
|
338
|
+
exists = len(files) > 0
|
339
|
+
if not exists:
|
340
|
+
return False, None, False
|
341
|
+
newest_ts = rec.get("newest_ts")
|
342
|
+
if newest_ts:
|
343
|
+
now_utc = self._utcnow().replace(tzinfo=None)
|
344
|
+
ts_naive = newest_ts.replace(tzinfo=None) if newest_ts.tzinfo else newest_ts
|
345
|
+
age_min = max(0.0, (now_utc - ts_naive).total_seconds() / 60.0)
|
346
|
+
else:
|
347
|
+
age_min = None
|
348
|
+
incomplete = self.check_completeness and self.require_success_marker and not has_success
|
349
|
+
return True, age_min, incomplete
|
350
|
+
|
172
351
|
def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
173
352
|
"""
|
174
353
|
Populate self.plan with all dates and self.df_req with the subset to update.
|
354
|
+
- Pre-lists months or days (configurable) with timeouts that actually apply
|
355
|
+
- Computes staleness from newest *data* file
|
356
|
+
- Flags partitions without _SUCCESS as 'incomplete' (unless disabled)
|
357
|
+
- Marks future dates as 'future' (not actionable)
|
175
358
|
"""
|
176
|
-
dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
359
|
+
dates: List[dt.date] = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
177
360
|
history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
178
361
|
rows: List[Dict] = []
|
179
362
|
|
180
|
-
|
181
|
-
|
363
|
+
def is_future(d: dt.date) -> bool:
|
364
|
+
return d > self.reference_date
|
182
365
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
366
|
+
# Choose listing units
|
367
|
+
if self.list_granularity == "day":
|
368
|
+
units: List[Tuple[str, dt.date]] = [("day", d) for d in dates]
|
369
|
+
else:
|
370
|
+
months = list(self._iter_month_starts(self._month_floor(start), self._month_floor(end)))
|
371
|
+
units = [("month", m) for m in months]
|
372
|
+
|
373
|
+
self.logger.info(
|
374
|
+
f"Pre-listing {len(units)} {'days' if self.list_granularity=='day' else 'month prefixes'} for {self.description}",
|
375
|
+
extra=self._log_extra(),
|
376
|
+
)
|
377
|
+
|
378
|
+
# Parallel listing with real timeout (uses futures.wait) [oai_citation:2‡Python documentation](https://docs.python.org/3/library/concurrent.futures.html?utm_source=chatgpt.com) [oai_citation:3‡alexwlchan.net](https://alexwlchan.net/2019/adventures-with-concurrent-futures/?utm_source=chatgpt.com)
|
379
|
+
caches: Dict[dt.date, Dict[dt.date, Dict[str, object]]] = {}
|
380
|
+
max_workers = max(1, int(self.max_threads))
|
381
|
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
382
|
+
futs = {}
|
383
|
+
for kind, val in units:
|
384
|
+
prefix = self._day_prefix(val) if kind == "day" else self._month_prefix(val)
|
385
|
+
futs[ex.submit(self._list_prefix, prefix)] = (kind, val)
|
386
|
+
done, not_done = wait(futs, timeout=self.total_timeout or None)
|
387
|
+
for f in done:
|
388
|
+
kind, val = futs[f]
|
199
389
|
try:
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
390
|
+
cache = f.result(timeout=self.list_timeout or None)
|
391
|
+
except Exception as e:
|
392
|
+
self.logger.warning(f"Listing failed for {kind}:{val} — {e}", extra=self._log_extra())
|
393
|
+
cache = {}
|
394
|
+
if kind == "month":
|
395
|
+
caches[val] = cache
|
396
|
+
else:
|
397
|
+
# day → store into its month bucket for summarization reuse
|
398
|
+
mk = val.replace(day=1)
|
399
|
+
caches.setdefault(mk, {}).update(cache)
|
400
|
+
for f in not_done:
|
401
|
+
kind, val = futs[f]
|
402
|
+
self.logger.error(f"Listing timed out for {kind}:{val}", extra=self._log_extra())
|
403
|
+
if kind == "month":
|
404
|
+
caches[val] = {}
|
405
|
+
else:
|
406
|
+
caches.setdefault(val.replace(day=1), {})
|
407
|
+
|
408
|
+
# Summarize each date
|
409
|
+
for d in dates:
|
410
|
+
if is_future(d):
|
411
|
+
rows.append({
|
412
|
+
"date": d, "file_exists": False, "file_age_minutes": None,
|
413
|
+
"update_category": "future", "update_priority": self.priority_map.get("future", 99),
|
414
|
+
"update_required": False, "description": self.description,
|
415
|
+
})
|
416
|
+
continue
|
417
|
+
|
418
|
+
if self._is_skipped(d):
|
419
|
+
self.logger.debug(f"Skipping {d}: in skipped set.", extra=self._log_extra())
|
420
|
+
rows.append(self._make_row(d, history_start, False, None))
|
421
|
+
continue
|
422
|
+
|
423
|
+
month_key = d.replace(day=1)
|
424
|
+
cache = caches.get(month_key, {})
|
425
|
+
exists, age_min, incomplete = self._summarize_partition(d, cache)
|
426
|
+
|
427
|
+
# Incomplete partitions get their own category (unless overwrite)
|
428
|
+
if incomplete and not self.overwrite:
|
429
|
+
rows.append({
|
430
|
+
"date": d, "file_exists": True, "file_age_minutes": age_min,
|
431
|
+
"update_category": "incomplete", "update_priority": self.priority_map.get("incomplete", 1),
|
432
|
+
"update_required": True, "description": self.description,
|
433
|
+
})
|
434
|
+
continue
|
435
|
+
|
436
|
+
# Fall back to your existing policy (overwrite / history / staleness / missing)
|
437
|
+
rows.append(self._make_row(d, history_start, exists, age_min))
|
438
|
+
|
439
|
+
df = pd.DataFrame.from_records(rows)
|
208
440
|
if not df.empty:
|
209
441
|
df["date"] = pd.to_datetime(df["date"]).dt.date
|
210
442
|
df["update_priority"] = df["update_priority"].astype(int)
|
@@ -212,31 +444,14 @@ class UpdatePlanner(ManagedResource):
|
|
212
444
|
df = df.sort_values(
|
213
445
|
by=["update_priority", "date"],
|
214
446
|
ascending=[True, not self.reverse_order],
|
215
|
-
kind="mergesort",
|
447
|
+
kind="mergesort",
|
216
448
|
).reset_index(drop=True)
|
217
449
|
|
218
450
|
self.plan = df
|
219
451
|
self.df_req = df[df["update_required"]].copy()
|
220
452
|
self._printed_this_run = False
|
221
453
|
|
222
|
-
|
223
|
-
"""
|
224
|
-
Check file existence and age for the given date.
|
225
|
-
"""
|
226
|
-
just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
227
|
-
if just_path in self.skipped:
|
228
|
-
self.logger.debug(f"Skipping {date}: path in skipped list.")
|
229
|
-
return False, None
|
230
|
-
|
231
|
-
path = f"{just_path}{self.filename}"
|
232
|
-
try:
|
233
|
-
exists = self.fs.exists(path)
|
234
|
-
age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
235
|
-
return bool(exists), age
|
236
|
-
except Exception as e:
|
237
|
-
self.logger.warning(f"exists/age check failed for {path}: {e}")
|
238
|
-
return False, None
|
239
|
-
|
454
|
+
# --------------------- original policy (kept) ---------------------
|
240
455
|
def _make_row(
|
241
456
|
self,
|
242
457
|
date: dt.date,
|
@@ -246,15 +461,14 @@ class UpdatePlanner(ManagedResource):
|
|
246
461
|
) -> Dict:
|
247
462
|
"""
|
248
463
|
Build a single plan row based on flags and thresholds.
|
464
|
+
(Categories 'future'/'incomplete' are injected earlier.)
|
249
465
|
"""
|
250
466
|
within_history = history_start <= date <= self.reference_date
|
251
467
|
update_required = False
|
252
468
|
|
253
|
-
# 1) Overwrite forces update
|
254
469
|
if self.overwrite:
|
255
470
|
category = "overwrite_forced"
|
256
471
|
update_required = True
|
257
|
-
# 2) Inside history window
|
258
472
|
elif within_history:
|
259
473
|
if not file_exists:
|
260
474
|
category = "missing_in_history"
|
@@ -264,11 +478,9 @@ class UpdatePlanner(ManagedResource):
|
|
264
478
|
update_required = True
|
265
479
|
else:
|
266
480
|
category = "file_is_recent"
|
267
|
-
# 3) Outside history, missing file (and not ignoring)
|
268
481
|
elif not file_exists and not self.ignore_missing:
|
269
482
|
category = "create_missing"
|
270
483
|
update_required = True
|
271
|
-
# 4) Everything else
|
272
484
|
else:
|
273
485
|
category = "missing_ignored" if not file_exists else "file_is_recent"
|
274
486
|
|
@@ -282,19 +494,308 @@ class UpdatePlanner(ManagedResource):
|
|
282
494
|
"description": self.description,
|
283
495
|
}
|
284
496
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
497
|
+
# import datetime as dt
|
498
|
+
# from concurrent.futures import ThreadPoolExecutor, as_completed
|
499
|
+
# from typing import List, Optional, Dict, Union, Tuple, Set, Iterator, ClassVar
|
500
|
+
#
|
501
|
+
# import pandas as pd
|
502
|
+
#
|
503
|
+
# from sibi_dst.utils import ManagedResource
|
504
|
+
# from . import FileAgeChecker
|
505
|
+
#
|
506
|
+
#
|
507
|
+
# class UpdatePlanner(ManagedResource):
|
508
|
+
# """
|
509
|
+
# Scans date-partitioned storage and builds an 'update plan' for dates that need processing.
|
510
|
+
# Produces a Pandas DataFrame plan; it does *not* load data frames, so Dask-vs-Pandas
|
511
|
+
# concerns do not apply here.
|
512
|
+
# """
|
513
|
+
#
|
514
|
+
# DEFAULT_PRIORITY_MAP: ClassVar[Dict[str, int]] = {
|
515
|
+
# "file_is_recent": 0,
|
516
|
+
# "missing_ignored": 0,
|
517
|
+
# "overwrite_forced": 1,
|
518
|
+
# "create_missing": 2,
|
519
|
+
# "missing_in_history": 3,
|
520
|
+
# "stale_in_history": 4,
|
521
|
+
# }
|
522
|
+
#
|
523
|
+
# DEFAULT_MAX_AGE_MINUTES: int = 1440
|
524
|
+
# DEFAULT_HISTORY_DAYS_THRESHOLD: int = 30
|
525
|
+
# logger_extra = {"sibi_dst_component": __name__}
|
526
|
+
#
|
527
|
+
# def __init__(
|
528
|
+
# self,
|
529
|
+
# parquet_storage_path: str,
|
530
|
+
# parquet_filename: str,
|
531
|
+
# description: str = "Update Planner",
|
532
|
+
# reference_date: Union[str, dt.date, None] = None,
|
533
|
+
# history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
534
|
+
# max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
535
|
+
# overwrite: bool = False,
|
536
|
+
# ignore_missing: bool = False,
|
537
|
+
# custom_priority_map: Optional[Dict[str, int]] = None,
|
538
|
+
# reverse_order: bool = False,
|
539
|
+
# show_progress: bool = False,
|
540
|
+
# skipped: Optional[List[str]] = None,
|
541
|
+
# **kwargs,
|
542
|
+
# ):
|
543
|
+
# super().__init__(**kwargs)
|
544
|
+
#
|
545
|
+
# # Public-ish attributes
|
546
|
+
# self.description = description
|
547
|
+
# self.data_path = self._ensure_trailing_slash(parquet_storage_path)
|
548
|
+
# self.filename = parquet_filename
|
549
|
+
# self.reverse_order = reverse_order
|
550
|
+
# self.show_progress = show_progress
|
551
|
+
# self.overwrite = overwrite
|
552
|
+
# self.ignore_missing = ignore_missing
|
553
|
+
# self.history_days_threshold = history_days_threshold
|
554
|
+
# self.max_age_minutes = max_age_minutes
|
555
|
+
# self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
|
556
|
+
# self.skipped = set(skipped or [])
|
557
|
+
#
|
558
|
+
# # Execution knobs from kwargs (fed by upstream config)
|
559
|
+
# self.max_threads: int = int(kwargs.get("max_threads", 3))
|
560
|
+
# self.timeout: float = float(kwargs.get("timeout", 30.0))
|
561
|
+
#
|
562
|
+
# # Date window
|
563
|
+
# self.start_date = kwargs.get("parquet_start_date")
|
564
|
+
# self.end_date = kwargs.get("parquet_end_date")
|
565
|
+
#
|
566
|
+
# # Reference "today"
|
567
|
+
# if reference_date is None:
|
568
|
+
# self.reference_date = dt.date.today()
|
569
|
+
# else:
|
570
|
+
# self.reference_date = pd.to_datetime(reference_date).date()
|
571
|
+
#
|
572
|
+
# # Helpers & state
|
573
|
+
# self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
574
|
+
# self.plan: pd.DataFrame = pd.DataFrame()
|
575
|
+
# self.df_req: pd.DataFrame = pd.DataFrame()
|
576
|
+
#
|
577
|
+
# # internal run flag to print once per run if caller reuses instance
|
578
|
+
# self._printed_this_run: bool = False
|
579
|
+
#
|
580
|
+
# # --------------------- public helpers ---------------------
|
581
|
+
# def has_plan(self) -> bool:
|
582
|
+
# """Safe truthiness for plan existence."""
|
583
|
+
# return isinstance(self.plan, pd.DataFrame) and not self.plan.empty
|
584
|
+
#
|
585
|
+
# def required_count(self) -> int:
|
586
|
+
# return 0 if not isinstance(self.df_req, pd.DataFrame) else len(self.df_req)
|
587
|
+
#
|
588
|
+
# # --------------------- core API ---------------------
|
589
|
+
# def generate_plan(
|
590
|
+
# self,
|
591
|
+
# start: Union[str, dt.date, None] = None,
|
592
|
+
# end: Union[str, dt.date, None] = None,
|
593
|
+
# freq: str = "D",
|
594
|
+
# ) -> pd.DataFrame:
|
595
|
+
# """
|
596
|
+
# Build a plan for [start, end]. Returns rows that require update (df_req).
|
597
|
+
# """
|
598
|
+
# start = start or self.start_date
|
599
|
+
# end = end or self.end_date
|
600
|
+
# sd = pd.to_datetime(start).date()
|
601
|
+
# ed = pd.to_datetime(end).date()
|
602
|
+
# if sd > ed:
|
603
|
+
# raise ValueError(f"Start date ({sd}) must be on or before end date ({ed}).")
|
604
|
+
#
|
605
|
+
# self.logger.info(f"Generating update plan for {self.description} from {sd} to {ed}", extra=self.logger_extra)
|
606
|
+
# self._generate_plan(sd, ed, freq=freq)
|
607
|
+
# self.logger.info(
|
608
|
+
# f"Plan built for {self.description}: {len(self.plan)} dates evaluated, "
|
609
|
+
# f"{len(self.df_req)} require update",
|
610
|
+
# extra=self.logger_extra
|
611
|
+
# )
|
612
|
+
# return self.df_req
|
613
|
+
#
|
614
|
+
# def show_update_plan(self) -> None:
|
615
|
+
# logger_extra = self.logger_extra.update({"date_of_update": self.reference_date.strftime('%Y-%m-%d'), "dataclass": self.description,"action_module_name": "update_plan"})
|
616
|
+
#
|
617
|
+
# """Pretty-print the current plan once per run."""
|
618
|
+
# if not self.has_plan():
|
619
|
+
# self.logger.info("No update plan to show.")
|
620
|
+
# return
|
621
|
+
# if self._printed_this_run:
|
622
|
+
# return
|
623
|
+
#
|
624
|
+
# try:
|
625
|
+
# from rich.console import Console
|
626
|
+
# from rich.table import Table
|
627
|
+
# except Exception:
|
628
|
+
# # Fallback: plain text
|
629
|
+
# self.logger.info(f"Update Plan (plain list):\n{self.plan.to_string(index=False)}", extra=logger_extra)
|
630
|
+
# self._printed_this_run = True
|
631
|
+
# return
|
632
|
+
#
|
633
|
+
# table = Table(
|
634
|
+
# title=f"Update Plan for {self.data_path}",
|
635
|
+
# show_header=True,
|
636
|
+
# header_style="bold magenta",
|
637
|
+
# )
|
638
|
+
# for column in self.plan.columns:
|
639
|
+
# table.add_column(column, justify="left")
|
640
|
+
#
|
641
|
+
# for _, row in self.plan.iterrows():
|
642
|
+
# table.add_row(*(str(row[col]) for col in self.plan.columns))
|
643
|
+
#
|
644
|
+
# console = Console()
|
645
|
+
# with console.capture() as capture:
|
646
|
+
# console.print(table)
|
647
|
+
# self.logger.info(f"Full Update Plan:\n{capture.get().strip()}", extra=logger_extra)
|
648
|
+
# self._printed_this_run = True
|
649
|
+
#
|
650
|
+
# def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[dt.date]]]:
|
651
|
+
# """
|
652
|
+
# Yield (priority, [dates...]) batches, smallest priority first.
|
653
|
+
# """
|
654
|
+
# if not self.has_plan():
|
655
|
+
# return
|
656
|
+
# req = self.plan[self.plan["update_required"]]
|
657
|
+
# if req.empty:
|
658
|
+
# return
|
659
|
+
# for priority in sorted(req["update_priority"].unique()):
|
660
|
+
# dates_df = req[req["update_priority"] == priority]
|
661
|
+
# # sort within group
|
662
|
+
# dates_df = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
663
|
+
# dates = dates_df["date"].tolist()
|
664
|
+
# if dates:
|
665
|
+
# yield int(priority), dates
|
666
|
+
#
|
667
|
+
# # --------------------- internals ---------------------
|
668
|
+
# @staticmethod
|
669
|
+
# def _ensure_trailing_slash(path: str) -> str:
|
670
|
+
# return path.rstrip("/") + "/"
|
671
|
+
#
|
672
|
+
# def _generate_plan(self, start: dt.date, end: dt.date, freq: str = "D") -> None:
|
673
|
+
# """
|
674
|
+
# Populate self.plan with all dates and self.df_req with the subset to update.
|
675
|
+
# """
|
676
|
+
# dates = pd.date_range(start=start, end=end, freq=freq).date.tolist()
|
677
|
+
# history_start = self.reference_date - dt.timedelta(days=self.history_days_threshold)
|
678
|
+
# rows: List[Dict] = []
|
679
|
+
#
|
680
|
+
# # bound threads
|
681
|
+
# max_workers = max(1, int(self.max_threads))
|
682
|
+
#
|
683
|
+
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
684
|
+
# futures = {executor.submit(self._get_file_status, d): d for d in dates}
|
685
|
+
# iterator = as_completed(futures)
|
686
|
+
# if self.show_progress:
|
687
|
+
# try:
|
688
|
+
# from tqdm import tqdm
|
689
|
+
# iterator = tqdm(
|
690
|
+
# iterator, total=len(futures),
|
691
|
+
# desc=f"Scanning dates for {self.description}",
|
692
|
+
# unit="date", leave=False
|
693
|
+
# )
|
694
|
+
# except Exception:
|
695
|
+
# pass # no tqdm → proceed without progress bar
|
696
|
+
#
|
697
|
+
# for future in iterator:
|
698
|
+
# d = futures[future]
|
699
|
+
# try:
|
700
|
+
# exists, age = future.result(timeout=self.timeout)
|
701
|
+
# rows.append(self._make_row(d, history_start, exists, age))
|
702
|
+
# except Exception as exc:
|
703
|
+
# self.logger.error(f"Error processing date {d}: {exc}", extra=self.logger_extra)
|
704
|
+
# rows.append(self._make_row(d, history_start, False, None))
|
705
|
+
#
|
706
|
+
# df = pd.DataFrame(rows)
|
707
|
+
# # consistent types
|
708
|
+
# if not df.empty:
|
709
|
+
# df["date"] = pd.to_datetime(df["date"]).dt.date
|
710
|
+
# df["update_priority"] = df["update_priority"].astype(int)
|
711
|
+
#
|
712
|
+
# df = df.sort_values(
|
713
|
+
# by=["update_priority", "date"],
|
714
|
+
# ascending=[True, not self.reverse_order],
|
715
|
+
# kind="mergesort", # stable
|
716
|
+
# ).reset_index(drop=True)
|
717
|
+
#
|
718
|
+
# self.plan = df
|
719
|
+
# self.df_req = df[df["update_required"]].copy()
|
720
|
+
# self._printed_this_run = False
|
721
|
+
#
|
722
|
+
# def _get_file_status(self, date: dt.date) -> Tuple[bool, Optional[float]]:
|
723
|
+
# """
|
724
|
+
# Check file existence and age for the given date.
|
725
|
+
# """
|
726
|
+
# just_path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
727
|
+
# if just_path in self.skipped:
|
728
|
+
# self.logger.debug(f"Skipping {date}: path in skipped list.", extra=self.logger_extra)
|
729
|
+
# return False, None
|
730
|
+
#
|
731
|
+
# path = f"{just_path}{self.filename}"
|
732
|
+
# try:
|
733
|
+
# exists = self.fs.exists(path)
|
734
|
+
# age = self.age_checker.get_file_or_dir_age_minutes(path, self.fs) if exists else None
|
735
|
+
# return bool(exists), age
|
736
|
+
# except Exception as e:
|
737
|
+
# self.logger.warning(f"exists/age check failed for {path}: {e}", extra=self.logger_extra)
|
738
|
+
# return False, None
|
739
|
+
#
|
740
|
+
# def _make_row(
|
741
|
+
# self,
|
742
|
+
# date: dt.date,
|
743
|
+
# history_start: dt.date,
|
744
|
+
# file_exists: bool,
|
745
|
+
# file_age: Optional[float],
|
746
|
+
# ) -> Dict:
|
747
|
+
# """
|
748
|
+
# Build a single plan row based on flags and thresholds.
|
749
|
+
# """
|
750
|
+
# within_history = history_start <= date <= self.reference_date
|
751
|
+
# update_required = False
|
752
|
+
#
|
753
|
+
# # 1) Overwrite forces update
|
754
|
+
# if self.overwrite:
|
755
|
+
# category = "overwrite_forced"
|
756
|
+
# update_required = True
|
757
|
+
# # 2) Inside history window
|
758
|
+
# elif within_history:
|
759
|
+
# if not file_exists:
|
760
|
+
# category = "missing_in_history"
|
761
|
+
# update_required = True
|
762
|
+
# elif file_age is not None and file_age > self.max_age_minutes:
|
763
|
+
# category = "stale_in_history"
|
764
|
+
# update_required = True
|
765
|
+
# else:
|
766
|
+
# category = "file_is_recent"
|
767
|
+
# # 3) Outside history, missing file (and not ignoring)
|
768
|
+
# elif not file_exists and not self.ignore_missing:
|
769
|
+
# category = "create_missing"
|
770
|
+
# update_required = True
|
771
|
+
# # 4) Everything else
|
772
|
+
# else:
|
773
|
+
# category = "missing_ignored" if not file_exists else "file_is_recent"
|
774
|
+
#
|
775
|
+
# return {
|
776
|
+
# "date": date,
|
777
|
+
# "file_exists": bool(file_exists),
|
778
|
+
# "file_age_minutes": file_age,
|
779
|
+
# "update_category": category,
|
780
|
+
# "update_priority": self.priority_map.get(category, 99),
|
781
|
+
# "update_required": bool(update_required),
|
782
|
+
# "description": self.description,
|
783
|
+
# }
|
784
|
+
#
|
785
|
+
# def exclude_dates(self, dates: Set[dt.date]) -> None:
|
786
|
+
# """
|
787
|
+
# Exclude specific dates from the update plan.
|
788
|
+
# """
|
789
|
+
# if not isinstance(dates, set):
|
790
|
+
# raise ValueError("dates must be a set[date].")
|
791
|
+
# if not self.has_plan():
|
792
|
+
# self.logger.info("No update plan to modify. Call generate_plan() first.", extra=self.logger_extra)
|
793
|
+
# return
|
794
|
+
#
|
795
|
+
# before = len(self.plan)
|
796
|
+
# self.plan = self.plan[~self.plan["date"].isin(dates)]
|
797
|
+
# self.df_req = self.plan[self.plan["update_required"]].copy()
|
798
|
+
# self.logger.info(
|
799
|
+
# f"Excluded {len(dates)} dates from the update plan (from {before} to {len(self.plan)} rows).",
|
800
|
+
# extra=self.logger_extra
|
801
|
+
# )
|