sibi-dst 0.3.58__py3-none-any.whl → 0.3.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,16 @@
1
1
  import datetime
2
2
  import logging
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from typing import Type, Any, Dict, Optional, Union, List, Tuple
5
3
  import threading
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import Type, Any, Dict, Optional, Union, List
7
+
6
8
  import fsspec
7
9
  import pandas as pd
8
- from IPython.display import display
9
10
  from tqdm import tqdm
10
11
 
11
12
  from .log_utils import Logger
12
- from .date_utils import FileAgeChecker
13
13
  from .parquet_saver import ParquetSaver
14
- from .update_planner import UpdatePlanner
15
14
 
16
15
 
17
16
  class DataWrapper:
@@ -25,77 +24,80 @@ class DataWrapper:
25
24
  DEFAULT_MAX_AGE_MINUTES = 1440
26
25
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
27
26
 
28
- def __init__(self,
29
- dataclass: Type,
30
- date_field: str,
31
- data_path: str,
32
- parquet_filename: str,
33
- start_date: Any,
34
- end_date: Any,
35
- fs: Optional[fsspec.AbstractFileSystem] = None,
36
- filesystem_type: str = "file",
37
- filesystem_options: Optional[Dict] = None,
38
- debug: bool = False,
39
- verbose: bool = False,
40
- class_params: Optional[Dict] = None,
41
- load_params: Optional[Dict] = None,
42
- reverse_order: bool = False,
43
- overwrite: bool = False,
44
- ignore_missing: bool = False,
45
- logger: Logger = None,
46
- max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
47
- history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
48
- show_progress: bool = False,
49
- timeout: float = 60,
50
- reference_date: datetime.date = None,
51
- custom_priority_map: Dict[str, int] = None,
52
- max_threads: int = 3):
27
+ def __init__(
28
+ self,
29
+ dataclass: Type,
30
+ date_field: str,
31
+ data_path: str,
32
+ parquet_filename: str,
33
+ #start_date: Any,
34
+ #end_date: Any,
35
+ fs: Optional[fsspec.AbstractFileSystem] = None,
36
+ #filesystem_type: str = "file",
37
+ #filesystem_options: Optional[Dict] = None,
38
+ debug: bool = False,
39
+ verbose: bool = False,
40
+ class_params: Optional[Dict] = None,
41
+ load_params: Optional[Dict] = None,
42
+ #reverse_order: bool = False,
43
+ #overwrite: bool = False,
44
+ #ignore_missing: bool = False,
45
+ logger: Logger = None,
46
+ #max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
47
+ #history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
48
+ show_progress: bool = False,
49
+ timeout: float = 60,
50
+ #reference_date: datetime.date = None,
51
+ #custom_priority_map: Dict[str, int] = None,
52
+ max_threads: int = 3,
53
+ **kwargs: Any,
54
+ ):
53
55
  self.dataclass = dataclass
54
56
  self.date_field = date_field
55
57
  self.data_path = self._ensure_forward_slash(data_path)
56
58
  self.parquet_filename = parquet_filename
57
- self.filesystem_type = filesystem_type
58
- self.filesystem_options = filesystem_options or {}
59
- self.fs = fs or self._init_filesystem()
59
+ #self.filesystem_type = filesystem_type
60
+ #self.filesystem_options = filesystem_options or {}
61
+ self.fs = fs or None
60
62
  self.debug = debug
61
63
  self.verbose = verbose
62
- self.class_params = class_params or {}
63
- self.load_params = load_params or {}
64
- self.reverse_order = reverse_order
65
- self.overwrite = overwrite
66
- self.ignore_missing = ignore_missing
64
+ # self.reverse_order = reverse_order
65
+ # self.overwrite = overwrite
66
+ # self.ignore_missing = ignore_missing
67
67
  self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
68
68
  self.logger.set_level(logging.DEBUG if debug else logging.INFO)
69
- self.max_age_minutes = max_age_minutes
70
- self.history_days_threshold = history_days_threshold
69
+ # self.max_age_minutes = max_age_minutes
70
+ # self.history_days_threshold = history_days_threshold
71
71
  self.show_progress = show_progress
72
72
  self.timeout = timeout
73
- self.reference_date = reference_date or datetime.date.today()
74
- self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
73
+ #self.reference_date = reference_date or datetime.date.today()
74
+ #self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
75
75
  self.max_threads = max_threads
76
-
77
- self.start_date = self._convert_to_date(start_date)
78
- self.end_date = self._convert_to_date(end_date)
79
- self._lock = threading.Lock()
80
- self.processed_dates = []
81
- self.age_checker = FileAgeChecker(logger=self.logger)
82
-
83
- self.update_planner_params = {
84
- "data_path": self.data_path,
85
- "filename": self.parquet_filename,
86
- "fs": self.fs,
87
- "debug": self.debug,
88
- "logger": self.logger,
89
- "reverse_order": self.reverse_order,
90
- "overwrite": self.overwrite,
91
- "ignore_missing": self.ignore_missing,
92
- "history_days_threshold": history_days_threshold,
93
- "max_age_minutes": max_age_minutes,
94
- "show_progress": self.show_progress,
95
- "description": f"{self.dataclass.__name__}"
76
+ self.class_params = class_params or {
77
+ 'debug': self.debug,
78
+ 'logger': self.logger,
79
+ 'fs': self.fs,
80
+ 'verbose': self.verbose,
96
81
  }
97
- self.update_plan = UpdatePlanner(**self.update_planner_params).generate_plan(self.start_date, self.end_date)
82
+ self.load_params = load_params or {}
98
83
 
84
+ self._lock = threading.Lock()
85
+ self.processed_dates: List[datetime.date] = []
86
+ self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
87
+ self.mmanifest = kwargs.get("mmanifest", None)
88
+ self.update_planner=kwargs.get("update_planner", None)
89
+
90
+ def __enter__(self):
91
+ """Context manager entry"""
92
+ return self
93
+
94
+ def __exit__(self, exc_type, exc_val, exc_tb):
95
+ """Context manager exit"""
96
+ if self.mmanifest and self.mmanifest._new_records:
97
+ self.mmanifest.save()
98
+ if exc_type is not None:
99
+ self.logger.error(f"Exception occurred: {exc_val}")
100
+ return False
99
101
 
100
102
  def _init_filesystem(self) -> fsspec.AbstractFileSystem:
101
103
  with self._lock:
@@ -114,100 +116,86 @@ class DataWrapper:
114
116
  def _ensure_forward_slash(path: str) -> str:
115
117
  return path.rstrip('/') + '/'
116
118
 
117
- def generate_date_range(self) -> List[datetime.date]:
118
- """Generate ordered date range with future date handling"""
119
- date_range = pd.date_range(
120
- start=self.start_date,
121
- end=self.end_date,
122
- freq='D'
123
- ).date.tolist()
124
-
125
- if self.reverse_order:
126
- date_range.reverse()
127
-
128
- return [
129
- d for d in date_range
130
- if d <= self.reference_date or self.overwrite
131
- ]
132
-
133
119
  def process(self, max_retries: int = 3):
134
- """Process updates with priority-based execution and retries"""
135
- #update_plan = self.generate_update_plan()
136
- update_plan = self.update_plan
137
- if update_plan.empty:
120
+ """Process updates with priority-based execution, retries, benchmarking and progress updates"""
121
+ overall_start = time.perf_counter()
122
+ plan = self.update_planner.plan
123
+ # Use len(plan.index) instead of plan.empty for Dask compatibility
124
+ plan_count = len(plan.index)
125
+ if plan_count == 0:
138
126
  self.logger.info("No updates required")
139
127
  return
140
- # Filter for required updates first
141
- #update_plan = update_plan[update_plan["update_required"] == True]
142
-
143
- if self.show_progress:
144
- #display(self._enhanced_display_table(update_plan))
145
- display(update_plan)
146
-
147
- for priority in sorted(update_plan["update_priority"].unique()):
148
- self._process_priority_group(update_plan, priority, max_retries)
149
-
150
- def _process_priority_group(self,
151
- update_plan: pd.DataFrame,
152
- priority: int,
153
- max_retries: int):
154
- """Process a single priority group with parallel execution"""
155
- dates = update_plan[update_plan["update_priority"] == priority]["date"].tolist()
128
+ self.logger.info(f"Update plan for {self.dataclass.__name__} includes {plan_count} items for update")
129
+
130
+ if self.verbose:
131
+ self.update_planner.show_update_plan()
132
+
133
+ for priority in sorted(plan["update_priority"].unique()):
134
+ self._process_priority_group(plan, priority, max_retries)
135
+
136
+ total_time = time.perf_counter() - overall_start
137
+ processed = len(self.processed_dates)
138
+ if processed:
139
+ self.logger.info(
140
+ f"Processed {processed} dates in {total_time:.1f}s "
141
+ f"(avg {total_time / processed:.1f}s per date)"
142
+ )
143
+ if self.show_progress or self.verbose:
144
+ self.show_benchmark_summary()
145
+
146
+ def _process_priority_group(
147
+ self,
148
+ plan: pd.DataFrame,
149
+ priority: int,
150
+ max_retries: int
151
+ ):
152
+ """Process a single priority group with parallel execution and timing"""
153
+ dates = plan[plan["update_priority"] == priority]["date"].tolist()
156
154
  if not dates:
157
155
  return
158
-
159
- desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
156
+ desc = f"Processing {self.dataclass.__name__}, priority: {priority}"
160
157
  self.logger.debug(f"Starting {desc.lower()}")
161
- max_threads = min(len(dates), self.max_threads)
162
- self.logger.debug(f"DataWrapper Max threads set at: {max_threads}")
163
- with ThreadPoolExecutor(max_workers=max_threads) as executor:
164
- futures = {
165
- executor.submit(self._process_date_with_retry, date, max_retries): date
166
- for date in dates
167
- }
168
-
169
- for future in tqdm(as_completed(futures),
170
- total=len(futures),
171
- desc=desc,
172
- disable=not self.show_progress):
158
+ group_start = time.perf_counter()
159
+ max_thr = min(len(dates), self.max_threads)
160
+ self.logger.debug(f"Max threads for priority {priority}: {max_thr}")
161
+ with ThreadPoolExecutor(max_workers=max_thr) as executor:
162
+ futures = {executor.submit(self._process_date_with_retry, date, max_retries): date for date in dates}
163
+ for future in tqdm(as_completed(futures), total=len(futures), desc=desc, disable=not self.show_progress):
173
164
  date = futures[future]
174
165
  try:
175
166
  future.result(timeout=self.timeout)
176
167
  except Exception as e:
177
- self.logger.error(f"Permanent failure processing {date}: {str(e)}")
178
-
179
- def _priority_label(self, priority: int) -> str:
180
- """Get human-readable label for priority level"""
181
- return next(
182
- (k for k, v in self.priority_map.items() if v == priority),
183
- f"Unknown Priority {priority}"
184
- )
168
+ self.logger.error(f"Permanent failure processing {date}: {e}")
169
+ group_time = time.perf_counter() - group_start
170
+ self.logger.info(f"Priority {priority} group processed {len(dates)} dates in {group_time:.1f}s")
185
171
 
186
172
  def _process_date_with_retry(self, date: datetime.date, max_retries: int):
187
- """Process a date with retry logic"""
188
173
  for attempt in range(1, max_retries + 1):
189
174
  try:
190
175
  self._process_single_date(date)
191
176
  return
192
177
  except Exception as e:
193
178
  if attempt < max_retries:
194
- self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {str(e)}")
179
+ self.logger.warning(f"Retry {attempt}/{max_retries} for {date}: {e}")
195
180
  else:
196
181
  raise RuntimeError(f"Failed processing {date} after {max_retries} attempts") from e
197
182
 
198
183
  def _process_single_date(self, date: datetime.date):
199
- """Core date processing logic"""
184
+ """Core date processing logic with load/save timing and thread reporting"""
200
185
  path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
186
+ self.logger.info(f"Processing date {date.isoformat()} for {path}")
187
+ # self.logger.info(f"Path {path} in {self.skipped}: {path in self.skipped}")
188
+ #if path in self.skipped:
189
+ # self.logger.info(f"Skipping {date} as it exists in the skipped list")
190
+ # return
201
191
  full_path = f"{path}{self.parquet_filename}"
202
192
 
203
- self.logger.info(f"Processing {date} ({full_path})")
204
- start_time = datetime.datetime.now()
193
+ thread_name = threading.current_thread().name
194
+ self.logger.info(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
205
195
 
196
+ overall_start = time.perf_counter()
206
197
  try:
207
- self.logger.debug(f"Class Params: {self.class_params}")
208
- self.logger.debug(f"Load Params: {self.load_params}")
209
-
210
- df = pd.DataFrame()
198
+ load_start = time.perf_counter()
211
199
  with self.dataclass(**self.class_params) as data:
212
200
  df = data.load_period(
213
201
  dt_field=self.date_field,
@@ -215,11 +203,22 @@ class DataWrapper:
215
203
  end=date,
216
204
  **self.load_params
217
205
  )
218
-
219
- if len(df.index)==0:
220
- self.logger.warning(f"No data found for {date}")
206
+ load_time = time.perf_counter() - load_start
207
+
208
+ if df.head(1, compute=True).empty:
209
+ if self.mmanifest:
210
+ schema = df._meta.dtypes.astype(str).to_dict()
211
+ self.mmanifest.record(
212
+ full_path=path
213
+ )
214
+ self.logger.info(f"No data found for {date}. Logged to missing manifest.")
221
215
  return
216
+ # Dask-compatible empty check
217
+ # if len(df.index) == 0:
218
+ # self.logger.warning(f"No data found for {date}")
219
+ # return
222
220
 
221
+ save_start = time.perf_counter()
223
222
  with self._lock:
224
223
  ParquetSaver(
225
224
  df_result=df,
@@ -227,23 +226,33 @@ class DataWrapper:
227
226
  fs=self.fs,
228
227
  logger=self.logger
229
228
  ).save_to_parquet(self.parquet_filename)
229
+ save_time = time.perf_counter() - save_start
230
230
 
231
- duration = (datetime.datetime.now() - start_time).total_seconds()
232
- self._log_success(date, duration, full_path)
233
-
231
+ total_time = time.perf_counter() - overall_start
232
+ self.benchmarks[date] = {
233
+ "load_duration": load_time,
234
+ "save_duration": save_time,
235
+ "total_duration": total_time
236
+ }
237
+ self._log_success(date, total_time, full_path)
234
238
  except Exception as e:
235
239
  self._log_failure(date, e)
236
240
  raise
237
241
 
238
242
  def _log_success(self, date: datetime.date, duration: float, path: str):
239
- """Handle successful processing logging"""
240
243
  msg = f"Completed {date} in {duration:.1f}s | Saved to {path}"
241
244
  self.logger.info(msg)
242
245
  self.processed_dates.append(date)
243
246
 
244
247
  def _log_failure(self, date: datetime.date, error: Exception):
245
- """Handle error logging"""
246
- msg = f"Failed processing {date}: {str(error)}"
248
+ msg = f"Failed processing {date}: {error}"
247
249
  self.logger.error(msg)
248
250
 
249
-
251
+ def show_benchmark_summary(self):
252
+ """Display a summary of load/save timings per date"""
253
+ if not self.benchmarks:
254
+ self.logger.info("No benchmarking data to show")
255
+ return
256
+ df_bench = pd.DataFrame.from_records([{"date": d, **m} for d, m in self.benchmarks.items()])
257
+ df_bench = df_bench.set_index("date").sort_index(ascending=not self.reverse_order)
258
+ self.logger.info("Benchmark Summary:\n" + df_bench.to_string())
@@ -149,9 +149,9 @@ class DateUtils:
149
149
 
150
150
 
151
151
  class FileAgeChecker:
152
- def __init__(self, logger=None):
152
+ def __init__(self, debug=False, logger=None):
153
153
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
154
-
154
+ self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
155
155
  def is_file_older_than(
156
156
  self,
157
157
  file_path: str,
@@ -171,15 +171,15 @@ class FileAgeChecker:
171
171
  :return: True if older than max_age_minutes, False otherwise.
172
172
  """
173
173
  fs = fs or fsspec.filesystem("file")
174
- self.logger.info(f"Checking age for {file_path}...")
174
+ self.logger.debug(f"Checking age for {file_path}...")
175
175
 
176
176
  try:
177
177
  if not fs.exists(file_path):
178
- self.logger.info(f"Path not found: {file_path}.")
178
+ self.logger.debug(f"Path not found: {file_path}.")
179
179
  return not ignore_missing
180
180
 
181
181
  if fs.isdir(file_path):
182
- self.logger.info(f"Found directory: {file_path}")
182
+ self.logger.debug(f"Found directory: {file_path}")
183
183
  age = self._get_directory_age_minutes(file_path, fs, verbose)
184
184
  elif fs.isfile(file_path):
185
185
  age = self._get_file_age_minutes(file_path, fs, verbose)
@@ -208,7 +208,7 @@ class FileAgeChecker:
208
208
  fs = fs or fsspec.filesystem("file")
209
209
  try:
210
210
  if not fs.exists(file_path):
211
- self.logger.info(f"Path not found: {file_path}")
211
+ self.logger.debug(f"Path not found: {file_path}")
212
212
  return float("inf")
213
213
 
214
214
  if fs.isdir(file_path):
@@ -237,7 +237,7 @@ class FileAgeChecker:
237
237
  return float("inf")
238
238
 
239
239
  if not all_files:
240
- self.logger.info(f"Empty directory: {dir_path}")
240
+ self.logger.debug(f"Empty directory: {dir_path}")
241
241
  return float("inf")
242
242
 
243
243
  modification_times = []
@@ -255,7 +255,7 @@ class FileAgeChecker:
255
255
 
256
256
  oldest = min(modification_times)
257
257
  age = (datetime.datetime.now(datetime.timezone.utc) - oldest).total_seconds() / 60
258
- self.logger.info(f"Oldest in {dir_path}: {age:.2f} minutes")
258
+ self.logger.debug(f"Oldest in {dir_path}: {age:.2f} minutes")
259
259
 
260
260
  return age
261
261
 
@@ -77,7 +77,7 @@ class Logger:
77
77
  formatter.converter = time.localtime # << Set local time explicitly
78
78
 
79
79
  # Create a file handler
80
- file_handler = logging.FileHandler(log_file_path)
80
+ file_handler = logging.FileHandler(log_file_path, delay=True)
81
81
  file_handler.setFormatter(formatter)
82
82
  self.logger.addHandler(file_handler)
83
83
 
@@ -0,0 +1,154 @@
1
+ import pandas as pd
2
+ import fsspec
3
+ import threading
4
+ import uuid
5
+ from typing import List, Optional, Set, Dict, Any
6
+
7
+ from sibi_dst.utils import Logger
8
+
9
+
10
+ class MissingManifestManager:
11
+ """
12
+ Thread-safe manager for a “missing-partitions” manifest (Parquet file).
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ fs: fsspec.AbstractFileSystem,
18
+ manifest_path: str,
19
+ clear_existing: bool = False,
20
+ **kwargs: Any,
21
+ ):
22
+ self.fs = fs
23
+ self.manifest_path = manifest_path.rstrip("/")
24
+ self.clear_existing = clear_existing
25
+
26
+ self.debug: bool = kwargs.get("debug", False)
27
+ self.logger = kwargs.get(
28
+ "logger",
29
+ Logger.default_logger(logger_name="missing_manifest_manager")
30
+ )
31
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
32
+
33
+ # In-memory list for new paths
34
+ self._new_records: List[Dict[str, str]] = []
35
+ # Cached set of existing paths
36
+ self._loaded_paths: Optional[Set[str]] = None
37
+
38
+ # Use a reentrant lock so save() can call load_existing() safely
39
+ self._lock = threading.RLock()
40
+
41
+ def _safe_exists(self, path: str) -> bool:
42
+ try:
43
+ return self.fs.exists(path)
44
+ except PermissionError:
45
+ if self.debug:
46
+ self.logger.debug(f"Permission denied checking existence of '{path}'")
47
+ return False
48
+ except Exception as e:
49
+ self.logger.warning(f"Error checking existence of '{path}': {e}")
50
+ return False
51
+
52
+ def load_existing(self) -> Set[str]:
53
+ """
54
+ Load and cache existing manifest paths.
55
+ """
56
+ with self._lock:
57
+ if self._loaded_paths is not None:
58
+ return self._loaded_paths
59
+
60
+ if not self._safe_exists(self.manifest_path):
61
+ self._loaded_paths = set()
62
+ return self._loaded_paths
63
+
64
+ try:
65
+ df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
66
+ paths = (
67
+ df.get("path", pd.Series(dtype=str))
68
+ .dropna().astype(str)
69
+ .loc[lambda s: s.str.strip().astype(bool)]
70
+ )
71
+ self._loaded_paths = set(paths.tolist())
72
+ except Exception as e:
73
+ self.logger.warning(f"Failed to load manifest '{self.manifest_path}': {e}")
74
+ self._loaded_paths = set()
75
+
76
+ return self._loaded_paths
77
+
78
+ def record(self, full_path: str) -> None:
79
+ """
80
+ Register a missing file path.
81
+ """
82
+ if not full_path or not isinstance(full_path, str):
83
+ return
84
+ with self._lock:
85
+ self._new_records.append({"path": full_path})
86
+
87
+ def save(self) -> None:
88
+ """
89
+ Merge new records into the manifest and write it out atomically.
90
+ """
91
+ with self._lock:
92
+ # Build DataFrame of new entries
93
+ new_df = pd.DataFrame(self._new_records)
94
+ should_overwrite = self.clear_existing or not self._safe_exists(self.manifest_path)
95
+ if new_df.empty and not should_overwrite:
96
+ return
97
+
98
+ # Clean new_df
99
+ new_df = (
100
+ new_df.get("path", pd.Series(dtype=str))
101
+ .dropna().astype(str)
102
+ .loc[lambda s: s.str.strip().astype(bool)]
103
+ .to_frame()
104
+ )
105
+
106
+ # Merge or overwrite
107
+ if should_overwrite:
108
+ out_df = new_df
109
+ else:
110
+ try:
111
+ old_df = pd.read_parquet(self.manifest_path, filesystem=self.fs)
112
+ old_paths = (
113
+ old_df.get("path", pd.Series(dtype=str))
114
+ .dropna().astype(str)
115
+ .loc[lambda s: s.str.strip().astype(bool)]
116
+ .to_frame()
117
+ )
118
+ out_df = pd.concat([old_paths, new_df], ignore_index=True)
119
+ except Exception as e:
120
+ self.logger.warning(f"Could not merge manifest, overwriting: {e}")
121
+ out_df = new_df
122
+
123
+ out_df = out_df.drop_duplicates(subset=["path"]).reset_index(drop=True)
124
+
125
+ # Ensure parent dir
126
+ parent = self.manifest_path.rsplit("/", 1)[0]
127
+ try:
128
+ self.fs.makedirs(parent, exist_ok=True)
129
+ except Exception as e:
130
+ self.logger.warning(f"Could not create manifest directory '{parent}': {e}")
131
+
132
+ # Write atomically: temp file + rename
133
+ temp_path = f"{self.manifest_path}.tmp-{uuid.uuid4().hex}"
134
+ try:
135
+ out_df.to_parquet(
136
+ temp_path,
137
+ filesystem=self.fs,
138
+ index=False
139
+ )
140
+ # rename into place (atomic in most filesystems)
141
+ self.fs.mv(temp_path, self.manifest_path, recursive=False)
142
+ except Exception as e:
143
+ self.logger.error(f"Failed to write or rename manifest: {e}")
144
+ # Clean up temp if it exists
145
+ try:
146
+ if self.fs.exists(temp_path):
147
+ self.fs.rm(temp_path, recursive=True)
148
+ except Exception:
149
+ pass
150
+ raise
151
+
152
+ # Reset memory & cache
153
+ self._new_records.clear()
154
+ self._loaded_paths = set(out_df["path"].tolist())