spiderforce4ai 2.5.9__py3-none-any.whl → 2.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -156,19 +156,34 @@ class PostExtractionAgent:
156
156
  self.buffer = PostExtractionBuffer(config.buffer_file)
157
157
  self.results: Dict[str, Any] = {}
158
158
  self.rate_limiter = RateLimiter()
159
+ # Convert string path to Path object if needed
160
+ if isinstance(self.config.output_file, str):
161
+ self.config.output_file = Path(self.config.output_file)
159
162
  self._setup_output()
160
163
 
161
164
  def _setup_output(self) -> None:
162
165
  """Setup output file if combining results."""
163
166
  if self.config.combine_output and self.config.output_file:
167
+ # Ensure parent directory exists
164
168
  self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
169
+
170
+ # Load existing results if file exists
165
171
  if self.config.output_file.exists():
166
- # Backup existing file
167
- backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
168
- self.config.output_file.rename(backup_path)
169
- self.config.output_file.touch()
170
- # Initialize empty results dictionary
171
- self.results = {}
172
+ try:
173
+ with open(self.config.output_file, 'r') as f:
174
+ self.results = json.load(f)
175
+ except json.JSONDecodeError:
176
+ # If file is corrupted, backup and start fresh
177
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
178
+ self.config.output_file.rename(backup_path)
179
+ self.results = {}
180
+
181
+ # Create file if it doesn't exist
182
+ if not self.config.output_file.exists():
183
+ self.config.output_file.touch()
184
+ self.results = {}
185
+
186
+ logger.info(f"Initialized output file at {self.config.output_file}")
172
187
 
173
188
  def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
174
189
  """Process a single piece of content through the LLM."""
@@ -260,22 +275,44 @@ class PostExtractionAgent:
260
275
  """Save individual or combined results synchronously."""
261
276
  try:
262
277
  if self.config.combine_output and self.config.output_file:
263
- # Update the results dictionary
264
- self.results[url] = result
278
+ # Convert Path to string if needed
279
+ output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
265
280
 
266
281
  # Ensure output directory exists
267
- self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
282
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
+
284
+ # Load existing results or create new
285
+ try:
286
+ if Path(output_file).exists():
287
+ with open(output_file, 'r', encoding='utf-8') as f:
288
+ self.results = json.load(f)
289
+ else:
290
+ self.results = {}
291
+ except (json.JSONDecodeError, FileNotFoundError):
292
+ self.results = {}
293
+
294
+ # Update results with new data
295
+ self.results[url] = result
268
296
 
269
297
  # Save combined results atomically
270
- temp_file = self.config.output_file.with_suffix('.tmp')
271
- with open(temp_file, 'w') as f:
272
- json.dump(self.results, f, indent=2)
273
- temp_file.replace(self.config.output_file)
298
+ temp_file = f"{output_file}.tmp"
299
+ with open(temp_file, 'w', encoding='utf-8') as f:
300
+ json.dump(self.results, f, indent=2, ensure_ascii=False)
274
301
 
275
- logger.info(f"Updated combined results file with {url}")
302
+ # Atomic replace and cleanup backup files
303
+ Path(temp_file).replace(output_file)
304
+ logger.info(f"Updated combined results file with {url} in {output_file}")
305
+
306
+ # Cleanup all backup files
307
+ for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
+ try:
309
+ backup_file.unlink()
310
+ logger.info(f"Cleaned up backup file: {backup_file}")
311
+ except Exception as e:
312
+ logger.warning(f"Failed to remove backup file {backup_file}: {e}")
276
313
 
277
- # Cleanup backup files after successful save
278
- for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
314
+ # Cleanup backup files
315
+ for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
279
316
  try:
280
317
  backup_file.unlink()
281
318
  logger.info(f"Cleaned up backup file: {backup_file}")
@@ -333,18 +370,20 @@ class PostExtractionAgent:
333
370
 
334
371
  # Save result synchronously
335
372
  try:
336
- # Save both original and transformed result
337
- if self.config.custom_transform_function:
338
- self._save_result_sync(url, transformed_result)
339
- logger.info(f"Saved transformed results for {url}")
373
+ # Always save the result, whether transformed or original
374
+ result_to_save = transformed_result if self.config.custom_transform_function else result
375
+ if self.config.combine_output and self.config.output_file:
376
+ self._save_result_sync(url, result_to_save)
377
+ logger.info(f"Saved results for {url} to combined output")
340
378
  else:
341
- self._save_result_sync(url, result)
342
- logger.info(f"Saved original results for {url}")
379
+ self._save_result_sync(url, result_to_save)
380
+ logger.info(f"Saved individual result for {url}")
343
381
  except Exception as e:
344
382
  error_msg = f"Error saving results for {url}: {str(e)}"
345
383
  logger.error(error_msg)
346
384
  console.print(f"[red]{error_msg}[/red]")
347
385
 
386
+ # Return the appropriate result
348
387
  return transformed_result if self.config.custom_transform_function else result
349
388
 
350
389
  # Wait before retry
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.5.9
3
+ Version: 2.6.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
+ spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
3
+ spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
4
+ spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=K6AGqeDO_MZ4pQMlkmnfK6Y5Sa1BWkUWv9u7_LMxsuM,17314
3
- spiderforce4ai-2.5.9.dist-info/METADATA,sha256=4qXFZ6sEYnqsjULabDNc0ez0ZTuTPa1FuUTXpGuXG0I,9012
4
- spiderforce4ai-2.5.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.5.9.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.5.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.5.9.dist-info/RECORD,,