spiderforce4ai 2.6__py3-none-any.whl → 2.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -156,19 +156,34 @@ class PostExtractionAgent:
156
156
  self.buffer = PostExtractionBuffer(config.buffer_file)
157
157
  self.results: Dict[str, Any] = {}
158
158
  self.rate_limiter = RateLimiter()
159
+ # Convert string path to Path object if needed
160
+ if isinstance(self.config.output_file, str):
161
+ self.config.output_file = Path(self.config.output_file)
159
162
  self._setup_output()
160
163
 
161
164
  def _setup_output(self) -> None:
162
165
  """Setup output file if combining results."""
163
166
  if self.config.combine_output and self.config.output_file:
167
+ # Ensure parent directory exists
164
168
  self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
169
+
170
+ # Load existing results if file exists
165
171
  if self.config.output_file.exists():
166
- # Backup existing file
167
- backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
168
- self.config.output_file.rename(backup_path)
169
- self.config.output_file.touch()
170
- # Initialize empty results dictionary
171
- self.results = {}
172
+ try:
173
+ with open(self.config.output_file, 'r') as f:
174
+ self.results = json.load(f)
175
+ except json.JSONDecodeError:
176
+ # If file is corrupted, backup and start fresh
177
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
178
+ self.config.output_file.rename(backup_path)
179
+ self.results = {}
180
+
181
+ # Create file if it doesn't exist
182
+ if not self.config.output_file.exists():
183
+ self.config.output_file.touch()
184
+ self.results = {}
185
+
186
+ logger.info(f"Initialized output file at {self.config.output_file}")
172
187
 
173
188
  def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
174
189
  """Process a single piece of content through the LLM."""
@@ -263,28 +278,38 @@ class PostExtractionAgent:
263
278
  # Convert Path to string if needed
264
279
  output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
265
280
 
266
- # Load existing results if file exists
267
- if Path(output_file).exists():
268
- try:
269
- with open(output_file, 'r') as f:
281
+ # Ensure output directory exists
282
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
+
284
+ # Load existing results or create new
285
+ try:
286
+ if Path(output_file).exists():
287
+ with open(output_file, 'r', encoding='utf-8') as f:
270
288
  self.results = json.load(f)
271
- except json.JSONDecodeError:
289
+ else:
272
290
  self.results = {}
291
+ except (json.JSONDecodeError, FileNotFoundError):
292
+ self.results = {}
273
293
 
274
294
  # Update results with new data
275
295
  self.results[url] = result
276
296
 
277
- # Ensure output directory exists
278
- Path(output_file).parent.mkdir(parents=True, exist_ok=True)
279
-
280
297
  # Save combined results atomically
281
298
  temp_file = f"{output_file}.tmp"
282
- with open(temp_file, 'w') as f:
283
- json.dump(self.results, f, indent=2)
299
+ with open(temp_file, 'w', encoding='utf-8') as f:
300
+ json.dump(self.results, f, indent=2, ensure_ascii=False)
284
301
 
285
- # Atomic replace
302
+ # Atomic replace and cleanup backup files
286
303
  Path(temp_file).replace(output_file)
287
- logger.info(f"Updated combined results file with {url}")
304
+ logger.info(f"Updated combined results file with {url} in {output_file}")
305
+
306
+ # Cleanup all backup files
307
+ for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
+ try:
309
+ backup_file.unlink()
310
+ logger.info(f"Cleaned up backup file: {backup_file}")
311
+ except Exception as e:
312
+ logger.warning(f"Failed to remove backup file {backup_file}: {e}")
288
313
 
289
314
  # Cleanup backup files
290
315
  for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
@@ -345,18 +370,20 @@ class PostExtractionAgent:
345
370
 
346
371
  # Save result synchronously
347
372
  try:
348
- # Save both original and transformed result
349
- if self.config.custom_transform_function:
350
- self._save_result_sync(url, transformed_result)
351
- logger.info(f"Saved transformed results for {url}")
373
+ # Always save the result, whether transformed or original
374
+ result_to_save = transformed_result if self.config.custom_transform_function else result
375
+ if self.config.combine_output and self.config.output_file:
376
+ self._save_result_sync(url, result_to_save)
377
+ logger.info(f"Saved results for {url} to combined output")
352
378
  else:
353
- self._save_result_sync(url, result)
354
- logger.info(f"Saved original results for {url}")
379
+ self._save_result_sync(url, result_to_save)
380
+ logger.info(f"Saved individual result for {url}")
355
381
  except Exception as e:
356
382
  error_msg = f"Error saving results for {url}: {str(e)}"
357
383
  logger.error(error_msg)
358
384
  console.print(f"[red]{error_msg}[/red]")
359
385
 
386
+ # Return the appropriate result
360
387
  return transformed_result if self.config.custom_transform_function else result
361
388
 
362
389
  # Wait before retry
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6
3
+ Version: 2.6.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
+ spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
3
+ spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
4
+ spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=AysrHIoD-IreqbvWqCDxyN7v8EPSdLOG9yxABamTZSg,17827
3
- spiderforce4ai-2.6.dist-info/METADATA,sha256=JtIZ1-ojRvfm773-yF1a_M_x6eB5kbnb6WT5XT04KDA,9010
4
- spiderforce4ai-2.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.6.dist-info/RECORD,,