spiderforce4ai 2.6.3__py3-none-any.whl → 2.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -159,6 +159,12 @@ class PostExtractionAgent:
159
159
  # Convert string path to Path object if needed
160
160
  if isinstance(self.config.output_file, str):
161
161
  self.config.output_file = Path(self.config.output_file)
162
+ # Ensure parent directory exists
163
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
164
+ # Create empty JSON file if it doesn't exist
165
+ if not self.config.output_file.exists():
166
+ with open(self.config.output_file, 'w') as f:
167
+ json.dump({}, f)
162
168
  self._setup_output()
163
169
 
164
170
  def _setup_output(self) -> None:
@@ -272,58 +278,35 @@ class PostExtractionAgent:
272
278
  return None
273
279
 
274
280
  def _save_result_sync(self, url: str, result: Dict) -> None:
275
- """Save individual or combined results synchronously."""
281
+ """Save results synchronously to combined output file."""
276
282
  try:
277
- if self.config.combine_output and self.config.output_file:
278
- # Convert Path to string if needed
279
- output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
280
-
281
- # Ensure output directory exists
282
- Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
-
284
- # Load existing results or create new
283
+ if self.config.output_file:
284
+ # Load existing results
285
285
  try:
286
- if Path(output_file).exists():
287
- with open(output_file, 'r', encoding='utf-8') as f:
288
- self.results = json.load(f)
289
- else:
290
- self.results = {}
286
+ with open(self.config.output_file, 'r', encoding='utf-8') as f:
287
+ current_results = json.load(f)
291
288
  except (json.JSONDecodeError, FileNotFoundError):
292
- self.results = {}
293
-
294
- # Update results with new data
295
- self.results[url] = result
296
-
297
- # Save combined results atomically
298
- temp_file = f"{output_file}.tmp"
289
+ current_results = {}
290
+
291
+ # Update with new result
292
+ current_results[url] = result
293
+
294
+ # Save atomically using temporary file
295
+ temp_file = self.config.output_file.with_suffix('.tmp')
299
296
  with open(temp_file, 'w', encoding='utf-8') as f:
300
- json.dump(self.results, f, indent=2, ensure_ascii=False)
301
-
302
- # Atomic replace and cleanup backup files
303
- Path(temp_file).replace(output_file)
304
- logger.info(f"Updated combined results file with {url} in {output_file}")
305
-
306
- # Cleanup all backup files
307
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
- try:
309
- backup_file.unlink()
310
- logger.info(f"Cleaned up backup file: {backup_file}")
311
- except Exception as e:
312
- logger.warning(f"Failed to remove backup file {backup_file}: {e}")
313
-
297
+ json.dump(current_results, f, indent=2, ensure_ascii=False)
298
+
299
+ # Atomic replace
300
+ temp_file.replace(self.config.output_file)
301
+ logger.info(f"Updated combined results file with {url}")
302
+
314
303
  # Cleanup backup files
315
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
304
+ for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
316
305
  try:
317
306
  backup_file.unlink()
318
307
  logger.info(f"Cleaned up backup file: {backup_file}")
319
308
  except Exception as e:
320
309
  logger.warning(f"Failed to remove backup file {backup_file}: {e}")
321
-
322
- elif not self.config.combine_output and self.config.output_file:
323
- individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
324
- with open(individual_file, 'w') as f:
325
- json.dump(result, f, indent=2)
326
- logger.info(f"Saved individual result file for {url}")
327
310
  except Exception as e:
328
311
  logger.error(f"Error saving results for {url}: {str(e)}")
329
312
 
@@ -372,12 +355,9 @@ class PostExtractionAgent:
372
355
  try:
373
356
  # Always save the result, whether transformed or original
374
357
  result_to_save = transformed_result if self.config.custom_transform_function else result
375
- if self.config.combine_output and self.config.output_file:
376
- self._save_result_sync(url, result_to_save)
377
- logger.info(f"Saved results for {url} to combined output")
378
- else:
358
+ if self.config.output_file:
379
359
  self._save_result_sync(url, result_to_save)
380
- logger.info(f"Saved individual result for {url}")
360
+ logger.info(f"Saved results for {url} to {self.config.output_file}")
381
361
  except Exception as e:
382
362
  error_msg = f"Error saving results for {url}: {str(e)}"
383
363
  logger.error(error_msg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6.3
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
+ spiderforce4ai/post_extraction_agent.py,sha256=GJXV-qTi9xAwil8YSUBaB0OUDDPDzWfcYScldxKsenw,18121
3
+ spiderforce4ai-2.6.4.dist-info/METADATA,sha256=pzdSGVryHkPAOuf3UcBrimThEi4paBzYKFoWCM-wVvY,9012
4
+ spiderforce4ai-2.6.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.4.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
3
- spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
4
- spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.6.3.dist-info/RECORD,,