PyPI - spiderforce4ai - Versions diffs - 2.6__tar.gz → 2.6.4__tar.gz - Mend

spiderforce4ai 2.6tar.gz → 2.6.4tar.gz

Files changed (14) hide show

{spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.6
+Version: 2.6.4
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "2.6"
+version = "2.6.4"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
 readme = "README.md"
 authors = [

{spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="spiderforce4ai",
-    version="2.6",
+    version="2.6.4",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",

{spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai/post_extraction_agent.py RENAMED Viewed

@@ -156,19 +156,40 @@ class PostExtractionAgent:
         self.buffer = PostExtractionBuffer(config.buffer_file)
         self.results: Dict[str, Any] = {}
         self.rate_limiter = RateLimiter()
+        # Convert string path to Path object if needed
+        if isinstance(self.config.output_file, str):
+            self.config.output_file = Path(self.config.output_file)
+            # Ensure parent directory exists
+            self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
+            # Create empty JSON file if it doesn't exist
+            if not self.config.output_file.exists():
+                with open(self.config.output_file, 'w') as f:
+                    json.dump({}, f)
         self._setup_output()
     def _setup_output(self) -> None:
         """Setup output file if combining results."""
         if self.config.combine_output and self.config.output_file:
+            # Ensure parent directory exists
             self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
+            # Load existing results if file exists
             if self.config.output_file.exists():
-                # Backup existing file
-                backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
-                self.config.output_file.rename(backup_path)
-            self.config.output_file.touch()
-            # Initialize empty results dictionary
-            self.results = {}
+                try:
+                    with open(self.config.output_file, 'r') as f:
+                        self.results = json.load(f)
+                except json.JSONDecodeError:
+                    # If file is corrupted, backup and start fresh
+                    backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
+                    self.config.output_file.rename(backup_path)
+                    self.results = {}
+            # Create file if it doesn't exist
+            if not self.config.output_file.exists():
+                self.config.output_file.touch()
+                self.results = {}
+            logger.info(f"Initialized output file at {self.config.output_file}")
     def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
         """Process a single piece of content through the LLM."""
@@ -257,48 +278,35 @@ class PostExtractionAgent:
             return None
     def _save_result_sync(self, url: str, result: Dict) -> None:
-        """Save individual or combined results synchronously."""
+        """Save results synchronously to combined output file."""
         try:
-            if self.config.combine_output and self.config.output_file:
-                # Convert Path to string if needed
-                output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
-                # Load existing results if file exists
-                if Path(output_file).exists():
-                    try:
-                        with open(output_file, 'r') as f:
-                            self.results = json.load(f)
-                    except json.JSONDecodeError:
-                        self.results = {}
-                # Update results with new data
-                self.results[url] = result
-                # Ensure output directory exists
-                Path(output_file).parent.mkdir(parents=True, exist_ok=True)
-                # Save combined results atomically
-                temp_file = f"{output_file}.tmp"
-                with open(temp_file, 'w') as f:
-                    json.dump(self.results, f, indent=2)
+            if self.config.output_file:
+                # Load existing results
+                try:
+                    with open(self.config.output_file, 'r', encoding='utf-8') as f:
+                        current_results = json.load(f)
+                except (json.JSONDecodeError, FileNotFoundError):
+                    current_results = {}
+                # Update with new result
+                current_results[url] = result
+                # Save atomically using temporary file
+                temp_file = self.config.output_file.with_suffix('.tmp')
+                with open(temp_file, 'w', encoding='utf-8') as f:
+                    json.dump(current_results, f, indent=2, ensure_ascii=False)
                 # Atomic replace
-                Path(temp_file).replace(output_file)
+                temp_file.replace(self.config.output_file)
                 logger.info(f"Updated combined results file with {url}")
                 # Cleanup backup files
-                for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
+                for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
                     try:
                         backup_file.unlink()
                         logger.info(f"Cleaned up backup file: {backup_file}")
                     except Exception as e:
                         logger.warning(f"Failed to remove backup file {backup_file}: {e}")
-            elif not self.config.combine_output and self.config.output_file:
-                individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
-                with open(individual_file, 'w') as f:
-                    json.dump(result, f, indent=2)
-                logger.info(f"Saved individual result file for {url}")
         except Exception as e:
             logger.error(f"Error saving results for {url}: {str(e)}")
@@ -345,18 +353,17 @@ class PostExtractionAgent:
                 # Save result synchronously
                 try:
-                    # Save both original and transformed result
-                    if self.config.custom_transform_function:
-                        self._save_result_sync(url, transformed_result)
-                        logger.info(f"Saved transformed results for {url}")
-                    else:
-                        self._save_result_sync(url, result)
-                        logger.info(f"Saved original results for {url}")
+                    # Always save the result, whether transformed or original
+                    result_to_save = transformed_result if self.config.custom_transform_function else result
+                    if self.config.output_file:
+                        self._save_result_sync(url, result_to_save)
+                        logger.info(f"Saved results for {url} to {self.config.output_file}")
                 except Exception as e:
                     error_msg = f"Error saving results for {url}: {str(e)}"
                     logger.error(error_msg)
                     console.print(f"[red]{error_msg}[/red]")
+                # Return the appropriate result
                 return transformed_result if self.config.custom_transform_function else result
             # Wait before retry

{spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.6
+Version: 2.6.4
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz