PyPI - spiderforce4ai - Versions diffs - 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl - Mend

spiderforce4ai 0.1.8py3-none-any.whl → 1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -57,22 +57,27 @@ class CrawlConfig:
     output_dir: Path = Path("spiderforce_reports")  # Default to spiderforce_reports in current directory
     webhook_url: Optional[str] = None  # Optional webhook endpoint
     webhook_timeout: int = 10  # Webhook timeout
-    report_file: Optional[Path] = None  # Optional report file location
+    webhook_headers: Optional[Dict[str, str]] = None  # Optional webhook headers
+    webhook_payload_template: Optional[str] = None  # Optional custom webhook payload template
+    save_reports: bool = False  # Whether to save crawl reports
+    report_file: Optional[Path] = None  # Optional report file location (used only if save_reports is True)
     def __post_init__(self):
-        # Initialize empty lists for selectors if None
+        # Initialize empty lists/dicts for None values
         self.remove_selectors = self.remove_selectors or []
         self.remove_selectors_regex = self.remove_selectors_regex or []
+        self.webhook_headers = self.webhook_headers or {}
         # Ensure output_dir is a Path and exists
         self.output_dir = Path(self.output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        # If report_file is not specified, create it in output_dir
-        if self.report_file is None:
-            self.report_file = self.output_dir / "crawl_report.json"
-        else:
-            self.report_file = Path(self.report_file)
+        # Only setup report file if save_reports is True
+        if self.save_reports:
+            if self.report_file is None:
+                self.report_file = self.output_dir / "crawl_report.json"
+            else:
+                self.report_file = Path(self.report_file)
     def to_dict(self) -> Dict:
         """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
     if not config.webhook_url:
         return
-    payload = {
-        "url": result.url,
-        "status": result.status,
-        "markdown": result.markdown if result.status == "success" else None,
-        "error": result.error if result.status == "failed" else None,
-        "timestamp": result.timestamp,
-        "config": config.to_dict()
-    }
+    # Use custom payload template if provided, otherwise use default
+    if config.webhook_payload_template:
+        # Replace variables in the template
+        payload_str = config.webhook_payload_template.format(
+            url=result.url,
+            status=result.status,
+            markdown=result.markdown if result.status == "success" else None,
+            error=result.error if result.status == "failed" else None,
+            timestamp=result.timestamp,
+            config=config.to_dict()
+        )
+        payload = json.loads(payload_str)  # Parse the formatted JSON string
+    else:
+        # Use default payload format
+        payload = {
+            "url": result.url,
+            "status": result.status,
+            "markdown": result.markdown if result.status == "success" else None,
+            "error": result.error if result.status == "failed" else None,
+            "timestamp": result.timestamp,
+            "config": config.to_dict()
+        }
     try:
         response = requests.post(
             config.webhook_url,
             json=payload,
+            headers=config.webhook_headers,
             timeout=config.webhook_timeout
         )
         response.raise_for_status()
@@ -276,8 +296,8 @@ class SpiderForce4AI:
                 results.append(result)
-            # Save report if configured
-            if config.report_file:
+            # Save report if enabled
+            if config.save_reports:
                 self._save_report_sync(results, config)
                 print(f"\nReport saved to: {config.report_file}")
@@ -420,6 +440,55 @@ class SpiderForce4AI:
         """Synchronous version of crawl_url_async."""
         return asyncio.run(self.crawl_url_async(url, config))
+    async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
+        """Retry failed URLs once."""
+        if not failed_results:
+            return []
+        console.print("\n[yellow]Retrying failed URLs...[/yellow]")
+        retry_results = []
+        # Create a new progress bar if one wasn't provided
+        should_close_progress = progress is None
+        if progress is None:
+            progress = Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                console=console
+            )
+            progress.start()
+        retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
+        for result in failed_results:
+            progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
+            try:
+                new_result = await self.crawl_url_async(result.url, config)
+                if new_result.status == "success":
+                    console.print(f"[green]✓ Retry successful: {result.url}[/green]")
+                else:
+                    console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
+                retry_results.append(new_result)
+            except Exception as e:
+                console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
+                retry_results.append(CrawlResult(
+                    url=result.url,
+                    status="failed",
+                    error=f"Retry error: {str(e)}",
+                    config=config.to_dict()
+                ))
+            progress.update(retry_task, advance=1)
+            await asyncio.sleep(config.request_delay)
+        if should_close_progress:
+            progress.stop()
+        return retry_results
     async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
         """Crawl multiple URLs asynchronously with progress bar."""
         await self._ensure_session()
@@ -445,17 +514,46 @@ class SpiderForce4AI:
                     await asyncio.sleep(config.request_delay)
                     return result
-            results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
+            initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
+            # Identify failed URLs
+            failed_results = [r for r in initial_results if r.status == "failed"]
+            # Calculate initial failure ratio
+            initial_failed = len(failed_results)
+            total_urls = len(urls)
+            failure_ratio = (initial_failed / total_urls) * 100
+            # Retry failed URLs if ratio is acceptable
+            if failed_results:
+                if failure_ratio > 20:
+                    console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
+                    results = initial_results
+                else:
+                    retry_results = await self._retry_failed_urls(failed_results, config, progress)
+                    # Replace failed results with retry results
+                    results = [r for r in initial_results if r.status == "success"] + retry_results
+            else:
+                results = initial_results
             # Save final report
             await self._save_report(config)
-            # Print summary
-            successful = len([r for r in results if r.status == "success"])
-            failed = len([r for r in results if r.status == "failed"])
-            console.print(f"\n[green]Crawling completed:[/green]")
-            console.print(f"✓ Successful: {successful}")
-            console.print(f"✗ Failed: {failed}")
+            # Calculate final statistics
+            final_successful = len([r for r in results if r.status == "success"])
+            final_failed = len([r for r in results if r.status == "failed"])
+            # Print detailed summary
+            console.print(f"\n[green]Crawling Summary:[/green]")
+            console.print(f"Total URLs processed: {total_urls}")
+            console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
+            console.print(f"Final results:")
+            console.print(f"  ✓ Successful: {final_successful}")
+            console.print(f"  ✗ Failed: {final_failed}")
+            if initial_failed > 0:
+                retry_successful = initial_failed - final_failed
+                console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
             if config.report_file:
                 console.print(f"📊 Report saved to: {config.report_file}")
@@ -543,12 +641,42 @@ class SpiderForce4AI:
             self._save_report_sync(results, config)
             print(f"\nReport saved to: {config.report_file}")
-        # Print summary
-        successful = len([r for r in results if r.status == "success"])
-        failed = len([r for r in results if r.status == "failed"])
-        print(f"\nCrawling completed:")
-        print(f"✓ Successful: {successful}")
-        print(f"✗ Failed: {failed}")
+        # Calculate initial failure statistics
+        failed_results = [r for r in results if r.status == "failed"]
+        initial_failed = len(failed_results)
+        total_urls = len(urls)
+        failure_ratio = (initial_failed / total_urls) * 100
+        # Retry failed URLs if ratio is acceptable
+        if failed_results:
+            if failure_ratio > 20:
+                console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
+            else:
+                console.print("\n[yellow]Retrying failed URLs...[/yellow]")
+                for result in failed_results:
+                    new_result = _process_url_parallel((result.url, self.base_url, config))
+                    if new_result.status == "success":
+                        console.print(f"[green]✓ Retry successful: {result.url}[/green]")
+                        # Replace the failed result with the successful retry
+                        results[results.index(result)] = new_result
+                    else:
+                        console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
+        # Calculate final statistics
+        final_successful = len([r for r in results if r.status == "success"])
+        final_failed = len([r for r in results if r.status == "failed"])
+        # Print detailed summary
+        console.print(f"\n[green]Crawling Summary:[/green]")
+        console.print(f"Total URLs processed: {total_urls}")
+        console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
+        console.print(f"Final results:")
+        console.print(f"  ✓ Successful: {final_successful}")
+        console.print(f"  ✗ Failed: {final_failed}")
+        if initial_failed > 0:
+            retry_successful = initial_failed - final_failed
+            console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
         return results

{spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.8
+Version: 1.0
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
     timeout=30,                            # Request timeout (seconds)
     # Output Settings
-    output_dir=Path("spiderforce_reports"), # Default directory for files
-    webhook_url="https://your-webhook.com", # Real-time notifications
-    webhook_timeout=10,                     # Webhook timeout
-    report_file=Path("crawl_report.json")   # Final report location
+    output_dir=Path("spiderforce_reports"),  # Default directory for files
+    webhook_url="https://your-webhook.com",  # Real-time notifications
+    webhook_timeout=10,                      # Webhook timeout
+    webhook_headers={                        # Optional custom headers for webhook
+        "Authorization": "Bearer your-token",
+        "X-Custom-Header": "value"
+    },
+    webhook_payload_template='''{           # Optional custom webhook payload template
+        "crawled_url": "{url}",
+        "content": "{markdown}",
+        "crawl_status": "{status}",
+        "crawl_error": "{error}",
+        "crawl_time": "{timestamp}",
+        "custom_field": "your-value"
+    }''',
+    save_reports=False,                      # Whether to save crawl reports (default: False)
+    report_file=Path("crawl_report.json")    # Report location (used only if save_reports=True)
 )
 ```

spiderforce4ai-1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
+spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
+spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-1.0.dist-info/RECORD,,

spiderforce4ai-0.1.8.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
-spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
-spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.8.dist-info/RECORD,,

{spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl

spiderforce4ai 0.1.8py3-none-any.whl → 1.0py3-none-any.whl