PyPI - spiderforce4ai - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

spiderforce4ai 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -57,22 +57,27 @@ class CrawlConfig:
     output_dir: Path = Path("spiderforce_reports")  # Default to spiderforce_reports in current directory
     webhook_url: Optional[str] = None  # Optional webhook endpoint
     webhook_timeout: int = 10  # Webhook timeout
-    report_file: Optional[Path] = None  # Optional report file location
+    webhook_headers: Optional[Dict[str, str]] = None  # Optional webhook headers
+    webhook_payload_template: Optional[str] = None  # Optional custom webhook payload template
+    save_reports: bool = False  # Whether to save crawl reports
+    report_file: Optional[Path] = None  # Optional report file location (used only if save_reports is True)
     def __post_init__(self):
-        # Initialize empty lists for selectors if None
+        # Initialize empty lists/dicts for None values
         self.remove_selectors = self.remove_selectors or []
         self.remove_selectors_regex = self.remove_selectors_regex or []
+        self.webhook_headers = self.webhook_headers or {}
         # Ensure output_dir is a Path and exists
         self.output_dir = Path(self.output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        # If report_file is not specified, create it in output_dir
-        if self.report_file is None:
-            self.report_file = self.output_dir / "crawl_report.json"
-        else:
-            self.report_file = Path(self.report_file)
+        # Only setup report file if save_reports is True
+        if self.save_reports:
+            if self.report_file is None:
+                self.report_file = self.output_dir / "crawl_report.json"
+            else:
+                self.report_file = Path(self.report_file)
     def to_dict(self) -> Dict:
         """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
     if not config.webhook_url:
         return
-    payload = {
-        "url": result.url,
-        "status": result.status,
-        "markdown": result.markdown if result.status == "success" else None,
-        "error": result.error if result.status == "failed" else None,
-        "timestamp": result.timestamp,
-        "config": config.to_dict()
-    }
+    # Use custom payload template if provided, otherwise use default
+    if config.webhook_payload_template:
+        # Replace variables in the template
+        payload_str = config.webhook_payload_template.format(
+            url=result.url,
+            status=result.status,
+            markdown=result.markdown if result.status == "success" else None,
+            error=result.error if result.status == "failed" else None,
+            timestamp=result.timestamp,
+            config=config.to_dict()
+        )
+        payload = json.loads(payload_str)  # Parse the formatted JSON string
+    else:
+        # Use default payload format
+        payload = {
+            "url": result.url,
+            "status": result.status,
+            "markdown": result.markdown if result.status == "success" else None,
+            "error": result.error if result.status == "failed" else None,
+            "timestamp": result.timestamp,
+            "config": config.to_dict()
+        }
     try:
         response = requests.post(
             config.webhook_url,
             json=payload,
+            headers=config.webhook_headers,
             timeout=config.webhook_timeout
         )
         response.raise_for_status()
@@ -196,6 +216,113 @@ class SpiderForce4AI:
             await f.write(markdown)
         return filepath
+    def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl sitemap URLs using server-side parallel processing.
+        """
+        print(f"Fetching sitemap from {sitemap_url}...")
+        # Fetch sitemap
+        try:
+            response = requests.get(sitemap_url, timeout=config.timeout)
+            response.raise_for_status()
+            sitemap_text = response.text
+        except Exception as e:
+            print(f"Error fetching sitemap: {str(e)}")
+            raise
+        # Parse sitemap
+        try:
+            root = ET.fromstring(sitemap_text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            print(f"Found {len(urls)} URLs in sitemap")
+        except Exception as e:
+            print(f"Error parsing sitemap: {str(e)}")
+            raise
+        # Process URLs using server-side parallel endpoint
+        return self.crawl_urls_server_parallel(urls, config)
+    def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl multiple URLs using server-side parallel processing.
+        This uses the /convert_parallel endpoint which handles parallelization on the server.
+        """
+        print(f"Sending {len(urls)} URLs for parallel processing...")
+        try:
+            endpoint = f"{self.base_url}/convert_parallel"
+            # Prepare payload
+            payload = {
+                "urls": urls,
+                **config.to_dict()
+            }
+            # Send request
+            response = requests.post(
+                endpoint,
+                json=payload,
+                timeout=config.timeout
+            )
+            response.raise_for_status()
+            # Process results
+            results = []
+            server_results = response.json()  # Assuming server returns JSON array of results
+            for url_result in server_results:
+                result = CrawlResult(
+                    url=url_result["url"],
+                    status=url_result.get("status", "failed"),
+                    markdown=url_result.get("markdown"),
+                    error=url_result.get("error"),
+                    config=config.to_dict()
+                )
+                # Save markdown if successful and output dir is configured
+                if result.status == "success" and config.output_dir and result.markdown:
+                    filepath = config.output_dir / f"{slugify(result.url)}.md"
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(result.markdown)
+                # Send webhook if configured
+                if config.webhook_url:
+                    _send_webhook_sync(result, config)
+                results.append(result)
+            # Save report if enabled
+            if config.save_reports:
+                self._save_report_sync(results, config)
+                print(f"\nReport saved to: {config.report_file}")
+            # Print summary
+            successful = len([r for r in results if r.status == "success"])
+            failed = len([r for r in results if r.status == "failed"])
+            print(f"\nParallel processing completed:")
+            print(f"✓ Successful: {successful}")
+            print(f"✗ Failed: {failed}")
+            return results
+        except Exception as e:
+            print(f"Error during parallel processing: {str(e)}")
+            # Create failed results for all URLs
+            return [
+                CrawlResult(
+                    url=url,
+                    status="failed",
+                    error=str(e),
+                    config=config.to_dict()
+                ) for url in urls
+            ]
     async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
         """Send webhook with crawl results."""
         if not config.webhook_url:
@@ -313,6 +440,55 @@ class SpiderForce4AI:
         """Synchronous version of crawl_url_async."""
         return asyncio.run(self.crawl_url_async(url, config))
+    async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
+        """Retry failed URLs once."""
+        if not failed_results:
+            return []
+        console.print("\n[yellow]Retrying failed URLs...[/yellow]")
+        retry_results = []
+        # Create a new progress bar if one wasn't provided
+        should_close_progress = progress is None
+        if progress is None:
+            progress = Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                console=console
+            )
+            progress.start()
+        retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
+        for result in failed_results:
+            progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
+            try:
+                new_result = await self.crawl_url_async(result.url, config)
+                if new_result.status == "success":
+                    console.print(f"[green]✓ Retry successful: {result.url}[/green]")
+                else:
+                    console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
+                retry_results.append(new_result)
+            except Exception as e:
+                console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
+                retry_results.append(CrawlResult(
+                    url=result.url,
+                    status="failed",
+                    error=f"Retry error: {str(e)}",
+                    config=config.to_dict()
+                ))
+            progress.update(retry_task, advance=1)
+            await asyncio.sleep(config.request_delay)
+        if should_close_progress:
+            progress.stop()
+        return retry_results
     async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
         """Crawl multiple URLs asynchronously with progress bar."""
         await self._ensure_session()
@@ -338,15 +514,27 @@ class SpiderForce4AI:
                     await asyncio.sleep(config.request_delay)
                     return result
-            results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
+            initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
+            # Identify failed URLs
+            failed_results = [r for r in initial_results if r.status == "failed"]
+            # Retry failed URLs
+            if failed_results:
+                retry_results = await self._retry_failed_urls(failed_results, config, progress)
+                # Replace failed results with retry results
+                results = [r for r in initial_results if r.status == "success"] + retry_results
+            else:
+                results = initial_results
             # Save final report
             await self._save_report(config)
-            # Print summary
+            # Print final summary
             successful = len([r for r in results if r.status == "success"])
             failed = len([r for r in results if r.status == "failed"])
-            console.print(f"\n[green]Crawling completed:[/green]")
+            console.print(f"\n[green]Final crawling results:[/green]")
             console.print(f"✓ Successful: {successful}")
             console.print(f"✗ Failed: {failed}")
@@ -436,12 +624,25 @@ class SpiderForce4AI:
             self._save_report_sync(results, config)
             print(f"\nReport saved to: {config.report_file}")
-        # Print summary
+        # Identify failed URLs and retry them
+        failed_results = [r for r in results if r.status == "failed"]
+        if failed_results:
+            console.print("\n[yellow]Retrying failed URLs...[/yellow]")
+            for result in failed_results:
+                new_result = _process_url_parallel((result.url, self.base_url, config))
+                if new_result.status == "success":
+                    console.print(f"[green]✓ Retry successful: {result.url}[/green]")
+                    # Replace the failed result with the successful retry
+                    results[results.index(result)] = new_result
+                else:
+                    console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
+        # Print final summary
         successful = len([r for r in results if r.status == "success"])
         failed = len([r for r in results if r.status == "failed"])
-        print(f"\nCrawling completed:")
-        print(f"✓ Successful: {successful}")
-        print(f"✗ Failed: {failed}")
+        console.print(f"\n[green]Final crawling results:[/green]")
+        console.print(f"✓ Successful: {successful}")
+        console.print(f"✗ Failed: {failed}")
         return results

{spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.7
+Version: 0.1.9
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -24,75 +24,73 @@ Dynamic: requires-python
 # SpiderForce4AI Python Wrapper
-A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
-## Installation
-```bash
-pip install spiderforce4ai
-```
+A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
 ## Quick Start (Minimal Setup)
 ```python
 from spiderforce4ai import SpiderForce4AI, CrawlConfig
-# Initialize with your SpiderForce4AI service URL
+# Initialize with your service URL
 spider = SpiderForce4AI("http://localhost:3004")
-# Use default configuration (will save in ./spiderforce_reports)
+# Create default config
 config = CrawlConfig()
 # Crawl a single URL
 result = spider.crawl_url("https://example.com", config)
 ```
+## Installation
+```bash
+pip install spiderforce4ai
+```
 ## Crawling Methods
-### 1. Single URL Crawling
+### 1. Single URL
 ```python
-# Synchronous
+# Basic usage
 result = spider.crawl_url("https://example.com", config)
-# Asynchronous
+# Async version
 async def crawl():
     result = await spider.crawl_url_async("https://example.com", config)
 ```
-### 2. Multiple URLs Crawling
+### 2. Multiple URLs
 ```python
-# List of URLs
 urls = [
     "https://example.com/page1",
-    "https://example.com/page2",
-    "https://example.com/page3"
+    "https://example.com/page2"
 ]
-# Synchronous
-results = spider.crawl_urls(urls, config)
+# Client-side parallel (using multiprocessing)
+results = spider.crawl_urls_parallel(urls, config)
+# Server-side parallel (single request)
+results = spider.crawl_urls_server_parallel(urls, config)
-# Asynchronous
+# Async version
 async def crawl():
     results = await spider.crawl_urls_async(urls, config)
-# Parallel (using multiprocessing)
-results = spider.crawl_urls_parallel(urls, config)
 ```
 ### 3. Sitemap Crawling
 ```python
-# Synchronous
-results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+# Server-side parallel (recommended)
+results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
+# Client-side parallel
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
-# Asynchronous
+# Async version
 async def crawl():
     results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
-# Parallel (using multiprocessing)
-results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
 ```
 ## Configuration Options
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
 All configuration options are optional with sensible defaults:
 ```python
+from pathlib import Path
 config = CrawlConfig(
     # Content Selection (all optional)
-    target_selector="article",              # Specific element to target
+    target_selector="article",              # Specific element to extract
     remove_selectors=[                      # Elements to remove
         ".ads",
         "#popup",
@@ -112,21 +112,34 @@ config = CrawlConfig(
     remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
     # Processing Settings
-    max_concurrent_requests=1,              # Default: 1 (parallel processing)
-    request_delay=0.5,                     # Delay between requests in seconds
-    timeout=30,                            # Request timeout in seconds
+    max_concurrent_requests=1,              # For client-side parallel processing
+    request_delay=0.5,                     # Delay between requests (seconds)
+    timeout=30,                            # Request timeout (seconds)
     # Output Settings
-    output_dir="custom_output",            # Default: "spiderforce_reports"
-    report_file="custom_report.json",      # Default: "crawl_report.json"
-    webhook_url="https://your-webhook.com", # Optional webhook endpoint
-    webhook_timeout=10                      # Webhook timeout in seconds
+    output_dir=Path("spiderforce_reports"),  # Default directory for files
+    webhook_url="https://your-webhook.com",  # Real-time notifications
+    webhook_timeout=10,                      # Webhook timeout
+    webhook_headers={                        # Optional custom headers for webhook
+        "Authorization": "Bearer your-token",
+        "X-Custom-Header": "value"
+    },
+    webhook_payload_template='''{           # Optional custom webhook payload template
+        "crawled_url": "{url}",
+        "content": "{markdown}",
+        "crawl_status": "{status}",
+        "crawl_error": "{error}",
+        "crawl_time": "{timestamp}",
+        "custom_field": "your-value"
+    }''',
+    save_reports=False,                      # Whether to save crawl reports (default: False)
+    report_file=Path("crawl_report.json")    # Report location (used only if save_reports=True)
 )
 ```
 ## Real-World Examples
-### 1. Basic Website Crawling
+### 1. Basic Blog Crawling
 ```python
 from spiderforce4ai import SpiderForce4AI, CrawlConfig
@@ -134,78 +147,77 @@ from pathlib import Path
 spider = SpiderForce4AI("http://localhost:3004")
 config = CrawlConfig(
+    target_selector="article.post-content",
     output_dir=Path("blog_content")
 )
-result = spider.crawl_url("https://example.com/blog", config)
-print(f"Content saved to: {result.url}.md")
+result = spider.crawl_url("https://example.com/blog-post", config)
 ```
-### 2. Advanced Parallel Sitemap Crawling
+### 2. Parallel Website Crawling
 ```python
 config = CrawlConfig(
-    max_concurrent_requests=5,
-    output_dir=Path("website_content"),
     remove_selectors=[
         ".navigation",
         ".footer",
         ".ads",
         "#cookie-notice"
     ],
+    max_concurrent_requests=5,
+    output_dir=Path("website_content"),
     webhook_url="https://your-webhook.com/endpoint"
 )
-results = spider.crawl_sitemap_parallel(
-    "https://example.com/sitemap.xml",
-    config
-)
+# Using server-side parallel processing
+results = spider.crawl_urls_server_parallel([
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+], config)
 ```
-### 3. Async Crawling with Progress
+### 3. Full Sitemap Processing
 ```python
-import asyncio
-async def main():
-    config = CrawlConfig(
-        max_concurrent_requests=3,
-        request_delay=1.0
-    )
-    async with spider:
-        results = await spider.crawl_urls_async([
-            "https://example.com/1",
-            "https://example.com/2",
-            "https://example.com/3"
-        ], config)
-    return results
+config = CrawlConfig(
+    target_selector="main",
+    remove_selectors=[".sidebar", ".comments"],
+    output_dir=Path("site_content"),
+    report_file=Path("crawl_report.json")
+)
-results = asyncio.run(main())
+results = spider.crawl_sitemap_server_parallel(
+    "https://example.com/sitemap.xml",
+    config
+)
 ```
 ## Output Structure
-### 1. File Organization
+### 1. Directory Layout
 ```
-output_dir/
-├── example-com-page1.md
+spiderforce_reports/           # Default output directory
+├── example-com-page1.md      # Converted markdown files
 ├── example-com-page2.md
-└── crawl_report.json
+└── crawl_report.json         # Crawl report
 ```
 ### 2. Markdown Files
-Each markdown file is named using a slugified version of the URL and contains the converted content.
+Each file is named using a slugified version of the URL:
+```markdown
+# Page Title
+Content converted to clean markdown...
+```
-### 3. Report JSON Structure
+### 3. Crawl Report
 ```json
 {
   "timestamp": "2025-02-15T10:30:00.123456",
   "config": {
     "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"],
-    "remove_selectors_regex": ["modal-\\d+"]
+    "remove_selectors": [".ads", "#popup"]
   },
   "results": {
     "successful": [
@@ -234,7 +246,7 @@ Each markdown file is named using a slugified version of the URL and contains th
 ```
 ### 4. Webhook Notifications
-If configured, webhooks receive real-time updates in JSON format:
+If configured, real-time updates are sent for each processed URL:
 ```json
 {
   "url": "https://example.com/page1",
@@ -250,7 +262,7 @@ If configured, webhooks receive real-time updates in JSON format:
 ## Error Handling
-The package handles various types of errors:
+The package handles various types of errors gracefully:
 - Network errors
 - Timeout errors
 - Invalid URLs
@@ -269,6 +281,25 @@ All errors are:
 - Running SpiderForce4AI service
 - Internet connection
+## Performance Considerations
+1. Server-side Parallel Processing
+   - Best for most cases
+   - Single HTTP request for multiple URLs
+   - Less network overhead
+   - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
+2. Client-side Parallel Processing
+   - Good for special cases requiring local control
+   - Uses Python multiprocessing
+   - More network overhead
+   - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
+3. Async Processing
+   - Best for integration with async applications
+   - Good for real-time processing
+   - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
 ## License
 MIT License

spiderforce4ai-0.1.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=oU_UIdzsQxExaVgD7NCaVm4G-9zMtKGnREfY6xL1uFY,26041
+spiderforce4ai-0.1.9.dist-info/METADATA,sha256=poV1i_-H3AgzFhs9juRDJSfaWO0gVePb5JXN7ynL4Y4,7771
+spiderforce4ai-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.9.dist-info/RECORD,,

spiderforce4ai-0.1.7.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=qLYHahjvFutdGmibbVZ7cfTd1mMM1FZNd_7nv-EMPtQ,17649
-spiderforce4ai-0.1.7.dist-info/METADATA,sha256=-eWd9exoMxMAYClp6rWHaX_H3md4hBlRq6CHhTJ1ACg,6575
-spiderforce4ai-0.1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.7.dist-info/RECORD,,

{spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

spiderforce4ai 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl