PyPI - spiderforce4ai - Versions diffs - 1.1__py3-none-any.whl → 1.3__py3-none-any.whl - Mend

spiderforce4ai 1.1py3-none-any.whl → 1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

spiderforce4ai/__init__.py +43 -12
spiderforce4ai-1.3.dist-info/METADATA +298 -0
spiderforce4ai-1.3.dist-info/RECORD +5 -0
spiderforce4ai-1.1.dist-info/METADATA +0 -309
spiderforce4ai-1.1.dist-info/RECORD +0 -5
{spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/WHEEL +0 -0
{spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/top_level.txt +0 -0

spiderforce4ai/__init__.py CHANGED Viewed

@@ -350,17 +350,23 @@ class SpiderForce4AI:
     def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
         """Save crawl report synchronously."""
+        # Separate successful and failed results
+        successful_results = [r for r in results if r.status == "success"]
+        failed_results = [r for r in results if r.status == "failed"]
+        # Create report with only final state
         report = {
             "timestamp": datetime.now().isoformat(),
             "config": config.to_dict(),
             "results": {
-                "successful": [asdict(r) for r in results if r.status == "success"],
-                "failed": [asdict(r) for r in results if r.status == "failed"]
+                "successful": [asdict(r) for r in successful_results],
+                "failed": [asdict(r) for r in failed_results]  # Only truly failed URLs after retries
             },
             "summary": {
                 "total": len(results),
-                "successful": len([r for r in results if r.status == "success"]),
-                "failed": len([r for r in results if r.status == "failed"])
+                "successful": len(successful_results),
+                "failed": len(failed_results),
+                "retry_info": getattr(self, '_retry_stats', {})  # Include retry statistics if available
             }
         }
@@ -372,17 +378,22 @@ class SpiderForce4AI:
         if not config.report_file:
             return
+        # Separate successful and failed results
+        successful_results = [r for r in self.crawl_results if r.status == "success"]
+        failed_results = [r for r in self.crawl_results if r.status == "failed"]
         report = {
             "timestamp": datetime.now().isoformat(),
             "config": config.to_dict(),
             "results": {
-                "successful": [asdict(r) for r in self.crawl_results if r.status == "success"],
-                "failed": [asdict(r) for r in self.crawl_results if r.status == "failed"]
+                "successful": [asdict(r) for r in successful_results],
+                "failed": [asdict(r) for r in failed_results]  # Only truly failed URLs after retries
             },
             "summary": {
                 "total": len(self.crawl_results),
-                "successful": len([r for r in self.crawl_results if r.status == "success"]),
-                "failed": len([r for r in self.crawl_results if r.status == "failed"])
+                "successful": len(successful_results),
+                "failed": len(failed_results),
+                "retry_info": getattr(self, '_retry_stats', {})  # Include retry statistics if available
             }
         }
@@ -535,8 +546,13 @@ class SpiderForce4AI:
                     results = initial_results
                 else:
                     retry_results = await self._retry_failed_urls(failed_results, config, progress)
-                    # Replace failed results with retry results
-                    results = [r for r in initial_results if r.status == "success"] + retry_results
+                    # Update results list by replacing failed results with successful retries
+                    results = initial_results.copy()
+                    for retry_result in retry_results:
+                        for i, result in enumerate(results):
+                            if result.url == retry_result.url:
+                                results[i] = retry_result
+                                break
             else:
                 results = initial_results
@@ -661,12 +677,27 @@ class SpiderForce4AI:
                 console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
                 for result in failed_results:
                     new_result = _process_url_parallel((result.url, self.base_url, config))
+                    # Save markdown and trigger webhook for successful retries
                     if new_result.status == "success":
                         console.print(f"[green]✓ Retry successful: {result.url}[/green]")
-                        # Replace the failed result with the successful retry
-                        results[results.index(result)] = new_result
+                        # Save markdown if output directory is configured
+                        if config.output_dir and new_result.markdown:
+                            filepath = config.output_dir / f"{slugify(new_result.url)}.md"
+                            with open(filepath, 'w', encoding='utf-8') as f:
+                                f.write(new_result.markdown)
+                        # Send webhook for successful retry
+                        _send_webhook_sync(new_result, config)
                     else:
                         console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
+                        # Send webhook for failed retry
+                        _send_webhook_sync(new_result, config)
+                    # Update results list
+                    for i, r in enumerate(results):
+                        if r.url == new_result.url:
+                            results[i] = new_result
+                            break
         # Calculate final statistics
         final_successful = len([r for r in results if r.status == "success"])

spiderforce4ai-1.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,298 @@
+Metadata-Version: 2.2
+Name: spiderforce4ai
+Version: 1.3
+Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
+Home-page: https://petertam.pro
+Author: Piotr Tamulewicz
+Author-email: Piotr Tamulewicz <pt@petertam.pro>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio>=3.4.3
+Requires-Dist: rich>=10.0.0
+Requires-Dist: aiofiles>=0.8.0
+Requires-Dist: httpx>=0.24.0
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# SpiderForce4AI Python Wrapper
+A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
+## Features
+- HTML to Markdown conversion
+- Parallel and async crawling support
+- Sitemap processing
+- Custom content selection
+- Automatic retry mechanism
+- Detailed progress tracking
+- Webhook notifications
+- Customizable reporting
+## Installation
+```bash
+pip install spiderforce4ai
+```
+## Quick Start
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+from pathlib import Path
+# Initialize crawler
+spider = SpiderForce4AI("http://localhost:3004")
+# Configure crawling options
+config = CrawlConfig(
+    target_selector="article",
+    remove_selectors=[".ads", ".navigation"],
+    max_concurrent_requests=5,
+    save_reports=True
+)
+# Crawl a sitemap
+results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
+```
+## Key Features
+### 1. Smart Retry Mechanism
+- Automatically retries failed URLs
+- Monitors failure ratio to prevent server overload
+- Detailed retry statistics and progress tracking
+- Aborts retries if failure rate exceeds 20%
+```python
+# Retry behavior is automatic
+config = CrawlConfig(
+    max_concurrent_requests=5,
+    request_delay=1.0  # Delay between retries
+)
+results = spider.crawl_urls_async(urls, config)
+```
+### 2. Custom Webhook Integration
+- Flexible payload formatting
+- Custom headers support
+- Variable substitution in templates
+```python
+config = CrawlConfig(
+    webhook_url="https://your-webhook.com",
+    webhook_headers={
+        "Authorization": "Bearer token",
+        "X-Custom-Header": "value"
+    },
+    webhook_payload_template='''{
+        "url": "{url}",
+        "content": "{markdown}",
+        "status": "{status}",
+        "custom_field": "value"
+    }'''
+)
+```
+### 3. Flexible Report Generation
+- Optional report saving
+- Customizable report location
+- Detailed success/failure statistics
+```python
+config = CrawlConfig(
+    save_reports=True,
+    report_file=Path("custom_report.json"),
+    output_dir=Path("content")
+)
+```
+## Crawling Methods
+### 1. Single URL Processing
+```python
+# Synchronous
+result = spider.crawl_url("https://example.com", config)
+# Asynchronous
+async def crawl():
+    result = await spider.crawl_url_async("https://example.com", config)
+```
+### 2. Multiple URLs
+```python
+urls = ["https://example.com/page1", "https://example.com/page2"]
+# Server-side parallel (recommended)
+results = spider.crawl_urls_server_parallel(urls, config)
+# Client-side parallel
+results = spider.crawl_urls_parallel(urls, config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_urls_async(urls, config)
+```
+### 3. Sitemap Processing
+```python
+# Server-side parallel (recommended)
+results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
+# Client-side parallel
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
+```
+## Configuration Options
+```python
+config = CrawlConfig(
+    # Content Selection
+    target_selector="article",              # Target element to extract
+    remove_selectors=[".ads", "#popup"],    # Elements to remove
+    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
+    # Processing
+    max_concurrent_requests=5,              # Parallel processing limit
+    request_delay=0.5,                      # Delay between requests
+    timeout=30,                             # Request timeout
+    # Output
+    output_dir=Path("content"),             # Output directory
+    save_reports=False,                     # Enable/disable report saving
+    report_file=Path("report.json"),        # Report location
+    # Webhook
+    webhook_url="https://webhook.com",      # Webhook endpoint
+    webhook_timeout=10,                     # Webhook timeout
+    webhook_headers={                       # Custom headers
+        "Authorization": "Bearer token"
+    },
+    webhook_payload_template='''            # Custom payload format
+    {
+        "url": "{url}",
+        "content": "{markdown}",
+        "status": "{status}",
+        "error": "{error}",
+        "time": "{timestamp}"
+    }'''
+)
+```
+## Progress Tracking
+The package provides detailed progress information:
+```
+Fetching sitemap from https://example.com/sitemap.xml...
+Found 156 URLs in sitemap
+[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
+Retrying failed URLs: 18 (11.5% failed)
+[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
+Crawling Summary:
+Total URLs processed: 156
+Initial failures: 18 (11.5%)
+Final results:
+  ✓ Successful: 150
+  ✗ Failed: 6
+Retry success rate: 12/18 (66.7%)
+```
+## Output Structure
+### 1. Directory Layout
+```
+content/                    # Output directory
+├── example-com-page1.md   # Markdown files
+├── example-com-page2.md
+└── report.json            # Crawl report
+```
+### 2. Report Format
+```json
+{
+  "timestamp": "2025-02-15T10:30:00",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads"]
+  },
+  "results": {
+    "successful": [...],
+    "failed": [...]
+  },
+  "summary": {
+    "total": 156,
+    "successful": 150,
+    "failed": 6
+  }
+}
+```
+## Performance Optimization
+1. Server-side Parallel Processing
+   - Recommended for most cases
+   - Single HTTP request
+   - Reduced network overhead
+   - Built-in load balancing
+2. Client-side Parallel Processing
+   - Better control over processing
+   - Customizable concurrency
+   - Progress tracking per URL
+   - Automatic retry handling
+3. Asynchronous Processing
+   - Ideal for async applications
+   - Non-blocking operation
+   - Real-time progress updates
+   - Efficient resource usage
+## Error Handling
+The package provides comprehensive error handling:
+- Automatic retry for failed URLs
+- Failure ratio monitoring
+- Detailed error reporting
+- Webhook error notifications
+- Progress tracking during retries
+## Requirements
+- Python 3.11+
+- Running SpiderForce4AI service
+- Internet connection
+## Dependencies
+- aiohttp
+- asyncio
+- rich
+- aiofiles
+- httpx
+## License
+MIT License
+## Credits
+Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=Fbgqu9uPg0wuWZgiVYNTv6CkkcOHgU_f5-uoXRKhgn4,29922
+spiderforce4ai-1.3.dist-info/METADATA,sha256=ybuwcVE62JFnWJKcCdHDrOOqmbuh8PEzF69_yFK-eV0,7183
+spiderforce4ai-1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-1.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-1.3.dist-info/RECORD,,

spiderforce4ai-1.1.dist-info/METADATA DELETED Viewed

@@ -1,309 +0,0 @@
-Metadata-Version: 2.2
-Name: spiderforce4ai
-Version: 1.1
-Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
-Home-page: https://petertam.pro
-Author: Piotr Tamulewicz
-Author-email: Piotr Tamulewicz <pt@petertam.pro>
-License: MIT
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Requires-Python: >=3.11
-Description-Content-Type: text/markdown
-Requires-Dist: aiohttp>=3.8.0
-Requires-Dist: asyncio>=3.4.3
-Requires-Dist: rich>=10.0.0
-Requires-Dist: aiofiles>=0.8.0
-Requires-Dist: httpx>=0.24.0
-Dynamic: author
-Dynamic: home-page
-Dynamic: requires-python
-# SpiderForce4AI Python Wrapper
-A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
-## Quick Start (Minimal Setup)
-```python
-from spiderforce4ai import SpiderForce4AI, CrawlConfig
-# Initialize with your service URL
-spider = SpiderForce4AI("http://localhost:3004")
-# Create default config
-config = CrawlConfig()
-# Crawl a single URL
-result = spider.crawl_url("https://example.com", config)
-```
-## Installation
-```bash
-pip install spiderforce4ai
-```
-## Crawling Methods
-### 1. Single URL
-```python
-# Basic usage
-result = spider.crawl_url("https://example.com", config)
-# Async version
-async def crawl():
-    result = await spider.crawl_url_async("https://example.com", config)
-```
-### 2. Multiple URLs
-```python
-urls = [
-    "https://example.com/page1",
-    "https://example.com/page2"
-]
-# Client-side parallel (using multiprocessing)
-results = spider.crawl_urls_parallel(urls, config)
-# Server-side parallel (single request)
-results = spider.crawl_urls_server_parallel(urls, config)
-# Async version
-async def crawl():
-    results = await spider.crawl_urls_async(urls, config)
-```
-### 3. Sitemap Crawling
-```python
-# Server-side parallel (recommended)
-results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
-# Client-side parallel
-results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
-# Async version
-async def crawl():
-    results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
-```
-## Configuration Options
-All configuration options are optional with sensible defaults:
-```python
-from pathlib import Path
-config = CrawlConfig(
-    # Content Selection (all optional)
-    target_selector="article",              # Specific element to extract
-    remove_selectors=[                      # Elements to remove
-        ".ads",
-        "#popup",
-        ".navigation",
-        ".footer"
-    ],
-    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
-    # Processing Settings
-    max_concurrent_requests=1,              # For client-side parallel processing
-    request_delay=0.5,                     # Delay between requests (seconds)
-    timeout=30,                            # Request timeout (seconds)
-    # Output Settings
-    output_dir=Path("spiderforce_reports"),  # Default directory for files
-    webhook_url="https://your-webhook.com",  # Real-time notifications
-    webhook_timeout=10,                      # Webhook timeout
-    webhook_headers={                        # Optional custom headers for webhook
-        "Authorization": "Bearer your-token",
-        "X-Custom-Header": "value"
-    },
-    webhook_payload_template='''{           # Optional custom webhook payload template
-        "crawled_url": "{url}",
-        "content": "{markdown}",
-        "crawl_status": "{status}",
-        "crawl_error": "{error}",
-        "crawl_time": "{timestamp}",
-        "custom_field": "your-value"
-    }''',
-    save_reports=False,                      # Whether to save crawl reports (default: False)
-    report_file=Path("crawl_report.json")    # Report location (used only if save_reports=True)
-)
-```
-## Real-World Examples
-### 1. Basic Blog Crawling
-```python
-from spiderforce4ai import SpiderForce4AI, CrawlConfig
-from pathlib import Path
-spider = SpiderForce4AI("http://localhost:3004")
-config = CrawlConfig(
-    target_selector="article.post-content",
-    output_dir=Path("blog_content")
-)
-result = spider.crawl_url("https://example.com/blog-post", config)
-```
-### 2. Parallel Website Crawling
-```python
-config = CrawlConfig(
-    remove_selectors=[
-        ".navigation",
-        ".footer",
-        ".ads",
-        "#cookie-notice"
-    ],
-    max_concurrent_requests=5,
-    output_dir=Path("website_content"),
-    webhook_url="https://your-webhook.com/endpoint"
-)
-# Using server-side parallel processing
-results = spider.crawl_urls_server_parallel([
-    "https://example.com/page1",
-    "https://example.com/page2",
-    "https://example.com/page3"
-], config)
-```
-### 3. Full Sitemap Processing
-```python
-config = CrawlConfig(
-    target_selector="main",
-    remove_selectors=[".sidebar", ".comments"],
-    output_dir=Path("site_content"),
-    report_file=Path("crawl_report.json")
-)
-results = spider.crawl_sitemap_server_parallel(
-    "https://example.com/sitemap.xml",
-    config
-)
-```
-## Output Structure
-### 1. Directory Layout
-```
-spiderforce_reports/           # Default output directory
-├── example-com-page1.md      # Converted markdown files
-├── example-com-page2.md
-└── crawl_report.json         # Crawl report
-```
-### 2. Markdown Files
-Each file is named using a slugified version of the URL:
-```markdown
-# Page Title
-Content converted to clean markdown...
-```
-### 3. Crawl Report
-```json
-{
-  "timestamp": "2025-02-15T10:30:00.123456",
-  "config": {
-    "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"]
-  },
-  "results": {
-    "successful": [
-      {
-        "url": "https://example.com/page1",
-        "status": "success",
-        "markdown": "# Page Title\n\nContent...",
-        "timestamp": "2025-02-15T10:30:00.123456"
-      }
-    ],
-    "failed": [
-      {
-        "url": "https://example.com/page2",
-        "status": "failed",
-        "error": "HTTP 404: Not Found",
-        "timestamp": "2025-02-15T10:30:01.123456"
-      }
-    ]
-  },
-  "summary": {
-    "total": 2,
-    "successful": 1,
-    "failed": 1
-  }
-}
-```
-### 4. Webhook Notifications
-If configured, real-time updates are sent for each processed URL:
-```json
-{
-  "url": "https://example.com/page1",
-  "status": "success",
-  "markdown": "# Page Title\n\nContent...",
-  "timestamp": "2025-02-15T10:30:00.123456",
-  "config": {
-    "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"]
-  }
-}
-```
-## Error Handling
-The package handles various types of errors gracefully:
-- Network errors
-- Timeout errors
-- Invalid URLs
-- Missing content
-- Service errors
-All errors are:
-1. Logged in the console
-2. Included in the JSON report
-3. Sent via webhook (if configured)
-4. Available in the results list
-## Requirements
-- Python 3.11 or later
-- Running SpiderForce4AI service
-- Internet connection
-## Performance Considerations
-1. Server-side Parallel Processing
-   - Best for most cases
-   - Single HTTP request for multiple URLs
-   - Less network overhead
-   - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
-2. Client-side Parallel Processing
-   - Good for special cases requiring local control
-   - Uses Python multiprocessing
-   - More network overhead
-   - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
-3. Async Processing
-   - Best for integration with async applications
-   - Good for real-time processing
-   - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
-## License
-MIT License
-## Credits
-Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-1.1.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=lCviRhfLngSMehFJZwyK4LirPwbWEyZ0RJjCt5FkBcY,28304
-spiderforce4ai-1.1.dist-info/METADATA,sha256=lQfqXn0ifJOmOmLkgr8YTSYUFiu6-HS3YsRD0togylo,7769
-spiderforce4ai-1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-1.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-1.1.dist-info/RECORD,,

{spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 1.1__py3-none-any.whl → 1.3__py3-none-any.whl

spiderforce4ai 1.1py3-none-any.whl → 1.3py3-none-any.whl