PyPI - spiderforce4ai - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

spiderforce4ai 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -86,6 +86,31 @@ class CrawlConfig:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
+def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
+    """Synchronous version of webhook sender for parallel processing."""
+    if not config.webhook_url:
+        return
+    payload = {
+        "url": result.url,
+        "status": result.status,
+        "markdown": result.markdown if result.status == "success" else None,
+        "error": result.error if result.status == "failed" else None,
+        "timestamp": result.timestamp,
+        "config": config.to_dict()
+    }
+    try:
+        response = requests.post(
+            config.webhook_url,
+            json=payload,
+            timeout=config.webhook_timeout
+        )
+        response.raise_for_status()
+    except Exception as e:
+        print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
 # Module level function for multiprocessing
 def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
     """Process a single URL for parallel processing."""
@@ -99,12 +124,15 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         response = requests.post(endpoint, json=payload, timeout=config.timeout)
         if response.status_code != 200:
-            return CrawlResult(
+            result = CrawlResult(
                 url=url,
                 status="failed",
                 error=f"HTTP {response.status_code}: {response.text}",
                 config=config.to_dict()
             )
+            # Send webhook for failed result
+            _send_webhook_sync(result, config)
+            return result
         markdown = response.text
@@ -114,24 +142,32 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             with open(filepath, 'w', encoding='utf-8') as f:
                 f.write(markdown)
-        # Add delay if configured
-        if config.request_delay:
-            time.sleep(config.request_delay)
-        return CrawlResult(
+        result = CrawlResult(
             url=url,
             status="success",
             markdown=markdown,
             config=config.to_dict()
         )
+        # Send webhook for successful result
+        _send_webhook_sync(result, config)
+        # Add delay if configured
+        if config.request_delay:
+            time.sleep(config.request_delay)
+        return result
     except Exception as e:
-        return CrawlResult(
+        result = CrawlResult(
             url=url,
             status="failed",
             error=str(e),
             config=config.to_dict()
         )
+        # Send webhook for error result
+        _send_webhook_sync(result, config)
+        return result
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -160,6 +196,113 @@ class SpiderForce4AI:
             await f.write(markdown)
         return filepath
+    def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl sitemap URLs using server-side parallel processing.
+        """
+        print(f"Fetching sitemap from {sitemap_url}...")
+        # Fetch sitemap
+        try:
+            response = requests.get(sitemap_url, timeout=config.timeout)
+            response.raise_for_status()
+            sitemap_text = response.text
+        except Exception as e:
+            print(f"Error fetching sitemap: {str(e)}")
+            raise
+        # Parse sitemap
+        try:
+            root = ET.fromstring(sitemap_text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            print(f"Found {len(urls)} URLs in sitemap")
+        except Exception as e:
+            print(f"Error parsing sitemap: {str(e)}")
+            raise
+        # Process URLs using server-side parallel endpoint
+        return self.crawl_urls_server_parallel(urls, config)
+    def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl multiple URLs using server-side parallel processing.
+        This uses the /convert_parallel endpoint which handles parallelization on the server.
+        """
+        print(f"Sending {len(urls)} URLs for parallel processing...")
+        try:
+            endpoint = f"{self.base_url}/convert_parallel"
+            # Prepare payload
+            payload = {
+                "urls": urls,
+                **config.to_dict()
+            }
+            # Send request
+            response = requests.post(
+                endpoint,
+                json=payload,
+                timeout=config.timeout
+            )
+            response.raise_for_status()
+            # Process results
+            results = []
+            server_results = response.json()  # Assuming server returns JSON array of results
+            for url_result in server_results:
+                result = CrawlResult(
+                    url=url_result["url"],
+                    status=url_result.get("status", "failed"),
+                    markdown=url_result.get("markdown"),
+                    error=url_result.get("error"),
+                    config=config.to_dict()
+                )
+                # Save markdown if successful and output dir is configured
+                if result.status == "success" and config.output_dir and result.markdown:
+                    filepath = config.output_dir / f"{slugify(result.url)}.md"
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(result.markdown)
+                # Send webhook if configured
+                if config.webhook_url:
+                    _send_webhook_sync(result, config)
+                results.append(result)
+            # Save report if configured
+            if config.report_file:
+                self._save_report_sync(results, config)
+                print(f"\nReport saved to: {config.report_file}")
+            # Print summary
+            successful = len([r for r in results if r.status == "success"])
+            failed = len([r for r in results if r.status == "failed"])
+            print(f"\nParallel processing completed:")
+            print(f"✓ Successful: {successful}")
+            print(f"✗ Failed: {failed}")
+            return results
+        except Exception as e:
+            print(f"Error during parallel processing: {str(e)}")
+            # Create failed results for all URLs
+            return [
+                CrawlResult(
+                    url=url,
+                    status="failed",
+                    error=str(e),
+                    config=config.to_dict()
+                ) for url in urls
+            ]
     async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
         """Send webhook with crawl results."""
         if not config.webhook_url:
@@ -424,4 +567,5 @@ class SpiderForce4AI:
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
+        self._executor.shutdown(wait=True)

{spiderforce4ai-0.1.6.dist-info → spiderforce4ai-0.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.6
+Version: 0.1.8
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -24,75 +24,73 @@ Dynamic: requires-python
 # SpiderForce4AI Python Wrapper
-A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
-## Installation
-```bash
-pip install spiderforce4ai
-```
+A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
 ## Quick Start (Minimal Setup)
 ```python
 from spiderforce4ai import SpiderForce4AI, CrawlConfig
-# Initialize with your SpiderForce4AI service URL
+# Initialize with your service URL
 spider = SpiderForce4AI("http://localhost:3004")
-# Use default configuration (will save in ./spiderforce_reports)
+# Create default config
 config = CrawlConfig()
 # Crawl a single URL
 result = spider.crawl_url("https://example.com", config)
 ```
+## Installation
+```bash
+pip install spiderforce4ai
+```
 ## Crawling Methods
-### 1. Single URL Crawling
+### 1. Single URL
 ```python
-# Synchronous
+# Basic usage
 result = spider.crawl_url("https://example.com", config)
-# Asynchronous
+# Async version
 async def crawl():
     result = await spider.crawl_url_async("https://example.com", config)
 ```
-### 2. Multiple URLs Crawling
+### 2. Multiple URLs
 ```python
-# List of URLs
 urls = [
     "https://example.com/page1",
-    "https://example.com/page2",
-    "https://example.com/page3"
+    "https://example.com/page2"
 ]
-# Synchronous
-results = spider.crawl_urls(urls, config)
+# Client-side parallel (using multiprocessing)
+results = spider.crawl_urls_parallel(urls, config)
+# Server-side parallel (single request)
+results = spider.crawl_urls_server_parallel(urls, config)
-# Asynchronous
+# Async version
 async def crawl():
     results = await spider.crawl_urls_async(urls, config)
-# Parallel (using multiprocessing)
-results = spider.crawl_urls_parallel(urls, config)
 ```
 ### 3. Sitemap Crawling
 ```python
-# Synchronous
-results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+# Server-side parallel (recommended)
+results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
+# Client-side parallel
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
-# Asynchronous
+# Async version
 async def crawl():
     results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
-# Parallel (using multiprocessing)
-results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
 ```
 ## Configuration Options
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
 All configuration options are optional with sensible defaults:
 ```python
+from pathlib import Path
 config = CrawlConfig(
     # Content Selection (all optional)
-    target_selector="article",              # Specific element to target
+    target_selector="article",              # Specific element to extract
     remove_selectors=[                      # Elements to remove
         ".ads",
         "#popup",
@@ -112,21 +112,21 @@ config = CrawlConfig(
     remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
     # Processing Settings
-    max_concurrent_requests=1,              # Default: 1 (parallel processing)
-    request_delay=0.5,                     # Delay between requests in seconds
-    timeout=30,                            # Request timeout in seconds
+    max_concurrent_requests=1,              # For client-side parallel processing
+    request_delay=0.5,                     # Delay between requests (seconds)
+    timeout=30,                            # Request timeout (seconds)
     # Output Settings
-    output_dir="custom_output",            # Default: "spiderforce_reports"
-    report_file="custom_report.json",      # Default: "crawl_report.json"
-    webhook_url="https://your-webhook.com", # Optional webhook endpoint
-    webhook_timeout=10                      # Webhook timeout in seconds
+    output_dir=Path("spiderforce_reports"), # Default directory for files
+    webhook_url="https://your-webhook.com", # Real-time notifications
+    webhook_timeout=10,                     # Webhook timeout
+    report_file=Path("crawl_report.json")   # Final report location
 )
 ```
 ## Real-World Examples
-### 1. Basic Website Crawling
+### 1. Basic Blog Crawling
 ```python
 from spiderforce4ai import SpiderForce4AI, CrawlConfig
@@ -134,78 +134,77 @@ from pathlib import Path
 spider = SpiderForce4AI("http://localhost:3004")
 config = CrawlConfig(
+    target_selector="article.post-content",
     output_dir=Path("blog_content")
 )
-result = spider.crawl_url("https://example.com/blog", config)
-print(f"Content saved to: {result.url}.md")
+result = spider.crawl_url("https://example.com/blog-post", config)
 ```
-### 2. Advanced Parallel Sitemap Crawling
+### 2. Parallel Website Crawling
 ```python
 config = CrawlConfig(
-    max_concurrent_requests=5,
-    output_dir=Path("website_content"),
     remove_selectors=[
         ".navigation",
         ".footer",
         ".ads",
         "#cookie-notice"
     ],
+    max_concurrent_requests=5,
+    output_dir=Path("website_content"),
     webhook_url="https://your-webhook.com/endpoint"
 )
-results = spider.crawl_sitemap_parallel(
-    "https://example.com/sitemap.xml",
-    config
-)
+# Using server-side parallel processing
+results = spider.crawl_urls_server_parallel([
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+], config)
 ```
-### 3. Async Crawling with Progress
+### 3. Full Sitemap Processing
 ```python
-import asyncio
-async def main():
-    config = CrawlConfig(
-        max_concurrent_requests=3,
-        request_delay=1.0
-    )
-    async with spider:
-        results = await spider.crawl_urls_async([
-            "https://example.com/1",
-            "https://example.com/2",
-            "https://example.com/3"
-        ], config)
-    return results
+config = CrawlConfig(
+    target_selector="main",
+    remove_selectors=[".sidebar", ".comments"],
+    output_dir=Path("site_content"),
+    report_file=Path("crawl_report.json")
+)
-results = asyncio.run(main())
+results = spider.crawl_sitemap_server_parallel(
+    "https://example.com/sitemap.xml",
+    config
+)
 ```
 ## Output Structure
-### 1. File Organization
+### 1. Directory Layout
 ```
-output_dir/
-├── example-com-page1.md
+spiderforce_reports/           # Default output directory
+├── example-com-page1.md      # Converted markdown files
 ├── example-com-page2.md
-└── crawl_report.json
+└── crawl_report.json         # Crawl report
 ```
 ### 2. Markdown Files
-Each markdown file is named using a slugified version of the URL and contains the converted content.
+Each file is named using a slugified version of the URL:
+```markdown
+# Page Title
+Content converted to clean markdown...
+```
-### 3. Report JSON Structure
+### 3. Crawl Report
 ```json
 {
   "timestamp": "2025-02-15T10:30:00.123456",
   "config": {
     "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"],
-    "remove_selectors_regex": ["modal-\\d+"]
+    "remove_selectors": [".ads", "#popup"]
   },
   "results": {
     "successful": [
@@ -234,7 +233,7 @@ Each markdown file is named using a slugified version of the URL and contains th
 ```
 ### 4. Webhook Notifications
-If configured, webhooks receive real-time updates in JSON format:
+If configured, real-time updates are sent for each processed URL:
 ```json
 {
   "url": "https://example.com/page1",
@@ -250,7 +249,7 @@ If configured, webhooks receive real-time updates in JSON format:
 ## Error Handling
-The package handles various types of errors:
+The package handles various types of errors gracefully:
 - Network errors
 - Timeout errors
 - Invalid URLs
@@ -269,6 +268,25 @@ All errors are:
 - Running SpiderForce4AI service
 - Internet connection
+## Performance Considerations
+1. Server-side Parallel Processing
+   - Best for most cases
+   - Single HTTP request for multiple URLs
+   - Less network overhead
+   - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
+2. Client-side Parallel Processing
+   - Good for special cases requiring local control
+   - Uses Python multiprocessing
+   - More network overhead
+   - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
+3. Async Processing
+   - Best for integration with async applications
+   - Good for real-time processing
+   - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
 ## License
 MIT License

spiderforce4ai-0.1.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
+spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
+spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.8.dist-info/RECORD,,

spiderforce4ai-0.1.6.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
-spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
-spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.6.dist-info/RECORD,,

{spiderforce4ai-0.1.6.dist-info → spiderforce4ai-0.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.6.dist-info → spiderforce4ai-0.1.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

spiderforce4ai 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl