PyPI - spiderforce4ai - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

spiderforce4ai 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

spiderforce4ai/__init__.py +89 -87
spiderforce4ai-0.1.6.dist-info/METADATA +278 -0
spiderforce4ai-0.1.6.dist-info/RECORD +5 -0
spiderforce4ai-0.1.4.dist-info/METADATA +0 -239
spiderforce4ai-0.1.4.dist-info/RECORD +0 -5
{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/WHEEL +0 -0
{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/top_level.txt +0 -0

spiderforce4ai/__init__.py CHANGED Viewed

@@ -1,13 +1,10 @@
-"""
-SpiderForce4AI Python Wrapper
-A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
-"""
+# spiderforce4ai/__init__.py
 import asyncio
 import aiohttp
 import json
 import logging
-from typing import List, Dict, Union, Optional
+from typing import List, Dict, Union, Optional, Tuple
 from dataclasses import dataclass, asdict
 from urllib.parse import urljoin, urlparse
 from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 from rich.console import Console
 import aiofiles
 import httpx
+import requests
 from multiprocessing import Pool
 console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
+# Module level function for multiprocessing
+def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
+    """Process a single URL for parallel processing."""
+    url, base_url, config = args
+    try:
+        endpoint = f"{base_url}/convert"
+        payload = {
+            "url": url,
+            **config.to_dict()
+        }
+        response = requests.post(endpoint, json=payload, timeout=config.timeout)
+        if response.status_code != 200:
+            return CrawlResult(
+                url=url,
+                status="failed",
+                error=f"HTTP {response.status_code}: {response.text}",
+                config=config.to_dict()
+            )
+        markdown = response.text
+        # Save markdown if output directory is configured
+        if config.output_dir:
+            filepath = config.output_dir / f"{slugify(url)}.md"
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(markdown)
+        # Add delay if configured
+        if config.request_delay:
+            time.sleep(config.request_delay)
+        return CrawlResult(
+            url=url,
+            status="success",
+            markdown=markdown,
+            config=config.to_dict()
+        )
+    except Exception as e:
+        return CrawlResult(
+            url=url,
+            status="failed",
+            error=str(e),
+            config=config.to_dict()
+        )
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -140,6 +185,25 @@ class SpiderForce4AI:
         except Exception as e:
             console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
+        """Save crawl report synchronously."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in results if r.status == "success"],
+                "failed": [asdict(r) for r in results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(results),
+                "successful": len([r for r in results if r.status == "success"]),
+                "failed": len([r for r in results if r.status == "failed"])
+            }
+        }
+        with open(config.report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)
     async def _save_report(self, config: CrawlConfig):
         """Save crawl report to JSON file."""
         if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
         """Synchronous version of crawl_sitemap_async."""
         return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self._ensure_session()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        await self._close_session()
-    def __enter__(self):
-        """Sync context manager entry."""
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
     def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
-        """
+        """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
         print(f"Fetching sitemap from {sitemap_url}...")
         # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
             print(f"Error parsing sitemap: {str(e)}")
             raise
-        def _crawl_single(url: str) -> CrawlResult:
-            try:
-                endpoint = f"{self.base_url}/convert"
-                payload = {
-                    "url": url,
-                    **config.to_dict()
-                }
-                response = requests.post(endpoint, json=payload, timeout=config.timeout)
-                if response.status_code != 200:
-                    return CrawlResult(
-                        url=url,
-                        status="failed",
-                        error=f"HTTP {response.status_code}: {response.text}",
-                        config=config.to_dict()
-                    )
-                markdown = response.text
-                # Save markdown if output directory is configured
-                if config.output_dir:
-                    filepath = config.output_dir / f"{slugify(url)}.md"
-                    with open(filepath, 'w', encoding='utf-8') as f:
-                        f.write(markdown)
-                # Add delay if configured
-                if config.request_delay:
-                    time.sleep(config.request_delay)
-                return CrawlResult(
-                    url=url,
-                    status="success",
-                    markdown=markdown,
-                    config=config.to_dict()
-                )
-            except Exception as e:
-                return CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                )
+        # Prepare arguments for parallel processing
+        process_args = [(url, self.base_url, config) for url in urls]
         # Create process pool and execute crawls
         results = []
         with Pool(processes=config.max_concurrent_requests) as pool:
             with Progress(
                 SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
             ) as progress:
                 task = progress.add_task("Crawling URLs...", total=len(urls))
-                for result in pool.imap_unordered(_crawl_single, urls):
+                for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
         return results
-    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
-        """Save crawl report synchronously."""
-        report = {
-            "timestamp": datetime.now().isoformat(),
-            "config": config.to_dict(),
-            "results": {
-                "successful": [asdict(r) for r in results if r.status == "success"],
-                "failed": [asdict(r) for r in results if r.status == "failed"]
-            },
-            "summary": {
-                "total": len(results),
-                "successful": len([r for r in results if r.status == "success"]),
-                "failed": len([r for r in results if r.status == "failed"])
-            }
-        }
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self._ensure_session()
+        return self
-        with open(config.report_file, 'w', encoding='utf-8') as f:
-            json.dump(report, f, indent=2)
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self._close_session()
+    def __enter__(self):
+        """Sync context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Sync context manager exit."""
+        self._executor.shutdown(wait=True)

spiderforce4ai-0.1.6.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,278 @@
+Metadata-Version: 2.2
+Name: spiderforce4ai
+Version: 0.1.6
+Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
+Home-page: https://petertam.pro
+Author: Piotr Tamulewicz
+Author-email: Piotr Tamulewicz <pt@petertam.pro>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio>=3.4.3
+Requires-Dist: rich>=10.0.0
+Requires-Dist: aiofiles>=0.8.0
+Requires-Dist: httpx>=0.24.0
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# SpiderForce4AI Python Wrapper
+A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
+## Installation
+```bash
+pip install spiderforce4ai
+```
+## Quick Start (Minimal Setup)
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+# Initialize with your SpiderForce4AI service URL
+spider = SpiderForce4AI("http://localhost:3004")
+# Use default configuration (will save in ./spiderforce_reports)
+config = CrawlConfig()
+# Crawl a single URL
+result = spider.crawl_url("https://example.com", config)
+```
+## Crawling Methods
+### 1. Single URL Crawling
+```python
+# Synchronous
+result = spider.crawl_url("https://example.com", config)
+# Asynchronous
+async def crawl():
+    result = await spider.crawl_url_async("https://example.com", config)
+```
+### 2. Multiple URLs Crawling
+```python
+# List of URLs
+urls = [
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+]
+# Synchronous
+results = spider.crawl_urls(urls, config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_urls_async(urls, config)
+# Parallel (using multiprocessing)
+results = spider.crawl_urls_parallel(urls, config)
+```
+### 3. Sitemap Crawling
+```python
+# Synchronous
+results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
+# Parallel (using multiprocessing)
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
+```
+## Configuration Options
+All configuration options are optional with sensible defaults:
+```python
+config = CrawlConfig(
+    # Content Selection (all optional)
+    target_selector="article",              # Specific element to target
+    remove_selectors=[                      # Elements to remove
+        ".ads",
+        "#popup",
+        ".navigation",
+        ".footer"
+    ],
+    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
+    # Processing Settings
+    max_concurrent_requests=1,              # Default: 1 (parallel processing)
+    request_delay=0.5,                     # Delay between requests in seconds
+    timeout=30,                            # Request timeout in seconds
+    # Output Settings
+    output_dir="custom_output",            # Default: "spiderforce_reports"
+    report_file="custom_report.json",      # Default: "crawl_report.json"
+    webhook_url="https://your-webhook.com", # Optional webhook endpoint
+    webhook_timeout=10                      # Webhook timeout in seconds
+)
+```
+## Real-World Examples
+### 1. Basic Website Crawling
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+from pathlib import Path
+spider = SpiderForce4AI("http://localhost:3004")
+config = CrawlConfig(
+    output_dir=Path("blog_content")
+)
+result = spider.crawl_url("https://example.com/blog", config)
+print(f"Content saved to: {result.url}.md")
+```
+### 2. Advanced Parallel Sitemap Crawling
+```python
+config = CrawlConfig(
+    max_concurrent_requests=5,
+    output_dir=Path("website_content"),
+    remove_selectors=[
+        ".navigation",
+        ".footer",
+        ".ads",
+        "#cookie-notice"
+    ],
+    webhook_url="https://your-webhook.com/endpoint"
+)
+results = spider.crawl_sitemap_parallel(
+    "https://example.com/sitemap.xml",
+    config
+)
+```
+### 3. Async Crawling with Progress
+```python
+import asyncio
+async def main():
+    config = CrawlConfig(
+        max_concurrent_requests=3,
+        request_delay=1.0
+    )
+    async with spider:
+        results = await spider.crawl_urls_async([
+            "https://example.com/1",
+            "https://example.com/2",
+            "https://example.com/3"
+        ], config)
+    return results
+results = asyncio.run(main())
+```
+## Output Structure
+### 1. File Organization
+```
+output_dir/
+├── example-com-page1.md
+├── example-com-page2.md
+└── crawl_report.json
+```
+### 2. Markdown Files
+Each markdown file is named using a slugified version of the URL and contains the converted content.
+### 3. Report JSON Structure
+```json
+{
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"],
+    "remove_selectors_regex": ["modal-\\d+"]
+  },
+  "results": {
+    "successful": [
+      {
+        "url": "https://example.com/page1",
+        "status": "success",
+        "markdown": "# Page Title\n\nContent...",
+        "timestamp": "2025-02-15T10:30:00.123456"
+      }
+    ],
+    "failed": [
+      {
+        "url": "https://example.com/page2",
+        "status": "failed",
+        "error": "HTTP 404: Not Found",
+        "timestamp": "2025-02-15T10:30:01.123456"
+      }
+    ]
+  },
+  "summary": {
+    "total": 2,
+    "successful": 1,
+    "failed": 1
+  }
+}
+```
+### 4. Webhook Notifications
+If configured, webhooks receive real-time updates in JSON format:
+```json
+{
+  "url": "https://example.com/page1",
+  "status": "success",
+  "markdown": "# Page Title\n\nContent...",
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"]
+  }
+}
+```
+## Error Handling
+The package handles various types of errors:
+- Network errors
+- Timeout errors
+- Invalid URLs
+- Missing content
+- Service errors
+All errors are:
+1. Logged in the console
+2. Included in the JSON report
+3. Sent via webhook (if configured)
+4. Available in the results list
+## Requirements
+- Python 3.11 or later
+- Running SpiderForce4AI service
+- Internet connection
+## License
+MIT License
+## Credits
+Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-0.1.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
+spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
+spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.6.dist-info/RECORD,,

spiderforce4ai-0.1.4.dist-info/METADATA DELETED Viewed

@@ -1,239 +0,0 @@
-Metadata-Version: 2.2
-Name: spiderforce4ai
-Version: 0.1.4
-Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
-Home-page: https://petertam.pro
-Author: Piotr Tamulewicz
-Author-email: Piotr Tamulewicz <pt@petertam.pro>
-License: MIT
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Requires-Python: >=3.11
-Description-Content-Type: text/markdown
-Requires-Dist: aiohttp>=3.8.0
-Requires-Dist: asyncio>=3.4.3
-Requires-Dist: rich>=10.0.0
-Requires-Dist: aiofiles>=0.8.0
-Requires-Dist: httpx>=0.24.0
-Dynamic: author
-Dynamic: home-page
-Dynamic: requires-python
-# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
-A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
-## Features
-- 🔄 Simple synchronous and asynchronous APIs
-- 📁 Automatic Markdown file saving with URL-based filenames
-- 📊 Real-time progress tracking in console
-- 🪝 Webhook support for real-time notifications
-- 📝 Detailed crawl reports in JSON format
-- ⚡ Concurrent crawling with rate limiting
-- 🔍 Support for sitemap.xml crawling
-- 🛡️ Comprehensive error handling
-## Installation
-```bash
-pip install spiderforce4ai
-```
-## Quick Start
-```python
-from spiderforce4ai import SpiderForce4AI, CrawlConfig
-# Initialize the client
-spider = SpiderForce4AI("http://localhost:3004")
-# Use default configuration
-config = CrawlConfig()
-# Crawl a single URL
-result = spider.crawl_url("https://example.com", config)
-# Crawl multiple URLs
-urls = [
-    "https://example.com/page1",
-    "https://example.com/page2"
-]
-results = spider.crawl_urls(urls, config)
-# Crawl from sitemap
-results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
-```
-## Configuration
-The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
-```python
-config = CrawlConfig(
-    # Content Selection (all optional)
-    target_selector="article",              # Specific element to target
-    remove_selectors=[".ads", "#popup"],   # Elements to remove
-    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
-    # Processing Settings
-    max_concurrent_requests=1,              # Default: 1
-    request_delay=0.5,                     # Delay between requests in seconds
-    timeout=30,                            # Request timeout in seconds
-    # Output Settings
-    output_dir="spiderforce_reports",      # Default output directory
-    webhook_url="https://your-webhook.com", # Optional webhook endpoint
-    webhook_timeout=10,                     # Webhook timeout in seconds
-    report_file=None                        # Optional custom report location
-)
-```
-### Default Directory Structure
-```
-./
-└── spiderforce_reports/
-    ├── example-com-page1.md
-    ├── example-com-page2.md
-    └── crawl_report.json
-```
-## Webhook Notifications
-If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
-```json
-{
-  "url": "https://example.com/page1",
-  "status": "success",
-  "markdown": "# Page Title\n\nContent...",
-  "timestamp": "2025-02-15T10:30:00.123456",
-  "config": {
-    "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"],
-    "remove_selectors_regex": ["modal-\\d+"]
-  }
-}
-```
-## Crawl Report
-A comprehensive JSON report is automatically generated in the output directory:
-```json
-{
-  "timestamp": "2025-02-15T10:30:00.123456",
-  "config": {
-    "target_selector": "article",
-    "remove_selectors": [".ads", "#popup"],
-    "remove_selectors_regex": ["modal-\\d+"]
-  },
-  "results": {
-    "successful": [
-      {
-        "url": "https://example.com/page1",
-        "status": "success",
-        "markdown": "# Page Title\n\nContent...",
-        "timestamp": "2025-02-15T10:30:00.123456"
-      }
-    ],
-    "failed": [
-      {
-        "url": "https://example.com/page2",
-        "status": "failed",
-        "error": "HTTP 404: Not Found",
-        "timestamp": "2025-02-15T10:30:01.123456"
-      }
-    ]
-  },
-  "summary": {
-    "total": 2,
-    "successful": 1,
-    "failed": 1
-  }
-}
-```
-## Async Usage
-```python
-import asyncio
-from spiderforce4ai import SpiderForce4AI, CrawlConfig
-async def main():
-    config = CrawlConfig()
-    spider = SpiderForce4AI("http://localhost:3004")
-    async with spider:
-        results = await spider.crawl_urls_async(
-            ["https://example.com/page1", "https://example.com/page2"],
-            config
-        )
-    return results
-if __name__ == "__main__":
-    results = asyncio.run(main())
-```
-## Error Handling
-The crawler is designed to be resilient:
-- Continues processing even if some URLs fail
-- Records all errors in the crawl report
-- Sends error notifications via webhook if configured
-- Provides clear error messages in console output
-## Progress Tracking
-The crawler provides real-time progress tracking in the console:
-```
-🔄 Crawling URLs... [####################] 100%
-✓ Successful: 95
-✗ Failed: 5
-📊 Report saved to: ./spiderforce_reports/crawl_report.json
-```
-## Usage with AI Agents
-The package is designed to be easily integrated with AI agents and chat systems:
-```python
-from spiderforce4ai import SpiderForce4AI, CrawlConfig
-def fetch_content_for_ai(urls):
-    spider = SpiderForce4AI("http://localhost:3004")
-    config = CrawlConfig()
-    # Crawl content
-    results = spider.crawl_urls(urls, config)
-    # Return successful results
-    return {
-        result.url: result.markdown
-        for result in results
-        if result.status == "success"
-    }
-# Use with AI agent
-urls = ["https://example.com/article1", "https://example.com/article2"]
-content = fetch_content_for_ai(urls)
-```
-## Requirements
-- Python 3.11 or later
-- Docker (for running SpiderForce4AI service)
-## License
-MIT License
-## Credits
-Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
-spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
-spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.4.dist-info/RECORD,,

{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

spiderforce4ai 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl