PyPI - spiderforce4ai - Versions diffs - 0.1.0__py3-none-any.whl - Mend

spiderforce4ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

spiderforce4ai/__init__.py +303 -0
spiderforce4ai-0.1.0.dist-info/METADATA +239 -0
spiderforce4ai-0.1.0.dist-info/RECORD +5 -0
spiderforce4ai-0.1.0.dist-info/WHEEL +5 -0
spiderforce4ai-0.1.0.dist-info/top_level.txt +1 -0

spiderforce4ai/__init__.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""
+SpiderForce4AI Python Wrapper
+A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
+"""
+import asyncio
+import aiohttp
+import json
+import logging
+from typing import List, Dict, Union, Optional
+from dataclasses import dataclass, asdict
+from urllib.parse import urljoin, urlparse
+from pathlib import Path
+import time
+import xml.etree.ElementTree as ET
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import re
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
+from rich.console import Console
+import aiofiles
+import httpx
+console = Console()
+def slugify(url: str) -> str:
+    """Convert URL to a valid filename."""
+    parsed = urlparse(url)
+    # Combine domain and path, remove scheme and special characters
+    slug = f"{parsed.netloc}{parsed.path}"
+    slug = re.sub(r'[^\w\-]', '_', slug)
+    slug = re.sub(r'_+', '_', slug)  # Replace multiple underscores with single
+    return slug.strip('_')
+@dataclass
+class CrawlResult:
+    """Store results of a crawl operation."""
+    url: str
+    status: str  # 'success' or 'failed'
+    markdown: Optional[str] = None
+    error: Optional[str] = None
+    timestamp: str = None
+    config: Dict = None
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.now().isoformat()
+@dataclass
+class CrawlConfig:
+    """Configuration for crawling settings."""
+    target_selector: Optional[str] = None  # Optional - specific element to target
+    remove_selectors: Optional[List[str]] = None  # Optional - elements to remove
+    remove_selectors_regex: Optional[List[str]] = None  # Optional - regex patterns for removal
+    max_concurrent_requests: int = 1  # Default to single thread
+    request_delay: float = 0.5  # Delay between requests
+    timeout: int = 30  # Request timeout
+    output_dir: Path = Path("spiderforce_reports")  # Default to spiderforce_reports in current directory
+    webhook_url: Optional[str] = None  # Optional webhook endpoint
+    webhook_timeout: int = 10  # Webhook timeout
+    report_file: Optional[Path] = None  # Optional report file location
+    def __post_init__(self):
+        # Initialize empty lists for selectors if None
+        self.remove_selectors = self.remove_selectors or []
+        self.remove_selectors_regex = self.remove_selectors_regex or []
+        # Ensure output_dir is a Path and exists
+        self.output_dir = Path(self.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # If report_file is not specified, create it in output_dir
+        if self.report_file is None:
+            self.report_file = self.output_dir / "crawl_report.json"
+        else:
+            self.report_file = Path(self.report_file)
+    def to_dict(self) -> Dict:
+        """Convert config to dictionary for API requests."""
+        payload = {}
+        # Only include selectors if they are set
+        if self.target_selector:
+            payload["target_selector"] = self.target_selector
+        if self.remove_selectors:
+            payload["remove_selectors"] = self.remove_selectors
+        if self.remove_selectors_regex:
+            payload["remove_selectors_regex"] = self.remove_selectors_regex
+        return payload
+class SpiderForce4AI:
+    """Main class for interacting with SpiderForce4AI service."""
+    def __init__(self, base_url: str):
+        self.base_url = base_url.rstrip('/')
+        self.session = None
+        self._executor = ThreadPoolExecutor()
+        self.crawl_results: List[CrawlResult] = []
+    async def _ensure_session(self):
+        """Ensure aiohttp session exists."""
+        if self.session is None or self.session.closed:
+            self.session = aiohttp.ClientSession()
+    async def _close_session(self):
+        """Close aiohttp session."""
+        if self.session and not self.session.closed:
+            await self.session.close()
+    async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
+        """Save markdown content to file."""
+        filename = f"{slugify(url)}.md"
+        filepath = output_dir / filename
+        async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
+            await f.write(markdown)
+        return filepath
+    async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
+        """Send webhook with crawl results."""
+        if not config.webhook_url:
+            return
+        payload = {
+            "url": result.url,
+            "status": result.status,
+            "markdown": result.markdown if result.status == "success" else None,
+            "error": result.error if result.status == "failed" else None,
+            "timestamp": result.timestamp,
+            "config": config.to_dict()
+        }
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.post(
+                    config.webhook_url,
+                    json=payload,
+                    timeout=config.webhook_timeout
+                )
+                response.raise_for_status()
+        except Exception as e:
+            console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+    async def _save_report(self, config: CrawlConfig):
+        """Save crawl report to JSON file."""
+        if not config.report_file:
+            return
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in self.crawl_results if r.status == "success"],
+                "failed": [asdict(r) for r in self.crawl_results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(self.crawl_results),
+                "successful": len([r for r in self.crawl_results if r.status == "success"]),
+                "failed": len([r for r in self.crawl_results if r.status == "failed"])
+            }
+        }
+        async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
+            await f.write(json.dumps(report, indent=2))
+    async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
+        """Crawl a single URL asynchronously."""
+        await self._ensure_session()
+        try:
+            endpoint = f"{self.base_url}/convert"
+            payload = {
+                "url": url,
+                **config.to_dict()
+            }
+            async with self.session.post(endpoint, json=payload, timeout=config.timeout) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    result = CrawlResult(
+                        url=url,
+                        status="failed",
+                        error=f"HTTP {response.status}: {error_text}",
+                        config=config.to_dict()
+                    )
+                else:
+                    markdown = await response.text()
+                    result = CrawlResult(
+                        url=url,
+                        status="success",
+                        markdown=markdown,
+                        config=config.to_dict()
+                    )
+                    if config.output_dir:
+                        await self._save_markdown(url, markdown, config.output_dir)
+                    await self._send_webhook(result, config)
+                self.crawl_results.append(result)
+                return result
+        except Exception as e:
+            result = CrawlResult(
+                url=url,
+                status="failed",
+                error=str(e),
+                config=config.to_dict()
+            )
+            self.crawl_results.append(result)
+            return result
+    def crawl_url(self, url: str, config: CrawlConfig) -> CrawlResult:
+        """Synchronous version of crawl_url_async."""
+        return asyncio.run(self.crawl_url_async(url, config))
+    async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
+        """Crawl multiple URLs asynchronously with progress bar."""
+        await self._ensure_session()
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            console=console
+        ) as progress:
+            task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
+            async def crawl_with_progress(url):
+                result = await self.crawl_url_async(url, config)
+                progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
+                return result
+            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            async def crawl_with_semaphore(url):
+                async with semaphore:
+                    result = await crawl_with_progress(url)
+                    await asyncio.sleep(config.request_delay)
+                    return result
+            results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
+            # Save final report
+            await self._save_report(config)
+            # Print summary
+            successful = len([r for r in results if r.status == "success"])
+            failed = len([r for r in results if r.status == "failed"])
+            console.print(f"\n[green]Crawling completed:[/green]")
+            console.print(f"✓ Successful: {successful}")
+            console.print(f"✗ Failed: {failed}")
+            if config.report_file:
+                console.print(f"📊 Report saved to: {config.report_file}")
+            return results
+    def crawl_urls(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
+        """Synchronous version of crawl_urls_async."""
+        return asyncio.run(self.crawl_urls_async(urls, config))
+    async def crawl_sitemap_async(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """Crawl URLs from a sitemap asynchronously."""
+        await self._ensure_session()
+        try:
+            console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
+            async with self.session.get(sitemap_url, timeout=config.timeout) as response:
+                sitemap_text = await response.text()
+        except Exception as e:
+            console.print(f"[red]Error fetching sitemap: {str(e)}[/red]")
+            raise
+        try:
+            root = ET.fromstring(sitemap_text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
+        except Exception as e:
+            console.print(f"[red]Error parsing sitemap: {str(e)}[/red]")
+            raise
+        return await self.crawl_urls_async(urls, config)
+    def crawl_sitemap(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """Synchronous version of crawl_sitemap_async."""
+        return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self._ensure_session()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self._close_session()
+    def __enter__(self):
+        """Sync context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Sync context manager exit."""
+        self._executor.shutdown(wait=True)

spiderforce4ai-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,239 @@
+Metadata-Version: 2.2
+Name: spiderforce4ai
+Version: 0.1.0
+Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
+Home-page: https://petertam.pro
+Author: Piotr Tamulewicz
+Author-email: Piotr Tamulewicz <pt@petertam.pro>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio>=3.4.3
+Requires-Dist: rich>=10.0.0
+Requires-Dist: aiofiles>=0.8.0
+Requires-Dist: httpx>=0.24.0
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# SpiderForce4AI Python Wrapper
+A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
+## Features
+- 🔄 Simple synchronous and asynchronous APIs
+- 📁 Automatic Markdown file saving with URL-based filenames
+- 📊 Real-time progress tracking in console
+- 🪝 Webhook support for real-time notifications
+- 📝 Detailed crawl reports in JSON format
+- ⚡ Concurrent crawling with rate limiting
+- 🔍 Support for sitemap.xml crawling
+- 🛡️ Comprehensive error handling
+## Installation
+```bash
+pip install spiderforce4ai
+```
+## Quick Start
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+# Initialize the client
+spider = SpiderForce4AI("http://localhost:3004")
+# Use default configuration
+config = CrawlConfig()
+# Crawl a single URL
+result = spider.crawl_url("https://example.com", config)
+# Crawl multiple URLs
+urls = [
+    "https://example.com/page1",
+    "https://example.com/page2"
+]
+results = spider.crawl_urls(urls, config)
+# Crawl from sitemap
+results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+```
+## Configuration
+The `CrawlConfig` class provides various configuration options. All parameters are optional with sensible defaults:
+```python
+config = CrawlConfig(
+    # Content Selection (all optional)
+    target_selector="article",              # Specific element to target
+    remove_selectors=[".ads", "#popup"],   # Elements to remove
+    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
+    # Processing Settings
+    max_concurrent_requests=1,              # Default: 1
+    request_delay=0.5,                     # Delay between requests in seconds
+    timeout=30,                            # Request timeout in seconds
+    # Output Settings
+    output_dir="spiderforce_reports",      # Default output directory
+    webhook_url="https://your-webhook.com", # Optional webhook endpoint
+    webhook_timeout=10,                     # Webhook timeout in seconds
+    report_file=None                        # Optional custom report location
+)
+```
+### Default Directory Structure
+```
+./
+└── spiderforce_reports/
+    ├── example-com-page1.md
+    ├── example-com-page2.md
+    └── crawl_report.json
+```
+## Webhook Notifications
+If `webhook_url` is configured, the crawler sends POST requests with the following JSON structure:
+```json
+{
+  "url": "https://example.com/page1",
+  "status": "success",
+  "markdown": "# Page Title\n\nContent...",
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"],
+    "remove_selectors_regex": ["modal-\\d+"]
+  }
+}
+```
+## Crawl Report
+A comprehensive JSON report is automatically generated in the output directory:
+```json
+{
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"],
+    "remove_selectors_regex": ["modal-\\d+"]
+  },
+  "results": {
+    "successful": [
+      {
+        "url": "https://example.com/page1",
+        "status": "success",
+        "markdown": "# Page Title\n\nContent...",
+        "timestamp": "2025-02-15T10:30:00.123456"
+      }
+    ],
+    "failed": [
+      {
+        "url": "https://example.com/page2",
+        "status": "failed",
+        "error": "HTTP 404: Not Found",
+        "timestamp": "2025-02-15T10:30:01.123456"
+      }
+    ]
+  },
+  "summary": {
+    "total": 2,
+    "successful": 1,
+    "failed": 1
+  }
+}
+```
+## Async Usage
+```python
+import asyncio
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+async def main():
+    config = CrawlConfig()
+    spider = SpiderForce4AI("http://localhost:3004")
+    async with spider:
+        results = await spider.crawl_urls_async(
+            ["https://example.com/page1", "https://example.com/page2"],
+            config
+        )
+    return results
+if __name__ == "__main__":
+    results = asyncio.run(main())
+```
+## Error Handling
+The crawler is designed to be resilient:
+- Continues processing even if some URLs fail
+- Records all errors in the crawl report
+- Sends error notifications via webhook if configured
+- Provides clear error messages in console output
+## Progress Tracking
+The crawler provides real-time progress tracking in the console:
+```
+🔄 Crawling URLs... [####################] 100%
+✓ Successful: 95
+✗ Failed: 5
+📊 Report saved to: ./spiderforce_reports/crawl_report.json
+```
+## Usage with AI Agents
+The package is designed to be easily integrated with AI agents and chat systems:
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+def fetch_content_for_ai(urls):
+    spider = SpiderForce4AI("http://localhost:3004")
+    config = CrawlConfig()
+    # Crawl content
+    results = spider.crawl_urls(urls, config)
+    # Return successful results
+    return {
+        result.url: result.markdown
+        for result in results
+        if result.status == "success"
+    }
+# Use with AI agent
+urls = ["https://example.com/article1", "https://example.com/article2"]
+content = fetch_content_for_ai(urls)
+```
+## Requirements
+- Python 3.11 or later
+- Docker (for running SpiderForce4AI service)
+## License
+MIT License
+## Credits
+Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
+spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
+spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.0.dist-info/RECORD,,

spiderforce4ai-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (75.8.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

spiderforce4ai-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ spiderforce4ai