PyPI - spiderforce4ai - Versions diffs - 0.1.5__tar.gz → 0.1.7__tar.gz - Mend

spiderforce4ai 0.1.5tar.gz → 0.1.7tar.gz

Files changed (14) hide show

spiderforce4ai-0.1.7/PKG-INFO ADDED Viewed

@@ -0,0 +1,278 @@
+Metadata-Version: 2.2
+Name: spiderforce4ai
+Version: 0.1.7
+Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
+Home-page: https://petertam.pro
+Author: Piotr Tamulewicz
+Author-email: Piotr Tamulewicz <pt@petertam.pro>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio>=3.4.3
+Requires-Dist: rich>=10.0.0
+Requires-Dist: aiofiles>=0.8.0
+Requires-Dist: httpx>=0.24.0
+Dynamic: author
+Dynamic: home-page
+Dynamic: requires-python
+# SpiderForce4AI Python Wrapper
+A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
+## Installation
+```bash
+pip install spiderforce4ai
+```
+## Quick Start (Minimal Setup)
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+# Initialize with your SpiderForce4AI service URL
+spider = SpiderForce4AI("http://localhost:3004")
+# Use default configuration (will save in ./spiderforce_reports)
+config = CrawlConfig()
+# Crawl a single URL
+result = spider.crawl_url("https://example.com", config)
+```
+## Crawling Methods
+### 1. Single URL Crawling
+```python
+# Synchronous
+result = spider.crawl_url("https://example.com", config)
+# Asynchronous
+async def crawl():
+    result = await spider.crawl_url_async("https://example.com", config)
+```
+### 2. Multiple URLs Crawling
+```python
+# List of URLs
+urls = [
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+]
+# Synchronous
+results = spider.crawl_urls(urls, config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_urls_async(urls, config)
+# Parallel (using multiprocessing)
+results = spider.crawl_urls_parallel(urls, config)
+```
+### 3. Sitemap Crawling
+```python
+# Synchronous
+results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
+# Parallel (using multiprocessing)
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
+```
+## Configuration Options
+All configuration options are optional with sensible defaults:
+```python
+config = CrawlConfig(
+    # Content Selection (all optional)
+    target_selector="article",              # Specific element to target
+    remove_selectors=[                      # Elements to remove
+        ".ads",
+        "#popup",
+        ".navigation",
+        ".footer"
+    ],
+    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
+    # Processing Settings
+    max_concurrent_requests=1,              # Default: 1 (parallel processing)
+    request_delay=0.5,                     # Delay between requests in seconds
+    timeout=30,                            # Request timeout in seconds
+    # Output Settings
+    output_dir="custom_output",            # Default: "spiderforce_reports"
+    report_file="custom_report.json",      # Default: "crawl_report.json"
+    webhook_url="https://your-webhook.com", # Optional webhook endpoint
+    webhook_timeout=10                      # Webhook timeout in seconds
+)
+```
+## Real-World Examples
+### 1. Basic Website Crawling
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+from pathlib import Path
+spider = SpiderForce4AI("http://localhost:3004")
+config = CrawlConfig(
+    output_dir=Path("blog_content")
+)
+result = spider.crawl_url("https://example.com/blog", config)
+print(f"Content saved to: {result.url}.md")
+```
+### 2. Advanced Parallel Sitemap Crawling
+```python
+config = CrawlConfig(
+    max_concurrent_requests=5,
+    output_dir=Path("website_content"),
+    remove_selectors=[
+        ".navigation",
+        ".footer",
+        ".ads",
+        "#cookie-notice"
+    ],
+    webhook_url="https://your-webhook.com/endpoint"
+)
+results = spider.crawl_sitemap_parallel(
+    "https://example.com/sitemap.xml",
+    config
+)
+```
+### 3. Async Crawling with Progress
+```python
+import asyncio
+async def main():
+    config = CrawlConfig(
+        max_concurrent_requests=3,
+        request_delay=1.0
+    )
+    async with spider:
+        results = await spider.crawl_urls_async([
+            "https://example.com/1",
+            "https://example.com/2",
+            "https://example.com/3"
+        ], config)
+    return results
+results = asyncio.run(main())
+```
+## Output Structure
+### 1. File Organization
+```
+output_dir/
+├── example-com-page1.md
+├── example-com-page2.md
+└── crawl_report.json
+```
+### 2. Markdown Files
+Each markdown file is named using a slugified version of the URL and contains the converted content.
+### 3. Report JSON Structure
+```json
+{
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"],
+    "remove_selectors_regex": ["modal-\\d+"]
+  },
+  "results": {
+    "successful": [
+      {
+        "url": "https://example.com/page1",
+        "status": "success",
+        "markdown": "# Page Title\n\nContent...",
+        "timestamp": "2025-02-15T10:30:00.123456"
+      }
+    ],
+    "failed": [
+      {
+        "url": "https://example.com/page2",
+        "status": "failed",
+        "error": "HTTP 404: Not Found",
+        "timestamp": "2025-02-15T10:30:01.123456"
+      }
+    ]
+  },
+  "summary": {
+    "total": 2,
+    "successful": 1,
+    "failed": 1
+  }
+}
+```
+### 4. Webhook Notifications
+If configured, webhooks receive real-time updates in JSON format:
+```json
+{
+  "url": "https://example.com/page1",
+  "status": "success",
+  "markdown": "# Page Title\n\nContent...",
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"]
+  }
+}
+```
+## Error Handling
+The package handles various types of errors:
+- Network errors
+- Timeout errors
+- Invalid URLs
+- Missing content
+- Service errors
+All errors are:
+1. Logged in the console
+2. Included in the JSON report
+3. Sent via webhook (if configured)
+4. Available in the results list
+## Requirements
+- Python 3.11 or later
+- Running SpiderForce4AI service
+- Internet connection
+## License
+MIT License
+## Credits
+Created by [Peter Tam](https://petertam.pro)

spiderforce4ai-0.1.7/README.md ADDED Viewed

@@ -0,0 +1,254 @@
+# SpiderForce4AI Python Wrapper
+A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
+## Installation
+```bash
+pip install spiderforce4ai
+```
+## Quick Start (Minimal Setup)
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+# Initialize with your SpiderForce4AI service URL
+spider = SpiderForce4AI("http://localhost:3004")
+# Use default configuration (will save in ./spiderforce_reports)
+config = CrawlConfig()
+# Crawl a single URL
+result = spider.crawl_url("https://example.com", config)
+```
+## Crawling Methods
+### 1. Single URL Crawling
+```python
+# Synchronous
+result = spider.crawl_url("https://example.com", config)
+# Asynchronous
+async def crawl():
+    result = await spider.crawl_url_async("https://example.com", config)
+```
+### 2. Multiple URLs Crawling
+```python
+# List of URLs
+urls = [
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+]
+# Synchronous
+results = spider.crawl_urls(urls, config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_urls_async(urls, config)
+# Parallel (using multiprocessing)
+results = spider.crawl_urls_parallel(urls, config)
+```
+### 3. Sitemap Crawling
+```python
+# Synchronous
+results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
+# Asynchronous
+async def crawl():
+    results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
+# Parallel (using multiprocessing)
+results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
+```
+## Configuration Options
+All configuration options are optional with sensible defaults:
+```python
+config = CrawlConfig(
+    # Content Selection (all optional)
+    target_selector="article",              # Specific element to target
+    remove_selectors=[                      # Elements to remove
+        ".ads",
+        "#popup",
+        ".navigation",
+        ".footer"
+    ],
+    remove_selectors_regex=["modal-\\d+"],  # Regex patterns for removal
+    # Processing Settings
+    max_concurrent_requests=1,              # Default: 1 (parallel processing)
+    request_delay=0.5,                     # Delay between requests in seconds
+    timeout=30,                            # Request timeout in seconds
+    # Output Settings
+    output_dir="custom_output",            # Default: "spiderforce_reports"
+    report_file="custom_report.json",      # Default: "crawl_report.json"
+    webhook_url="https://your-webhook.com", # Optional webhook endpoint
+    webhook_timeout=10                      # Webhook timeout in seconds
+)
+```
+## Real-World Examples
+### 1. Basic Website Crawling
+```python
+from spiderforce4ai import SpiderForce4AI, CrawlConfig
+from pathlib import Path
+spider = SpiderForce4AI("http://localhost:3004")
+config = CrawlConfig(
+    output_dir=Path("blog_content")
+)
+result = spider.crawl_url("https://example.com/blog", config)
+print(f"Content saved to: {result.url}.md")
+```
+### 2. Advanced Parallel Sitemap Crawling
+```python
+config = CrawlConfig(
+    max_concurrent_requests=5,
+    output_dir=Path("website_content"),
+    remove_selectors=[
+        ".navigation",
+        ".footer",
+        ".ads",
+        "#cookie-notice"
+    ],
+    webhook_url="https://your-webhook.com/endpoint"
+)
+results = spider.crawl_sitemap_parallel(
+    "https://example.com/sitemap.xml",
+    config
+)
+```
+### 3. Async Crawling with Progress
+```python
+import asyncio
+async def main():
+    config = CrawlConfig(
+        max_concurrent_requests=3,
+        request_delay=1.0
+    )
+    async with spider:
+        results = await spider.crawl_urls_async([
+            "https://example.com/1",
+            "https://example.com/2",
+            "https://example.com/3"
+        ], config)
+    return results
+results = asyncio.run(main())
+```
+## Output Structure
+### 1. File Organization
+```
+output_dir/
+├── example-com-page1.md
+├── example-com-page2.md
+└── crawl_report.json
+```
+### 2. Markdown Files
+Each markdown file is named using a slugified version of the URL and contains the converted content.
+### 3. Report JSON Structure
+```json
+{
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"],
+    "remove_selectors_regex": ["modal-\\d+"]
+  },
+  "results": {
+    "successful": [
+      {
+        "url": "https://example.com/page1",
+        "status": "success",
+        "markdown": "# Page Title\n\nContent...",
+        "timestamp": "2025-02-15T10:30:00.123456"
+      }
+    ],
+    "failed": [
+      {
+        "url": "https://example.com/page2",
+        "status": "failed",
+        "error": "HTTP 404: Not Found",
+        "timestamp": "2025-02-15T10:30:01.123456"
+      }
+    ]
+  },
+  "summary": {
+    "total": 2,
+    "successful": 1,
+    "failed": 1
+  }
+}
+```
+### 4. Webhook Notifications
+If configured, webhooks receive real-time updates in JSON format:
+```json
+{
+  "url": "https://example.com/page1",
+  "status": "success",
+  "markdown": "# Page Title\n\nContent...",
+  "timestamp": "2025-02-15T10:30:00.123456",
+  "config": {
+    "target_selector": "article",
+    "remove_selectors": [".ads", "#popup"]
+  }
+}
+```
+## Error Handling
+The package handles various types of errors:
+- Network errors
+- Timeout errors
+- Invalid URLs
+- Missing content
+- Service errors
+All errors are:
+1. Logged in the console
+2. Included in the JSON report
+3. Sent via webhook (if configured)
+4. Available in the results list
+## Requirements
+- Python 3.11 or later
+- Running SpiderForce4AI service
+- Internet connection
+## License
+MIT License
+## Credits
+Created by [Peter Tam](https://petertam.pro)

{spiderforce4ai-0.1.5 → spiderforce4ai-0.1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "0.1.5"
+version = "0.1.7"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
 readme = "README.md"
 authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]

{spiderforce4ai-0.1.5 → spiderforce4ai-0.1.7}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
     name="spiderforce4ai",
-    version="0.1.5",
+    version="0.1.7",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",

{spiderforce4ai-0.1.5 → spiderforce4ai-0.1.7}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -86,6 +86,31 @@ class CrawlConfig:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
+def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
+    """Synchronous version of webhook sender for parallel processing."""
+    if not config.webhook_url:
+        return
+    payload = {
+        "url": result.url,
+        "status": result.status,
+        "markdown": result.markdown if result.status == "success" else None,
+        "error": result.error if result.status == "failed" else None,
+        "timestamp": result.timestamp,
+        "config": config.to_dict()
+    }
+    try:
+        response = requests.post(
+            config.webhook_url,
+            json=payload,
+            timeout=config.webhook_timeout
+        )
+        response.raise_for_status()
+    except Exception as e:
+        print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
 # Module level function for multiprocessing
 def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
     """Process a single URL for parallel processing."""
@@ -99,12 +124,15 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         response = requests.post(endpoint, json=payload, timeout=config.timeout)
         if response.status_code != 200:
-            return CrawlResult(
+            result = CrawlResult(
                 url=url,
                 status="failed",
                 error=f"HTTP {response.status_code}: {response.text}",
                 config=config.to_dict()
             )
+            # Send webhook for failed result
+            _send_webhook_sync(result, config)
+            return result
         markdown = response.text
@@ -114,24 +142,32 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             with open(filepath, 'w', encoding='utf-8') as f:
                 f.write(markdown)
-        # Add delay if configured
-        if config.request_delay:
-            time.sleep(config.request_delay)
-        return CrawlResult(
+        result = CrawlResult(
             url=url,
             status="success",
             markdown=markdown,
             config=config.to_dict()
         )
+        # Send webhook for successful result
+        _send_webhook_sync(result, config)
+        # Add delay if configured
+        if config.request_delay:
+            time.sleep(config.request_delay)
+        return result
     except Exception as e:
-        return CrawlResult(
+        result = CrawlResult(
             url=url,
             status="failed",
             error=str(e),
             config=config.to_dict()
         )
+        # Send webhook for error result
+        _send_webhook_sync(result, config)
+        return result
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -424,4 +460,5 @@ class SpiderForce4AI:
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
+        self._executor.shutdown(wait=True)

spiderforce4ai 0.1.5__tar.gz → 0.1.7__tar.gz

spiderforce4ai 0.1.5tar.gz → 0.1.7tar.gz