PyPI - spiderforce4ai - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

@@ -1,13 +1,10 @@
-"""
-SpiderForce4AI Python Wrapper
-A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
-"""
+# spiderforce4ai/__init__.py
 import asyncio
 import aiohttp
 import json
 import logging
-from typing import List, Dict, Union, Optional
+from typing import List, Dict, Union, Optional, Tuple
 from dataclasses import dataclass, asdict
 from urllib.parse import urljoin, urlparse
 from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 from rich.console import Console
 import aiofiles
 import httpx
+import requests
 from multiprocessing import Pool
 console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
+# Module level function for multiprocessing
+def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
+    """Process a single URL for parallel processing."""
+    url, base_url, config = args
+    try:
+        endpoint = f"{base_url}/convert"
+        payload = {
+            "url": url,
+            **config.to_dict()
+        }
+        response = requests.post(endpoint, json=payload, timeout=config.timeout)
+        if response.status_code != 200:
+            return CrawlResult(
+                url=url,
+                status="failed",
+                error=f"HTTP {response.status_code}: {response.text}",
+                config=config.to_dict()
+            )
+        markdown = response.text
+        # Save markdown if output directory is configured
+        if config.output_dir:
+            filepath = config.output_dir / f"{slugify(url)}.md"
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(markdown)
+        # Add delay if configured
+        if config.request_delay:
+            time.sleep(config.request_delay)
+        return CrawlResult(
+            url=url,
+            status="success",
+            markdown=markdown,
+            config=config.to_dict()
+        )
+    except Exception as e:
+        return CrawlResult(
+            url=url,
+            status="failed",
+            error=str(e),
+            config=config.to_dict()
+        )
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -140,6 +185,25 @@ class SpiderForce4AI:
         except Exception as e:
             console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
+        """Save crawl report synchronously."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in results if r.status == "success"],
+                "failed": [asdict(r) for r in results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(results),
+                "successful": len([r for r in results if r.status == "success"]),
+                "failed": len([r for r in results if r.status == "failed"])
+            }
+        }
+        with open(config.report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)
     async def _save_report(self, config: CrawlConfig):
         """Save crawl report to JSON file."""
         if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
         """Synchronous version of crawl_sitemap_async."""
         return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self._ensure_session()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        await self._close_session()
-    def __enter__(self):
-        """Sync context manager entry."""
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
     def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
-        """
+        """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
         print(f"Fetching sitemap from {sitemap_url}...")
         # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
             print(f"Error parsing sitemap: {str(e)}")
             raise
-        def _crawl_single(url: str) -> CrawlResult:
-            try:
-                endpoint = f"{self.base_url}/convert"
-                payload = {
-                    "url": url,
-                    **config.to_dict()
-                }
-                response = requests.post(endpoint, json=payload, timeout=config.timeout)
-                if response.status_code != 200:
-                    return CrawlResult(
-                        url=url,
-                        status="failed",
-                        error=f"HTTP {response.status_code}: {response.text}",
-                        config=config.to_dict()
-                    )
-                markdown = response.text
-                # Save markdown if output directory is configured
-                if config.output_dir:
-                    filepath = config.output_dir / f"{slugify(url)}.md"
-                    with open(filepath, 'w', encoding='utf-8') as f:
-                        f.write(markdown)
-                # Add delay if configured
-                if config.request_delay:
-                    time.sleep(config.request_delay)
-                return CrawlResult(
-                    url=url,
-                    status="success",
-                    markdown=markdown,
-                    config=config.to_dict()
-                )
-            except Exception as e:
-                return CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                )
+        # Prepare arguments for parallel processing
+        process_args = [(url, self.base_url, config) for url in urls]
         # Create process pool and execute crawls
         results = []
         with Pool(processes=config.max_concurrent_requests) as pool:
             with Progress(
                 SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
             ) as progress:
                 task = progress.add_task("Crawling URLs...", total=len(urls))
-                for result in pool.imap_unordered(_crawl_single, urls):
+                for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
         return results
-    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
-        """Save crawl report synchronously."""
-        report = {
-            "timestamp": datetime.now().isoformat(),
-            "config": config.to_dict(),
-            "results": {
-                "successful": [asdict(r) for r in results if r.status == "success"],
-                "failed": [asdict(r) for r in results if r.status == "failed"]
-            },
-            "summary": {
-                "total": len(results),
-                "successful": len([r for r in results if r.status == "success"]),
-                "failed": len([r for r in results if r.status == "failed"])
-            }
-        }
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self._ensure_session()
+        return self
-        with open(config.report_file, 'w', encoding='utf-8') as f:
-            json.dump(report, f, indent=2)
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self._close_session()
+    def __enter__(self):
+        """Sync context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Sync context manager exit."""
+        self._executor.shutdown(wait=True)

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.3
+Version: 0.1.5
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "0.1.3"
+version = "0.1.5"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
 readme = "README.md"
 authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
     name="spiderforce4ai",
-    version="0.1.0",
+    version="0.1.5",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.3
+Version: 0.1.5
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai 0.1.3__tar.gz → 0.1.5__tar.gz

spiderforce4ai 0.1.3tar.gz → 0.1.5tar.gz