PyPI - spiderforce4ai - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

spiderforce4ai 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -1,13 +1,10 @@
-"""
-SpiderForce4AI Python Wrapper
-A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
-"""
+# spiderforce4ai/__init__.py
 import asyncio
 import aiohttp
 import json
 import logging
-from typing import List, Dict, Union, Optional
+from typing import List, Dict, Union, Optional, Tuple
 from dataclasses import dataclass, asdict
 from urllib.parse import urljoin, urlparse
 from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 from rich.console import Console
 import aiofiles
 import httpx
+import requests
 from multiprocessing import Pool
 console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
+# Module level function for multiprocessing
+def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
+    """Process a single URL for parallel processing."""
+    url, base_url, config = args
+    try:
+        endpoint = f"{base_url}/convert"
+        payload = {
+            "url": url,
+            **config.to_dict()
+        }
+        response = requests.post(endpoint, json=payload, timeout=config.timeout)
+        if response.status_code != 200:
+            return CrawlResult(
+                url=url,
+                status="failed",
+                error=f"HTTP {response.status_code}: {response.text}",
+                config=config.to_dict()
+            )
+        markdown = response.text
+        # Save markdown if output directory is configured
+        if config.output_dir:
+            filepath = config.output_dir / f"{slugify(url)}.md"
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(markdown)
+        # Add delay if configured
+        if config.request_delay:
+            time.sleep(config.request_delay)
+        return CrawlResult(
+            url=url,
+            status="success",
+            markdown=markdown,
+            config=config.to_dict()
+        )
+    except Exception as e:
+        return CrawlResult(
+            url=url,
+            status="failed",
+            error=str(e),
+            config=config.to_dict()
+        )
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -140,6 +185,25 @@ class SpiderForce4AI:
         except Exception as e:
             console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
+        """Save crawl report synchronously."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in results if r.status == "success"],
+                "failed": [asdict(r) for r in results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(results),
+                "successful": len([r for r in results if r.status == "success"]),
+                "failed": len([r for r in results if r.status == "failed"])
+            }
+        }
+        with open(config.report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)
     async def _save_report(self, config: CrawlConfig):
         """Save crawl report to JSON file."""
         if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
         """Synchronous version of crawl_sitemap_async."""
         return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
-    async def __aenter__(self):
-        """Async context manager entry."""
-        await self._ensure_session()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit."""
-        await self._close_session()
-    def __enter__(self):
-        """Sync context manager entry."""
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
     def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
-        """
+        """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
         print(f"Fetching sitemap from {sitemap_url}...")
         # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
             print(f"Error parsing sitemap: {str(e)}")
             raise
-        def _crawl_single(url: str) -> CrawlResult:
-            try:
-                endpoint = f"{self.base_url}/convert"
-                payload = {
-                    "url": url,
-                    **config.to_dict()
-                }
-                response = requests.post(endpoint, json=payload, timeout=config.timeout)
-                if response.status_code != 200:
-                    return CrawlResult(
-                        url=url,
-                        status="failed",
-                        error=f"HTTP {response.status_code}: {response.text}",
-                        config=config.to_dict()
-                    )
-                markdown = response.text
-                # Save markdown if output directory is configured
-                if config.output_dir:
-                    filepath = config.output_dir / f"{slugify(url)}.md"
-                    with open(filepath, 'w', encoding='utf-8') as f:
-                        f.write(markdown)
-                # Add delay if configured
-                if config.request_delay:
-                    time.sleep(config.request_delay)
-                return CrawlResult(
-                    url=url,
-                    status="success",
-                    markdown=markdown,
-                    config=config.to_dict()
-                )
-            except Exception as e:
-                return CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                )
+        # Prepare arguments for parallel processing
+        process_args = [(url, self.base_url, config) for url in urls]
         # Create process pool and execute crawls
         results = []
         with Pool(processes=config.max_concurrent_requests) as pool:
             with Progress(
                 SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
             ) as progress:
                 task = progress.add_task("Crawling URLs...", total=len(urls))
-                for result in pool.imap_unordered(_crawl_single, urls):
+                for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
         return results
-    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
-        """Save crawl report synchronously."""
-        report = {
-            "timestamp": datetime.now().isoformat(),
-            "config": config.to_dict(),
-            "results": {
-                "successful": [asdict(r) for r in results if r.status == "success"],
-                "failed": [asdict(r) for r in results if r.status == "failed"]
-            },
-            "summary": {
-                "total": len(results),
-                "successful": len([r for r in results if r.status == "success"]),
-                "failed": len([r for r in results if r.status == "failed"])
-            }
-        }
+    async def __aenter__(self):
+        """Async context manager entry."""
+        await self._ensure_session()
+        return self
-        with open(config.report_file, 'w', encoding='utf-8') as f:
-            json.dump(report, f, indent=2)
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self._close_session()
+    def __enter__(self):
+        """Sync context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Sync context manager exit."""
+        self._executor.shutdown(wait=True)

{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.4
+Version: 0.1.5
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
+spiderforce4ai-0.1.5.dist-info/METADATA,sha256=Fm5H-qr4CBfJAVKXyJXsABYib_Vhvn2iUb6T6qSidHg,6214
+spiderforce4ai-0.1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.5.dist-info/RECORD,,

spiderforce4ai-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
-spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
-spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.4.dist-info/RECORD,,

{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

spiderforce4ai 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl