PyPI - spiderforce4ai - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

spiderforce4ai 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 from rich.console import Console
 import aiofiles
 import httpx
+from multiprocessing import Pool
 console = Console()
@@ -300,4 +301,125 @@ class SpiderForce4AI:
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
+        self._executor.shutdown(wait=True)
+    def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
+        """
+        print(f"Fetching sitemap from {sitemap_url}...")
+        # Fetch sitemap
+        try:
+            response = requests.get(sitemap_url, timeout=config.timeout)
+            response.raise_for_status()
+            sitemap_text = response.text
+        except Exception as e:
+            print(f"Error fetching sitemap: {str(e)}")
+            raise
+        # Parse sitemap
+        try:
+            root = ET.fromstring(sitemap_text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            print(f"Found {len(urls)} URLs in sitemap")
+        except Exception as e:
+            print(f"Error parsing sitemap: {str(e)}")
+            raise
+        def _crawl_single(url: str) -> CrawlResult:
+            try:
+                endpoint = f"{self.base_url}/convert"
+                payload = {
+                    "url": url,
+                    **config.to_dict()
+                }
+                response = requests.post(endpoint, json=payload, timeout=config.timeout)
+                if response.status_code != 200:
+                    return CrawlResult(
+                        url=url,
+                        status="failed",
+                        error=f"HTTP {response.status_code}: {response.text}",
+                        config=config.to_dict()
+                    )
+                markdown = response.text
+                # Save markdown if output directory is configured
+                if config.output_dir:
+                    filepath = config.output_dir / f"{slugify(url)}.md"
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(markdown)
+                # Add delay if configured
+                if config.request_delay:
+                    time.sleep(config.request_delay)
+                return CrawlResult(
+                    url=url,
+                    status="success",
+                    markdown=markdown,
+                    config=config.to_dict()
+                )
+            except Exception as e:
+                return CrawlResult(
+                    url=url,
+                    status="failed",
+                    error=str(e),
+                    config=config.to_dict()
+                )
+        # Create process pool and execute crawls
+        results = []
+        with Pool(processes=config.max_concurrent_requests) as pool:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TextColumn("({task.completed}/{task.total})"),
+            ) as progress:
+                task = progress.add_task("Crawling URLs...", total=len(urls))
+                for result in pool.imap_unordered(_crawl_single, urls):
+                    results.append(result)
+                    progress.update(task, advance=1)
+                    status = "✓" if result.status == "success" else "✗"
+                    progress.description = f"Last: {status} {result.url}"
+        # Save final report
+        if config.report_file:
+            self._save_report_sync(results, config)
+            print(f"\nReport saved to: {config.report_file}")
+        # Print summary
+        successful = len([r for r in results if r.status == "success"])
+        failed = len([r for r in results if r.status == "failed"])
+        print(f"\nCrawling completed:")
+        print(f"✓ Successful: {successful}")
+        print(f"✗ Failed: {failed}")
+        return results
+    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
+        """Save crawl report synchronously."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in results if r.status == "success"],
+                "failed": [asdict(r) for r in results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(results),
+                "successful": len([r for r in results if r.status == "success"]),
+                "failed": len([r for r in results if r.status == "failed"])
+            }
+        }
+        with open(config.report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)

{spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.0
+Version: 0.1.2
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
 Dynamic: home-page
 Dynamic: requires-python
-# SpiderForce4AI Python Wrapper
+# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
 A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.

spiderforce4ai-0.1.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
+spiderforce4ai-0.1.2.dist-info/METADATA,sha256=DmzqJ_eAXf4XEL3b8ZjOgaqIFwwz2DVHQheyBwXTSGY,6214
+spiderforce4ai-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-0.1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-0.1.2.dist-info/RECORD,,

spiderforce4ai-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
-spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
-spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-0.1.0.dist-info/RECORD,,

{spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

spiderforce4ai 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl