PyPI - spiderforce4ai - Versions diffs - 0.1.0__tar.gz → 0.1.3__tar.gz - Mend

spiderforce4ai 0.1.0tar.gz → 0.1.3tar.gz

Files changed (11) hide show

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.0
+Version: 0.1.3
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
 Dynamic: home-page
 Dynamic: requires-python
-# SpiderForce4AI Python Wrapper
+# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
 A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/README.md RENAMED Viewed

@@ -1,4 +1,4 @@
-# SpiderForce4AI Python Wrapper
+# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
 A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "0.1.0"
+version = "0.1.3"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
 readme = "README.md"
 authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 from rich.console import Console
 import aiofiles
 import httpx
+from multiprocessing import Pool
 console = Console()
@@ -300,4 +301,125 @@ class SpiderForce4AI:
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Sync context manager exit."""
-        self._executor.shutdown(wait=True)
+        self._executor.shutdown(wait=True)
+    def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
+        """
+        print(f"Fetching sitemap from {sitemap_url}...")
+        # Fetch sitemap
+        try:
+            response = requests.get(sitemap_url, timeout=config.timeout)
+            response.raise_for_status()
+            sitemap_text = response.text
+        except Exception as e:
+            print(f"Error fetching sitemap: {str(e)}")
+            raise
+        # Parse sitemap
+        try:
+            root = ET.fromstring(sitemap_text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            print(f"Found {len(urls)} URLs in sitemap")
+        except Exception as e:
+            print(f"Error parsing sitemap: {str(e)}")
+            raise
+        def _crawl_single(url: str) -> CrawlResult:
+            try:
+                endpoint = f"{self.base_url}/convert"
+                payload = {
+                    "url": url,
+                    **config.to_dict()
+                }
+                response = requests.post(endpoint, json=payload, timeout=config.timeout)
+                if response.status_code != 200:
+                    return CrawlResult(
+                        url=url,
+                        status="failed",
+                        error=f"HTTP {response.status_code}: {response.text}",
+                        config=config.to_dict()
+                    )
+                markdown = response.text
+                # Save markdown if output directory is configured
+                if config.output_dir:
+                    filepath = config.output_dir / f"{slugify(url)}.md"
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(markdown)
+                # Add delay if configured
+                if config.request_delay:
+                    time.sleep(config.request_delay)
+                return CrawlResult(
+                    url=url,
+                    status="success",
+                    markdown=markdown,
+                    config=config.to_dict()
+                )
+            except Exception as e:
+                return CrawlResult(
+                    url=url,
+                    status="failed",
+                    error=str(e),
+                    config=config.to_dict()
+                )
+        # Create process pool and execute crawls
+        results = []
+        with Pool(processes=config.max_concurrent_requests) as pool:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TextColumn("({task.completed}/{task.total})"),
+            ) as progress:
+                task = progress.add_task("Crawling URLs...", total=len(urls))
+                for result in pool.imap_unordered(_crawl_single, urls):
+                    results.append(result)
+                    progress.update(task, advance=1)
+                    status = "✓" if result.status == "success" else "✗"
+                    progress.description = f"Last: {status} {result.url}"
+        # Save final report
+        if config.report_file:
+            self._save_report_sync(results, config)
+            print(f"\nReport saved to: {config.report_file}")
+        # Print summary
+        successful = len([r for r in results if r.status == "success"])
+        failed = len([r for r in results if r.status == "failed"])
+        print(f"\nCrawling completed:")
+        print(f"✓ Successful: {successful}")
+        print(f"✗ Failed: {failed}")
+        return results
+    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
+        """Save crawl report synchronously."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "config": config.to_dict(),
+            "results": {
+                "successful": [asdict(r) for r in results if r.status == "success"],
+                "failed": [asdict(r) for r in results if r.status == "failed"]
+            },
+            "summary": {
+                "total": len(results),
+                "successful": len([r for r in results if r.status == "success"]),
+                "failed": len([r for r in results if r.status == "failed"])
+            }
+        }
+        with open(config.report_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 0.1.0
+Version: 0.1.3
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
 Dynamic: home-page
 Dynamic: requires-python
-# SpiderForce4AI Python Wrapper
+# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
 A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/setup.cfg RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/setup.py RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai.egg-info/requires.txt RENAMED Viewed

File without changes

{spiderforce4ai-0.1.0 → spiderforce4ai-0.1.3}/spiderforce4ai.egg-info/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 0.1.0__tar.gz → 0.1.3__tar.gz

spiderforce4ai 0.1.0tar.gz → 0.1.3tar.gz