PyPI - spiderforce4ai - Versions diffs - 2.1__py3-none-any.whl → 2.4__py3-none-any.whl - Mend

spiderforce4ai 2.1py3-none-any.whl → 2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # spiderforce4ai/__init__.py
+from .post_extraction_agent import PostExtractionAgent, PostExtractionConfig, ExtractionTemplate
 import asyncio
 import aiohttp
 import json
 import logging
-from typing import List, Dict, Union, Optional, Tuple
+from typing import List, Dict, Union, Optional, Tuple, Callable, Any
 from dataclasses import dataclass, asdict
 from urllib.parse import urljoin, urlparse
 from pathlib import Path
@@ -23,75 +24,55 @@ from multiprocessing import Pool
 console = Console()
 def extract_metadata_headers(markdown: str, url: str = '') -> str:
-    """Extract metadata and headers from markdown content with enhanced SEO formatting."""
+    """Extract metadata and headers from markdown content."""
     lines = markdown.split('\n')
-    extracted = []
-    in_metadata = False
-    metadata = {
-        'title': '',
-        'description': '',
-        'canonical_url': '',
-        'language': ''
-    }
-    first_paragraph = ''
+    metadata = {}
+    headers = []
-    # First pass - collect metadata and first paragraph
-    for i, line in enumerate(lines):
-        # Check for metadata block boundaries
-        if line.strip() == '---':
-            if not in_metadata:
-                in_metadata = True
-                continue
-            else:
-                in_metadata = False
-                break
+    def parse_metadata_line(line):
+        """Parse a single metadata line correctly."""
+        first_colon = line.find(':')
+        if first_colon == -1:
+            return None, None
+        key = line[:first_colon].strip()
+        value = line[first_colon + 1:].strip()
-        # Extract metadata within the block
-        if in_metadata:
-            if ':' in line:
-                key, value = [part.strip() for part in line.split(':', 1)]
-                key = key.lower()
-                # Handle multi-line values
-                if value.startswith('>'):
-                    value = value[1:].strip()
-                    j = i + 1
-                    while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
-                        value += ' ' + lines[j].strip()
-                        j += 1
-                if key == 'title':
-                    metadata['title'] = value
-                elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
-                    metadata['description'] = value
-                elif key in ['canonical_url', 'canonical']:
-                    metadata['canonical_url'] = value
-                elif key in ['language', 'lang']:
-                    metadata['language'] = value
-        elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
-            first_paragraph = line.strip()
-    # Use first paragraph as fallback description if none found
-    if not metadata['description'] and first_paragraph:
-        metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
-    # Add formatted metadata section
-    extracted.append(f"URL: {url}")
-    extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
-    extracted.append(f"Description: {metadata['description']}")
-    extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
-    extracted.append(f"Language: {metadata['language'] or 'en'}")
-    extracted.append("")  # Empty line after metadata
+        # Handle the case where value starts with "URL:" - this means it's a missing description
+        if value.startswith('URL:'):
+            return key, ''
+        return key, value
-    # Second pass - process headers
+    # Process each line
     for line in lines:
-        if line.strip().startswith('#'):
+        line = line.strip()
+        if not line:
+            continue
+        # Check if it's a metadata line (contains : but isn't a header)
+        if ':' in line and not line.startswith('#'):
+            key, value = parse_metadata_line(line)
+            if key:
+                metadata[key] = value
+        # Check if it's a header
+        elif line.startswith('#'):
             level = len(line) - len(line.lstrip('#'))
             text = line.lstrip('#').strip()
             if 1 <= level <= 6:
-                extracted.append(f"H{level}: {text}")
+                headers.append(f"H{level}: {text}")
-    return '\n'.join(extracted)
+    # Construct output
+    output = []
+    output.append(f"URL: {url}")
+    output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
+    output.append(f"Description: {metadata.get('Description', '')}")
+    output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
+    output.append(f"Language: {metadata.get('Language', 'en')}")
+    output.append("")  # Empty line
+    output.extend(headers)
+    return '\n'.join(output)
 def slugify(url: str) -> str:
     """Convert URL to a valid filename."""
@@ -111,6 +92,7 @@ class CrawlResult:
     error: Optional[str] = None
     timestamp: str = None
     config: Dict = None
+    extraction_result: Optional[Dict] = None  # Store post-extraction results
     def __post_init__(self):
         if not self.timestamp:
@@ -131,9 +113,14 @@ class CrawlConfig:
     webhook_headers: Optional[Dict[str, str]] = None  # Optional webhook headers
     webhook_payload_template: Optional[str] = None  # Optional custom webhook payload template
     save_reports: bool = False  # Whether to save crawl reports
-    report_file: Optional[Path] = None  # Optional report file location (used only if save_reports is True)
-    combine_to_one_markdown: Optional[str] = None  # 'full' or 'metadata_headers' to combine all pages into one file
-    combined_markdown_file: Optional[Path] = None  # Optional path for combined markdown file
+    report_file: Optional[Path] = None  # Optional report file location
+    combine_to_one_markdown: Optional[str] = None  # 'full' or 'metadata_headers'
+    combined_markdown_file: Optional[Path] = None  # Optional path for combined file
+    # Post-extraction settings
+    post_extraction_agent: Optional[Dict[str, Any]] = None  # LLM configuration
+    post_extraction_agent_save_to_file: Optional[str] = None  # Extraction output file
+    post_agent_transformer_function: Optional[Callable] = None  # Custom transformer
     def __post_init__(self):
         # Initialize empty lists/dicts for None values
@@ -161,6 +148,15 @@ class CrawlConfig:
             # Create or clear the combined file
             self.combined_markdown_file.write_text('')
+        # Validate post-extraction agent configuration if provided
+        if self.post_extraction_agent:
+            if "messages" not in self.post_extraction_agent:
+                raise ValueError("Post-extraction agent configuration must include 'messages'")
+            if "model" not in self.post_extraction_agent:
+                raise ValueError("Post-extraction agent configuration must include 'model'")
+            if "api_key" not in self.post_extraction_agent:
+                raise ValueError("Post-extraction agent configuration must include 'api_key'")
     def to_dict(self) -> Dict:
         """Convert config to dictionary for API requests."""
         payload = {}
@@ -172,52 +168,120 @@ class CrawlConfig:
         if self.remove_selectors_regex:
             payload["remove_selectors_regex"] = self.remove_selectors_regex
         return payload
 def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
     """Synchronous version of webhook sender for parallel processing."""
     if not config.webhook_url:
         return
-    # Use custom payload template if provided, otherwise use default
-    if config.webhook_payload_template:
-        # Replace variables in the template
-        payload_str = config.webhook_payload_template.format(
-            url=result.url,
-            status=result.status,
-            markdown=result.markdown if result.status == "success" else None,
-            error=result.error if result.status == "failed" else None,
-            timestamp=result.timestamp,
-            config=config.to_dict()
+    try:
+        # Use custom payload template if provided, otherwise use default
+        if config.webhook_payload_template:
+            # Replace variables in the template
+            payload_str = config.webhook_payload_template.format(
+                url=result.url,
+                status=result.status,
+                markdown=result.markdown if result.status == "success" else None,
+                error=result.error if result.status == "failed" else None,
+                timestamp=result.timestamp,
+                config=config.to_dict(),
+                extraction_result=result.extraction_result if result.extraction_result else None
+            )
+            payload = json.loads(payload_str)  # Parse the formatted JSON string
+        else:
+            # Use default payload format
+            payload = {
+                "url": result.url,
+                "status": result.status,
+                "markdown": result.markdown if result.status == "success" else None,
+                "error": result.error if result.status == "failed" else None,
+                "timestamp": result.timestamp,
+                "config": config.to_dict(),
+                "extraction_result": result.extraction_result if result.extraction_result else None
+            }
+        response = requests.post(
+            config.webhook_url,
+            json=payload,
+            headers=config.webhook_headers,
+            timeout=config.webhook_timeout
         )
-        payload = json.loads(payload_str)  # Parse the formatted JSON string
-    else:
-        # Use default payload format
+        response.raise_for_status()
+    except Exception as e:
+        console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+async def _send_webhook_async(result: CrawlResult, config: CrawlConfig):
+    """Asynchronous webhook sender."""
+    if not config.webhook_url:
+        return
+    try:
+        # Prepare payload similar to sync version
         payload = {
             "url": result.url,
             "status": result.status,
             "markdown": result.markdown if result.status == "success" else None,
             "error": result.error if result.status == "failed" else None,
             "timestamp": result.timestamp,
-            "config": config.to_dict()
+            "config": config.to_dict(),
+            "extraction_result": result.extraction_result if result.extraction_result else None
         }
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                config.webhook_url,
+                json=payload,
+                headers=config.webhook_headers,
+                timeout=config.webhook_timeout
+            )
+            response.raise_for_status()
+    except Exception as e:
+        console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
+async def _save_markdown_async(url: str, markdown: str, config: CrawlConfig):
+    """Save markdown content to file and/or append to combined file asynchronously."""
     try:
-        response = requests.post(
-            config.webhook_url,
-            json=payload,
-            headers=config.webhook_headers,
-            timeout=config.webhook_timeout
-        )
-        response.raise_for_status()
+        # Save individual file if not combining or if combining in full mode
+        if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
+            filename = f"{slugify(url)}.md"
+            filepath = config.output_dir / filename
+            async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
+                await f.write(markdown)
+        # Handle combined markdown file
+        if config.combine_to_one_markdown:
+            content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
+            combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+            async with aiofiles.open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                await f.write(combined_content)
+    except Exception as e:
+        console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
+def _save_markdown_sync(url: str, markdown: str, config: CrawlConfig) -> None:
+    """Synchronous version of markdown saver for parallel processing."""
+    try:
+        # Save individual file if not combining or if combining in full mode
+        if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
+            filepath = config.output_dir / f"{slugify(url)}.md"
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(markdown)
+        # Handle combined markdown file
+        if config.combine_to_one_markdown:
+            content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
+            combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+            with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                f.write(combined_content)
     except Exception as e:
-        print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
+        console.print(f"[red]Error saving markdown for {url}: {str(e)}[/red]")
-# Module level function for multiprocessing
 def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
     """Process a single URL for parallel processing."""
     url, base_url, config = args
     try:
+        # Make the conversion request
         endpoint = f"{base_url}/convert"
         payload = {
             "url": url,
@@ -232,7 +296,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
                 error=f"HTTP {response.status_code}: {response.text}",
                 config=config.to_dict()
             )
-            # Send webhook for failed result
             _send_webhook_sync(result, config)
             return result
@@ -240,19 +303,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         # Save markdown if output directory is configured
         if config.output_dir:
-            # Save individual file if not combining or if combining in full mode
-            if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
-                filepath = config.output_dir / f"{slugify(url)}.md"
-                with open(filepath, 'w', encoding='utf-8') as f:
-                    f.write(markdown)
-            # Handle combined markdown file
-            if config.combine_to_one_markdown:
-                content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
-                combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
-                with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
-                    f.write(combined_content)
+            _save_markdown_sync(url, markdown, config)
         result = CrawlResult(
             url=url,
@@ -261,6 +312,28 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             config=config.to_dict()
         )
+        # Handle post-extraction if configured
+        if config.post_extraction_agent:
+            try:
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                extraction_result = asyncio.run(agent.process_content(url, markdown))
+                if extraction_result:
+                    result.extraction_result = extraction_result
+            except Exception as e:
+                console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
         # Send webhook for successful result
         _send_webhook_sync(result, config)
@@ -281,6 +354,60 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         _send_webhook_sync(result, config)
         return result
+async def _save_report_async(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None):
+    """Save crawl report to JSON file asynchronously."""
+    if not config.report_file:
+        return
+    # Separate successful and failed results
+    successful_results = [r for r in results if r.status == "success"]
+    failed_results = [r for r in results if r.status == "failed"]
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "config": config.to_dict(),
+        "results": {
+            "successful": [asdict(r) for r in successful_results],
+            "failed": [asdict(r) for r in failed_results]
+        },
+        "summary": {
+            "total": len(results),
+            "successful": len(successful_results),
+            "failed": len(failed_results),
+            "retry_info": retry_stats or {}
+        }
+    }
+    async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
+        await f.write(json.dumps(report, indent=2))
+def _save_report_sync(results: List[CrawlResult], config: CrawlConfig, retry_stats: Dict = None) -> None:
+    """Synchronous version of report saver."""
+    if not config.report_file:
+        return
+    # Create report similar to async version
+    successful_results = [r for r in results if r.status == "success"]
+    failed_results = [r for r in results if r.status == "failed"]
+    report = {
+        "timestamp": datetime.now().isoformat(),
+        "config": config.to_dict(),
+        "results": {
+            "successful": [asdict(r) for r in successful_results],
+            "failed": [asdict(r) for r in failed_results]
+        },
+        "summary": {
+            "total": len(results),
+            "successful": len(successful_results),
+            "failed": len(failed_results),
+            "retry_info": retry_stats or {}
+        }
+    }
+    with open(config.report_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2)
 class SpiderForce4AI:
     """Main class for interacting with SpiderForce4AI service."""
@@ -289,6 +416,7 @@ class SpiderForce4AI:
         self.session = None
         self._executor = ThreadPoolExecutor()
         self.crawl_results: List[CrawlResult] = []
+        self._retry_stats = {}
     async def _ensure_session(self):
         """Ensure aiohttp session exists."""
@@ -300,215 +428,6 @@ class SpiderForce4AI:
         if self.session and not self.session.closed:
             await self.session.close()
-    async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
-        """Save markdown content to file and/or append to combined file."""
-        # Save individual file if not combining or if combining in full mode
-        if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
-            filename = f"{slugify(url)}.md"
-            filepath = output_dir / filename
-            async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
-                await f.write(markdown)
-        # Handle combined markdown file
-        if self.config.combine_to_one_markdown:
-            content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
-            combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
-            async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
-                await f.write(combined_content)
-    def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl sitemap URLs using server-side parallel processing.
-        """
-        print(f"Fetching sitemap from {sitemap_url}...")
-        # Fetch sitemap
-        try:
-            response = requests.get(sitemap_url, timeout=config.timeout)
-            response.raise_for_status()
-            sitemap_text = response.text
-        except Exception as e:
-            print(f"Error fetching sitemap: {str(e)}")
-            raise
-        # Parse sitemap
-        try:
-            root = ET.fromstring(sitemap_text)
-            namespace = {'ns': root.tag.split('}')[0].strip('{')}
-            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
-            print(f"Found {len(urls)} URLs in sitemap")
-        except Exception as e:
-            print(f"Error parsing sitemap: {str(e)}")
-            raise
-        # Process URLs using server-side parallel endpoint
-        return self.crawl_urls_server_parallel(urls, config)
-    def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl multiple URLs using server-side parallel processing.
-        This uses the /convert_parallel endpoint which handles parallelization on the server.
-        """
-        print(f"Sending {len(urls)} URLs for parallel processing...")
-        try:
-            endpoint = f"{self.base_url}/convert_parallel"
-            # Prepare payload
-            payload = {
-                "urls": urls,
-                **config.to_dict()
-            }
-            # Send request
-            response = requests.post(
-                endpoint,
-                json=payload,
-                timeout=config.timeout
-            )
-            response.raise_for_status()
-            # Process results
-            results = []
-            server_results = response.json()  # Assuming server returns JSON array of results
-            for url_result in server_results:
-                result = CrawlResult(
-                    url=url_result["url"],
-                    status=url_result.get("status", "failed"),
-                    markdown=url_result.get("markdown"),
-                    error=url_result.get("error"),
-                    config=config.to_dict()
-                )
-                # Save markdown if successful and output dir is configured
-                if result.status == "success" and config.output_dir and result.markdown:
-                    filepath = config.output_dir / f"{slugify(result.url)}.md"
-                    with open(filepath, 'w', encoding='utf-8') as f:
-                        f.write(result.markdown)
-                # Send webhook if configured
-                if config.webhook_url:
-                    _send_webhook_sync(result, config)
-                results.append(result)
-            # Calculate statistics
-            successful = len([r for r in results if r.status == "success"])
-            failed = len([r for r in results if r.status == "failed"])
-            # Print summary
-            print(f"\nParallel processing completed:")
-            print(f"✓ Successful: {successful}")
-            print(f"✗ Failed: {failed}")
-            # Save report if enabled
-            if config.save_reports and config.report_file:
-                self._retry_stats = {
-                    "initial_failures": failed,
-                    "failure_ratio": (failed / len(urls)) * 100,
-                    "retry_successful": 0,  # No retries in server parallel mode
-                    "retry_failed": failed
-                }
-                self._save_report_sync(results, config)
-                console.print(f"📊 Report saved to: {config.report_file}")
-            return results
-        except Exception as e:
-            print(f"Error during parallel processing: {str(e)}")
-            # Create failed results for all URLs
-            return [
-                CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                ) for url in urls
-            ]
-    async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
-        """Send webhook with crawl results."""
-        if not config.webhook_url:
-            return
-        payload = {
-            "url": result.url,
-            "status": result.status,
-            "markdown": result.markdown if result.status == "success" else None,
-            "error": result.error if result.status == "failed" else None,
-            "timestamp": result.timestamp,
-            "config": config.to_dict()
-        }
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.post(
-                    config.webhook_url,
-                    json=payload,
-                    timeout=config.webhook_timeout
-                )
-                response.raise_for_status()
-        except Exception as e:
-            console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
-    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
-        """Save crawl report synchronously."""
-        # Separate successful and failed results
-        successful_results = [r for r in results if r.status == "success"]
-        failed_results = [r for r in results if r.status == "failed"]
-        # Create report with only final state
-        report = {
-            "timestamp": datetime.now().isoformat(),
-            "config": config.to_dict(),
-            "results": {
-                "successful": [asdict(r) for r in successful_results],
-                "failed": [asdict(r) for r in failed_results]  # Only truly failed URLs after retries
-            },
-            "summary": {
-                "total": len(results),
-                "successful": len(successful_results),
-                "failed": len(failed_results),
-                "retry_info": getattr(self, '_retry_stats', {})  # Include retry statistics if available
-            }
-        }
-        with open(config.report_file, 'w', encoding='utf-8') as f:
-            json.dump(report, f, indent=2)
-    async def _save_report(self, config: CrawlConfig):
-        """Save crawl report to JSON file."""
-        if not config.report_file:
-            return
-        # Separate successful and failed results
-        successful_results = [r for r in self.crawl_results if r.status == "success"]
-        failed_results = [r for r in self.crawl_results if r.status == "failed"]
-        report = {
-            "timestamp": datetime.now().isoformat(),
-            "config": config.to_dict(),
-            "results": {
-                "successful": [asdict(r) for r in successful_results],
-                "failed": [asdict(r) for r in failed_results]  # Only truly failed URLs after retries
-            },
-            "summary": {
-                "total": len(self.crawl_results),
-                "successful": len(successful_results),
-                "failed": len(failed_results),
-                "retry_info": getattr(self, '_retry_stats', {})  # Include retry statistics if available
-            }
-        }
-        async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
-            await f.write(json.dumps(report, indent=2))
     async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
         """Crawl a single URL asynchronously."""
         await self._ensure_session()
@@ -539,9 +458,31 @@ class SpiderForce4AI:
                     )
                     if config.output_dir:
-                        await self._save_markdown(url, markdown, config.output_dir)
+                        await _save_markdown_async(url, markdown, config)
-                    await self._send_webhook(result, config)
+                    # Handle post-extraction if configured
+                    if config.post_extraction_agent and result.status == "success":
+                        try:
+                            post_config = PostExtractionConfig(
+                                model=config.post_extraction_agent["model"],
+                                messages=config.post_extraction_agent["messages"],
+                                api_key=config.post_extraction_agent["api_key"],
+                                max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                                temperature=config.post_extraction_agent.get("temperature", 0.7),
+                                base_url=config.post_extraction_agent.get("base_url"),
+                                combine_output=bool(config.post_extraction_agent_save_to_file),
+                                output_file=config.post_extraction_agent_save_to_file,
+                                custom_transform_function=config.post_agent_transformer_function
+                            )
+                            agent = PostExtractionAgent(post_config)
+                            extraction_result = await agent.process_content(url, markdown)
+                            if extraction_result:
+                                result.extraction_result = extraction_result
+                        except Exception as e:
+                            console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
+                    await _send_webhook_async(result, config)
                 self.crawl_results.append(result)
                 return result
@@ -561,18 +502,18 @@ class SpiderForce4AI:
         return asyncio.run(self.crawl_url_async(url, config))
     async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
-        """Retry failed URLs once."""
+        """Retry failed URLs with optional progress tracking."""
         if not failed_results:
             return []
         failed_count = len(failed_results)
-        total_count = len([r for r in self.crawl_results])
+        total_count = len(self.crawl_results)
         failure_ratio = (failed_count / total_count) * 100
         console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
         retry_results = []
-        # Create a new progress bar if one wasn't provided
+        # Create or use provided progress bar
         should_close_progress = progress is None
         if progress is None:
             progress = Progress(
@@ -616,6 +557,7 @@ class SpiderForce4AI:
     async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
         """Crawl multiple URLs asynchronously with progress bar."""
         await self._ensure_session()
+        post_extraction_results = {}
         with Progress(
             SpinnerColumn(),
@@ -624,52 +566,60 @@ class SpiderForce4AI:
             TaskProgressColumn(),
             console=console
         ) as progress:
-            task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
+            crawl_task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
             async def crawl_with_progress(url):
                 result = await self.crawl_url_async(url, config)
-                progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
+                progress.update(crawl_task, advance=1, description=f"[cyan]Crawled: {url}")
                 return result
+            # Set up concurrency control
             semaphore = asyncio.Semaphore(config.max_concurrent_requests)
             async def crawl_with_semaphore(url):
                 async with semaphore:
                     result = await crawl_with_progress(url)
                     await asyncio.sleep(config.request_delay)
                     return result
+            # Perform initial crawl
             initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
-            # Identify failed URLs
+            # Handle failed URLs
             failed_results = [r for r in initial_results if r.status == "failed"]
-            # Calculate initial failure ratio
             initial_failed = len(failed_results)
             total_urls = len(urls)
             failure_ratio = (initial_failed / total_urls) * 100
             # Retry failed URLs if ratio is acceptable
-            if failed_results:
-                if failure_ratio > 20:
-                    console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
-                    results = initial_results
-                else:
-                    retry_results = await self._retry_failed_urls(failed_results, config, progress)
-                    # Update results list by replacing failed results with successful retries
-                    results = initial_results.copy()
-                    for retry_result in retry_results:
-                        for i, result in enumerate(results):
-                            if result.url == retry_result.url:
-                                results[i] = retry_result
-                                break
-            else:
-                results = initial_results
+            results = initial_results
+            retry_successful = 0
-            # Calculate final statistics before saving report
+            if failed_results and failure_ratio <= 20:
+                retry_results = await self._retry_failed_urls(failed_results, config, progress)
+                retry_successful = len([r for r in retry_results if r.status == "success"])
+                # Update results list
+                for retry_result in retry_results:
+                    for i, result in enumerate(results):
+                        if result.url == retry_result.url:
+                            results[i] = retry_result
+                            break
+            # Calculate final statistics
             final_successful = len([r for r in results if r.status == "success"])
             final_failed = len([r for r in results if r.status == "failed"])
-            # Print detailed summary
+            # Update retry stats
+            self._retry_stats = {
+                "initial_failures": initial_failed,
+                "failure_ratio": failure_ratio,
+                "retry_successful": retry_successful if initial_failed > 0 else 0,
+                "retry_failed": final_failed,
+                "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
+            }
+            # Print summary
             console.print(f"\n[green]Crawling Summary:[/green]")
             console.print(f"Total URLs processed: {total_urls}")
             console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
@@ -678,18 +628,11 @@ class SpiderForce4AI:
             console.print(f"  ✗ Failed: {final_failed}")
             if initial_failed > 0:
-                retry_successful = initial_failed - final_failed
                 console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
-            # Save final report after all retries are complete
+            # Save final report
             if config.save_reports:
-                self._retry_stats = {
-                    "initial_failures": initial_failed,
-                    "failure_ratio": failure_ratio,
-                    "retry_successful": retry_successful if initial_failed > 0 else 0,
-                    "retry_failed": final_failed
-                }
-                await self._save_report(config)
+                await _save_report_async(results, config, self._retry_stats)
                 console.print(f"📊 Report saved to: {config.report_file}")
             return results
@@ -726,32 +669,21 @@ class SpiderForce4AI:
         return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
     def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
-        """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
-        print(f"Fetching sitemap from {sitemap_url}...")
-        # Fetch sitemap
+        """Crawl sitemap URLs in parallel using multiprocessing."""
+        # Fetch and parse sitemap
         try:
             response = requests.get(sitemap_url, timeout=config.timeout)
             response.raise_for_status()
-            sitemap_text = response.text
-        except Exception as e:
-            print(f"Error fetching sitemap: {str(e)}")
-            raise
-        # Parse sitemap
-        try:
-            root = ET.fromstring(sitemap_text)
+            root = ET.fromstring(response.text)
             namespace = {'ns': root.tag.split('}')[0].strip('{')}
             urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
-            print(f"Found {len(urls)} URLs in sitemap")
+            console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
         except Exception as e:
-            print(f"Error parsing sitemap: {str(e)}")
+            console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
             raise
-        # Prepare arguments for parallel processing
+        # Process URLs in parallel
         process_args = [(url, self.base_url, config) for url in urls]
-        # Create process pool and execute crawls
         results = []
         with Pool(processes=config.max_concurrent_requests) as pool:
@@ -762,81 +694,186 @@ class SpiderForce4AI:
                 TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                 TextColumn("({task.completed}/{task.total})"),
             ) as progress:
-                task = progress.add_task("Crawling URLs...", total=len(urls))
+                task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
                 for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
-                    progress.description = f"Last: {status} {result.url}"
+                    progress.description = f"[cyan]Last: {status} {result.url}"
-        # Calculate initial failure statistics
+        # Calculate statistics and handle retries
         failed_results = [r for r in results if r.status == "failed"]
         initial_failed = len(failed_results)
-        total_urls = len(urls)
-        failure_ratio = (initial_failed / total_urls) * 100
+        failure_ratio = (initial_failed / len(urls)) * 100
+        retry_successful = 0
-        # Retry failed URLs if ratio is acceptable
-        if failed_results:
-            if failure_ratio > 20:
-                console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
-            else:
-                failed_count = len(failed_results)
-                failure_ratio = (failed_count / total_urls) * 100
-                console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
-                for result in failed_results:
-                    new_result = _process_url_parallel((result.url, self.base_url, config))
-                    # Save markdown and trigger webhook for successful retries
-                    if new_result.status == "success":
-                        console.print(f"[green]✓ Retry successful: {result.url}[/green]")
-                        # Save markdown if output directory is configured
-                        if config.output_dir and new_result.markdown:
-                            filepath = config.output_dir / f"{slugify(new_result.url)}.md"
-                            with open(filepath, 'w', encoding='utf-8') as f:
-                                f.write(new_result.markdown)
-                        # Send webhook for successful retry
-                        _send_webhook_sync(new_result, config)
-                    else:
-                        console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
-                        # Send webhook for failed retry
-                        _send_webhook_sync(new_result, config)
-                    # Update results list
-                    for i, r in enumerate(results):
-                        if r.url == new_result.url:
-                            results[i] = new_result
-                            break
+        if failed_results and failure_ratio <= 20:
+            console.print(f"\n[yellow]Retrying {initial_failed} failed URLs...[/yellow]")
+            for result in failed_results:
+                new_result = _process_url_parallel((result.url, self.base_url, config))
+                if new_result.status == "success":
+                    retry_successful += 1
+                    console.print(f"[green]✓ Retry successful: {result.url}[/green]")
+                else:
+                    console.print(f"[red]✗ Retry failed: {result.url}[/red]")
+                # Update results list
+                for i, r in enumerate(results):
+                    if r.url == new_result.url:
+                        results[i] = new_result
+                        break
         # Calculate final statistics
         final_successful = len([r for r in results if r.status == "success"])
         final_failed = len([r for r in results if r.status == "failed"])
-        # Print detailed summary
+        # Print summary
         console.print(f"\n[green]Crawling Summary:[/green]")
-        console.print(f"Total URLs processed: {total_urls}")
+        console.print(f"Total URLs processed: {len(urls)}")
         console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
         console.print(f"Final results:")
         console.print(f"  ✓ Successful: {final_successful}")
         console.print(f"  ✗ Failed: {final_failed}")
         if initial_failed > 0:
-            retry_successful = initial_failed - final_failed
             console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
-        # Save final report after all retries are complete
+# Save report
         if config.save_reports:
             self._retry_stats = {
                 "initial_failures": initial_failed,
                 "failure_ratio": failure_ratio,
-                "retry_successful": retry_successful if initial_failed > 0 else 0,
+                "retry_successful": retry_successful,
                 "retry_failed": final_failed
             }
-            self._save_report_sync(results, config)
+            _save_report_sync(results, config, self._retry_stats)
             console.print(f"📊 Report saved to: {config.report_file}")
         return results
+    def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl multiple URLs using server-side parallel processing.
+        This uses the /convert_parallel endpoint which handles parallelization on the server.
+        """
+        console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
+        try:
+            endpoint = f"{self.base_url}/convert_parallel"
+            # Prepare payload
+            payload = {
+                "urls": urls,
+                **config.to_dict()
+            }
+            # Send request
+            response = requests.post(
+                endpoint,
+                json=payload,
+                timeout=config.timeout
+            )
+            response.raise_for_status()
+            # Process results
+            results = []
+            server_results = response.json()
+            for url_result in server_results:
+                result = CrawlResult(
+                    url=url_result["url"],
+                    status=url_result.get("status", "failed"),
+                    markdown=url_result.get("markdown"),
+                    error=url_result.get("error"),
+                    config=config.to_dict()
+                )
+                # Save markdown if successful and output dir is configured
+                if result.status == "success" and config.output_dir and result.markdown:
+                    _save_markdown_sync(result.url, result.markdown, config)
+                # Handle post-extraction if configured
+                if config.post_extraction_agent and result.status == "success":
+                    try:
+                        post_config = PostExtractionConfig(
+                            model=config.post_extraction_agent["model"],
+                            messages=config.post_extraction_agent["messages"],
+                            api_key=config.post_extraction_agent["api_key"],
+                            max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                            temperature=config.post_extraction_agent.get("temperature", 0.7),
+                            base_url=config.post_extraction_agent.get("base_url"),
+                            combine_output=bool(config.post_extraction_agent_save_to_file),
+                            output_file=config.post_extraction_agent_save_to_file,
+                            custom_transform_function=config.post_agent_transformer_function
+                        )
+                        agent = PostExtractionAgent(post_config)
+                        extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
+                        if extraction_result:
+                            result.extraction_result = extraction_result
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                # Send webhook if configured
+                _send_webhook_sync(result, config)
+                results.append(result)
+            # Calculate statistics
+            successful = len([r for r in results if r.status == "success"])
+            failed = len([r for r in results if r.status == "failed"])
+            # Print summary
+            console.print("\n[green]Parallel processing completed:[/green]")
+            console.print(f"✓ Successful: {successful}")
+            console.print(f"✗ Failed: {failed}")
+            # Save report if enabled
+            if config.save_reports:
+                self._retry_stats = {
+                    "initial_failures": failed,
+                    "failure_ratio": (failed / len(urls)) * 100,
+                    "retry_successful": 0,  # No retries in server parallel mode
+                    "retry_failed": failed
+                }
+                _save_report_sync(results, config, self._retry_stats)
+                console.print(f"📊 Report saved to: {config.report_file}")
+            return results
+        except Exception as e:
+            console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
+            # Create failed results for all URLs
+            return [
+                CrawlResult(
+                    url=url,
+                    status="failed",
+                    error=str(e),
+                    config=config.to_dict()
+                ) for url in urls
+            ]
+    def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
+        """
+        Crawl sitemap URLs using server-side parallel processing.
+        """
+        console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
+        try:
+            response = requests.get(sitemap_url, timeout=config.timeout)
+            response.raise_for_status()
+            root = ET.fromstring(response.text)
+            namespace = {'ns': root.tag.split('}')[0].strip('{')}
+            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
+            console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
+            # Process URLs using server-side parallel endpoint
+            return self.crawl_urls_server_parallel(urls, config)
+        except Exception as e:
+            console.print(f"[red]Error processing sitemap: {str(e)}[/red]")
+            raise
     async def __aenter__(self):
         """Async context manager entry."""
         await self._ensure_session()
@@ -854,3 +891,7 @@ class SpiderForce4AI:
         """Sync context manager exit."""
         self._executor.shutdown(wait=True)
+# Version info
+#__version__ = "2.3.1"
+#__author__ = "Piotr Tamulewicz"
+#__email__ = "pt@petertam.pro"

spiderforce4ai 2.1__py3-none-any.whl → 2.4__py3-none-any.whl

spiderforce4ai 2.1py3-none-any.whl → 2.4py3-none-any.whl