PyPI - spiderforce4ai - Versions diffs - 2.6.6__tar.gz → 2.6.7__tar.gz - Mend

spiderforce4ai 2.6.6tar.gz → 2.6.7tar.gz

Files changed (14) hide show

{spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.6.6
+Version: 2.6.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "2.6.6"
+version = "2.6.7"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
 readme = "README.md"
 authors = [

{spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="spiderforce4ai",
-    version="2.6.6",
+    version="2.6.7",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",

{spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             _send_webhook_sync(result, config)
             return result
-        markdown = response.text
+        # Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
+        try:
+            response_data = response.json()
+            # Make sure we're accessing 'markdown' correctly
+            markdown = response_data.get('markdown', '')  # Use get() with default value
+            if not markdown and response.text:  # Fallback to raw text if no markdown
+                markdown = response.text
+        except json.JSONDecodeError:
+            # If response isn't JSON, use raw text
+            markdown = response.text
         # Save markdown if output directory is configured
         if config.output_dir:
@@ -811,144 +820,90 @@ class SpiderForce4AI:
         return results
     def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl multiple URLs using server-side parallel processing.
-        This uses the /convert_parallel endpoint which handles parallelization on the server.
-        """
-        console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
-        try:
-            endpoint = f"{self.base_url}/convert_parallel"
-            # Prepare payload
-            payload = {
-                "urls": urls,
-                **config.to_dict()
-            }
-            # Send request
-            response = requests.post(
-                endpoint,
-                json=payload,
-                timeout=config.timeout
-            )
-            response.raise_for_status()
-            # Process results
-            results = []
-            server_results = response.json()
-            for url_result in server_results:
-                result = CrawlResult(
-                    url=url_result["url"],
-                    status=url_result.get("status", "failed"),
-                    markdown=url_result.get("markdown"),
-                    error=url_result.get("error"),
-                    config=config.to_dict()
-                )
-                # Save markdown if successful and output dir is configured
-                if result.status == "success" and config.output_dir and result.markdown:
-                    _save_markdown_sync(result.url, result.markdown, config)
-                # Send webhook if configured
-                _send_webhook_sync(result, config)
-                results.append(result)
+        """Crawl URLs in parallel using multiprocessing."""
+        console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
-            # Process LLM requests sequentially after all crawling is complete
-            if config.post_extraction_agent:
-                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
-                successful_results = [r for r in results if r.status == "success"]
-                with Progress(
-                    SpinnerColumn(),
-                    TextColumn("[progress.description]{task.description}"),
-                    BarColumn(),
-                    TaskProgressColumn(),
-                ) as progress:
-                    llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
-                    post_config = PostExtractionConfig(
-                        model=config.post_extraction_agent["model"],
-                        messages=config.post_extraction_agent["messages"],
-                        api_key=config.post_extraction_agent["api_key"],
-                        max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                        temperature=config.post_extraction_agent.get("temperature", 0.7),
-                        base_url=config.post_extraction_agent.get("base_url"),
-                        combine_output=bool(config.post_extraction_agent_save_to_file),
-                        output_file=config.post_extraction_agent_save_to_file,
-                        custom_transform_function=config.post_agent_transformer_function,
-                        response_format=config.post_extraction_agent.get("response_format")
-                    )
-                    agent = PostExtractionAgent(post_config)
+        # Process URLs in parallel
+        process_args = [(url, self.base_url, config) for url in urls]
+        results = []
+        with Pool(processes=config.max_concurrent_requests) as pool:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TextColumn("({task.completed}/{task.total})"),
+            ) as progress:
+                task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
-                    for result in successful_results:
-                        try:
-                            # Get LLM response
-                            llm_response = agent.process_content(result.url, result.markdown)
-                            if llm_response:
-                                # Add URL to the response before transformation
-                                llm_response['url'] = result.url
-                                # Apply transformation if provided
-                                if config.post_agent_transformer_function:
-                                    try:
-                                        logger.info(f"Starting transformer function execution for {result.url}")
-                                        result.extraction_result = config.post_agent_transformer_function(llm_response)
-                                        logger.info(f"Successfully executed transformer function for {result.url}")
-                                    except KeyError as e:
-                                        # Log missing field but continue with transformation
-                                        missing_field = str(e).strip("'")
-                                        logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
-                                        # Add missing field with empty value
-                                        llm_response[missing_field] = ""
-                                        # Retry transformation with added field
-                                        logger.info(f"Retrying transformer function for {result.url} after adding missing field")
-                                        result.extraction_result = config.post_agent_transformer_function(llm_response)
-                                        logger.info(f"Successfully executed transformer function on retry for {result.url}")
-                                    except Exception as e:
-                                        logger.error(f"Transformer error for {result.url}: {str(e)}")
-                                        result.extraction_result = llm_response  # Use original response if transform fails
-                                else:
-                                    result.extraction_result = llm_response
-                            progress.update(llm_task, advance=1)
-                        except Exception as e:
-                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
-                            time.sleep(1)  # Add delay after error
-                        time.sleep(0.5)  # Rate limiting between requests
-            # Calculate statistics
-            successful = len([r for r in results if r.status == "success"])
-            failed = len([r for r in results if r.status == "failed"])
-            # Print summary
-            console.print("\n[green]Parallel processing completed:[/green]")
-            console.print(f"✓ Successful: {successful}")
-            console.print(f"✗ Failed: {failed}")
+                for result in pool.imap_unordered(_process_url_parallel, process_args):
+                    results.append(result)
+                    progress.update(task, advance=1)
+                    status = "✓" if result.status == "success" else "✗"
+                    progress.description = f"[cyan]Last: {status} {result.url}"
-            # Save report if enabled
-            if config.save_reports:
-                self._retry_stats = {
-                    "initial_failures": failed,
-                    "failure_ratio": (failed / len(urls)) * 100,
-                    "retry_successful": 0,  # No retries in server parallel mode
-                    "retry_failed": failed
-                }
-                _save_report_sync(results, config, self._retry_stats)
-                console.print(f"📊 Report saved to: {config.report_file}")
+        # Process LLM requests sequentially after all crawling is complete
+        if config.post_extraction_agent:
+            console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+            successful_results = [r for r in results if r.status == "success"]
-            return results
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+            ) as progress:
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
-        except Exception as e:
-            console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
-            # Create failed results for all URLs
-            return [
-                CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                ) for url in urls
-            ]
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        # Process content synchronously since it's not an async method
+                        extraction_result = agent.process_content(result.url, result.markdown)
+                        if extraction_result:
+                            result.extraction_result = extraction_result
+                            logger.info(f"Successfully processed and transformed content for {result.url}")
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        time.sleep(1)  # Add delay after error
+                    time.sleep(0.5)  # Rate limiting between requests
+        # Calculate statistics
+        successful = len([r for r in results if r.status == "success"])
+        failed = len([r for r in results if r.status == "failed"])
+        # Print summary
+        console.print(f"\n[green]Crawling Summary:[/green]")
+        console.print(f"Total URLs processed: {len(urls)}")
+        console.print(f"✓ Successful: {successful}")
+        console.print(f"✗ Failed: {failed}")
+        # Save report if enabled
+        if config.save_reports:
+            self._retry_stats = {
+                "initial_failures": failed,
+                "failure_ratio": (failed / len(urls)) * 100,
+                "retry_successful": 0,
+                "retry_failed": failed
+            }
+            _save_report_sync(results, config, self._retry_stats)
+            console.print(f"📊 Report saved to: {config.report_file}")
+        return results
     def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
         """

{spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.6.6
+Version: 2.6.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz