PyPI - spiderforce4ai - Versions diffs - 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl - Mend

spiderforce4ai 2.6.6py3-none-any.whl → 2.6.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             _send_webhook_sync(result, config)
             return result
-        markdown = response.text
+        # Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
+        try:
+            response_data = response.json()
+            # Make sure we're accessing 'markdown' correctly
+            markdown = response_data.get('markdown', '')  # Use get() with default value
+            if not markdown and response.text:  # Fallback to raw text if no markdown
+                markdown = response.text
+        except json.JSONDecodeError:
+            # If response isn't JSON, use raw text
+            markdown = response.text
         # Save markdown if output directory is configured
         if config.output_dir:
@@ -811,144 +820,90 @@ class SpiderForce4AI:
         return results
     def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
-        """
-        Crawl multiple URLs using server-side parallel processing.
-        This uses the /convert_parallel endpoint which handles parallelization on the server.
-        """
-        console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
-        try:
-            endpoint = f"{self.base_url}/convert_parallel"
-            # Prepare payload
-            payload = {
-                "urls": urls,
-                **config.to_dict()
-            }
-            # Send request
-            response = requests.post(
-                endpoint,
-                json=payload,
-                timeout=config.timeout
-            )
-            response.raise_for_status()
-            # Process results
-            results = []
-            server_results = response.json()
-            for url_result in server_results:
-                result = CrawlResult(
-                    url=url_result["url"],
-                    status=url_result.get("status", "failed"),
-                    markdown=url_result.get("markdown"),
-                    error=url_result.get("error"),
-                    config=config.to_dict()
-                )
-                # Save markdown if successful and output dir is configured
-                if result.status == "success" and config.output_dir and result.markdown:
-                    _save_markdown_sync(result.url, result.markdown, config)
-                # Send webhook if configured
-                _send_webhook_sync(result, config)
-                results.append(result)
+        """Crawl URLs in parallel using multiprocessing."""
+        console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
-            # Process LLM requests sequentially after all crawling is complete
-            if config.post_extraction_agent:
-                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
-                successful_results = [r for r in results if r.status == "success"]
-                with Progress(
-                    SpinnerColumn(),
-                    TextColumn("[progress.description]{task.description}"),
-                    BarColumn(),
-                    TaskProgressColumn(),
-                ) as progress:
-                    llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
-                    post_config = PostExtractionConfig(
-                        model=config.post_extraction_agent["model"],
-                        messages=config.post_extraction_agent["messages"],
-                        api_key=config.post_extraction_agent["api_key"],
-                        max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                        temperature=config.post_extraction_agent.get("temperature", 0.7),
-                        base_url=config.post_extraction_agent.get("base_url"),
-                        combine_output=bool(config.post_extraction_agent_save_to_file),
-                        output_file=config.post_extraction_agent_save_to_file,
-                        custom_transform_function=config.post_agent_transformer_function,
-                        response_format=config.post_extraction_agent.get("response_format")
-                    )
-                    agent = PostExtractionAgent(post_config)
+        # Process URLs in parallel
+        process_args = [(url, self.base_url, config) for url in urls]
+        results = []
+        with Pool(processes=config.max_concurrent_requests) as pool:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                TextColumn("({task.completed}/{task.total})"),
+            ) as progress:
+                task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
-                    for result in successful_results:
-                        try:
-                            # Get LLM response
-                            llm_response = agent.process_content(result.url, result.markdown)
-                            if llm_response:
-                                # Add URL to the response before transformation
-                                llm_response['url'] = result.url
-                                # Apply transformation if provided
-                                if config.post_agent_transformer_function:
-                                    try:
-                                        logger.info(f"Starting transformer function execution for {result.url}")
-                                        result.extraction_result = config.post_agent_transformer_function(llm_response)
-                                        logger.info(f"Successfully executed transformer function for {result.url}")
-                                    except KeyError as e:
-                                        # Log missing field but continue with transformation
-                                        missing_field = str(e).strip("'")
-                                        logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
-                                        # Add missing field with empty value
-                                        llm_response[missing_field] = ""
-                                        # Retry transformation with added field
-                                        logger.info(f"Retrying transformer function for {result.url} after adding missing field")
-                                        result.extraction_result = config.post_agent_transformer_function(llm_response)
-                                        logger.info(f"Successfully executed transformer function on retry for {result.url}")
-                                    except Exception as e:
-                                        logger.error(f"Transformer error for {result.url}: {str(e)}")
-                                        result.extraction_result = llm_response  # Use original response if transform fails
-                                else:
-                                    result.extraction_result = llm_response
-                            progress.update(llm_task, advance=1)
-                        except Exception as e:
-                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
-                            time.sleep(1)  # Add delay after error
-                        time.sleep(0.5)  # Rate limiting between requests
-            # Calculate statistics
-            successful = len([r for r in results if r.status == "success"])
-            failed = len([r for r in results if r.status == "failed"])
-            # Print summary
-            console.print("\n[green]Parallel processing completed:[/green]")
-            console.print(f"✓ Successful: {successful}")
-            console.print(f"✗ Failed: {failed}")
+                for result in pool.imap_unordered(_process_url_parallel, process_args):
+                    results.append(result)
+                    progress.update(task, advance=1)
+                    status = "✓" if result.status == "success" else "✗"
+                    progress.description = f"[cyan]Last: {status} {result.url}"
-            # Save report if enabled
-            if config.save_reports:
-                self._retry_stats = {
-                    "initial_failures": failed,
-                    "failure_ratio": (failed / len(urls)) * 100,
-                    "retry_successful": 0,  # No retries in server parallel mode
-                    "retry_failed": failed
-                }
-                _save_report_sync(results, config, self._retry_stats)
-                console.print(f"📊 Report saved to: {config.report_file}")
+        # Process LLM requests sequentially after all crawling is complete
+        if config.post_extraction_agent:
+            console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+            successful_results = [r for r in results if r.status == "success"]
-            return results
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+            ) as progress:
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
-        except Exception as e:
-            console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
-            # Create failed results for all URLs
-            return [
-                CrawlResult(
-                    url=url,
-                    status="failed",
-                    error=str(e),
-                    config=config.to_dict()
-                ) for url in urls
-            ]
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        # Process content synchronously since it's not an async method
+                        extraction_result = agent.process_content(result.url, result.markdown)
+                        if extraction_result:
+                            result.extraction_result = extraction_result
+                            logger.info(f"Successfully processed and transformed content for {result.url}")
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        time.sleep(1)  # Add delay after error
+                    time.sleep(0.5)  # Rate limiting between requests
+        # Calculate statistics
+        successful = len([r for r in results if r.status == "success"])
+        failed = len([r for r in results if r.status == "failed"])
+        # Print summary
+        console.print(f"\n[green]Crawling Summary:[/green]")
+        console.print(f"Total URLs processed: {len(urls)}")
+        console.print(f"✓ Successful: {successful}")
+        console.print(f"✗ Failed: {failed}")
+        # Save report if enabled
+        if config.save_reports:
+            self._retry_stats = {
+                "initial_failures": failed,
+                "failure_ratio": (failed / len(urls)) * 100,
+                "retry_successful": 0,
+                "retry_failed": failed
+            }
+            _save_report_sync(results, config, self._retry_stats)
+            console.print(f"📊 Report saved to: {config.report_file}")
+        return results
     def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
         """

{spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.6.6
+Version: 2.6.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-2.6.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+spiderforce4ai/__init__.py,sha256=p_ybuwvTD7bTelORBzAkomUQrc69WvOmu3owHKlzp0A,42231
+spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
+spiderforce4ai-2.6.7.dist-info/METADATA,sha256=5Tjsk-VHFD81TDxfh2LAClnoWa99BwBbNja9-681rZI,9012
+spiderforce4ai-2.6.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-2.6.7.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
+spiderforce4ai-2.6.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-2.6.7.dist-info/RECORD,,

spiderforce4ai-2.6.6.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
-spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
-spiderforce4ai-2.6.6.dist-info/METADATA,sha256=eoFT4zgeNK3TkBEF5pKnf5IducFbm1quZnndCuXPf-c,9012
-spiderforce4ai-2.6.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-2.6.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
-spiderforce4ai-2.6.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-2.6.6.dist-info/RECORD,,

{spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl

spiderforce4ai 2.6.6py3-none-any.whl → 2.6.7py3-none-any.whl