spiderforce4ai 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
302
302
  _send_webhook_sync(result, config)
303
303
  return result
304
304
 
305
- markdown = response.text
305
+ # Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
306
+ try:
307
+ response_data = response.json()
308
+ # Make sure we're accessing 'markdown' correctly
309
+ markdown = response_data.get('markdown', '') # Use get() with default value
310
+ if not markdown and response.text: # Fallback to raw text if no markdown
311
+ markdown = response.text
312
+ except json.JSONDecodeError:
313
+ # If response isn't JSON, use raw text
314
+ markdown = response.text
306
315
 
307
316
  # Save markdown if output directory is configured
308
317
  if config.output_dir:
@@ -811,144 +820,90 @@ class SpiderForce4AI:
811
820
  return results
812
821
 
813
822
  def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
814
- """
815
- Crawl multiple URLs using server-side parallel processing.
816
- This uses the /convert_parallel endpoint which handles parallelization on the server.
817
- """
818
- console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
819
-
820
- try:
821
- endpoint = f"{self.base_url}/convert_parallel"
822
-
823
- # Prepare payload
824
- payload = {
825
- "urls": urls,
826
- **config.to_dict()
827
- }
828
-
829
- # Send request
830
- response = requests.post(
831
- endpoint,
832
- json=payload,
833
- timeout=config.timeout
834
- )
835
- response.raise_for_status()
836
-
837
- # Process results
838
- results = []
839
- server_results = response.json()
840
-
841
- for url_result in server_results:
842
- result = CrawlResult(
843
- url=url_result["url"],
844
- status=url_result.get("status", "failed"),
845
- markdown=url_result.get("markdown"),
846
- error=url_result.get("error"),
847
- config=config.to_dict()
848
- )
849
-
850
- # Save markdown if successful and output dir is configured
851
- if result.status == "success" and config.output_dir and result.markdown:
852
- _save_markdown_sync(result.url, result.markdown, config)
853
-
854
- # Send webhook if configured
855
- _send_webhook_sync(result, config)
856
- results.append(result)
823
+ """Crawl URLs in parallel using multiprocessing."""
824
+ console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
857
825
 
858
- # Process LLM requests sequentially after all crawling is complete
859
- if config.post_extraction_agent:
860
- console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
861
- successful_results = [r for r in results if r.status == "success"]
862
-
863
- with Progress(
864
- SpinnerColumn(),
865
- TextColumn("[progress.description]{task.description}"),
866
- BarColumn(),
867
- TaskProgressColumn(),
868
- ) as progress:
869
- llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
870
-
871
- post_config = PostExtractionConfig(
872
- model=config.post_extraction_agent["model"],
873
- messages=config.post_extraction_agent["messages"],
874
- api_key=config.post_extraction_agent["api_key"],
875
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
876
- temperature=config.post_extraction_agent.get("temperature", 0.7),
877
- base_url=config.post_extraction_agent.get("base_url"),
878
- combine_output=bool(config.post_extraction_agent_save_to_file),
879
- output_file=config.post_extraction_agent_save_to_file,
880
- custom_transform_function=config.post_agent_transformer_function,
881
- response_format=config.post_extraction_agent.get("response_format")
882
- )
883
- agent = PostExtractionAgent(post_config)
826
+ # Process URLs in parallel
827
+ process_args = [(url, self.base_url, config) for url in urls]
828
+ results = []
829
+
830
+ with Pool(processes=config.max_concurrent_requests) as pool:
831
+ with Progress(
832
+ SpinnerColumn(),
833
+ TextColumn("[progress.description]{task.description}"),
834
+ BarColumn(),
835
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
836
+ TextColumn("({task.completed}/{task.total})"),
837
+ ) as progress:
838
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
884
839
 
885
- for result in successful_results:
886
- try:
887
- # Get LLM response
888
- llm_response = agent.process_content(result.url, result.markdown)
889
- if llm_response:
890
- # Add URL to the response before transformation
891
- llm_response['url'] = result.url
892
- # Apply transformation if provided
893
- if config.post_agent_transformer_function:
894
- try:
895
- logger.info(f"Starting transformer function execution for {result.url}")
896
- result.extraction_result = config.post_agent_transformer_function(llm_response)
897
- logger.info(f"Successfully executed transformer function for {result.url}")
898
- except KeyError as e:
899
- # Log missing field but continue with transformation
900
- missing_field = str(e).strip("'")
901
- logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
902
- # Add missing field with empty value
903
- llm_response[missing_field] = ""
904
- # Retry transformation with added field
905
- logger.info(f"Retrying transformer function for {result.url} after adding missing field")
906
- result.extraction_result = config.post_agent_transformer_function(llm_response)
907
- logger.info(f"Successfully executed transformer function on retry for {result.url}")
908
- except Exception as e:
909
- logger.error(f"Transformer error for {result.url}: {str(e)}")
910
- result.extraction_result = llm_response # Use original response if transform fails
911
- else:
912
- result.extraction_result = llm_response
913
- progress.update(llm_task, advance=1)
914
- except Exception as e:
915
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
916
- time.sleep(1) # Add delay after error
917
- time.sleep(0.5) # Rate limiting between requests
918
-
919
- # Calculate statistics
920
- successful = len([r for r in results if r.status == "success"])
921
- failed = len([r for r in results if r.status == "failed"])
922
-
923
- # Print summary
924
- console.print("\n[green]Parallel processing completed:[/green]")
925
- console.print(f"✓ Successful: {successful}")
926
- console.print(f"✗ Failed: {failed}")
840
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
841
+ results.append(result)
842
+ progress.update(task, advance=1)
843
+ status = "✓" if result.status == "success" else "✗"
844
+ progress.description = f"[cyan]Last: {status} {result.url}"
927
845
 
928
- # Save report if enabled
929
- if config.save_reports:
930
- self._retry_stats = {
931
- "initial_failures": failed,
932
- "failure_ratio": (failed / len(urls)) * 100,
933
- "retry_successful": 0, # No retries in server parallel mode
934
- "retry_failed": failed
935
- }
936
- _save_report_sync(results, config, self._retry_stats)
937
- console.print(f"📊 Report saved to: {config.report_file}")
846
+ # Process LLM requests sequentially after all crawling is complete
847
+ if config.post_extraction_agent:
848
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
849
+ successful_results = [r for r in results if r.status == "success"]
938
850
 
939
- return results
851
+ with Progress(
852
+ SpinnerColumn(),
853
+ TextColumn("[progress.description]{task.description}"),
854
+ BarColumn(),
855
+ TaskProgressColumn(),
856
+ ) as progress:
857
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
940
858
 
941
- except Exception as e:
942
- console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
943
- # Create failed results for all URLs
944
- return [
945
- CrawlResult(
946
- url=url,
947
- status="failed",
948
- error=str(e),
949
- config=config.to_dict()
950
- ) for url in urls
951
- ]
859
+ post_config = PostExtractionConfig(
860
+ model=config.post_extraction_agent["model"],
861
+ messages=config.post_extraction_agent["messages"],
862
+ api_key=config.post_extraction_agent["api_key"],
863
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
864
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
865
+ base_url=config.post_extraction_agent.get("base_url"),
866
+ combine_output=bool(config.post_extraction_agent_save_to_file),
867
+ output_file=config.post_extraction_agent_save_to_file,
868
+ custom_transform_function=config.post_agent_transformer_function
869
+ )
870
+ agent = PostExtractionAgent(post_config)
871
+
872
+ for result in successful_results:
873
+ try:
874
+ # Process content synchronously since it's not an async method
875
+ extraction_result = agent.process_content(result.url, result.markdown)
876
+ if extraction_result:
877
+ result.extraction_result = extraction_result
878
+ logger.info(f"Successfully processed and transformed content for {result.url}")
879
+ progress.update(llm_task, advance=1)
880
+ except Exception as e:
881
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
882
+ time.sleep(1) # Add delay after error
883
+ time.sleep(0.5) # Rate limiting between requests
884
+
885
+ # Calculate statistics
886
+ successful = len([r for r in results if r.status == "success"])
887
+ failed = len([r for r in results if r.status == "failed"])
888
+
889
+ # Print summary
890
+ console.print(f"\n[green]Crawling Summary:[/green]")
891
+ console.print(f"Total URLs processed: {len(urls)}")
892
+ console.print(f"✓ Successful: {successful}")
893
+ console.print(f"✗ Failed: {failed}")
894
+
895
+ # Save report if enabled
896
+ if config.save_reports:
897
+ self._retry_stats = {
898
+ "initial_failures": failed,
899
+ "failure_ratio": (failed / len(urls)) * 100,
900
+ "retry_successful": 0,
901
+ "retry_failed": failed
902
+ }
903
+ _save_report_sync(results, config, self._retry_stats)
904
+ console.print(f"📊 Report saved to: {config.report_file}")
905
+
906
+ return results
952
907
 
953
908
  def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
954
909
  """