spiderforce4ai 2.6.6__py3-none-any.whl → 2.6.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
302
302
  _send_webhook_sync(result, config)
303
303
  return result
304
304
 
305
- markdown = response.text
305
+ # Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
306
+ try:
307
+ response_data = response.json()
308
+ # Make sure we're accessing 'markdown' correctly
309
+ markdown = response_data.get('markdown', '') # Use get() with default value
310
+ if not markdown and response.text: # Fallback to raw text if no markdown
311
+ markdown = response.text
312
+ except json.JSONDecodeError:
313
+ # If response isn't JSON, use raw text
314
+ markdown = response.text
306
315
 
307
316
  # Save markdown if output directory is configured
308
317
  if config.output_dir:
@@ -811,144 +820,90 @@ class SpiderForce4AI:
811
820
  return results
812
821
 
813
822
  def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
814
- """
815
- Crawl multiple URLs using server-side parallel processing.
816
- This uses the /convert_parallel endpoint which handles parallelization on the server.
817
- """
818
- console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
819
-
820
- try:
821
- endpoint = f"{self.base_url}/convert_parallel"
822
-
823
- # Prepare payload
824
- payload = {
825
- "urls": urls,
826
- **config.to_dict()
827
- }
828
-
829
- # Send request
830
- response = requests.post(
831
- endpoint,
832
- json=payload,
833
- timeout=config.timeout
834
- )
835
- response.raise_for_status()
836
-
837
- # Process results
838
- results = []
839
- server_results = response.json()
840
-
841
- for url_result in server_results:
842
- result = CrawlResult(
843
- url=url_result["url"],
844
- status=url_result.get("status", "failed"),
845
- markdown=url_result.get("markdown"),
846
- error=url_result.get("error"),
847
- config=config.to_dict()
848
- )
849
-
850
- # Save markdown if successful and output dir is configured
851
- if result.status == "success" and config.output_dir and result.markdown:
852
- _save_markdown_sync(result.url, result.markdown, config)
853
-
854
- # Send webhook if configured
855
- _send_webhook_sync(result, config)
856
- results.append(result)
823
+ """Crawl URLs in parallel using multiprocessing."""
824
+ console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
857
825
 
858
- # Process LLM requests sequentially after all crawling is complete
859
- if config.post_extraction_agent:
860
- console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
861
- successful_results = [r for r in results if r.status == "success"]
862
-
863
- with Progress(
864
- SpinnerColumn(),
865
- TextColumn("[progress.description]{task.description}"),
866
- BarColumn(),
867
- TaskProgressColumn(),
868
- ) as progress:
869
- llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
870
-
871
- post_config = PostExtractionConfig(
872
- model=config.post_extraction_agent["model"],
873
- messages=config.post_extraction_agent["messages"],
874
- api_key=config.post_extraction_agent["api_key"],
875
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
876
- temperature=config.post_extraction_agent.get("temperature", 0.7),
877
- base_url=config.post_extraction_agent.get("base_url"),
878
- combine_output=bool(config.post_extraction_agent_save_to_file),
879
- output_file=config.post_extraction_agent_save_to_file,
880
- custom_transform_function=config.post_agent_transformer_function,
881
- response_format=config.post_extraction_agent.get("response_format")
882
- )
883
- agent = PostExtractionAgent(post_config)
826
+ # Process URLs in parallel
827
+ process_args = [(url, self.base_url, config) for url in urls]
828
+ results = []
829
+
830
+ with Pool(processes=config.max_concurrent_requests) as pool:
831
+ with Progress(
832
+ SpinnerColumn(),
833
+ TextColumn("[progress.description]{task.description}"),
834
+ BarColumn(),
835
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
836
+ TextColumn("({task.completed}/{task.total})"),
837
+ ) as progress:
838
+ task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
884
839
 
885
- for result in successful_results:
886
- try:
887
- # Get LLM response
888
- llm_response = agent.process_content(result.url, result.markdown)
889
- if llm_response:
890
- # Add URL to the response before transformation
891
- llm_response['url'] = result.url
892
- # Apply transformation if provided
893
- if config.post_agent_transformer_function:
894
- try:
895
- logger.info(f"Starting transformer function execution for {result.url}")
896
- result.extraction_result = config.post_agent_transformer_function(llm_response)
897
- logger.info(f"Successfully executed transformer function for {result.url}")
898
- except KeyError as e:
899
- # Log missing field but continue with transformation
900
- missing_field = str(e).strip("'")
901
- logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
902
- # Add missing field with empty value
903
- llm_response[missing_field] = ""
904
- # Retry transformation with added field
905
- logger.info(f"Retrying transformer function for {result.url} after adding missing field")
906
- result.extraction_result = config.post_agent_transformer_function(llm_response)
907
- logger.info(f"Successfully executed transformer function on retry for {result.url}")
908
- except Exception as e:
909
- logger.error(f"Transformer error for {result.url}: {str(e)}")
910
- result.extraction_result = llm_response # Use original response if transform fails
911
- else:
912
- result.extraction_result = llm_response
913
- progress.update(llm_task, advance=1)
914
- except Exception as e:
915
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
916
- time.sleep(1) # Add delay after error
917
- time.sleep(0.5) # Rate limiting between requests
918
-
919
- # Calculate statistics
920
- successful = len([r for r in results if r.status == "success"])
921
- failed = len([r for r in results if r.status == "failed"])
922
-
923
- # Print summary
924
- console.print("\n[green]Parallel processing completed:[/green]")
925
- console.print(f"✓ Successful: {successful}")
926
- console.print(f"✗ Failed: {failed}")
840
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
841
+ results.append(result)
842
+ progress.update(task, advance=1)
843
+ status = "✓" if result.status == "success" else "✗"
844
+ progress.description = f"[cyan]Last: {status} {result.url}"
927
845
 
928
- # Save report if enabled
929
- if config.save_reports:
930
- self._retry_stats = {
931
- "initial_failures": failed,
932
- "failure_ratio": (failed / len(urls)) * 100,
933
- "retry_successful": 0, # No retries in server parallel mode
934
- "retry_failed": failed
935
- }
936
- _save_report_sync(results, config, self._retry_stats)
937
- console.print(f"📊 Report saved to: {config.report_file}")
846
+ # Process LLM requests sequentially after all crawling is complete
847
+ if config.post_extraction_agent:
848
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
849
+ successful_results = [r for r in results if r.status == "success"]
938
850
 
939
- return results
851
+ with Progress(
852
+ SpinnerColumn(),
853
+ TextColumn("[progress.description]{task.description}"),
854
+ BarColumn(),
855
+ TaskProgressColumn(),
856
+ ) as progress:
857
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
940
858
 
941
- except Exception as e:
942
- console.print(f"[red]Error during parallel processing: {str(e)}[/red]")
943
- # Create failed results for all URLs
944
- return [
945
- CrawlResult(
946
- url=url,
947
- status="failed",
948
- error=str(e),
949
- config=config.to_dict()
950
- ) for url in urls
951
- ]
859
+ post_config = PostExtractionConfig(
860
+ model=config.post_extraction_agent["model"],
861
+ messages=config.post_extraction_agent["messages"],
862
+ api_key=config.post_extraction_agent["api_key"],
863
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
864
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
865
+ base_url=config.post_extraction_agent.get("base_url"),
866
+ combine_output=bool(config.post_extraction_agent_save_to_file),
867
+ output_file=config.post_extraction_agent_save_to_file,
868
+ custom_transform_function=config.post_agent_transformer_function
869
+ )
870
+ agent = PostExtractionAgent(post_config)
871
+
872
+ for result in successful_results:
873
+ try:
874
+ # Process content synchronously since it's not an async method
875
+ extraction_result = agent.process_content(result.url, result.markdown)
876
+ if extraction_result:
877
+ result.extraction_result = extraction_result
878
+ logger.info(f"Successfully processed and transformed content for {result.url}")
879
+ progress.update(llm_task, advance=1)
880
+ except Exception as e:
881
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
882
+ time.sleep(1) # Add delay after error
883
+ time.sleep(0.5) # Rate limiting between requests
884
+
885
+ # Calculate statistics
886
+ successful = len([r for r in results if r.status == "success"])
887
+ failed = len([r for r in results if r.status == "failed"])
888
+
889
+ # Print summary
890
+ console.print(f"\n[green]Crawling Summary:[/green]")
891
+ console.print(f"Total URLs processed: {len(urls)}")
892
+ console.print(f"✓ Successful: {successful}")
893
+ console.print(f"✗ Failed: {failed}")
894
+
895
+ # Save report if enabled
896
+ if config.save_reports:
897
+ self._retry_stats = {
898
+ "initial_failures": failed,
899
+ "failure_ratio": (failed / len(urls)) * 100,
900
+ "retry_successful": 0,
901
+ "retry_failed": failed
902
+ }
903
+ _save_report_sync(results, config, self._retry_stats)
904
+ console.print(f"📊 Report saved to: {config.report_file}")
905
+
906
+ return results
952
907
 
953
908
  def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
954
909
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6.6
3
+ Version: 2.6.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=p_ybuwvTD7bTelORBzAkomUQrc69WvOmu3owHKlzp0A,42231
2
+ spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
3
+ spiderforce4ai-2.6.7.dist-info/METADATA,sha256=5Tjsk-VHFD81TDxfh2LAClnoWa99BwBbNja9-681rZI,9012
4
+ spiderforce4ai-2.6.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.7.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.7.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
3
- spiderforce4ai-2.6.6.dist-info/METADATA,sha256=eoFT4zgeNK3TkBEF5pKnf5IducFbm1quZnndCuXPf-c,9012
4
- spiderforce4ai-2.6.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.6.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.6.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.6.6.dist-info/RECORD,,