spiderforce4ai 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +89 -134
- spiderforce4ai-2.6.8.dist-info/METADATA +789 -0
- spiderforce4ai-2.6.8.dist-info/RECORD +7 -0
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/WHEEL +1 -1
- spiderforce4ai-2.6.6.dist-info/METADATA +0 -336
- spiderforce4ai-2.6.6.dist-info/RECORD +0 -7
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
302
302
|
_send_webhook_sync(result, config)
|
303
303
|
return result
|
304
304
|
|
305
|
-
|
305
|
+
# Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
|
306
|
+
try:
|
307
|
+
response_data = response.json()
|
308
|
+
# Make sure we're accessing 'markdown' correctly
|
309
|
+
markdown = response_data.get('markdown', '') # Use get() with default value
|
310
|
+
if not markdown and response.text: # Fallback to raw text if no markdown
|
311
|
+
markdown = response.text
|
312
|
+
except json.JSONDecodeError:
|
313
|
+
# If response isn't JSON, use raw text
|
314
|
+
markdown = response.text
|
306
315
|
|
307
316
|
# Save markdown if output directory is configured
|
308
317
|
if config.output_dir:
|
@@ -811,144 +820,90 @@ class SpiderForce4AI:
|
|
811
820
|
return results
|
812
821
|
|
813
822
|
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
814
|
-
"""
|
815
|
-
|
816
|
-
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
817
|
-
"""
|
818
|
-
console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
|
819
|
-
|
820
|
-
try:
|
821
|
-
endpoint = f"{self.base_url}/convert_parallel"
|
822
|
-
|
823
|
-
# Prepare payload
|
824
|
-
payload = {
|
825
|
-
"urls": urls,
|
826
|
-
**config.to_dict()
|
827
|
-
}
|
828
|
-
|
829
|
-
# Send request
|
830
|
-
response = requests.post(
|
831
|
-
endpoint,
|
832
|
-
json=payload,
|
833
|
-
timeout=config.timeout
|
834
|
-
)
|
835
|
-
response.raise_for_status()
|
836
|
-
|
837
|
-
# Process results
|
838
|
-
results = []
|
839
|
-
server_results = response.json()
|
840
|
-
|
841
|
-
for url_result in server_results:
|
842
|
-
result = CrawlResult(
|
843
|
-
url=url_result["url"],
|
844
|
-
status=url_result.get("status", "failed"),
|
845
|
-
markdown=url_result.get("markdown"),
|
846
|
-
error=url_result.get("error"),
|
847
|
-
config=config.to_dict()
|
848
|
-
)
|
849
|
-
|
850
|
-
# Save markdown if successful and output dir is configured
|
851
|
-
if result.status == "success" and config.output_dir and result.markdown:
|
852
|
-
_save_markdown_sync(result.url, result.markdown, config)
|
853
|
-
|
854
|
-
# Send webhook if configured
|
855
|
-
_send_webhook_sync(result, config)
|
856
|
-
results.append(result)
|
823
|
+
"""Crawl URLs in parallel using multiprocessing."""
|
824
|
+
console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
|
857
825
|
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
)
|
869
|
-
|
870
|
-
|
871
|
-
post_config = PostExtractionConfig(
|
872
|
-
model=config.post_extraction_agent["model"],
|
873
|
-
messages=config.post_extraction_agent["messages"],
|
874
|
-
api_key=config.post_extraction_agent["api_key"],
|
875
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
876
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
877
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
878
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
879
|
-
output_file=config.post_extraction_agent_save_to_file,
|
880
|
-
custom_transform_function=config.post_agent_transformer_function,
|
881
|
-
response_format=config.post_extraction_agent.get("response_format")
|
882
|
-
)
|
883
|
-
agent = PostExtractionAgent(post_config)
|
826
|
+
# Process URLs in parallel
|
827
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
828
|
+
results = []
|
829
|
+
|
830
|
+
with Pool(processes=config.max_concurrent_requests) as pool:
|
831
|
+
with Progress(
|
832
|
+
SpinnerColumn(),
|
833
|
+
TextColumn("[progress.description]{task.description}"),
|
834
|
+
BarColumn(),
|
835
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
836
|
+
TextColumn("({task.completed}/{task.total})"),
|
837
|
+
) as progress:
|
838
|
+
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
884
839
|
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
# Add URL to the response before transformation
|
891
|
-
llm_response['url'] = result.url
|
892
|
-
# Apply transformation if provided
|
893
|
-
if config.post_agent_transformer_function:
|
894
|
-
try:
|
895
|
-
logger.info(f"Starting transformer function execution for {result.url}")
|
896
|
-
result.extraction_result = config.post_agent_transformer_function(llm_response)
|
897
|
-
logger.info(f"Successfully executed transformer function for {result.url}")
|
898
|
-
except KeyError as e:
|
899
|
-
# Log missing field but continue with transformation
|
900
|
-
missing_field = str(e).strip("'")
|
901
|
-
logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
|
902
|
-
# Add missing field with empty value
|
903
|
-
llm_response[missing_field] = ""
|
904
|
-
# Retry transformation with added field
|
905
|
-
logger.info(f"Retrying transformer function for {result.url} after adding missing field")
|
906
|
-
result.extraction_result = config.post_agent_transformer_function(llm_response)
|
907
|
-
logger.info(f"Successfully executed transformer function on retry for {result.url}")
|
908
|
-
except Exception as e:
|
909
|
-
logger.error(f"Transformer error for {result.url}: {str(e)}")
|
910
|
-
result.extraction_result = llm_response # Use original response if transform fails
|
911
|
-
else:
|
912
|
-
result.extraction_result = llm_response
|
913
|
-
progress.update(llm_task, advance=1)
|
914
|
-
except Exception as e:
|
915
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
916
|
-
time.sleep(1) # Add delay after error
|
917
|
-
time.sleep(0.5) # Rate limiting between requests
|
918
|
-
|
919
|
-
# Calculate statistics
|
920
|
-
successful = len([r for r in results if r.status == "success"])
|
921
|
-
failed = len([r for r in results if r.status == "failed"])
|
922
|
-
|
923
|
-
# Print summary
|
924
|
-
console.print("\n[green]Parallel processing completed:[/green]")
|
925
|
-
console.print(f"✓ Successful: {successful}")
|
926
|
-
console.print(f"✗ Failed: {failed}")
|
840
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
841
|
+
results.append(result)
|
842
|
+
progress.update(task, advance=1)
|
843
|
+
status = "✓" if result.status == "success" else "✗"
|
844
|
+
progress.description = f"[cyan]Last: {status} {result.url}"
|
927
845
|
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
"failure_ratio": (failed / len(urls)) * 100,
|
933
|
-
"retry_successful": 0, # No retries in server parallel mode
|
934
|
-
"retry_failed": failed
|
935
|
-
}
|
936
|
-
_save_report_sync(results, config, self._retry_stats)
|
937
|
-
console.print(f"📊 Report saved to: {config.report_file}")
|
846
|
+
# Process LLM requests sequentially after all crawling is complete
|
847
|
+
if config.post_extraction_agent:
|
848
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
849
|
+
successful_results = [r for r in results if r.status == "success"]
|
938
850
|
|
939
|
-
|
851
|
+
with Progress(
|
852
|
+
SpinnerColumn(),
|
853
|
+
TextColumn("[progress.description]{task.description}"),
|
854
|
+
BarColumn(),
|
855
|
+
TaskProgressColumn(),
|
856
|
+
) as progress:
|
857
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
940
858
|
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
859
|
+
post_config = PostExtractionConfig(
|
860
|
+
model=config.post_extraction_agent["model"],
|
861
|
+
messages=config.post_extraction_agent["messages"],
|
862
|
+
api_key=config.post_extraction_agent["api_key"],
|
863
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
864
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
865
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
866
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
867
|
+
output_file=config.post_extraction_agent_save_to_file,
|
868
|
+
custom_transform_function=config.post_agent_transformer_function
|
869
|
+
)
|
870
|
+
agent = PostExtractionAgent(post_config)
|
871
|
+
|
872
|
+
for result in successful_results:
|
873
|
+
try:
|
874
|
+
# Process content synchronously since it's not an async method
|
875
|
+
extraction_result = agent.process_content(result.url, result.markdown)
|
876
|
+
if extraction_result:
|
877
|
+
result.extraction_result = extraction_result
|
878
|
+
logger.info(f"Successfully processed and transformed content for {result.url}")
|
879
|
+
progress.update(llm_task, advance=1)
|
880
|
+
except Exception as e:
|
881
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
882
|
+
time.sleep(1) # Add delay after error
|
883
|
+
time.sleep(0.5) # Rate limiting between requests
|
884
|
+
|
885
|
+
# Calculate statistics
|
886
|
+
successful = len([r for r in results if r.status == "success"])
|
887
|
+
failed = len([r for r in results if r.status == "failed"])
|
888
|
+
|
889
|
+
# Print summary
|
890
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
891
|
+
console.print(f"Total URLs processed: {len(urls)}")
|
892
|
+
console.print(f"✓ Successful: {successful}")
|
893
|
+
console.print(f"✗ Failed: {failed}")
|
894
|
+
|
895
|
+
# Save report if enabled
|
896
|
+
if config.save_reports:
|
897
|
+
self._retry_stats = {
|
898
|
+
"initial_failures": failed,
|
899
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
900
|
+
"retry_successful": 0,
|
901
|
+
"retry_failed": failed
|
902
|
+
}
|
903
|
+
_save_report_sync(results, config, self._retry_stats)
|
904
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
905
|
+
|
906
|
+
return results
|
952
907
|
|
953
908
|
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
954
909
|
"""
|