spiderforce4ai 2.6.6__tar.gz → 2.6.7__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/PKG-INFO +1 -1
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/pyproject.toml +1 -1
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/setup.py +1 -1
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai/__init__.py +89 -134
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/README.md +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/setup.cfg +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai/post_extraction_agent.py +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.6.6 → spiderforce4ai-2.6.7}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.6.
|
7
|
+
version = "2.6.7"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.6.
|
10
|
+
version="2.6.7",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -302,7 +302,16 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
302
302
|
_send_webhook_sync(result, config)
|
303
303
|
return result
|
304
304
|
|
305
|
-
|
305
|
+
# Parse JSON response - THIS IS WHERE THE ERROR LIKELY OCCURS
|
306
|
+
try:
|
307
|
+
response_data = response.json()
|
308
|
+
# Make sure we're accessing 'markdown' correctly
|
309
|
+
markdown = response_data.get('markdown', '') # Use get() with default value
|
310
|
+
if not markdown and response.text: # Fallback to raw text if no markdown
|
311
|
+
markdown = response.text
|
312
|
+
except json.JSONDecodeError:
|
313
|
+
# If response isn't JSON, use raw text
|
314
|
+
markdown = response.text
|
306
315
|
|
307
316
|
# Save markdown if output directory is configured
|
308
317
|
if config.output_dir:
|
@@ -811,144 +820,90 @@ class SpiderForce4AI:
|
|
811
820
|
return results
|
812
821
|
|
813
822
|
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
814
|
-
"""
|
815
|
-
|
816
|
-
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
817
|
-
"""
|
818
|
-
console.print(f"[cyan]Sending {len(urls)} URLs for parallel processing...[/cyan]")
|
819
|
-
|
820
|
-
try:
|
821
|
-
endpoint = f"{self.base_url}/convert_parallel"
|
822
|
-
|
823
|
-
# Prepare payload
|
824
|
-
payload = {
|
825
|
-
"urls": urls,
|
826
|
-
**config.to_dict()
|
827
|
-
}
|
828
|
-
|
829
|
-
# Send request
|
830
|
-
response = requests.post(
|
831
|
-
endpoint,
|
832
|
-
json=payload,
|
833
|
-
timeout=config.timeout
|
834
|
-
)
|
835
|
-
response.raise_for_status()
|
836
|
-
|
837
|
-
# Process results
|
838
|
-
results = []
|
839
|
-
server_results = response.json()
|
840
|
-
|
841
|
-
for url_result in server_results:
|
842
|
-
result = CrawlResult(
|
843
|
-
url=url_result["url"],
|
844
|
-
status=url_result.get("status", "failed"),
|
845
|
-
markdown=url_result.get("markdown"),
|
846
|
-
error=url_result.get("error"),
|
847
|
-
config=config.to_dict()
|
848
|
-
)
|
849
|
-
|
850
|
-
# Save markdown if successful and output dir is configured
|
851
|
-
if result.status == "success" and config.output_dir and result.markdown:
|
852
|
-
_save_markdown_sync(result.url, result.markdown, config)
|
853
|
-
|
854
|
-
# Send webhook if configured
|
855
|
-
_send_webhook_sync(result, config)
|
856
|
-
results.append(result)
|
823
|
+
"""Crawl URLs in parallel using multiprocessing."""
|
824
|
+
console.print(f"[cyan]Processing {len(urls)} URLs in parallel...[/cyan]")
|
857
825
|
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
)
|
869
|
-
|
870
|
-
|
871
|
-
post_config = PostExtractionConfig(
|
872
|
-
model=config.post_extraction_agent["model"],
|
873
|
-
messages=config.post_extraction_agent["messages"],
|
874
|
-
api_key=config.post_extraction_agent["api_key"],
|
875
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
876
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
877
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
878
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
879
|
-
output_file=config.post_extraction_agent_save_to_file,
|
880
|
-
custom_transform_function=config.post_agent_transformer_function,
|
881
|
-
response_format=config.post_extraction_agent.get("response_format")
|
882
|
-
)
|
883
|
-
agent = PostExtractionAgent(post_config)
|
826
|
+
# Process URLs in parallel
|
827
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
828
|
+
results = []
|
829
|
+
|
830
|
+
with Pool(processes=config.max_concurrent_requests) as pool:
|
831
|
+
with Progress(
|
832
|
+
SpinnerColumn(),
|
833
|
+
TextColumn("[progress.description]{task.description}"),
|
834
|
+
BarColumn(),
|
835
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
836
|
+
TextColumn("({task.completed}/{task.total})"),
|
837
|
+
) as progress:
|
838
|
+
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
884
839
|
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
# Add URL to the response before transformation
|
891
|
-
llm_response['url'] = result.url
|
892
|
-
# Apply transformation if provided
|
893
|
-
if config.post_agent_transformer_function:
|
894
|
-
try:
|
895
|
-
logger.info(f"Starting transformer function execution for {result.url}")
|
896
|
-
result.extraction_result = config.post_agent_transformer_function(llm_response)
|
897
|
-
logger.info(f"Successfully executed transformer function for {result.url}")
|
898
|
-
except KeyError as e:
|
899
|
-
# Log missing field but continue with transformation
|
900
|
-
missing_field = str(e).strip("'")
|
901
|
-
logger.warning(f"Missing field '{missing_field}' in LLM response for {result.url}")
|
902
|
-
# Add missing field with empty value
|
903
|
-
llm_response[missing_field] = ""
|
904
|
-
# Retry transformation with added field
|
905
|
-
logger.info(f"Retrying transformer function for {result.url} after adding missing field")
|
906
|
-
result.extraction_result = config.post_agent_transformer_function(llm_response)
|
907
|
-
logger.info(f"Successfully executed transformer function on retry for {result.url}")
|
908
|
-
except Exception as e:
|
909
|
-
logger.error(f"Transformer error for {result.url}: {str(e)}")
|
910
|
-
result.extraction_result = llm_response # Use original response if transform fails
|
911
|
-
else:
|
912
|
-
result.extraction_result = llm_response
|
913
|
-
progress.update(llm_task, advance=1)
|
914
|
-
except Exception as e:
|
915
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
916
|
-
time.sleep(1) # Add delay after error
|
917
|
-
time.sleep(0.5) # Rate limiting between requests
|
918
|
-
|
919
|
-
# Calculate statistics
|
920
|
-
successful = len([r for r in results if r.status == "success"])
|
921
|
-
failed = len([r for r in results if r.status == "failed"])
|
922
|
-
|
923
|
-
# Print summary
|
924
|
-
console.print("\n[green]Parallel processing completed:[/green]")
|
925
|
-
console.print(f"✓ Successful: {successful}")
|
926
|
-
console.print(f"✗ Failed: {failed}")
|
840
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
841
|
+
results.append(result)
|
842
|
+
progress.update(task, advance=1)
|
843
|
+
status = "✓" if result.status == "success" else "✗"
|
844
|
+
progress.description = f"[cyan]Last: {status} {result.url}"
|
927
845
|
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
"failure_ratio": (failed / len(urls)) * 100,
|
933
|
-
"retry_successful": 0, # No retries in server parallel mode
|
934
|
-
"retry_failed": failed
|
935
|
-
}
|
936
|
-
_save_report_sync(results, config, self._retry_stats)
|
937
|
-
console.print(f"📊 Report saved to: {config.report_file}")
|
846
|
+
# Process LLM requests sequentially after all crawling is complete
|
847
|
+
if config.post_extraction_agent:
|
848
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
849
|
+
successful_results = [r for r in results if r.status == "success"]
|
938
850
|
|
939
|
-
|
851
|
+
with Progress(
|
852
|
+
SpinnerColumn(),
|
853
|
+
TextColumn("[progress.description]{task.description}"),
|
854
|
+
BarColumn(),
|
855
|
+
TaskProgressColumn(),
|
856
|
+
) as progress:
|
857
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
940
858
|
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
859
|
+
post_config = PostExtractionConfig(
|
860
|
+
model=config.post_extraction_agent["model"],
|
861
|
+
messages=config.post_extraction_agent["messages"],
|
862
|
+
api_key=config.post_extraction_agent["api_key"],
|
863
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
864
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
865
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
866
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
867
|
+
output_file=config.post_extraction_agent_save_to_file,
|
868
|
+
custom_transform_function=config.post_agent_transformer_function
|
869
|
+
)
|
870
|
+
agent = PostExtractionAgent(post_config)
|
871
|
+
|
872
|
+
for result in successful_results:
|
873
|
+
try:
|
874
|
+
# Process content synchronously since it's not an async method
|
875
|
+
extraction_result = agent.process_content(result.url, result.markdown)
|
876
|
+
if extraction_result:
|
877
|
+
result.extraction_result = extraction_result
|
878
|
+
logger.info(f"Successfully processed and transformed content for {result.url}")
|
879
|
+
progress.update(llm_task, advance=1)
|
880
|
+
except Exception as e:
|
881
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
882
|
+
time.sleep(1) # Add delay after error
|
883
|
+
time.sleep(0.5) # Rate limiting between requests
|
884
|
+
|
885
|
+
# Calculate statistics
|
886
|
+
successful = len([r for r in results if r.status == "success"])
|
887
|
+
failed = len([r for r in results if r.status == "failed"])
|
888
|
+
|
889
|
+
# Print summary
|
890
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
891
|
+
console.print(f"Total URLs processed: {len(urls)}")
|
892
|
+
console.print(f"✓ Successful: {successful}")
|
893
|
+
console.print(f"✗ Failed: {failed}")
|
894
|
+
|
895
|
+
# Save report if enabled
|
896
|
+
if config.save_reports:
|
897
|
+
self._retry_stats = {
|
898
|
+
"initial_failures": failed,
|
899
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
900
|
+
"retry_successful": 0,
|
901
|
+
"retry_failed": failed
|
902
|
+
}
|
903
|
+
_save_report_sync(results, config, self._retry_stats)
|
904
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
905
|
+
|
906
|
+
return results
|
952
907
|
|
953
908
|
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
954
909
|
"""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|