spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +102 -68
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.1.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -460,28 +438,6 @@ class SpiderForce4AI:
|
|
460
438
|
if config.output_dir:
|
461
439
|
await _save_markdown_async(url, markdown, config)
|
462
440
|
|
463
|
-
# Handle post-extraction if configured
|
464
|
-
if config.post_extraction_agent and result.status == "success":
|
465
|
-
try:
|
466
|
-
post_config = PostExtractionConfig(
|
467
|
-
model=config.post_extraction_agent["model"],
|
468
|
-
messages=config.post_extraction_agent["messages"],
|
469
|
-
api_key=config.post_extraction_agent["api_key"],
|
470
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
-
output_file=config.post_extraction_agent_save_to_file,
|
475
|
-
custom_transform_function=config.post_agent_transformer_function
|
476
|
-
)
|
477
|
-
|
478
|
-
agent = PostExtractionAgent(post_config)
|
479
|
-
extraction_result = await agent.process_content(url, markdown)
|
480
|
-
if extraction_result:
|
481
|
-
result.extraction_result = extraction_result
|
482
|
-
except Exception as e:
|
483
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
484
|
-
|
485
441
|
await _send_webhook_async(result, config)
|
486
442
|
|
487
443
|
self.crawl_results.append(result)
|
@@ -635,10 +591,40 @@ class SpiderForce4AI:
|
|
635
591
|
except Exception as e:
|
636
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
637
593
|
|
594
|
+
# Process LLM requests sequentially after all crawling is complete
|
595
|
+
llm_successful = 0
|
596
|
+
if config.post_extraction_agent:
|
597
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
598
|
+
successful_results = [r for r in results if r.status == "success"]
|
599
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
600
|
+
|
601
|
+
post_config = PostExtractionConfig(
|
602
|
+
model=config.post_extraction_agent["model"],
|
603
|
+
messages=config.post_extraction_agent["messages"],
|
604
|
+
api_key=config.post_extraction_agent["api_key"],
|
605
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
606
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
607
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
608
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
609
|
+
output_file=config.post_extraction_agent_save_to_file,
|
610
|
+
custom_transform_function=config.post_agent_transformer_function
|
611
|
+
)
|
612
|
+
agent = PostExtractionAgent(post_config)
|
613
|
+
|
614
|
+
for result in successful_results:
|
615
|
+
try:
|
616
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
617
|
+
if result.extraction_result:
|
618
|
+
llm_successful += 1
|
619
|
+
progress.update(llm_task, advance=1)
|
620
|
+
except Exception as e:
|
621
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
622
|
+
await asyncio.sleep(1) # Add delay after error
|
623
|
+
await asyncio.sleep(0.5) # Rate limiting between requests
|
624
|
+
|
638
625
|
# Calculate final statistics
|
639
626
|
final_successful = len([r for r in results if r.status == "success"])
|
640
627
|
final_failed = len([r for r in results if r.status == "failed"])
|
641
|
-
llm_successful = len([r for r in results if r.extraction_result is not None])
|
642
628
|
|
643
629
|
# Update retry stats
|
644
630
|
self._retry_stats = {
|
@@ -725,13 +711,48 @@ class SpiderForce4AI:
|
|
725
711
|
TextColumn("({task.completed}/{task.total})"),
|
726
712
|
) as progress:
|
727
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
728
|
-
|
714
|
+
|
729
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
730
716
|
results.append(result)
|
731
717
|
progress.update(task, advance=1)
|
732
718
|
status = "✓" if result.status == "success" else "✗"
|
733
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
734
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
735
756
|
# Calculate statistics and handle retries
|
736
757
|
failed_results = [r for r in results if r.status == "failed"]
|
737
758
|
initial_failed = len(failed_results)
|
@@ -823,31 +844,44 @@ class SpiderForce4AI:
|
|
823
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
824
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
825
846
|
|
826
|
-
# Handle post-extraction if configured
|
827
|
-
if config.post_extraction_agent and result.status == "success":
|
828
|
-
try:
|
829
|
-
post_config = PostExtractionConfig(
|
830
|
-
model=config.post_extraction_agent["model"],
|
831
|
-
messages=config.post_extraction_agent["messages"],
|
832
|
-
api_key=config.post_extraction_agent["api_key"],
|
833
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
834
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
835
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
836
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
837
|
-
output_file=config.post_extraction_agent_save_to_file,
|
838
|
-
custom_transform_function=config.post_agent_transformer_function
|
839
|
-
)
|
840
|
-
|
841
|
-
agent = PostExtractionAgent(post_config)
|
842
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
843
|
-
if extraction_result:
|
844
|
-
result.extraction_result = extraction_result
|
845
|
-
except Exception as e:
|
846
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
847
|
-
|
848
847
|
# Send webhook if configured
|
849
848
|
_send_webhook_sync(result, config)
|
850
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
851
885
|
|
852
886
|
# Calculate statistics
|
853
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
+
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
|
4
|
-
spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|