spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +102 -68
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.1.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -460,28 +438,6 @@ class SpiderForce4AI:
|
|
460
438
|
if config.output_dir:
|
461
439
|
await _save_markdown_async(url, markdown, config)
|
462
440
|
|
463
|
-
# Handle post-extraction if configured
|
464
|
-
if config.post_extraction_agent and result.status == "success":
|
465
|
-
try:
|
466
|
-
post_config = PostExtractionConfig(
|
467
|
-
model=config.post_extraction_agent["model"],
|
468
|
-
messages=config.post_extraction_agent["messages"],
|
469
|
-
api_key=config.post_extraction_agent["api_key"],
|
470
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
-
output_file=config.post_extraction_agent_save_to_file,
|
475
|
-
custom_transform_function=config.post_agent_transformer_function
|
476
|
-
)
|
477
|
-
|
478
|
-
agent = PostExtractionAgent(post_config)
|
479
|
-
extraction_result = await agent.process_content(url, markdown)
|
480
|
-
if extraction_result:
|
481
|
-
result.extraction_result = extraction_result
|
482
|
-
except Exception as e:
|
483
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
484
|
-
|
485
441
|
await _send_webhook_async(result, config)
|
486
442
|
|
487
443
|
self.crawl_results.append(result)
|
@@ -635,10 +591,40 @@ class SpiderForce4AI:
|
|
635
591
|
except Exception as e:
|
636
592
|
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
637
593
|
|
594
|
+
# Process LLM requests sequentially after all crawling is complete
|
595
|
+
llm_successful = 0
|
596
|
+
if config.post_extraction_agent:
|
597
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
598
|
+
successful_results = [r for r in results if r.status == "success"]
|
599
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
600
|
+
|
601
|
+
post_config = PostExtractionConfig(
|
602
|
+
model=config.post_extraction_agent["model"],
|
603
|
+
messages=config.post_extraction_agent["messages"],
|
604
|
+
api_key=config.post_extraction_agent["api_key"],
|
605
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
606
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
607
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
608
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
609
|
+
output_file=config.post_extraction_agent_save_to_file,
|
610
|
+
custom_transform_function=config.post_agent_transformer_function
|
611
|
+
)
|
612
|
+
agent = PostExtractionAgent(post_config)
|
613
|
+
|
614
|
+
for result in successful_results:
|
615
|
+
try:
|
616
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
617
|
+
if result.extraction_result:
|
618
|
+
llm_successful += 1
|
619
|
+
progress.update(llm_task, advance=1)
|
620
|
+
except Exception as e:
|
621
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
622
|
+
await asyncio.sleep(1) # Add delay after error
|
623
|
+
await asyncio.sleep(0.5) # Rate limiting between requests
|
624
|
+
|
638
625
|
# Calculate final statistics
|
639
626
|
final_successful = len([r for r in results if r.status == "success"])
|
640
627
|
final_failed = len([r for r in results if r.status == "failed"])
|
641
|
-
llm_successful = len([r for r in results if r.extraction_result is not None])
|
642
628
|
|
643
629
|
# Update retry stats
|
644
630
|
self._retry_stats = {
|
@@ -725,13 +711,48 @@ class SpiderForce4AI:
|
|
725
711
|
TextColumn("({task.completed}/{task.total})"),
|
726
712
|
) as progress:
|
727
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
728
|
-
|
714
|
+
|
729
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
730
716
|
results.append(result)
|
731
717
|
progress.update(task, advance=1)
|
732
718
|
status = "✓" if result.status == "success" else "✗"
|
733
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
734
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
735
756
|
# Calculate statistics and handle retries
|
736
757
|
failed_results = [r for r in results if r.status == "failed"]
|
737
758
|
initial_failed = len(failed_results)
|
@@ -823,31 +844,44 @@ class SpiderForce4AI:
|
|
823
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
824
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
825
846
|
|
826
|
-
# Handle post-extraction if configured
|
827
|
-
if config.post_extraction_agent and result.status == "success":
|
828
|
-
try:
|
829
|
-
post_config = PostExtractionConfig(
|
830
|
-
model=config.post_extraction_agent["model"],
|
831
|
-
messages=config.post_extraction_agent["messages"],
|
832
|
-
api_key=config.post_extraction_agent["api_key"],
|
833
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
834
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
835
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
836
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
837
|
-
output_file=config.post_extraction_agent_save_to_file,
|
838
|
-
custom_transform_function=config.post_agent_transformer_function
|
839
|
-
)
|
840
|
-
|
841
|
-
agent = PostExtractionAgent(post_config)
|
842
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
843
|
-
if extraction_result:
|
844
|
-
result.extraction_result = extraction_result
|
845
|
-
except Exception as e:
|
846
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
847
|
-
|
848
847
|
# Send webhook if configured
|
849
848
|
_send_webhook_sync(result, config)
|
850
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
851
885
|
|
852
886
|
# Calculate statistics
|
853
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
+
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
|
4
|
-
spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|