spiderforce4ai 2.4.2__py3-none-any.whl → 2.4.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +71 -45
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.2.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -733,13 +711,48 @@ class SpiderForce4AI:
|
|
733
711
|
TextColumn("({task.completed}/{task.total})"),
|
734
712
|
) as progress:
|
735
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
736
|
-
|
714
|
+
|
737
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
738
716
|
results.append(result)
|
739
717
|
progress.update(task, advance=1)
|
740
718
|
status = "✓" if result.status == "success" else "✗"
|
741
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
742
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
743
756
|
# Calculate statistics and handle retries
|
744
757
|
failed_results = [r for r in results if r.status == "failed"]
|
745
758
|
initial_failed = len(failed_results)
|
@@ -831,31 +844,44 @@ class SpiderForce4AI:
|
|
831
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
832
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
833
846
|
|
834
|
-
# Handle post-extraction if configured
|
835
|
-
if config.post_extraction_agent and result.status == "success":
|
836
|
-
try:
|
837
|
-
post_config = PostExtractionConfig(
|
838
|
-
model=config.post_extraction_agent["model"],
|
839
|
-
messages=config.post_extraction_agent["messages"],
|
840
|
-
api_key=config.post_extraction_agent["api_key"],
|
841
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
842
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
843
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
844
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
845
|
-
output_file=config.post_extraction_agent_save_to_file,
|
846
|
-
custom_transform_function=config.post_agent_transformer_function
|
847
|
-
)
|
848
|
-
|
849
|
-
agent = PostExtractionAgent(post_config)
|
850
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
851
|
-
if extraction_result:
|
852
|
-
result.extraction_result = extraction_result
|
853
|
-
except Exception as e:
|
854
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
855
|
-
|
856
847
|
# Send webhook if configured
|
857
848
|
_send_webhook_sync(result, config)
|
858
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
859
885
|
|
860
886
|
# Calculate statistics
|
861
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
+
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
-
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|