spiderforce4ai 2.4.2__py3-none-any.whl → 2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +71 -45
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.2.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.2.dist-info → spiderforce4ai-2.4.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
312
312
|
config=config.to_dict()
|
313
313
|
)
|
314
314
|
|
315
|
-
# Handle post-extraction if configured
|
316
|
-
if config.post_extraction_agent:
|
317
|
-
try:
|
318
|
-
post_config = PostExtractionConfig(
|
319
|
-
model=config.post_extraction_agent["model"],
|
320
|
-
messages=config.post_extraction_agent["messages"],
|
321
|
-
api_key=config.post_extraction_agent["api_key"],
|
322
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
323
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
324
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
325
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
326
|
-
output_file=config.post_extraction_agent_save_to_file,
|
327
|
-
custom_transform_function=config.post_agent_transformer_function
|
328
|
-
)
|
329
|
-
|
330
|
-
agent = PostExtractionAgent(post_config)
|
331
|
-
extraction_result = asyncio.run(agent.process_content(url, markdown))
|
332
|
-
if extraction_result:
|
333
|
-
result.extraction_result = extraction_result
|
334
|
-
except Exception as e:
|
335
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
336
|
-
|
337
315
|
# Send webhook for successful result
|
338
316
|
_send_webhook_sync(result, config)
|
339
317
|
|
@@ -733,13 +711,48 @@ class SpiderForce4AI:
|
|
733
711
|
TextColumn("({task.completed}/{task.total})"),
|
734
712
|
) as progress:
|
735
713
|
task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
|
736
|
-
|
714
|
+
|
737
715
|
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
738
716
|
results.append(result)
|
739
717
|
progress.update(task, advance=1)
|
740
718
|
status = "✓" if result.status == "success" else "✗"
|
741
719
|
progress.description = f"[cyan]Last: {status} {result.url}"
|
742
720
|
|
721
|
+
# Process LLM requests sequentially after all crawling is complete
|
722
|
+
if config.post_extraction_agent:
|
723
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
724
|
+
successful_results = [r for r in results if r.status == "success"]
|
725
|
+
|
726
|
+
with Progress(
|
727
|
+
SpinnerColumn(),
|
728
|
+
TextColumn("[progress.description]{task.description}"),
|
729
|
+
BarColumn(),
|
730
|
+
TaskProgressColumn(),
|
731
|
+
) as progress:
|
732
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
733
|
+
|
734
|
+
post_config = PostExtractionConfig(
|
735
|
+
model=config.post_extraction_agent["model"],
|
736
|
+
messages=config.post_extraction_agent["messages"],
|
737
|
+
api_key=config.post_extraction_agent["api_key"],
|
738
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
739
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
740
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
741
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
742
|
+
output_file=config.post_extraction_agent_save_to_file,
|
743
|
+
custom_transform_function=config.post_agent_transformer_function
|
744
|
+
)
|
745
|
+
agent = PostExtractionAgent(post_config)
|
746
|
+
|
747
|
+
for result in successful_results:
|
748
|
+
try:
|
749
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
750
|
+
progress.update(llm_task, advance=1)
|
751
|
+
except Exception as e:
|
752
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
753
|
+
time.sleep(1) # Add delay after error
|
754
|
+
time.sleep(0.5) # Rate limiting between requests
|
755
|
+
|
743
756
|
# Calculate statistics and handle retries
|
744
757
|
failed_results = [r for r in results if r.status == "failed"]
|
745
758
|
initial_failed = len(failed_results)
|
@@ -831,31 +844,44 @@ class SpiderForce4AI:
|
|
831
844
|
if result.status == "success" and config.output_dir and result.markdown:
|
832
845
|
_save_markdown_sync(result.url, result.markdown, config)
|
833
846
|
|
834
|
-
# Handle post-extraction if configured
|
835
|
-
if config.post_extraction_agent and result.status == "success":
|
836
|
-
try:
|
837
|
-
post_config = PostExtractionConfig(
|
838
|
-
model=config.post_extraction_agent["model"],
|
839
|
-
messages=config.post_extraction_agent["messages"],
|
840
|
-
api_key=config.post_extraction_agent["api_key"],
|
841
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
842
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
843
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
844
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
845
|
-
output_file=config.post_extraction_agent_save_to_file,
|
846
|
-
custom_transform_function=config.post_agent_transformer_function
|
847
|
-
)
|
848
|
-
|
849
|
-
agent = PostExtractionAgent(post_config)
|
850
|
-
extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
851
|
-
if extraction_result:
|
852
|
-
result.extraction_result = extraction_result
|
853
|
-
except Exception as e:
|
854
|
-
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
855
|
-
|
856
847
|
# Send webhook if configured
|
857
848
|
_send_webhook_sync(result, config)
|
858
849
|
results.append(result)
|
850
|
+
|
851
|
+
# Process LLM requests sequentially after all crawling is complete
|
852
|
+
if config.post_extraction_agent:
|
853
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
854
|
+
successful_results = [r for r in results if r.status == "success"]
|
855
|
+
|
856
|
+
with Progress(
|
857
|
+
SpinnerColumn(),
|
858
|
+
TextColumn("[progress.description]{task.description}"),
|
859
|
+
BarColumn(),
|
860
|
+
TaskProgressColumn(),
|
861
|
+
) as progress:
|
862
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
863
|
+
|
864
|
+
post_config = PostExtractionConfig(
|
865
|
+
model=config.post_extraction_agent["model"],
|
866
|
+
messages=config.post_extraction_agent["messages"],
|
867
|
+
api_key=config.post_extraction_agent["api_key"],
|
868
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
869
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
870
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
871
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
|
+
output_file=config.post_extraction_agent_save_to_file,
|
873
|
+
custom_transform_function=config.post_agent_transformer_function
|
874
|
+
)
|
875
|
+
agent = PostExtractionAgent(post_config)
|
876
|
+
|
877
|
+
for result in successful_results:
|
878
|
+
try:
|
879
|
+
result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
|
880
|
+
progress.update(llm_task, advance=1)
|
881
|
+
except Exception as e:
|
882
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
883
|
+
time.sleep(1) # Add delay after error
|
884
|
+
time.sleep(0.5) # Rate limiting between requests
|
859
885
|
|
860
886
|
# Calculate statistics
|
861
887
|
successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
|
4
|
+
spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
-
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
-
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|