spiderforce4ai 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +63 -25
- spiderforce4ai/post_extraction_agent.py +32 -17
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.2.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -460,28 +460,6 @@ class SpiderForce4AI:
|
|
460
460
|
if config.output_dir:
|
461
461
|
await _save_markdown_async(url, markdown, config)
|
462
462
|
|
463
|
-
# Handle post-extraction if configured
|
464
|
-
if config.post_extraction_agent and result.status == "success":
|
465
|
-
try:
|
466
|
-
post_config = PostExtractionConfig(
|
467
|
-
model=config.post_extraction_agent["model"],
|
468
|
-
messages=config.post_extraction_agent["messages"],
|
469
|
-
api_key=config.post_extraction_agent["api_key"],
|
470
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
-
output_file=config.post_extraction_agent_save_to_file,
|
475
|
-
custom_transform_function=config.post_agent_transformer_function
|
476
|
-
)
|
477
|
-
|
478
|
-
agent = PostExtractionAgent(post_config)
|
479
|
-
extraction_result = await agent.process_content(url, markdown)
|
480
|
-
if extraction_result:
|
481
|
-
result.extraction_result = extraction_result
|
482
|
-
except Exception as e:
|
483
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
484
|
-
|
485
463
|
await _send_webhook_async(result, config)
|
486
464
|
|
487
465
|
self.crawl_results.append(result)
|
@@ -576,8 +554,11 @@ class SpiderForce4AI:
|
|
576
554
|
# Set up concurrency control
|
577
555
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
556
|
|
557
|
+
# Semaphore for crawling
|
558
|
+
crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
559
|
+
|
579
560
|
async def crawl_with_semaphore(url):
|
580
|
-
async with
|
561
|
+
async with crawl_semaphore:
|
581
562
|
result = await crawl_with_progress(url)
|
582
563
|
await asyncio.sleep(config.request_delay)
|
583
564
|
return result
|
@@ -606,6 +587,63 @@ class SpiderForce4AI:
|
|
606
587
|
results[i] = retry_result
|
607
588
|
break
|
608
589
|
|
590
|
+
# Process LLM requests sequentially after all crawling is complete
|
591
|
+
if config.post_extraction_agent:
|
592
|
+
console.print("\n[cyan]Processing content with LLM...[/cyan]")
|
593
|
+
llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
|
594
|
+
|
595
|
+
post_config = PostExtractionConfig(
|
596
|
+
model=config.post_extraction_agent["model"],
|
597
|
+
messages=config.post_extraction_agent["messages"],
|
598
|
+
api_key=config.post_extraction_agent["api_key"],
|
599
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
600
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
601
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
602
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
603
|
+
output_file=config.post_extraction_agent_save_to_file,
|
604
|
+
custom_transform_function=config.post_agent_transformer_function
|
605
|
+
)
|
606
|
+
agent = PostExtractionAgent(post_config)
|
607
|
+
|
608
|
+
for result in results:
|
609
|
+
if result.status == "success":
|
610
|
+
try:
|
611
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
612
|
+
progress.update(llm_task, advance=1)
|
613
|
+
except Exception as e:
|
614
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
615
|
+
|
616
|
+
# Process LLM requests sequentially after all crawling is complete
|
617
|
+
llm_successful = 0
|
618
|
+
if config.post_extraction_agent:
|
619
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
620
|
+
successful_results = [r for r in results if r.status == "success"]
|
621
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
622
|
+
|
623
|
+
post_config = PostExtractionConfig(
|
624
|
+
model=config.post_extraction_agent["model"],
|
625
|
+
messages=config.post_extraction_agent["messages"],
|
626
|
+
api_key=config.post_extraction_agent["api_key"],
|
627
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
628
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
629
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
630
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
631
|
+
output_file=config.post_extraction_agent_save_to_file,
|
632
|
+
custom_transform_function=config.post_agent_transformer_function
|
633
|
+
)
|
634
|
+
agent = PostExtractionAgent(post_config)
|
635
|
+
|
636
|
+
for result in successful_results:
|
637
|
+
try:
|
638
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
639
|
+
if result.extraction_result:
|
640
|
+
llm_successful += 1
|
641
|
+
progress.update(llm_task, advance=1)
|
642
|
+
except Exception as e:
|
643
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
644
|
+
await asyncio.sleep(1) # Add delay after error
|
645
|
+
await asyncio.sleep(0.5) # Rate limiting between requests
|
646
|
+
|
609
647
|
# Calculate final statistics
|
610
648
|
final_successful = len([r for r in results if r.status == "success"])
|
611
649
|
final_failed = len([r for r in results if r.status == "failed"])
|
@@ -616,7 +654,7 @@ class SpiderForce4AI:
|
|
616
654
|
"failure_ratio": failure_ratio,
|
617
655
|
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
656
|
"retry_failed": final_failed,
|
619
|
-
"
|
657
|
+
"llm_successful": llm_successful
|
620
658
|
}
|
621
659
|
|
622
660
|
# Print summary
|
@@ -894,4 +932,4 @@ class SpiderForce4AI:
|
|
894
932
|
# Version info
|
895
933
|
#__version__ = "2.3.1"
|
896
934
|
#__author__ = "Piotr Tamulewicz"
|
897
|
-
#__email__ = "pt@petertam.pro"
|
935
|
+
#__email__ = "pt@petertam.pro"
|
@@ -176,23 +176,38 @@ class PostExtractionAgent:
|
|
176
176
|
for msg in self.config.messages
|
177
177
|
]
|
178
178
|
|
179
|
-
# Make LLM request
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
max_tokens=self.config.max_tokens,
|
184
|
-
temperature=self.config.temperature,
|
185
|
-
api_key=self.config.api_key,
|
186
|
-
api_base=self.config.base_url
|
187
|
-
)
|
179
|
+
# Make LLM request with retries
|
180
|
+
max_retries = 3
|
181
|
+
retry_delay = 1.0
|
182
|
+
last_error = None
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
for attempt in range(max_retries):
|
185
|
+
try:
|
186
|
+
response = await completion(
|
187
|
+
model=self.config.model,
|
188
|
+
messages=messages,
|
189
|
+
max_tokens=self.config.max_tokens,
|
190
|
+
temperature=self.config.temperature,
|
191
|
+
api_key=self.config.api_key,
|
192
|
+
api_base=self.config.base_url
|
193
|
+
)
|
194
|
+
|
195
|
+
# Parse response
|
196
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
197
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
198
|
+
return extracted_data
|
199
|
+
|
200
|
+
except json.JSONDecodeError as e:
|
201
|
+
last_error = f"Invalid JSON response from LLM: {e}"
|
202
|
+
if attempt < max_retries - 1:
|
203
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
204
|
+
except Exception as e:
|
205
|
+
last_error = str(e)
|
206
|
+
if attempt < max_retries - 1:
|
207
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
208
|
+
|
209
|
+
# If we get here, all retries failed
|
210
|
+
raise Exception(last_error)
|
196
211
|
|
197
212
|
except Exception as e:
|
198
213
|
logger.error(f"Error processing {url}: {str(e)}")
|
@@ -256,4 +271,4 @@ class PostExtractionAgent:
|
|
256
271
|
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
272
|
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
273
|
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
-
}
|
274
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
+
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
-
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
-
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|