spiderforce4ai 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +63 -25
- spiderforce4ai/post_extraction_agent.py +32 -17
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.2.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -460,28 +460,6 @@ class SpiderForce4AI:
|
|
460
460
|
if config.output_dir:
|
461
461
|
await _save_markdown_async(url, markdown, config)
|
462
462
|
|
463
|
-
# Handle post-extraction if configured
|
464
|
-
if config.post_extraction_agent and result.status == "success":
|
465
|
-
try:
|
466
|
-
post_config = PostExtractionConfig(
|
467
|
-
model=config.post_extraction_agent["model"],
|
468
|
-
messages=config.post_extraction_agent["messages"],
|
469
|
-
api_key=config.post_extraction_agent["api_key"],
|
470
|
-
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
471
|
-
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
472
|
-
base_url=config.post_extraction_agent.get("base_url"),
|
473
|
-
combine_output=bool(config.post_extraction_agent_save_to_file),
|
474
|
-
output_file=config.post_extraction_agent_save_to_file,
|
475
|
-
custom_transform_function=config.post_agent_transformer_function
|
476
|
-
)
|
477
|
-
|
478
|
-
agent = PostExtractionAgent(post_config)
|
479
|
-
extraction_result = await agent.process_content(url, markdown)
|
480
|
-
if extraction_result:
|
481
|
-
result.extraction_result = extraction_result
|
482
|
-
except Exception as e:
|
483
|
-
console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
|
484
|
-
|
485
463
|
await _send_webhook_async(result, config)
|
486
464
|
|
487
465
|
self.crawl_results.append(result)
|
@@ -576,8 +554,11 @@ class SpiderForce4AI:
|
|
576
554
|
# Set up concurrency control
|
577
555
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
556
|
|
557
|
+
# Semaphore for crawling
|
558
|
+
crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
559
|
+
|
579
560
|
async def crawl_with_semaphore(url):
|
580
|
-
async with
|
561
|
+
async with crawl_semaphore:
|
581
562
|
result = await crawl_with_progress(url)
|
582
563
|
await asyncio.sleep(config.request_delay)
|
583
564
|
return result
|
@@ -606,6 +587,63 @@ class SpiderForce4AI:
|
|
606
587
|
results[i] = retry_result
|
607
588
|
break
|
608
589
|
|
590
|
+
# Process LLM requests sequentially after all crawling is complete
|
591
|
+
if config.post_extraction_agent:
|
592
|
+
console.print("\n[cyan]Processing content with LLM...[/cyan]")
|
593
|
+
llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
|
594
|
+
|
595
|
+
post_config = PostExtractionConfig(
|
596
|
+
model=config.post_extraction_agent["model"],
|
597
|
+
messages=config.post_extraction_agent["messages"],
|
598
|
+
api_key=config.post_extraction_agent["api_key"],
|
599
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
600
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
601
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
602
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
603
|
+
output_file=config.post_extraction_agent_save_to_file,
|
604
|
+
custom_transform_function=config.post_agent_transformer_function
|
605
|
+
)
|
606
|
+
agent = PostExtractionAgent(post_config)
|
607
|
+
|
608
|
+
for result in results:
|
609
|
+
if result.status == "success":
|
610
|
+
try:
|
611
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
612
|
+
progress.update(llm_task, advance=1)
|
613
|
+
except Exception as e:
|
614
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
615
|
+
|
616
|
+
# Process LLM requests sequentially after all crawling is complete
|
617
|
+
llm_successful = 0
|
618
|
+
if config.post_extraction_agent:
|
619
|
+
console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
|
620
|
+
successful_results = [r for r in results if r.status == "success"]
|
621
|
+
llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
|
622
|
+
|
623
|
+
post_config = PostExtractionConfig(
|
624
|
+
model=config.post_extraction_agent["model"],
|
625
|
+
messages=config.post_extraction_agent["messages"],
|
626
|
+
api_key=config.post_extraction_agent["api_key"],
|
627
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
628
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
629
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
630
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
631
|
+
output_file=config.post_extraction_agent_save_to_file,
|
632
|
+
custom_transform_function=config.post_agent_transformer_function
|
633
|
+
)
|
634
|
+
agent = PostExtractionAgent(post_config)
|
635
|
+
|
636
|
+
for result in successful_results:
|
637
|
+
try:
|
638
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
639
|
+
if result.extraction_result:
|
640
|
+
llm_successful += 1
|
641
|
+
progress.update(llm_task, advance=1)
|
642
|
+
except Exception as e:
|
643
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
644
|
+
await asyncio.sleep(1) # Add delay after error
|
645
|
+
await asyncio.sleep(0.5) # Rate limiting between requests
|
646
|
+
|
609
647
|
# Calculate final statistics
|
610
648
|
final_successful = len([r for r in results if r.status == "success"])
|
611
649
|
final_failed = len([r for r in results if r.status == "failed"])
|
@@ -616,7 +654,7 @@ class SpiderForce4AI:
|
|
616
654
|
"failure_ratio": failure_ratio,
|
617
655
|
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
656
|
"retry_failed": final_failed,
|
619
|
-
"
|
657
|
+
"llm_successful": llm_successful
|
620
658
|
}
|
621
659
|
|
622
660
|
# Print summary
|
@@ -894,4 +932,4 @@ class SpiderForce4AI:
|
|
894
932
|
# Version info
|
895
933
|
#__version__ = "2.3.1"
|
896
934
|
#__author__ = "Piotr Tamulewicz"
|
897
|
-
#__email__ = "pt@petertam.pro"
|
935
|
+
#__email__ = "pt@petertam.pro"
|
@@ -176,23 +176,38 @@ class PostExtractionAgent:
|
|
176
176
|
for msg in self.config.messages
|
177
177
|
]
|
178
178
|
|
179
|
-
# Make LLM request
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
max_tokens=self.config.max_tokens,
|
184
|
-
temperature=self.config.temperature,
|
185
|
-
api_key=self.config.api_key,
|
186
|
-
api_base=self.config.base_url
|
187
|
-
)
|
179
|
+
# Make LLM request with retries
|
180
|
+
max_retries = 3
|
181
|
+
retry_delay = 1.0
|
182
|
+
last_error = None
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
for attempt in range(max_retries):
|
185
|
+
try:
|
186
|
+
response = await completion(
|
187
|
+
model=self.config.model,
|
188
|
+
messages=messages,
|
189
|
+
max_tokens=self.config.max_tokens,
|
190
|
+
temperature=self.config.temperature,
|
191
|
+
api_key=self.config.api_key,
|
192
|
+
api_base=self.config.base_url
|
193
|
+
)
|
194
|
+
|
195
|
+
# Parse response
|
196
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
197
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
198
|
+
return extracted_data
|
199
|
+
|
200
|
+
except json.JSONDecodeError as e:
|
201
|
+
last_error = f"Invalid JSON response from LLM: {e}"
|
202
|
+
if attempt < max_retries - 1:
|
203
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
204
|
+
except Exception as e:
|
205
|
+
last_error = str(e)
|
206
|
+
if attempt < max_retries - 1:
|
207
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
208
|
+
|
209
|
+
# If we get here, all retries failed
|
210
|
+
raise Exception(last_error)
|
196
211
|
|
197
212
|
except Exception as e:
|
198
213
|
logger.error(f"Error processing {url}: {str(e)}")
|
@@ -256,4 +271,4 @@ class PostExtractionAgent:
|
|
256
271
|
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
272
|
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
273
|
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
-
}
|
274
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
|
4
|
+
spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.2.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
-
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
-
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|