spiderforce4ai 2.4__py3-none-any.whl → 2.4.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +33 -3
- spiderforce4ai/post_extraction_agent.py +32 -17
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.1.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -576,8 +576,11 @@ class SpiderForce4AI:
|
|
576
576
|
# Set up concurrency control
|
577
577
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
578
|
|
579
|
+
# Semaphore for crawling
|
580
|
+
crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
581
|
+
|
579
582
|
async def crawl_with_semaphore(url):
|
580
|
-
async with
|
583
|
+
async with crawl_semaphore:
|
581
584
|
result = await crawl_with_progress(url)
|
582
585
|
await asyncio.sleep(config.request_delay)
|
583
586
|
return result
|
@@ -606,9 +609,36 @@ class SpiderForce4AI:
|
|
606
609
|
results[i] = retry_result
|
607
610
|
break
|
608
611
|
|
612
|
+
# Process LLM requests sequentially after all crawling is complete
|
613
|
+
if config.post_extraction_agent:
|
614
|
+
console.print("\n[cyan]Processing content with LLM...[/cyan]")
|
615
|
+
llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
|
616
|
+
|
617
|
+
post_config = PostExtractionConfig(
|
618
|
+
model=config.post_extraction_agent["model"],
|
619
|
+
messages=config.post_extraction_agent["messages"],
|
620
|
+
api_key=config.post_extraction_agent["api_key"],
|
621
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
622
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
623
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
624
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
625
|
+
output_file=config.post_extraction_agent_save_to_file,
|
626
|
+
custom_transform_function=config.post_agent_transformer_function
|
627
|
+
)
|
628
|
+
agent = PostExtractionAgent(post_config)
|
629
|
+
|
630
|
+
for result in results:
|
631
|
+
if result.status == "success":
|
632
|
+
try:
|
633
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
634
|
+
progress.update(llm_task, advance=1)
|
635
|
+
except Exception as e:
|
636
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
637
|
+
|
609
638
|
# Calculate final statistics
|
610
639
|
final_successful = len([r for r in results if r.status == "success"])
|
611
640
|
final_failed = len([r for r in results if r.status == "failed"])
|
641
|
+
llm_successful = len([r for r in results if r.extraction_result is not None])
|
612
642
|
|
613
643
|
# Update retry stats
|
614
644
|
self._retry_stats = {
|
@@ -616,7 +646,7 @@ class SpiderForce4AI:
|
|
616
646
|
"failure_ratio": failure_ratio,
|
617
647
|
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
648
|
"retry_failed": final_failed,
|
619
|
-
"
|
649
|
+
"llm_successful": llm_successful
|
620
650
|
}
|
621
651
|
|
622
652
|
# Print summary
|
@@ -894,4 +924,4 @@ class SpiderForce4AI:
|
|
894
924
|
# Version info
|
895
925
|
#__version__ = "2.3.1"
|
896
926
|
#__author__ = "Piotr Tamulewicz"
|
897
|
-
#__email__ = "pt@petertam.pro"
|
927
|
+
#__email__ = "pt@petertam.pro"
|
@@ -176,23 +176,38 @@ class PostExtractionAgent:
|
|
176
176
|
for msg in self.config.messages
|
177
177
|
]
|
178
178
|
|
179
|
-
# Make LLM request
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
max_tokens=self.config.max_tokens,
|
184
|
-
temperature=self.config.temperature,
|
185
|
-
api_key=self.config.api_key,
|
186
|
-
api_base=self.config.base_url
|
187
|
-
)
|
179
|
+
# Make LLM request with retries
|
180
|
+
max_retries = 3
|
181
|
+
retry_delay = 1.0
|
182
|
+
last_error = None
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
for attempt in range(max_retries):
|
185
|
+
try:
|
186
|
+
response = await completion(
|
187
|
+
model=self.config.model,
|
188
|
+
messages=messages,
|
189
|
+
max_tokens=self.config.max_tokens,
|
190
|
+
temperature=self.config.temperature,
|
191
|
+
api_key=self.config.api_key,
|
192
|
+
api_base=self.config.base_url
|
193
|
+
)
|
194
|
+
|
195
|
+
# Parse response
|
196
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
197
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
198
|
+
return extracted_data
|
199
|
+
|
200
|
+
except json.JSONDecodeError as e:
|
201
|
+
last_error = f"Invalid JSON response from LLM: {e}"
|
202
|
+
if attempt < max_retries - 1:
|
203
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
204
|
+
except Exception as e:
|
205
|
+
last_error = str(e)
|
206
|
+
if attempt < max_retries - 1:
|
207
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
208
|
+
|
209
|
+
# If we get here, all retries failed
|
210
|
+
raise Exception(last_error)
|
196
211
|
|
197
212
|
except Exception as e:
|
198
213
|
logger.error(f"Error processing {url}: {str(e)}")
|
@@ -256,4 +271,4 @@ class PostExtractionAgent:
|
|
256
271
|
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
272
|
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
273
|
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
-
}
|
274
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
|
4
|
+
spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.1.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
-
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
-
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|