spiderforce4ai 2.4__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +33 -3
- spiderforce4ai/post_extraction_agent.py +32 -17
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/METADATA +1 -1
- spiderforce4ai-2.4.1.dist-info/RECORD +7 -0
- spiderforce4ai-2.4.dist-info/RECORD +0 -7
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.1.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -576,8 +576,11 @@ class SpiderForce4AI:
|
|
576
576
|
# Set up concurrency control
|
577
577
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
578
578
|
|
579
|
+
# Semaphore for crawling
|
580
|
+
crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
581
|
+
|
579
582
|
async def crawl_with_semaphore(url):
|
580
|
-
async with
|
583
|
+
async with crawl_semaphore:
|
581
584
|
result = await crawl_with_progress(url)
|
582
585
|
await asyncio.sleep(config.request_delay)
|
583
586
|
return result
|
@@ -606,9 +609,36 @@ class SpiderForce4AI:
|
|
606
609
|
results[i] = retry_result
|
607
610
|
break
|
608
611
|
|
612
|
+
# Process LLM requests sequentially after all crawling is complete
|
613
|
+
if config.post_extraction_agent:
|
614
|
+
console.print("\n[cyan]Processing content with LLM...[/cyan]")
|
615
|
+
llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
|
616
|
+
|
617
|
+
post_config = PostExtractionConfig(
|
618
|
+
model=config.post_extraction_agent["model"],
|
619
|
+
messages=config.post_extraction_agent["messages"],
|
620
|
+
api_key=config.post_extraction_agent["api_key"],
|
621
|
+
max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
|
622
|
+
temperature=config.post_extraction_agent.get("temperature", 0.7),
|
623
|
+
base_url=config.post_extraction_agent.get("base_url"),
|
624
|
+
combine_output=bool(config.post_extraction_agent_save_to_file),
|
625
|
+
output_file=config.post_extraction_agent_save_to_file,
|
626
|
+
custom_transform_function=config.post_agent_transformer_function
|
627
|
+
)
|
628
|
+
agent = PostExtractionAgent(post_config)
|
629
|
+
|
630
|
+
for result in results:
|
631
|
+
if result.status == "success":
|
632
|
+
try:
|
633
|
+
result.extraction_result = await agent.process_content(result.url, result.markdown)
|
634
|
+
progress.update(llm_task, advance=1)
|
635
|
+
except Exception as e:
|
636
|
+
console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
|
637
|
+
|
609
638
|
# Calculate final statistics
|
610
639
|
final_successful = len([r for r in results if r.status == "success"])
|
611
640
|
final_failed = len([r for r in results if r.status == "failed"])
|
641
|
+
llm_successful = len([r for r in results if r.extraction_result is not None])
|
612
642
|
|
613
643
|
# Update retry stats
|
614
644
|
self._retry_stats = {
|
@@ -616,7 +646,7 @@ class SpiderForce4AI:
|
|
616
646
|
"failure_ratio": failure_ratio,
|
617
647
|
"retry_successful": retry_successful if initial_failed > 0 else 0,
|
618
648
|
"retry_failed": final_failed,
|
619
|
-
"
|
649
|
+
"llm_successful": llm_successful
|
620
650
|
}
|
621
651
|
|
622
652
|
# Print summary
|
@@ -894,4 +924,4 @@ class SpiderForce4AI:
|
|
894
924
|
# Version info
|
895
925
|
#__version__ = "2.3.1"
|
896
926
|
#__author__ = "Piotr Tamulewicz"
|
897
|
-
#__email__ = "pt@petertam.pro"
|
927
|
+
#__email__ = "pt@petertam.pro"
|
@@ -176,23 +176,38 @@ class PostExtractionAgent:
|
|
176
176
|
for msg in self.config.messages
|
177
177
|
]
|
178
178
|
|
179
|
-
# Make LLM request
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
max_tokens=self.config.max_tokens,
|
184
|
-
temperature=self.config.temperature,
|
185
|
-
api_key=self.config.api_key,
|
186
|
-
api_base=self.config.base_url
|
187
|
-
)
|
179
|
+
# Make LLM request with retries
|
180
|
+
max_retries = 3
|
181
|
+
retry_delay = 1.0
|
182
|
+
last_error = None
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
184
|
+
for attempt in range(max_retries):
|
185
|
+
try:
|
186
|
+
response = await completion(
|
187
|
+
model=self.config.model,
|
188
|
+
messages=messages,
|
189
|
+
max_tokens=self.config.max_tokens,
|
190
|
+
temperature=self.config.temperature,
|
191
|
+
api_key=self.config.api_key,
|
192
|
+
api_base=self.config.base_url
|
193
|
+
)
|
194
|
+
|
195
|
+
# Parse response
|
196
|
+
extracted_data = json.loads(response.choices[0].message.content)
|
197
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
198
|
+
return extracted_data
|
199
|
+
|
200
|
+
except json.JSONDecodeError as e:
|
201
|
+
last_error = f"Invalid JSON response from LLM: {e}"
|
202
|
+
if attempt < max_retries - 1:
|
203
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
204
|
+
except Exception as e:
|
205
|
+
last_error = str(e)
|
206
|
+
if attempt < max_retries - 1:
|
207
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
208
|
+
|
209
|
+
# If we get here, all retries failed
|
210
|
+
raise Exception(last_error)
|
196
211
|
|
197
212
|
except Exception as e:
|
198
213
|
logger.error(f"Error processing {url}: {str(e)}")
|
@@ -256,4 +271,4 @@ class PostExtractionAgent:
|
|
256
271
|
"failed_requests": len(self.buffer.get_failed_requests()),
|
257
272
|
"retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
|
258
273
|
"success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
|
259
|
-
}
|
274
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
|
3
|
+
spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
|
4
|
+
spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.4.1.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
|
3
|
-
spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
|
4
|
-
spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|