entari-plugin-hyw 3.5.0rc1__py3-none-any.whl → 3.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +77 -82
- entari_plugin_hyw/assets/card-dist/index.html +360 -99
- entari_plugin_hyw/card-ui/src/App.vue +246 -52
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +122 -67
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +46 -26
- entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
- entari_plugin_hyw/card-ui/src/types.ts +1 -0
- entari_plugin_hyw/{core/history.py → history.py} +25 -1
- entari_plugin_hyw/image_cache.py +283 -0
- entari_plugin_hyw/{core/pipeline.py → pipeline.py} +102 -27
- entari_plugin_hyw/{utils/prompts.py → prompts.py} +7 -24
- entari_plugin_hyw/render_vue.py +314 -0
- entari_plugin_hyw/{utils/search.py → search.py} +227 -10
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/RECORD +18 -29
- entari_plugin_hyw/core/__init__.py +0 -0
- entari_plugin_hyw/core/config.py +0 -35
- entari_plugin_hyw/core/hyw.py +0 -48
- entari_plugin_hyw/core/render_vue.py +0 -255
- entari_plugin_hyw/test_output/render_0.jpg +0 -0
- entari_plugin_hyw/test_output/render_1.jpg +0 -0
- entari_plugin_hyw/test_output/render_2.jpg +0 -0
- entari_plugin_hyw/test_output/render_3.jpg +0 -0
- entari_plugin_hyw/test_output/render_4.jpg +0 -0
- entari_plugin_hyw/tests/ui_test_output.jpg +0 -0
- entari_plugin_hyw/tests/verify_ui.py +0 -139
- entari_plugin_hyw/utils/__init__.py +0 -2
- entari_plugin_hyw/utils/browser.py +0 -40
- entari_plugin_hyw/utils/playwright_tool.py +0 -36
- /entari_plugin_hyw/{utils/misc.py → misc.py} +0 -0
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Image Caching Module for Pre-downloading Images
|
|
3
|
+
|
|
4
|
+
This module provides async image pre-download functionality to reduce render time.
|
|
5
|
+
Images are downloaded in the background when search results are obtained,
|
|
6
|
+
and cached as base64 data URLs for instant use during rendering.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import base64
|
|
11
|
+
import hashlib
|
|
12
|
+
from typing import Dict, List, Optional, Any
|
|
13
|
+
from loguru import logger
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ImageCache:
|
|
19
|
+
"""
|
|
20
|
+
Async image cache that pre-downloads images as base64.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
cache = ImageCache()
|
|
24
|
+
|
|
25
|
+
# Start pre-downloading images (non-blocking)
|
|
26
|
+
cache.start_prefetch(image_urls)
|
|
27
|
+
|
|
28
|
+
# Later, get cached image (blocking if not ready)
|
|
29
|
+
cached_url = await cache.get_cached(url) # Returns data:image/... or original URL
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
max_size_kb: int = 500, # Max image size to cache (KB)
|
|
35
|
+
timeout: float = 5.0, # Download timeout per image
|
|
36
|
+
max_concurrent: int = 6, # Max concurrent downloads
|
|
37
|
+
):
|
|
38
|
+
self.max_size_bytes = max_size_kb * 1024
|
|
39
|
+
self.timeout = timeout
|
|
40
|
+
self.max_concurrent = max_concurrent
|
|
41
|
+
|
|
42
|
+
# Cache storage: url -> base64_data_url or None (if failed)
|
|
43
|
+
self._cache: Dict[str, Optional[str]] = {}
|
|
44
|
+
# Pending downloads: url -> asyncio.Task
|
|
45
|
+
self._pending: Dict[str, asyncio.Task] = {}
|
|
46
|
+
# Semaphore for concurrent downloads
|
|
47
|
+
self._semaphore = asyncio.Semaphore(max_concurrent)
|
|
48
|
+
# Lock for cache access
|
|
49
|
+
self._lock = asyncio.Lock()
|
|
50
|
+
|
|
51
|
+
def start_prefetch(self, urls: List[str]) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Start pre-downloading images in the background (non-blocking).
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
urls: List of image URLs to prefetch
|
|
57
|
+
"""
|
|
58
|
+
if not httpx:
|
|
59
|
+
logger.warning("ImageCache: httpx not installed, prefetch disabled")
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
for url in urls:
|
|
63
|
+
if not url or not url.startswith("http"):
|
|
64
|
+
continue
|
|
65
|
+
if url in self._cache or url in self._pending:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
# Create background task
|
|
69
|
+
task = asyncio.create_task(self._download_image(url))
|
|
70
|
+
self._pending[url] = task
|
|
71
|
+
|
|
72
|
+
async def _download_image(self, url: str) -> Optional[str]:
|
|
73
|
+
"""
|
|
74
|
+
Download a single image and convert to base64.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Base64 data URL or None if failed/too large
|
|
78
|
+
"""
|
|
79
|
+
async with self._semaphore:
|
|
80
|
+
try:
|
|
81
|
+
async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
|
|
82
|
+
resp = await client.get(url, headers={
|
|
83
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
84
|
+
})
|
|
85
|
+
resp.raise_for_status()
|
|
86
|
+
|
|
87
|
+
# Check content length
|
|
88
|
+
content_length = resp.headers.get("content-length")
|
|
89
|
+
if content_length and int(content_length) > self.max_size_bytes:
|
|
90
|
+
logger.debug(f"ImageCache: Skipping {url} (too large: {content_length} bytes)")
|
|
91
|
+
async with self._lock:
|
|
92
|
+
self._cache[url] = None
|
|
93
|
+
self._pending.pop(url, None)
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
# Read content
|
|
97
|
+
content = resp.content
|
|
98
|
+
if len(content) > self.max_size_bytes:
|
|
99
|
+
logger.debug(f"ImageCache: Skipping {url} (content too large: {len(content)} bytes)")
|
|
100
|
+
async with self._lock:
|
|
101
|
+
self._cache[url] = None
|
|
102
|
+
self._pending.pop(url, None)
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
# Determine MIME type
|
|
106
|
+
content_type = resp.headers.get("content-type", "").lower()
|
|
107
|
+
if "jpeg" in content_type or "jpg" in content_type:
|
|
108
|
+
mime = "image/jpeg"
|
|
109
|
+
elif "png" in content_type:
|
|
110
|
+
mime = "image/png"
|
|
111
|
+
elif "gif" in content_type:
|
|
112
|
+
mime = "image/gif"
|
|
113
|
+
elif "webp" in content_type:
|
|
114
|
+
mime = "image/webp"
|
|
115
|
+
elif "svg" in content_type:
|
|
116
|
+
mime = "image/svg+xml"
|
|
117
|
+
else:
|
|
118
|
+
# Try to infer from URL
|
|
119
|
+
url_lower = url.lower()
|
|
120
|
+
if ".jpg" in url_lower or ".jpeg" in url_lower:
|
|
121
|
+
mime = "image/jpeg"
|
|
122
|
+
elif ".png" in url_lower:
|
|
123
|
+
mime = "image/png"
|
|
124
|
+
elif ".gif" in url_lower:
|
|
125
|
+
mime = "image/gif"
|
|
126
|
+
elif ".webp" in url_lower:
|
|
127
|
+
mime = "image/webp"
|
|
128
|
+
elif ".svg" in url_lower:
|
|
129
|
+
mime = "image/svg+xml"
|
|
130
|
+
else:
|
|
131
|
+
mime = "image/jpeg" # Default fallback
|
|
132
|
+
|
|
133
|
+
# Encode to base64
|
|
134
|
+
b64 = base64.b64encode(content).decode("utf-8")
|
|
135
|
+
data_url = f"data:{mime};base64,{b64}"
|
|
136
|
+
|
|
137
|
+
async with self._lock:
|
|
138
|
+
self._cache[url] = data_url
|
|
139
|
+
self._pending.pop(url, None)
|
|
140
|
+
|
|
141
|
+
logger.debug(f"ImageCache: Cached {url} ({len(content)} bytes)")
|
|
142
|
+
return data_url
|
|
143
|
+
|
|
144
|
+
except asyncio.TimeoutError:
|
|
145
|
+
logger.debug(f"ImageCache: Timeout downloading {url}")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.debug(f"ImageCache: Failed to download {url}: {e}")
|
|
148
|
+
|
|
149
|
+
async with self._lock:
|
|
150
|
+
self._cache[url] = None
|
|
151
|
+
self._pending.pop(url, None)
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
async def get_cached(self, url: str, wait: bool = True, wait_timeout: float = 3.0) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Get cached image data URL, or original URL if not cached.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
url: Original image URL
|
|
160
|
+
wait: If True, wait for pending download to complete
|
|
161
|
+
wait_timeout: Max time to wait for pending download
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Cached data URL or original URL
|
|
165
|
+
"""
|
|
166
|
+
if not url:
|
|
167
|
+
return url
|
|
168
|
+
|
|
169
|
+
# Check if already cached
|
|
170
|
+
async with self._lock:
|
|
171
|
+
if url in self._cache:
|
|
172
|
+
cached = self._cache[url]
|
|
173
|
+
return cached if cached else url # Return original if cached as None (failed)
|
|
174
|
+
|
|
175
|
+
pending_task = self._pending.get(url)
|
|
176
|
+
|
|
177
|
+
# Wait for pending download if requested
|
|
178
|
+
if pending_task and wait:
|
|
179
|
+
try:
|
|
180
|
+
await asyncio.wait_for(asyncio.shield(pending_task), timeout=wait_timeout)
|
|
181
|
+
async with self._lock:
|
|
182
|
+
cached = self._cache.get(url)
|
|
183
|
+
return cached if cached else url
|
|
184
|
+
except asyncio.TimeoutError:
|
|
185
|
+
logger.debug(f"ImageCache: Timeout waiting for {url}")
|
|
186
|
+
return url
|
|
187
|
+
except Exception:
|
|
188
|
+
return url
|
|
189
|
+
|
|
190
|
+
return url
|
|
191
|
+
|
|
192
|
+
async def get_all_cached(self, urls: List[str], wait_timeout: float = 3.0) -> Dict[str, str]:
|
|
193
|
+
"""
|
|
194
|
+
Get cached URLs for multiple images.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
urls: List of original URLs
|
|
198
|
+
wait_timeout: Max time to wait for all pending downloads
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dict mapping original URL to cached data URL (or original if not cached)
|
|
202
|
+
"""
|
|
203
|
+
result = {}
|
|
204
|
+
|
|
205
|
+
# Wait for all pending downloads first
|
|
206
|
+
pending_tasks = []
|
|
207
|
+
async with self._lock:
|
|
208
|
+
for url in urls:
|
|
209
|
+
if url in self._pending:
|
|
210
|
+
pending_tasks.append(self._pending[url])
|
|
211
|
+
|
|
212
|
+
if pending_tasks:
|
|
213
|
+
try:
|
|
214
|
+
await asyncio.wait_for(
|
|
215
|
+
asyncio.gather(*pending_tasks, return_exceptions=True),
|
|
216
|
+
timeout=wait_timeout
|
|
217
|
+
)
|
|
218
|
+
except asyncio.TimeoutError:
|
|
219
|
+
logger.debug(f"ImageCache: Timeout waiting for batch download")
|
|
220
|
+
|
|
221
|
+
# Collect results
|
|
222
|
+
for url in urls:
|
|
223
|
+
async with self._lock:
|
|
224
|
+
cached = self._cache.get(url)
|
|
225
|
+
result[url] = cached if cached else url
|
|
226
|
+
|
|
227
|
+
return result
|
|
228
|
+
|
|
229
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
230
|
+
"""Get cache statistics."""
|
|
231
|
+
cached_count = sum(1 for v in self._cache.values() if v is not None)
|
|
232
|
+
failed_count = sum(1 for v in self._cache.values() if v is None)
|
|
233
|
+
return {
|
|
234
|
+
"cached": cached_count,
|
|
235
|
+
"failed": failed_count,
|
|
236
|
+
"pending": len(self._pending),
|
|
237
|
+
"total": len(self._cache) + len(self._pending),
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
def clear(self) -> None:
|
|
241
|
+
"""Clear all cached data."""
|
|
242
|
+
self._cache.clear()
|
|
243
|
+
for task in self._pending.values():
|
|
244
|
+
task.cancel()
|
|
245
|
+
self._pending.clear()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# Global cache instance for reuse across requests
|
|
249
|
+
_global_cache: Optional[ImageCache] = None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def get_image_cache() -> ImageCache:
|
|
253
|
+
"""Get or create the global image cache instance."""
|
|
254
|
+
global _global_cache
|
|
255
|
+
if _global_cache is None:
|
|
256
|
+
_global_cache = ImageCache()
|
|
257
|
+
return _global_cache
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
async def prefetch_images(urls: List[str]) -> None:
|
|
261
|
+
"""
|
|
262
|
+
Convenience function to start prefetching images.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
urls: List of image URLs to prefetch
|
|
266
|
+
"""
|
|
267
|
+
cache = get_image_cache()
|
|
268
|
+
cache.start_prefetch(urls)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
async def get_cached_images(urls: List[str], wait_timeout: float = 3.0) -> Dict[str, str]:
|
|
272
|
+
"""
|
|
273
|
+
Convenience function to get cached images.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
urls: List of original URLs
|
|
277
|
+
wait_timeout: Max time to wait
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Dict mapping original URL to cached data URL
|
|
281
|
+
"""
|
|
282
|
+
cache = get_image_cache()
|
|
283
|
+
return await cache.get_all_cached(urls, wait_timeout=wait_timeout)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import html
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import time
|
|
5
6
|
from contextlib import asynccontextmanager
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple
|
|
@@ -8,16 +9,14 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
8
9
|
from loguru import logger
|
|
9
10
|
from openai import AsyncOpenAI
|
|
10
11
|
|
|
11
|
-
from .
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from .search import SearchService
|
|
13
|
+
from .image_cache import get_cached_images
|
|
14
|
+
from .prompts import (
|
|
14
15
|
AGENT_SP,
|
|
15
16
|
AGENT_SP_INSTRUCT_VISION_ADD,
|
|
16
17
|
AGENT_SP_TOOLS_STANDARD_ADD,
|
|
17
18
|
AGENT_SP_TOOLS_AGENT_ADD,
|
|
18
19
|
AGENT_SP_SEARCH_ADD,
|
|
19
|
-
AGENT_SP_PAGE_ADD,
|
|
20
|
-
AGENT_SP_IMAGE_SEARCH_ADD,
|
|
21
20
|
INSTRUCT_SP,
|
|
22
21
|
INSTRUCT_SP_VISION_ADD,
|
|
23
22
|
VISION_SP,
|
|
@@ -33,7 +32,7 @@ class ProcessingPipeline:
|
|
|
33
32
|
Core pipeline (vision -> instruct/search -> agent).
|
|
34
33
|
"""
|
|
35
34
|
|
|
36
|
-
def __init__(self, config:
|
|
35
|
+
def __init__(self, config: Any):
|
|
37
36
|
self.config = config
|
|
38
37
|
self.search_service = SearchService(config)
|
|
39
38
|
self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
|
|
@@ -120,12 +119,9 @@ class ProcessingPipeline:
|
|
|
120
119
|
final_response_content = ""
|
|
121
120
|
structured: Dict[str, Any] = {}
|
|
122
121
|
|
|
123
|
-
# Reset search cache and ID
|
|
122
|
+
# Reset search cache and ID counter for this execution
|
|
124
123
|
self.all_web_results = []
|
|
125
124
|
self.global_id_counter = 0
|
|
126
|
-
self.search_id_counter = 0
|
|
127
|
-
self.page_id_counter = 0
|
|
128
|
-
self.image_id_counter = 0
|
|
129
125
|
|
|
130
126
|
try:
|
|
131
127
|
logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
|
|
@@ -189,7 +185,8 @@ class ProcessingPipeline:
|
|
|
189
185
|
vision_text=vision_text,
|
|
190
186
|
model=instruct_model,
|
|
191
187
|
)
|
|
192
|
-
|
|
188
|
+
# Instruct time excludes search time (search_time is returned separately)
|
|
189
|
+
instruct_time = time.time() - instruct_start - search_time
|
|
193
190
|
|
|
194
191
|
# Calculate Instruct Cost
|
|
195
192
|
instruct_cost = 0.0
|
|
@@ -266,17 +263,18 @@ class ProcessingPipeline:
|
|
|
266
263
|
if vision_text:
|
|
267
264
|
system_prompt += AGENT_SP_INSTRUCT_VISION_ADD.format(vision_msgs=vision_text)
|
|
268
265
|
|
|
269
|
-
# Append search results
|
|
270
|
-
if has_search_results and search_msgs_text:
|
|
271
|
-
system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs=search_msgs_text)
|
|
272
|
-
|
|
273
|
-
# Append crawled page content
|
|
266
|
+
# Append all search results (text, page, image) in one block
|
|
274
267
|
page_msgs_text = self._format_page_msgs()
|
|
268
|
+
all_search_parts = []
|
|
269
|
+
if has_search_results and search_msgs_text:
|
|
270
|
+
all_search_parts.append(search_msgs_text)
|
|
275
271
|
if page_msgs_text:
|
|
276
|
-
|
|
277
|
-
|
|
272
|
+
all_search_parts.append(page_msgs_text)
|
|
278
273
|
if has_image_results and image_msgs_text:
|
|
279
|
-
|
|
274
|
+
all_search_parts.append(image_msgs_text)
|
|
275
|
+
|
|
276
|
+
if all_search_parts:
|
|
277
|
+
system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs="\n".join(all_search_parts))
|
|
280
278
|
|
|
281
279
|
last_system_prompt = system_prompt
|
|
282
280
|
|
|
@@ -332,6 +330,7 @@ class ProcessingPipeline:
|
|
|
332
330
|
"tool_results": [],
|
|
333
331
|
"tool_time": tool_exec_time,
|
|
334
332
|
"llm_time": step_llm_time,
|
|
333
|
+
"usage": step_usage,
|
|
335
334
|
}
|
|
336
335
|
for i, result in enumerate(results):
|
|
337
336
|
tc = tool_calls[i]
|
|
@@ -516,12 +515,18 @@ class ProcessingPipeline:
|
|
|
516
515
|
for s in steps:
|
|
517
516
|
if "tool_calls" in s:
|
|
518
517
|
# 1. Agent Thought Stage (with LLM time)
|
|
518
|
+
# Calculate step cost
|
|
519
|
+
step_usage = s.get("usage", {})
|
|
520
|
+
step_cost = 0.0
|
|
521
|
+
if a_in_price > 0 or a_out_price > 0:
|
|
522
|
+
step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
|
|
523
|
+
|
|
519
524
|
stages_used.append({
|
|
520
525
|
"name": "Agent",
|
|
521
526
|
"model": a_model,
|
|
522
527
|
"icon_config": agent_icon,
|
|
523
528
|
"provider": agent_provider,
|
|
524
|
-
"time": s.get("llm_time", 0), "cost":
|
|
529
|
+
"time": s.get("llm_time", 0), "cost": step_cost
|
|
525
530
|
})
|
|
526
531
|
|
|
527
532
|
# 2. Grouped Tool Stages
|
|
@@ -602,21 +607,30 @@ class ProcessingPipeline:
|
|
|
602
607
|
})
|
|
603
608
|
|
|
604
609
|
# Assign total time/cost to last Agent stage
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
last_agent["time"] = a.get("time", 0)
|
|
608
|
-
last_agent["cost"] = a.get("cost", 0.0)
|
|
610
|
+
# Sum up total time/cost for UI/stats (implicit via loop above)
|
|
611
|
+
# No need to assign everything to last agent anymore as we distribute it.
|
|
609
612
|
|
|
610
613
|
# --- Final Filter: Only show cited items in workflow cards ---
|
|
611
614
|
cited_urls = {ref['url'] for ref in (structured.get("references", []) +
|
|
612
615
|
structured.get("page_references", []) +
|
|
613
616
|
structured.get("image_references", []))}
|
|
614
617
|
|
|
618
|
+
# Find images already rendered in markdown content (to avoid duplicate display)
|
|
619
|
+
markdown_image_urls = set()
|
|
620
|
+
md_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
|
|
621
|
+
for match in md_img_pattern.finditer(final_content):
|
|
622
|
+
markdown_image_urls.add(match.group(1))
|
|
623
|
+
|
|
615
624
|
for s in stages_used:
|
|
616
625
|
if "references" in s and s["references"]:
|
|
617
626
|
s["references"] = [r for r in s["references"] if r.get("url") in cited_urls]
|
|
618
|
-
#
|
|
619
|
-
#
|
|
627
|
+
# Filter out images already shown in markdown content
|
|
628
|
+
# Check both url AND thumbnail since either might be used in markdown
|
|
629
|
+
if "image_references" in s and s["image_references"]:
|
|
630
|
+
s["image_references"] = [
|
|
631
|
+
r for r in s["image_references"]
|
|
632
|
+
if r.get("url") not in markdown_image_urls and (r.get("thumbnail") or "") not in markdown_image_urls
|
|
633
|
+
]
|
|
620
634
|
if "crawled_pages" in s and s["crawled_pages"]:
|
|
621
635
|
s["crawled_pages"] = [r for r in s["crawled_pages"] if r.get("url") in cited_urls]
|
|
622
636
|
|
|
@@ -633,6 +647,67 @@ class ProcessingPipeline:
|
|
|
633
647
|
# Update the reference (since it might be used by caller)
|
|
634
648
|
current_history[:] = cleaned_history
|
|
635
649
|
|
|
650
|
+
# --- Apply cached images to reduce render time ---
|
|
651
|
+
# Collect all image URLs that need caching (avoid duplicates when thumbnail == url)
|
|
652
|
+
all_image_urls = set()
|
|
653
|
+
for img_ref in structured.get("image_references", []):
|
|
654
|
+
if img_ref.get("thumbnail"):
|
|
655
|
+
all_image_urls.add(img_ref["thumbnail"])
|
|
656
|
+
if img_ref.get("url"):
|
|
657
|
+
all_image_urls.add(img_ref["url"])
|
|
658
|
+
|
|
659
|
+
for stage in stages_used:
|
|
660
|
+
for img_ref in stage.get("image_references", []):
|
|
661
|
+
if img_ref.get("thumbnail"):
|
|
662
|
+
all_image_urls.add(img_ref["thumbnail"])
|
|
663
|
+
if img_ref.get("url"):
|
|
664
|
+
all_image_urls.add(img_ref["url"])
|
|
665
|
+
|
|
666
|
+
# Also collect image URLs from markdown content
|
|
667
|
+
markdown_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
|
|
668
|
+
markdown_urls = markdown_img_pattern.findall(final_content)
|
|
669
|
+
all_image_urls.update(markdown_urls)
|
|
670
|
+
|
|
671
|
+
# Get cached versions (waits for pending downloads, with timeout)
|
|
672
|
+
if all_image_urls:
|
|
673
|
+
try:
|
|
674
|
+
cached_map = await get_cached_images(list(all_image_urls), wait_timeout=3.0)
|
|
675
|
+
|
|
676
|
+
# Apply cached URLs to structured response
|
|
677
|
+
for img_ref in structured.get("image_references", []):
|
|
678
|
+
if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
|
|
679
|
+
img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
|
|
680
|
+
if img_ref.get("url") and img_ref["url"] in cached_map:
|
|
681
|
+
img_ref["url"] = cached_map[img_ref["url"]]
|
|
682
|
+
|
|
683
|
+
# Apply cached URLs to stages
|
|
684
|
+
for stage in stages_used:
|
|
685
|
+
for img_ref in stage.get("image_references", []):
|
|
686
|
+
if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
|
|
687
|
+
img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
|
|
688
|
+
if img_ref.get("url") and img_ref["url"] in cached_map:
|
|
689
|
+
img_ref["url"] = cached_map[img_ref["url"]]
|
|
690
|
+
|
|
691
|
+
# Replace image URLs in markdown content with cached versions
|
|
692
|
+
def replace_markdown_img(match):
|
|
693
|
+
full_match = match.group(0)
|
|
694
|
+
url = match.group(1)
|
|
695
|
+
cached_url = cached_map.get(url)
|
|
696
|
+
if cached_url and cached_url != url:
|
|
697
|
+
return full_match.replace(url, cached_url)
|
|
698
|
+
return full_match
|
|
699
|
+
|
|
700
|
+
final_content = markdown_img_pattern.sub(replace_markdown_img, final_content)
|
|
701
|
+
structured["response"] = markdown_img_pattern.sub(replace_markdown_img, structured.get("response", ""))
|
|
702
|
+
|
|
703
|
+
# Log cache stats
|
|
704
|
+
from .image_cache import get_image_cache
|
|
705
|
+
cache_stats = get_image_cache().get_stats()
|
|
706
|
+
logger.info(f"ImageCache stats: {cache_stats}")
|
|
707
|
+
|
|
708
|
+
except Exception as e:
|
|
709
|
+
logger.warning(f"Failed to apply image cache: {e}")
|
|
710
|
+
|
|
636
711
|
return {
|
|
637
712
|
"llm_response": final_content,
|
|
638
713
|
"structured_response": structured,
|
|
@@ -1128,4 +1203,4 @@ class ProcessingPipeline:
|
|
|
1128
1203
|
except Exception:
|
|
1129
1204
|
pass
|
|
1130
1205
|
# Do NOT close shared crawler here, as pipeline instances are now per-request.
|
|
1131
|
-
# Shared crawler lifecycle is managed
|
|
1206
|
+
# Shared crawler lifecycle is managed globally.
|
|
@@ -34,7 +34,7 @@ INSTRUCT_SP = """# 你是一个专业的指导专家.
|
|
|
34
34
|
{tools_desc}
|
|
35
35
|
|
|
36
36
|
## 你的回复
|
|
37
|
-
调用工具后无需回复额外文本节省token.
|
|
37
|
+
调用工具后无需回复额外文本节省 token.
|
|
38
38
|
|
|
39
39
|
## 用户消息
|
|
40
40
|
```
|
|
@@ -61,10 +61,11 @@ AGENT_SP = """# 你是一个 Agent 总控专家, 你需要理解用户意图,
|
|
|
61
61
|
- 正文格式:
|
|
62
62
|
- 先给出一个 `# `大标题约 8-10 个字, 不要有多余废话, 不要直接回答用户的提问.
|
|
63
63
|
- 然后紧接着给出一个 <summary>...</summary>, 除了给出一个约 100 字的纯文本简介, 介绍本次输出的长文的清晰、重点概括.
|
|
64
|
-
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
- 随后开始详细二级标题 + markdown 正文, 语言描绘格式丰富多样, 简洁准确可信.
|
|
65
|
+
- 请不要给出过长的代码、表格列数等, 请控制字数在 600 字内, 只讲重点和准确的数据.
|
|
66
|
+
- 不支持渲染: 链接, 图片链接, mermaid
|
|
67
|
+
- 支持渲染: 公式, 代码高亮, 只在需要的时候给出.
|
|
68
|
+
- 图片链接、链接框架会自动渲染出, 你无需显式给出.
|
|
68
69
|
- 引用:
|
|
69
70
|
> 重要: 所有正文内容必须基于实际信息, 保证百分百真实度
|
|
70
71
|
- 信息来源已按获取顺序编号为 [1], [2], [3]...
|
|
@@ -98,24 +99,6 @@ AGENT_SP_INSTRUCT_VISION_ADD = """
|
|
|
98
99
|
"""
|
|
99
100
|
|
|
100
101
|
AGENT_SP_SEARCH_ADD = """
|
|
101
|
-
##
|
|
102
|
-
```text
|
|
102
|
+
## 联网信息
|
|
103
103
|
{search_msgs}
|
|
104
|
-
```
|
|
105
|
-
"""
|
|
106
|
-
|
|
107
|
-
AGENT_SP_PAGE_ADD = """
|
|
108
|
-
## 页面内容专家消息
|
|
109
|
-
```text
|
|
110
|
-
{page_msgs}
|
|
111
|
-
```
|
|
112
|
-
- 引用页面内容时, 必须使用 `page:id` 格式
|
|
113
|
-
"""
|
|
114
|
-
|
|
115
|
-
AGENT_SP_IMAGE_SEARCH_ADD = """
|
|
116
|
-
## 图像搜索专家消息
|
|
117
|
-
```text
|
|
118
|
-
{image_search_msgs}
|
|
119
|
-
```
|
|
120
|
-
- 每进行一次 internal_image_search, 挑选 1 张图像插入正文
|
|
121
104
|
"""
|